diff options
author | Paul Gortmaker <paul.gortmaker@windriver.com> | 2018-02-23 12:39:07 -0500 |
---|---|---|
committer | Paul Gortmaker <paul.gortmaker@windriver.com> | 2018-02-23 12:39:07 -0500 |
commit | 3f691eccc8fa85d0011699ea40a8b852cc36e802 (patch) | |
tree | ae7a400517aadcd3afcd32f5a4cb2a1e1edea19f | |
parent | 6d9970971f8fd5ff1aa4a7378e09aec11a98d75b (diff) | |
download | longterm-queue-4.8-3f691eccc8fa85d0011699ea40a8b852cc36e802.tar.gz |
patches taken directly from the stable-queue git repository
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
137 files changed, 16915 insertions, 0 deletions
diff --git a/queue/array_index_nospec-sanitize-speculative-array-de-references.patch b/queue/array_index_nospec-sanitize-speculative-array-de-references.patch new file mode 100644 index 0000000..99fe9a2 --- /dev/null +++ b/queue/array_index_nospec-sanitize-speculative-array-de-references.patch @@ -0,0 +1,117 @@ +From foo@baz Thu Feb 8 03:32:24 CET 2018 +From: Dan Williams <dan.j.williams@intel.com> +Date: Mon, 29 Jan 2018 17:02:22 -0800 +Subject: array_index_nospec: Sanitize speculative array de-references + +From: Dan Williams <dan.j.williams@intel.com> + + +(cherry picked from commit f3804203306e098dae9ca51540fcd5eb700d7f40) + +array_index_nospec() is proposed as a generic mechanism to mitigate +against Spectre-variant-1 attacks, i.e. an attack that bypasses boundary +checks via speculative execution. The array_index_nospec() +implementation is expected to be safe for current generation CPUs across +multiple architectures (ARM, x86). + +Based on an original implementation by Linus Torvalds, tweaked to remove +speculative flows by Alexei Starovoitov, and tweaked again by Linus to +introduce an x86 assembly implementation for the mask generation. + +Co-developed-by: Linus Torvalds <torvalds@linux-foundation.org> +Co-developed-by: Alexei Starovoitov <ast@kernel.org> +Suggested-by: Cyril Novikov <cnovikov@lynx.com> +Signed-off-by: Dan Williams <dan.j.williams@intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: linux-arch@vger.kernel.org +Cc: kernel-hardening@lists.openwall.com +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Catalin Marinas <catalin.marinas@arm.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: Russell King <linux@armlinux.org.uk> +Cc: gregkh@linuxfoundation.org +Cc: torvalds@linux-foundation.org +Cc: alan@linux.intel.com +Link: https://lkml.kernel.org/r/151727414229.33451.18411580953862676575.stgit@dwillia2-desk3.amr.corp.intel.com +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + include/linux/nospec.h | 72 +++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 72 insertions(+) + create mode 100644 include/linux/nospec.h + +--- /dev/null ++++ b/include/linux/nospec.h +@@ -0,0 +1,72 @@ ++// SPDX-License-Identifier: GPL-2.0 ++// Copyright(c) 2018 Linus Torvalds. All rights reserved. ++// Copyright(c) 2018 Alexei Starovoitov. All rights reserved. ++// Copyright(c) 2018 Intel Corporation. All rights reserved. ++ ++#ifndef _LINUX_NOSPEC_H ++#define _LINUX_NOSPEC_H ++ ++/** ++ * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise ++ * @index: array element index ++ * @size: number of elements in array ++ * ++ * When @index is out of bounds (@index >= @size), the sign bit will be ++ * set. Extend the sign bit to all bits and invert, giving a result of ++ * zero for an out of bounds index, or ~0 if within bounds [0, @size). ++ */ ++#ifndef array_index_mask_nospec ++static inline unsigned long array_index_mask_nospec(unsigned long index, ++ unsigned long size) ++{ ++ /* ++ * Warn developers about inappropriate array_index_nospec() usage. ++ * ++ * Even if the CPU speculates past the WARN_ONCE branch, the ++ * sign bit of @index is taken into account when generating the ++ * mask. ++ * ++ * This warning is compiled out when the compiler can infer that ++ * @index and @size are less than LONG_MAX. ++ */ ++ if (WARN_ONCE(index > LONG_MAX || size > LONG_MAX, ++ "array_index_nospec() limited to range of [0, LONG_MAX]\n")) ++ return 0; ++ ++ /* ++ * Always calculate and emit the mask even if the compiler ++ * thinks the mask is not needed. The compiler does not take ++ * into account the value of @index under speculation. ++ */ ++ OPTIMIZER_HIDE_VAR(index); ++ return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1); ++} ++#endif ++ ++/* ++ * array_index_nospec - sanitize an array index after a bounds check ++ * ++ * For a code sequence like: ++ * ++ * if (index < size) { ++ * index = array_index_nospec(index, size); ++ * val = array[index]; ++ * } ++ * ++ * ...if the CPU speculates past the bounds check then ++ * array_index_nospec() will clamp the index within the range of [0, ++ * size). ++ */ ++#define array_index_nospec(index, size) \ ++({ \ ++ typeof(index) _i = (index); \ ++ typeof(size) _s = (size); \ ++ unsigned long _mask = array_index_mask_nospec(_i, _s); \ ++ \ ++ BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \ ++ BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \ ++ \ ++ _i &= _mask; \ ++ _i; \ ++}) ++#endif /* _LINUX_NOSPEC_H */ diff --git a/queue/documentation-document-array_index_nospec.patch b/queue/documentation-document-array_index_nospec.patch new file mode 100644 index 0000000..5bad41f --- /dev/null +++ b/queue/documentation-document-array_index_nospec.patch @@ -0,0 +1,125 @@ +From foo@baz Thu Feb 8 03:32:24 CET 2018 +From: Mark Rutland <mark.rutland@arm.com> +Date: Mon, 29 Jan 2018 17:02:16 -0800 +Subject: Documentation: Document array_index_nospec + +From: Mark Rutland <mark.rutland@arm.com> + + +(cherry picked from commit f84a56f73dddaeac1dba8045b007f742f61cd2da) + +Document the rationale and usage of the new array_index_nospec() helper. + +Signed-off-by: Mark Rutland <mark.rutland@arm.com> +Signed-off-by: Will Deacon <will.deacon@arm.com> +Signed-off-by: Dan Williams <dan.j.williams@intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Kees Cook <keescook@chromium.org> +Cc: linux-arch@vger.kernel.org +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: gregkh@linuxfoundation.org +Cc: kernel-hardening@lists.openwall.com +Cc: torvalds@linux-foundation.org +Cc: alan@linux.intel.com +Link: https://lkml.kernel.org/r/151727413645.33451.15878817161436755393.stgit@dwillia2-desk3.amr.corp.intel.com +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + Documentation/speculation.txt | 90 ++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 90 insertions(+) + create mode 100644 Documentation/speculation.txt + +--- /dev/null ++++ b/Documentation/speculation.txt +@@ -0,0 +1,90 @@ ++This document explains potential effects of speculation, and how undesirable ++effects can be mitigated portably using common APIs. ++ ++=========== ++Speculation ++=========== ++ ++To improve performance and minimize average latencies, many contemporary CPUs ++employ speculative execution techniques such as branch prediction, performing ++work which may be discarded at a later stage. ++ ++Typically speculative execution cannot be observed from architectural state, ++such as the contents of registers. However, in some cases it is possible to ++observe its impact on microarchitectural state, such as the presence or ++absence of data in caches. Such state may form side-channels which can be ++observed to extract secret information. ++ ++For example, in the presence of branch prediction, it is possible for bounds ++checks to be ignored by code which is speculatively executed. Consider the ++following code: ++ ++ int load_array(int *array, unsigned int index) ++ { ++ if (index >= MAX_ARRAY_ELEMS) ++ return 0; ++ else ++ return array[index]; ++ } ++ ++Which, on arm64, may be compiled to an assembly sequence such as: ++ ++ CMP <index>, #MAX_ARRAY_ELEMS ++ B.LT less ++ MOV <returnval>, #0 ++ RET ++ less: ++ LDR <returnval>, [<array>, <index>] ++ RET ++ ++It is possible that a CPU mis-predicts the conditional branch, and ++speculatively loads array[index], even if index >= MAX_ARRAY_ELEMS. This ++value will subsequently be discarded, but the speculated load may affect ++microarchitectural state which can be subsequently measured. ++ ++More complex sequences involving multiple dependent memory accesses may ++result in sensitive information being leaked. Consider the following ++code, building on the prior example: ++ ++ int load_dependent_arrays(int *arr1, int *arr2, int index) ++ { ++ int val1, val2, ++ ++ val1 = load_array(arr1, index); ++ val2 = load_array(arr2, val1); ++ ++ return val2; ++ } ++ ++Under speculation, the first call to load_array() may return the value ++of an out-of-bounds address, while the second call will influence ++microarchitectural state dependent on this value. This may provide an ++arbitrary read primitive. ++ ++==================================== ++Mitigating speculation side-channels ++==================================== ++ ++The kernel provides a generic API to ensure that bounds checks are ++respected even under speculation. Architectures which are affected by ++speculation-based side-channels are expected to implement these ++primitives. ++ ++The array_index_nospec() helper in <linux/nospec.h> can be used to ++prevent information from being leaked via side-channels. ++ ++A call to array_index_nospec(index, size) returns a sanitized index ++value that is bounded to [0, size) even under cpu speculation ++conditions. ++ ++This can be used to protect the earlier load_array() example: ++ ++ int load_array(int *array, unsigned int index) ++ { ++ if (index >= MAX_ARRAY_ELEMS) ++ return 0; ++ else { ++ index = array_index_nospec(index, MAX_ARRAY_ELEMS); ++ return array[index]; ++ } ++ } diff --git a/queue/kaiser-add-nokaiser-boot-option-using-alternative.patch b/queue/kaiser-add-nokaiser-boot-option-using-alternative.patch new file mode 100644 index 0000000..b8d1005 --- /dev/null +++ b/queue/kaiser-add-nokaiser-boot-option-using-alternative.patch @@ -0,0 +1,652 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Sun, 24 Sep 2017 16:59:49 -0700 +Subject: kaiser: add "nokaiser" boot option, using ALTERNATIVE + +From: Hugh Dickins <hughd@google.com> + + +Added "nokaiser" boot option: an early param like "noinvpcid". +Most places now check int kaiser_enabled (#defined 0 when not +CONFIG_KAISER) instead of #ifdef CONFIG_KAISER; but entry_64.S +and entry_64_compat.S are using the ALTERNATIVE technique, which +patches in the preferred instructions at runtime. That technique +is tied to x86 cpu features, so X86_FEATURE_KAISER is fabricated. + +Prior to "nokaiser", Kaiser #defined _PAGE_GLOBAL 0: revert that, +but be careful with both _PAGE_GLOBAL and CR4.PGE: setting them when +nokaiser like when !CONFIG_KAISER, but not setting either when kaiser - +neither matters on its own, but it's hard to be sure that _PAGE_GLOBAL +won't get set in some obscure corner, or something add PGE into CR4. +By omitting _PAGE_GLOBAL from __supported_pte_mask when kaiser_enabled, +all page table setup which uses pte_pfn() masks it out of the ptes. + +It's slightly shameful that the same declaration versus definition of +kaiser_enabled appears in not one, not two, but in three header files +(asm/kaiser.h, asm/pgtable.h, asm/tlbflush.h). I felt safer that way, +than with #including any of those in any of the others; and did not +feel it worth an asm/kaiser_enabled.h - kernel/cpu/common.c includes +them all, so we shall hear about it if they get out of synch. + +Cleanups while in the area: removed the silly #ifdef CONFIG_KAISER +from kaiser.c; removed the unused native_get_normal_pgd(); removed +the spurious reg clutter from SWITCH_*_CR3 macro stubs; corrected some +comments. But more interestingly, set CR4.PSE in secondary_startup_64: +the manual is clear that it does not matter whether it's 0 or 1 when +4-level-pts are enabled, but I was distracted to find cr4 different on +BSP and auxiliaries - BSP alone was adding PSE, in probe_page_size_mask(). + +Signed-off-by: Hugh Dickins <hughd@google.com> +Acked-by: Jiri Kosina <jkosina@suse.cz> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + Documentation/kernel-parameters.txt | 2 + + arch/x86/entry/entry_64.S | 15 ++++++----- + arch/x86/include/asm/cpufeatures.h | 3 ++ + arch/x86/include/asm/kaiser.h | 27 +++++++++++++++------ + arch/x86/include/asm/pgtable.h | 20 +++++++++++---- + arch/x86/include/asm/pgtable_64.h | 13 +++------- + arch/x86/include/asm/pgtable_types.h | 4 --- + arch/x86/include/asm/tlbflush.h | 39 +++++++++++++++++++------------ + arch/x86/kernel/cpu/common.c | 28 +++++++++++++++++++++- + arch/x86/kernel/espfix_64.c | 3 +- + arch/x86/kernel/head_64.S | 4 +-- + arch/x86/mm/init.c | 2 - + arch/x86/mm/init_64.c | 10 +++++++ + arch/x86/mm/kaiser.c | 26 +++++++++++++++++--- + arch/x86/mm/pgtable.c | 8 +----- + arch/x86/mm/tlb.c | 4 --- + tools/arch/x86/include/asm/cpufeatures.h | 3 ++ + 17 files changed, 146 insertions(+), 65 deletions(-) + +--- a/Documentation/kernel-parameters.txt ++++ b/Documentation/kernel-parameters.txt +@@ -2763,6 +2763,8 @@ bytes respectively. Such letter suffixes + + nojitter [IA-64] Disables jitter checking for ITC timers. + ++ nokaiser [X86-64] Disable KAISER isolation of kernel from user. ++ + no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver + + no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -1079,7 +1079,7 @@ ENTRY(paranoid_entry) + * unconditionally, but we need to find out whether the reverse + * should be done on return (conveyed to paranoid_exit in %ebx). + */ +- movq %cr3, %rax ++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER + testl $KAISER_SHADOW_PGD_OFFSET, %eax + jz 2f + orl $2, %ebx +@@ -1111,6 +1111,7 @@ ENTRY(paranoid_exit) + TRACE_IRQS_OFF_DEBUG + TRACE_IRQS_IRETQ_DEBUG + #ifdef CONFIG_KAISER ++ /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */ + testl $2, %ebx /* SWITCH_USER_CR3 needed? */ + jz paranoid_exit_no_switch + SWITCH_USER_CR3 +@@ -1341,13 +1342,14 @@ ENTRY(nmi) + #ifdef CONFIG_KAISER + /* Unconditionally use kernel CR3 for do_nmi() */ + /* %rax is saved above, so OK to clobber here */ +- movq %cr3, %rax ++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER + /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ + orq x86_cr3_pcid_noflush, %rax + pushq %rax + /* mask off "user" bit of pgd address and 12 PCID bits: */ + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax + movq %rax, %cr3 ++2: + #endif + call do_nmi + +@@ -1357,8 +1359,7 @@ ENTRY(nmi) + * kernel code that needs user CR3, but do we ever return + * to "user mode" where we need the kernel CR3? + */ +- popq %rax +- mov %rax, %cr3 ++ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER + #endif + + /* +@@ -1585,13 +1586,14 @@ end_repeat_nmi: + #ifdef CONFIG_KAISER + /* Unconditionally use kernel CR3 for do_nmi() */ + /* %rax is saved above, so OK to clobber here */ +- movq %cr3, %rax ++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER + /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ + orq x86_cr3_pcid_noflush, %rax + pushq %rax + /* mask off "user" bit of pgd address and 12 PCID bits: */ + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax + movq %rax, %cr3 ++2: + #endif + + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ +@@ -1603,8 +1605,7 @@ end_repeat_nmi: + * kernel code that needs user CR3, like just just before + * a sysret. + */ +- popq %rax +- mov %rax, %cr3 ++ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER + #endif + + testl %ebx, %ebx /* swapgs needed? */ +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -198,6 +198,9 @@ + #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ + #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ + ++/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ ++#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */ ++ + /* Virtualization flags: Linux defined, word 8 */ + #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ + #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ +--- a/arch/x86/include/asm/kaiser.h ++++ b/arch/x86/include/asm/kaiser.h +@@ -46,28 +46,33 @@ movq \reg, %cr3 + .endm + + .macro SWITCH_KERNEL_CR3 +-pushq %rax ++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER + _SWITCH_TO_KERNEL_CR3 %rax + popq %rax ++8: + .endm + + .macro SWITCH_USER_CR3 +-pushq %rax ++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER + _SWITCH_TO_USER_CR3 %rax %al + popq %rax ++8: + .endm + + .macro SWITCH_KERNEL_CR3_NO_STACK +-movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) ++ALTERNATIVE "jmp 8f", \ ++ __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \ ++ X86_FEATURE_KAISER + _SWITCH_TO_KERNEL_CR3 %rax + movq PER_CPU_VAR(unsafe_stack_register_backup), %rax ++8: + .endm + + #else /* CONFIG_KAISER */ + +-.macro SWITCH_KERNEL_CR3 reg ++.macro SWITCH_KERNEL_CR3 + .endm +-.macro SWITCH_USER_CR3 reg regb ++.macro SWITCH_USER_CR3 + .endm + .macro SWITCH_KERNEL_CR3_NO_STACK + .endm +@@ -90,6 +95,16 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_p + + extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; + ++extern int kaiser_enabled; ++#else ++#define kaiser_enabled 0 ++#endif /* CONFIG_KAISER */ ++ ++/* ++ * Kaiser function prototypes are needed even when CONFIG_KAISER is not set, ++ * so as to build with tests on kaiser_enabled instead of #ifdefs. ++ */ ++ + /** + * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping + * @addr: the start address of the range +@@ -119,8 +134,6 @@ extern void kaiser_remove_mapping(unsign + */ + extern void kaiser_init(void); + +-#endif /* CONFIG_KAISER */ +- + #endif /* __ASSEMBLY */ + + #endif /* _ASM_X86_KAISER_H */ +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -18,6 +18,12 @@ + #ifndef __ASSEMBLY__ + #include <asm/x86_init.h> + ++#ifdef CONFIG_KAISER ++extern int kaiser_enabled; ++#else ++#define kaiser_enabled 0 ++#endif ++ + void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); + void ptdump_walk_pgd_level_checkwx(void); + +@@ -697,7 +703,7 @@ static inline int pgd_bad(pgd_t pgd) + * page table by accident; it will fault on the first + * instruction it tries to run. See native_set_pgd(). + */ +- if (IS_ENABLED(CONFIG_KAISER)) ++ if (kaiser_enabled) + ignore_flags |= _PAGE_NX; + + return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; +@@ -913,12 +919,14 @@ static inline void pmdp_set_wrprotect(st + */ + static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) + { +- memcpy(dst, src, count * sizeof(pgd_t)); ++ memcpy(dst, src, count * sizeof(pgd_t)); + #ifdef CONFIG_KAISER +- /* Clone the shadow pgd part as well */ +- memcpy(native_get_shadow_pgd(dst), +- native_get_shadow_pgd(src), +- count * sizeof(pgd_t)); ++ if (kaiser_enabled) { ++ /* Clone the shadow pgd part as well */ ++ memcpy(native_get_shadow_pgd(dst), ++ native_get_shadow_pgd(src), ++ count * sizeof(pgd_t)); ++ } + #endif + } + +--- a/arch/x86/include/asm/pgtable_64.h ++++ b/arch/x86/include/asm/pgtable_64.h +@@ -111,13 +111,12 @@ extern pgd_t kaiser_set_shadow_pgd(pgd_t + + static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) + { ++#ifdef CONFIG_DEBUG_VM ++ /* linux/mmdebug.h may not have been included at this point */ ++ BUG_ON(!kaiser_enabled); ++#endif + return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE); + } +- +-static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) +-{ +- return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE); +-} + #else + static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) + { +@@ -128,10 +127,6 @@ static inline pgd_t *native_get_shadow_p + BUILD_BUG_ON(1); + return NULL; + } +-static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) +-{ +- return pgdp; +-} + #endif /* CONFIG_KAISER */ + + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) +--- a/arch/x86/include/asm/pgtable_types.h ++++ b/arch/x86/include/asm/pgtable_types.h +@@ -45,11 +45,7 @@ + #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) + #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) + #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) +-#ifdef CONFIG_KAISER +-#define _PAGE_GLOBAL (_AT(pteval_t, 0)) +-#else + #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +-#endif + #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) + #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) + #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -137,9 +137,11 @@ static inline void cr4_set_bits_and_upda + * to avoid the need for asm/kaiser.h in unexpected places. + */ + #ifdef CONFIG_KAISER ++extern int kaiser_enabled; + extern void kaiser_setup_pcid(void); + extern void kaiser_flush_tlb_on_return_to_user(void); + #else ++#define kaiser_enabled 0 + static inline void kaiser_setup_pcid(void) + { + } +@@ -164,7 +166,7 @@ static inline void __native_flush_tlb(vo + * back: + */ + preempt_disable(); +- if (this_cpu_has(X86_FEATURE_PCID)) ++ if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) + kaiser_flush_tlb_on_return_to_user(); + native_write_cr3(native_read_cr3()); + preempt_enable(); +@@ -175,20 +177,30 @@ static inline void __native_flush_tlb_gl + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); +- /* clear PGE */ +- native_write_cr4(cr4 & ~X86_CR4_PGE); +- /* write old PGE again and flush TLBs */ +- native_write_cr4(cr4); ++ if (cr4 & X86_CR4_PGE) { ++ /* clear PGE and flush TLB of all entries */ ++ native_write_cr4(cr4 & ~X86_CR4_PGE); ++ /* restore PGE as it was before */ ++ native_write_cr4(cr4); ++ } else { ++ /* ++ * x86_64 microcode update comes this way when CR4.PGE is not ++ * enabled, and it's safer for all callers to allow this case. ++ */ ++ native_write_cr3(native_read_cr3()); ++ } + } + + static inline void __native_flush_tlb_global(void) + { +-#ifdef CONFIG_KAISER +- /* Globals are not used at all */ +- __native_flush_tlb(); +-#else + unsigned long flags; + ++ if (kaiser_enabled) { ++ /* Globals are not used at all */ ++ __native_flush_tlb(); ++ return; ++ } ++ + if (this_cpu_has(X86_FEATURE_INVPCID)) { + /* + * Using INVPCID is considerably faster than a pair of writes +@@ -208,7 +220,6 @@ static inline void __native_flush_tlb_gl + raw_local_irq_save(flags); + __native_flush_tlb_global_irq_disabled(); + raw_local_irq_restore(flags); +-#endif + } + + static inline void __native_flush_tlb_single(unsigned long addr) +@@ -223,7 +234,7 @@ static inline void __native_flush_tlb_si + */ + + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { +- if (this_cpu_has(X86_FEATURE_PCID)) ++ if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) + kaiser_flush_tlb_on_return_to_user(); + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + return; +@@ -238,9 +249,9 @@ static inline void __native_flush_tlb_si + * Make sure to do only a single invpcid when KAISER is + * disabled and we have only a single ASID. + */ +- if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER) +- invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); +- invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); ++ if (kaiser_enabled) ++ invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); ++ invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); + } + + static inline void __flush_tlb_all(void) +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -179,6 +179,20 @@ static int __init x86_pcid_setup(char *s + return 1; + } + __setup("nopcid", x86_pcid_setup); ++ ++static int __init x86_nokaiser_setup(char *s) ++{ ++ /* nokaiser doesn't accept parameters */ ++ if (s) ++ return -EINVAL; ++#ifdef CONFIG_KAISER ++ kaiser_enabled = 0; ++ setup_clear_cpu_cap(X86_FEATURE_KAISER); ++ pr_info("nokaiser: KAISER feature disabled\n"); ++#endif ++ return 0; ++} ++early_param("nokaiser", x86_nokaiser_setup); + #endif + + static int __init x86_noinvpcid_setup(char *s) +@@ -327,7 +341,7 @@ static __always_inline void setup_smap(s + static void setup_pcid(struct cpuinfo_x86 *c) + { + if (cpu_has(c, X86_FEATURE_PCID)) { +- if (cpu_has(c, X86_FEATURE_PGE)) { ++ if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) { + cr4_set_bits(X86_CR4_PCIDE); + /* + * INVPCID has two "groups" of types: +@@ -799,6 +813,10 @@ void get_cpu_cap(struct cpuinfo_x86 *c) + c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); + + init_scattered_cpuid_features(c); ++#ifdef CONFIG_KAISER ++ if (kaiser_enabled) ++ set_cpu_cap(c, X86_FEATURE_KAISER); ++#endif + } + + static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) +@@ -1537,6 +1555,14 @@ void cpu_init(void) + * try to read it. + */ + cr4_init_shadow(); ++ if (!kaiser_enabled) { ++ /* ++ * secondary_startup_64() deferred setting PGE in cr4: ++ * probe_page_size_mask() sets it on the boot cpu, ++ * but it needs to be set on each secondary cpu. ++ */ ++ cr4_set_bits(X86_CR4_PGE); ++ } + + /* + * Load microcode on this cpu if a valid microcode is available. +--- a/arch/x86/kernel/espfix_64.c ++++ b/arch/x86/kernel/espfix_64.c +@@ -132,9 +132,10 @@ void __init init_espfix_bsp(void) + * area to ensure it is mapped into the shadow user page + * tables. + */ +- if (IS_ENABLED(CONFIG_KAISER)) ++ if (kaiser_enabled) { + set_pgd(native_get_shadow_pgd(pgd_p), + __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page))); ++ } + + /* Randomize the locations */ + init_espfix_random(); +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -190,8 +190,8 @@ ENTRY(secondary_startup_64) + movq $(init_level4_pgt - __START_KERNEL_map), %rax + 1: + +- /* Enable PAE mode and PGE */ +- movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx ++ /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */ ++ movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx + movq %rcx, %cr4 + + /* Setup early boot stage 4 level pagetables. */ +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -177,7 +177,7 @@ static void __init probe_page_size_mask( + cr4_set_bits_and_update_boot(X86_CR4_PSE); + + /* Enable PGE if available */ +- if (boot_cpu_has(X86_FEATURE_PGE)) { ++ if (boot_cpu_has(X86_FEATURE_PGE) && !kaiser_enabled) { + cr4_set_bits_and_update_boot(X86_CR4_PGE); + __supported_pte_mask |= _PAGE_GLOBAL; + } else +--- a/arch/x86/mm/init_64.c ++++ b/arch/x86/mm/init_64.c +@@ -324,6 +324,16 @@ void __init cleanup_highmap(void) + continue; + if (vaddr < (unsigned long) _text || vaddr > end) + set_pmd(pmd, __pmd(0)); ++ else if (kaiser_enabled) { ++ /* ++ * level2_kernel_pgt is initialized with _PAGE_GLOBAL: ++ * clear that now. This is not important, so long as ++ * CR4.PGE remains clear, but it removes an anomaly. ++ * Physical mapping setup below avoids _PAGE_GLOBAL ++ * by use of massage_pgprot() inside pfn_pte() etc. ++ */ ++ set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL)); ++ } + } + } + +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -16,7 +16,9 @@ + #include <asm/pgalloc.h> + #include <asm/desc.h> + +-#ifdef CONFIG_KAISER ++int kaiser_enabled __read_mostly = 1; ++EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */ ++ + __visible + DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + +@@ -167,8 +169,8 @@ static pte_t *kaiser_pagetable_walk(unsi + return pte_offset_kernel(pmd, address); + } + +-int kaiser_add_user_map(const void *__start_addr, unsigned long size, +- unsigned long flags) ++static int kaiser_add_user_map(const void *__start_addr, unsigned long size, ++ unsigned long flags) + { + int ret = 0; + pte_t *pte; +@@ -177,6 +179,15 @@ int kaiser_add_user_map(const void *__st + unsigned long end_addr = PAGE_ALIGN(start_addr + size); + unsigned long target_address; + ++ /* ++ * It is convenient for callers to pass in __PAGE_KERNEL etc, ++ * and there is no actual harm from setting _PAGE_GLOBAL, so ++ * long as CR4.PGE is not set. But it is nonetheless troubling ++ * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser" ++ * requires that not to be #defined to 0): so mask it off here. ++ */ ++ flags &= ~_PAGE_GLOBAL; ++ + for (; address < end_addr; address += PAGE_SIZE) { + target_address = get_pa_from_mapping(address); + if (target_address == -1) { +@@ -263,6 +274,8 @@ void __init kaiser_init(void) + { + int cpu; + ++ if (!kaiser_enabled) ++ return; + kaiser_init_all_pgds(); + + for_each_possible_cpu(cpu) { +@@ -311,6 +324,8 @@ void __init kaiser_init(void) + /* Add a mapping to the shadow mapping, and synchronize the mappings */ + int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) + { ++ if (!kaiser_enabled) ++ return 0; + return kaiser_add_user_map((const void *)addr, size, flags); + } + +@@ -322,6 +337,8 @@ void kaiser_remove_mapping(unsigned long + unsigned long addr, next; + pgd_t *pgd; + ++ if (!kaiser_enabled) ++ return; + pgd = native_get_shadow_pgd(pgd_offset_k(start)); + for (addr = start; addr < end; pgd++, addr = next) { + next = pgd_addr_end(addr, end); +@@ -343,6 +360,8 @@ static inline bool is_userspace_pgd(pgd_ + + pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) + { ++ if (!kaiser_enabled) ++ return pgd; + /* + * Do we need to also populate the shadow pgd? Check _PAGE_USER to + * skip cases like kexec and EFI which make temporary low mappings. +@@ -399,4 +418,3 @@ void kaiser_flush_tlb_on_return_to_user( + X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); + } + EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); +-#endif /* CONFIG_KAISER */ +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -345,16 +345,12 @@ static inline void _pgd_free(pgd_t *pgd) + } + #else + +-#ifdef CONFIG_KAISER + /* +- * Instead of one pmd, we aquire two pmds. Being order-1, it is ++ * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is + * both 8k in size and 8k-aligned. That lets us just flip bit 12 + * in a pointer to swap between the two 4k halves. + */ +-#define PGD_ALLOCATION_ORDER 1 +-#else +-#define PGD_ALLOCATION_ORDER 0 +-#endif ++#define PGD_ALLOCATION_ORDER kaiser_enabled + + static inline pgd_t *_pgd_alloc(void) + { +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -39,8 +39,7 @@ static void load_new_mm_cr3(pgd_t *pgdir + { + unsigned long new_mm_cr3 = __pa(pgdir); + +-#ifdef CONFIG_KAISER +- if (this_cpu_has(X86_FEATURE_PCID)) { ++ if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) { + /* + * We reuse the same PCID for different tasks, so we must + * flush all the entries for the PCID out when we change tasks. +@@ -57,7 +56,6 @@ static void load_new_mm_cr3(pgd_t *pgdir + new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH; + kaiser_flush_tlb_on_return_to_user(); + } +-#endif /* CONFIG_KAISER */ + + /* + * Caution: many callers of this function expect +--- a/tools/arch/x86/include/asm/cpufeatures.h ++++ b/tools/arch/x86/include/asm/cpufeatures.h +@@ -197,6 +197,9 @@ + #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ + #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ + ++/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ ++#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */ ++ + /* Virtualization flags: Linux defined, word 8 */ + #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ + #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ diff --git a/queue/kaiser-align-addition-to-x86-mm-makefile.patch b/queue/kaiser-align-addition-to-x86-mm-makefile.patch new file mode 100644 index 0000000..d8217dd --- /dev/null +++ b/queue/kaiser-align-addition-to-x86-mm-makefile.patch @@ -0,0 +1,26 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Sun, 3 Sep 2017 19:51:10 -0700 +Subject: kaiser: align addition to x86/mm/Makefile + +From: Hugh Dickins <hughd@google.com> + + +Use tab not space so they line up properly, kaslr.o also. + +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/mm/Makefile | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/x86/mm/Makefile ++++ b/arch/x86/mm/Makefile +@@ -37,5 +37,5 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulatio + + obj-$(CONFIG_X86_INTEL_MPX) += mpx.o + obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o +-obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +-obj-$(CONFIG_KAISER) += kaiser.o ++obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o ++obj-$(CONFIG_KAISER) += kaiser.o diff --git a/queue/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch b/queue/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch new file mode 100644 index 0000000..2d9ee7e --- /dev/null +++ b/queue/kaiser-asm-tlbflush.h-handle-nopge-at-lower-level.patch @@ -0,0 +1,86 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Sat, 4 Nov 2017 18:23:24 -0700 +Subject: kaiser: asm/tlbflush.h handle noPGE at lower level + +From: Hugh Dickins <hughd@google.com> + + +I found asm/tlbflush.h too twisty, and think it safer not to avoid +__native_flush_tlb_global_irq_disabled() in the kaiser_enabled case, +but instead let it handle kaiser_enabled along with cr3: it can just +use __native_flush_tlb() for that, no harm in re-disabling preemption. + +(This is not the same change as Kirill and Dave have suggested for +upstream, flipping PGE in cr4: that's neat, but needs a cpu_has_pge +check; cr3 is enough for kaiser, and thought to be cheaper than cr4.) + +Also delete the X86_FEATURE_INVPCID invpcid_flush_all_nonglobals() +preference from __native_flush_tlb(): unlike the invpcid_flush_all() +preference in __native_flush_tlb_global(), it's not seen in upstream +4.14, and was recently reported to be surprisingly slow. + +Signed-off-by: Hugh Dickins <hughd@google.com> +Acked-by: Jiri Kosina <jkosina@suse.cz> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/tlbflush.h | 27 +++------------------------ + 1 file changed, 3 insertions(+), 24 deletions(-) + +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -152,14 +152,6 @@ static inline void kaiser_flush_tlb_on_r + + static inline void __native_flush_tlb(void) + { +- if (this_cpu_has(X86_FEATURE_INVPCID)) { +- /* +- * Note, this works with CR4.PCIDE=0 or 1. +- */ +- invpcid_flush_all_nonglobals(); +- return; +- } +- + /* + * If current->mm == NULL then we borrow a mm which may change during a + * task switch and therefore we must not be preempted while we write CR3 +@@ -183,11 +175,8 @@ static inline void __native_flush_tlb_gl + /* restore PGE as it was before */ + native_write_cr4(cr4); + } else { +- /* +- * x86_64 microcode update comes this way when CR4.PGE is not +- * enabled, and it's safer for all callers to allow this case. +- */ +- native_write_cr3(native_read_cr3()); ++ /* do it with cr3, letting kaiser flush user PCID */ ++ __native_flush_tlb(); + } + } + +@@ -195,12 +184,6 @@ static inline void __native_flush_tlb_gl + { + unsigned long flags; + +- if (kaiser_enabled) { +- /* Globals are not used at all */ +- __native_flush_tlb(); +- return; +- } +- + if (this_cpu_has(X86_FEATURE_INVPCID)) { + /* + * Using INVPCID is considerably faster than a pair of writes +@@ -256,11 +239,7 @@ static inline void __native_flush_tlb_si + + static inline void __flush_tlb_all(void) + { +- if (boot_cpu_has(X86_FEATURE_PGE)) +- __flush_tlb_global(); +- else +- __flush_tlb(); +- ++ __flush_tlb_global(); + /* + * Note: if we somehow had PCID but not PGE, then this wouldn't work -- + * we'd end up flushing kernel translations for the current ASID but diff --git a/queue/kaiser-cleanups-while-trying-for-gold-link.patch b/queue/kaiser-cleanups-while-trying-for-gold-link.patch new file mode 100644 index 0000000..5e95f5b --- /dev/null +++ b/queue/kaiser-cleanups-while-trying-for-gold-link.patch @@ -0,0 +1,134 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Mon, 21 Aug 2017 20:11:43 -0700 +Subject: kaiser: cleanups while trying for gold link + +From: Hugh Dickins <hughd@google.com> + + +While trying to get our gold link to work, four cleanups: +matched the gdt_page declaration to its definition; +in fiddling unsuccessfully with PERCPU_INPUT(), lined up backslashes; +lined up the backslashes according to convention in percpu-defs.h; +deleted the unused irq_stack_pointer addition to irq_stack_union. + +Sad to report that aligning backslashes does not appear to help gold +align to 8192: but while these did not help, they are worth keeping. + +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/desc.h | 2 +- + arch/x86/include/asm/processor.h | 5 ----- + include/asm-generic/vmlinux.lds.h | 18 ++++++++---------- + include/linux/percpu-defs.h | 24 ++++++++++++------------ + 4 files changed, 21 insertions(+), 28 deletions(-) + +--- a/arch/x86/include/asm/desc.h ++++ b/arch/x86/include/asm/desc.h +@@ -43,7 +43,7 @@ struct gdt_page { + struct desc_struct gdt[GDT_ENTRIES]; + } __attribute__((aligned(PAGE_SIZE))); + +-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); ++DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page); + + static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) + { +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -335,11 +335,6 @@ union irq_stack_union { + char gs_base[40]; + unsigned long stack_canary; + }; +- +- struct { +- char irq_stack_pointer[64]; +- char unused[IRQ_STACK_SIZE - 64]; +- }; + }; + + DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; +--- a/include/asm-generic/vmlinux.lds.h ++++ b/include/asm-generic/vmlinux.lds.h +@@ -778,16 +778,14 @@ + */ + #define PERCPU_INPUT(cacheline) \ + VMLINUX_SYMBOL(__per_cpu_start) = .; \ +- \ +- VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \ +- *(.data..percpu..first) \ +- . = ALIGN(cacheline); \ +- *(.data..percpu..user_mapped) \ +- *(.data..percpu..user_mapped..shared_aligned) \ +- . = ALIGN(PAGE_SIZE); \ +- *(.data..percpu..user_mapped..page_aligned) \ +- VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \ +- \ ++ VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \ ++ *(.data..percpu..first) \ ++ . = ALIGN(cacheline); \ ++ *(.data..percpu..user_mapped) \ ++ *(.data..percpu..user_mapped..shared_aligned) \ ++ . = ALIGN(PAGE_SIZE); \ ++ *(.data..percpu..user_mapped..page_aligned) \ ++ VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \ + . = ALIGN(PAGE_SIZE); \ + *(.data..percpu..page_aligned) \ + . = ALIGN(cacheline); \ +--- a/include/linux/percpu-defs.h ++++ b/include/linux/percpu-defs.h +@@ -121,10 +121,10 @@ + #define DEFINE_PER_CPU(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, "") + +-#define DECLARE_PER_CPU_USER_MAPPED(type, name) \ ++#define DECLARE_PER_CPU_USER_MAPPED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) + +-#define DEFINE_PER_CPU_USER_MAPPED(type, name) \ ++#define DEFINE_PER_CPU_USER_MAPPED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) + + /* +@@ -156,11 +156,11 @@ + DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \ + ____cacheline_aligned_in_smp + +-#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ ++#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ + ____cacheline_aligned_in_smp + +-#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ ++#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ + ____cacheline_aligned_in_smp + +@@ -185,18 +185,18 @@ + /* + * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode. + */ +-#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ +- DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ +- __aligned(PAGE_SIZE) +- +-#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ +- DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ +- __aligned(PAGE_SIZE) ++#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ ++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ ++ __aligned(PAGE_SIZE) ++ ++#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ ++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ ++ __aligned(PAGE_SIZE) + + /* + * Declaration/definition used for per-CPU variables that must be read mostly. + */ +-#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ ++#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, "..read_mostly") + + #define DEFINE_PER_CPU_READ_MOSTLY(type, name) \ diff --git a/queue/kaiser-delete-kaiser_real_switch-option.patch b/queue/kaiser-delete-kaiser_real_switch-option.patch new file mode 100644 index 0000000..bea2039 --- /dev/null +++ b/queue/kaiser-delete-kaiser_real_switch-option.patch @@ -0,0 +1,79 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Sun, 3 Sep 2017 18:30:43 -0700 +Subject: kaiser: delete KAISER_REAL_SWITCH option + +From: Hugh Dickins <hughd@google.com> + + +We fail to see what CONFIG_KAISER_REAL_SWITCH is for: it seems to be +left over from early development, and now just obscures tricky parts +of the code. Delete it before adding PCIDs, or nokaiser boot option. + +(Or if there is some good reason to keep the option, then it needs +a help text - and a "depends on KAISER", so that all those without +KAISER are not asked the question. But we'd much rather delete it.) + +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/entry/entry_64.S | 4 ---- + arch/x86/include/asm/kaiser.h | 4 ---- + security/Kconfig | 4 ---- + 3 files changed, 12 deletions(-) + +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -1317,9 +1317,7 @@ ENTRY(nmi) + /* %rax is saved above, so OK to clobber here */ + movq %cr3, %rax + pushq %rax +-#ifdef CONFIG_KAISER_REAL_SWITCH + andq $(~KAISER_SHADOW_PGD_OFFSET), %rax +-#endif + movq %rax, %cr3 + #endif + call do_nmi +@@ -1560,9 +1558,7 @@ end_repeat_nmi: + /* %rax is saved above, so OK to clobber here */ + movq %cr3, %rax + pushq %rax +-#ifdef CONFIG_KAISER_REAL_SWITCH + andq $(~KAISER_SHADOW_PGD_OFFSET), %rax +-#endif + movq %rax, %cr3 + #endif + +--- a/arch/x86/include/asm/kaiser.h ++++ b/arch/x86/include/asm/kaiser.h +@@ -21,17 +21,13 @@ + + .macro _SWITCH_TO_KERNEL_CR3 reg + movq %cr3, \reg +-#ifdef CONFIG_KAISER_REAL_SWITCH + andq $(~KAISER_SHADOW_PGD_OFFSET), \reg +-#endif + movq \reg, %cr3 + .endm + + .macro _SWITCH_TO_USER_CR3 reg + movq %cr3, \reg +-#ifdef CONFIG_KAISER_REAL_SWITCH + orq $(KAISER_SHADOW_PGD_OFFSET), \reg +-#endif + movq \reg, %cr3 + .endm + +--- a/security/Kconfig ++++ b/security/Kconfig +@@ -41,10 +41,6 @@ config KAISER + + If you are unsure how to answer this question, answer Y. + +-config KAISER_REAL_SWITCH +- bool "KAISER: actually switch page tables" +- default y +- + config SECURITYFS + bool "Enable the securityfs filesystem" + help diff --git a/queue/kaiser-disabled-on-xen-pv.patch b/queue/kaiser-disabled-on-xen-pv.patch new file mode 100644 index 0000000..c306014 --- /dev/null +++ b/queue/kaiser-disabled-on-xen-pv.patch @@ -0,0 +1,42 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Jiri Kosina <jkosina@suse.cz> +Date: Tue, 2 Jan 2018 14:19:49 +0100 +Subject: kaiser: disabled on Xen PV + +From: Jiri Kosina <jkosina@suse.cz> + + +Kaiser cannot be used on paravirtualized MMUs (namely reading and writing CR3). +This does not work with KAISER as the CR3 switch from and to user space PGD +would require to map the whole XEN_PV machinery into both. + +More importantly, enabling KAISER on Xen PV doesn't make too much sense, as PV +guests use distinct %cr3 values for kernel and user already. + +Signed-off-by: Jiri Kosina <jkosina@suse.cz> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/mm/kaiser.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -263,6 +263,9 @@ void __init kaiser_check_boottime_disabl + char arg[5]; + int ret; + ++ if (boot_cpu_has(X86_FEATURE_XENPV)) ++ goto silent_disable; ++ + ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); + if (ret > 0) { + if (!strncmp(arg, "on", 2)) +@@ -290,6 +293,8 @@ enable: + + disable: + pr_info("Kernel/User page tables isolation: disabled\n"); ++ ++silent_disable: + kaiser_enabled = 0; + setup_clear_cpu_cap(X86_FEATURE_KAISER); + } diff --git a/queue/kaiser-do-not-set-_page_nx-on-pgd_none.patch b/queue/kaiser-do-not-set-_page_nx-on-pgd_none.patch new file mode 100644 index 0000000..63a8639 --- /dev/null +++ b/queue/kaiser-do-not-set-_page_nx-on-pgd_none.patch @@ -0,0 +1,204 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Tue, 5 Sep 2017 12:05:01 -0700 +Subject: kaiser: do not set _PAGE_NX on pgd_none + +From: Hugh Dickins <hughd@google.com> + + +native_pgd_clear() uses native_set_pgd(), so native_set_pgd() must +avoid setting the _PAGE_NX bit on an otherwise pgd_none() entry: +usually that just generated a warning on exit, but sometimes +more mysterious and damaging failures (our production machines +could not complete booting). + +The original fix to this just avoided adding _PAGE_NX to +an empty entry; but eventually more problems surfaced with kexec, +and EFI mapping expected to be a problem too. So now instead +change native_set_pgd() to update shadow only if _PAGE_USER: + +A few places (kernel/machine_kexec_64.c, platform/efi/efi_64.c for sure) +use set_pgd() to set up a temporary internal virtual address space, with +physical pages remapped at what Kaiser regards as userspace addresses: +Kaiser then assumes a shadow pgd follows, which it will try to corrupt. + +This appears to be responsible for the recent kexec and kdump failures; +though it's unclear how those did not manifest as a problem before. +Ah, the shadow pgd will only be assumed to "follow" if the requested +pgd is on an even-numbered page: so I suppose it was going wrong 50% +of the time all along. + +What we need is a flag to set_pgd(), to tell it we're dealing with +userspace. Er, isn't that what the pgd's _PAGE_USER bit is saying? +Add a test for that. But we cannot do the same for pgd_clear() +(which may be called to clear corrupted entries - set aside the +question of "corrupt in which pgd?" until later), so there just +rely on pgd_clear() not being called in the problematic cases - +with a WARN_ON_ONCE() which should fire half the time if it is. + +But this is getting too big for an inline function: move it into +arch/x86/mm/kaiser.c (which then demands a boot/compressed mod); +and de-void and de-space native_get_shadow/normal_pgd() while here. + +Also make an unnecessary change to KASLR's init_trampoline(): it was +using set_pgd() to assign a pgd-value to a global variable (not in a +pg directory page), which was rather scary given Kaiser's previous +set_pgd() implementation: not a problem now, but too scary to leave +as was, it could easily blow up if we have to change set_pgd() again. + +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/boot/compressed/misc.h | 1 + arch/x86/include/asm/pgtable_64.h | 51 +++++++++----------------------------- + arch/x86/mm/kaiser.c | 42 +++++++++++++++++++++++++++++++ + arch/x86/mm/kaslr.c | 4 +- + 4 files changed, 58 insertions(+), 40 deletions(-) + +--- a/arch/x86/boot/compressed/misc.h ++++ b/arch/x86/boot/compressed/misc.h +@@ -9,6 +9,7 @@ + */ + #undef CONFIG_PARAVIRT + #undef CONFIG_PARAVIRT_SPINLOCKS ++#undef CONFIG_KAISER + #undef CONFIG_KASAN + + #include <linux/linkage.h> +--- a/arch/x86/include/asm/pgtable_64.h ++++ b/arch/x86/include/asm/pgtable_64.h +@@ -107,61 +107,36 @@ static inline void native_pud_clear(pud_ + } + + #ifdef CONFIG_KAISER +-static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) ++extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); ++ ++static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) + { +- return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE); ++ return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE); + } + +-static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) ++static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) + { +- return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE); ++ return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE); + } + #else +-static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) ++static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) ++{ ++ return pgd; ++} ++static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) + { + BUILD_BUG_ON(1); + return NULL; + } +-static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) ++static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) + { + return pgdp; + } + #endif /* CONFIG_KAISER */ + +-/* +- * Page table pages are page-aligned. The lower half of the top +- * level is used for userspace and the top half for the kernel. +- * This returns true for user pages that need to get copied into +- * both the user and kernel copies of the page tables, and false +- * for kernel pages that should only be in the kernel copy. +- */ +-static inline bool is_userspace_pgd(void *__ptr) +-{ +- unsigned long ptr = (unsigned long)__ptr; +- +- return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2)); +-} +- + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) + { +-#ifdef CONFIG_KAISER +- pteval_t extra_kern_pgd_flags = 0; +- /* Do we need to also populate the shadow pgd? */ +- if (is_userspace_pgd(pgdp)) { +- native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; +- /* +- * Even if the entry is *mapping* userspace, ensure +- * that userspace can not use it. This way, if we +- * get out to userspace running on the kernel CR3, +- * userspace will crash instead of running. +- */ +- extra_kern_pgd_flags = _PAGE_NX; +- } +- pgdp->pgd = pgd.pgd; +- pgdp->pgd |= extra_kern_pgd_flags; +-#else /* CONFIG_KAISER */ +- *pgdp = pgd; +-#endif ++ *pgdp = kaiser_set_shadow_pgd(pgdp, pgd); + } + + static inline void native_pgd_clear(pgd_t *pgd) +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -302,4 +302,46 @@ void kaiser_remove_mapping(unsigned long + unmap_pud_range_nofree(pgd, addr, end); + } + } ++ ++/* ++ * Page table pages are page-aligned. The lower half of the top ++ * level is used for userspace and the top half for the kernel. ++ * This returns true for user pages that need to get copied into ++ * both the user and kernel copies of the page tables, and false ++ * for kernel pages that should only be in the kernel copy. ++ */ ++static inline bool is_userspace_pgd(pgd_t *pgdp) ++{ ++ return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2); ++} ++ ++pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) ++{ ++ /* ++ * Do we need to also populate the shadow pgd? Check _PAGE_USER to ++ * skip cases like kexec and EFI which make temporary low mappings. ++ */ ++ if (pgd.pgd & _PAGE_USER) { ++ if (is_userspace_pgd(pgdp)) { ++ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; ++ /* ++ * Even if the entry is *mapping* userspace, ensure ++ * that userspace can not use it. This way, if we ++ * get out to userspace running on the kernel CR3, ++ * userspace will crash instead of running. ++ */ ++ pgd.pgd |= _PAGE_NX; ++ } ++ } else if (!pgd.pgd) { ++ /* ++ * pgd_clear() cannot check _PAGE_USER, and is even used to ++ * clear corrupted pgd entries: so just rely on cases like ++ * kexec and EFI never to be using pgd_clear(). ++ */ ++ if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) && ++ is_userspace_pgd(pgdp)) ++ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; ++ } ++ return pgd; ++} + #endif /* CONFIG_KAISER */ +--- a/arch/x86/mm/kaslr.c ++++ b/arch/x86/mm/kaslr.c +@@ -189,6 +189,6 @@ void __meminit init_trampoline(void) + *pud_tramp = *pud; + } + +- set_pgd(&trampoline_pgd_entry, +- __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); ++ /* Avoid set_pgd(), in case it's complicated by CONFIG_KAISER */ ++ trampoline_pgd_entry = __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)); + } diff --git a/queue/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch b/queue/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch new file mode 100644 index 0000000..2fed5fc --- /dev/null +++ b/queue/kaiser-drop-is_atomic-arg-to-kaiser_pagetable_walk.patch @@ -0,0 +1,53 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Sun, 29 Oct 2017 11:36:19 -0700 +Subject: kaiser: drop is_atomic arg to kaiser_pagetable_walk() + +From: Hugh Dickins <hughd@google.com> + + +I have not observed a might_sleep() warning from setup_fixmap_gdt()'s +use of kaiser_add_mapping() in our tree (why not?), but like upstream +we have not provided a way for that to pass is_atomic true down to +kaiser_pagetable_walk(), and at startup it's far from a likely source +of trouble: so just delete the walk's is_atomic arg and might_sleep(). + +Signed-off-by: Hugh Dickins <hughd@google.com> +Acked-by: Jiri Kosina <jkosina@suse.cz> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/mm/kaiser.c | 10 ++-------- + 1 file changed, 2 insertions(+), 8 deletions(-) + +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -107,19 +107,13 @@ static inline unsigned long get_pa_from_ + * + * Returns a pointer to a PTE on success, or NULL on failure. + */ +-static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic) ++static pte_t *kaiser_pagetable_walk(unsigned long address) + { + pmd_t *pmd; + pud_t *pud; + pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + +- if (is_atomic) { +- gfp &= ~GFP_KERNEL; +- gfp |= __GFP_HIGH | __GFP_ATOMIC; +- } else +- might_sleep(); +- + if (pgd_none(*pgd)) { + WARN_ONCE(1, "All shadow pgds should have been populated"); + return NULL; +@@ -194,7 +188,7 @@ static int kaiser_add_user_map(const voi + ret = -EIO; + break; + } +- pte = kaiser_pagetable_walk(address, false); ++ pte = kaiser_pagetable_walk(address); + if (!pte) { + ret = -ENOMEM; + break; diff --git a/queue/kaiser-enhanced-by-kernel-and-user-pcids.patch b/queue/kaiser-enhanced-by-kernel-and-user-pcids.patch new file mode 100644 index 0000000..84c6fd8 --- /dev/null +++ b/queue/kaiser-enhanced-by-kernel-and-user-pcids.patch @@ -0,0 +1,403 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Wed, 30 Aug 2017 16:23:00 -0700 +Subject: kaiser: enhanced by kernel and user PCIDs + +From: Hugh Dickins <hughd@google.com> + + +Merged performance improvements to Kaiser, using distinct kernel +and user Process Context Identifiers to minimize the TLB flushing. + +[This work actually all from Dave Hansen 2017-08-30: +still omitting trackswitch mods, and KAISER_REAL_SWITCH deleted.] + +Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/entry/entry_64.S | 10 ++++- + arch/x86/entry/entry_64_compat.S | 1 + arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/kaiser.h | 15 ++++++- + arch/x86/include/asm/pgtable_types.h | 26 +++++++++++++ + arch/x86/include/asm/tlbflush.h | 54 +++++++++++++++++++++++----- + arch/x86/include/uapi/asm/processor-flags.h | 3 + + arch/x86/kernel/cpu/common.c | 34 +++++++++++++++++ + arch/x86/kvm/x86.c | 3 + + arch/x86/mm/kaiser.c | 7 +++ + arch/x86/mm/tlb.c | 46 ++++++++++++++++++++++- + 11 files changed, 182 insertions(+), 18 deletions(-) + +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -1317,7 +1317,10 @@ ENTRY(nmi) + /* %rax is saved above, so OK to clobber here */ + movq %cr3, %rax + pushq %rax +- andq $(~KAISER_SHADOW_PGD_OFFSET), %rax ++ /* mask off "user" bit of pgd address and 12 PCID bits: */ ++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax ++ /* Add back kernel PCID and "no flush" bit */ ++ orq X86_CR3_PCID_KERN_VAR, %rax + movq %rax, %cr3 + #endif + call do_nmi +@@ -1558,7 +1561,10 @@ end_repeat_nmi: + /* %rax is saved above, so OK to clobber here */ + movq %cr3, %rax + pushq %rax +- andq $(~KAISER_SHADOW_PGD_OFFSET), %rax ++ /* mask off "user" bit of pgd address and 12 PCID bits: */ ++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax ++ /* Add back kernel PCID and "no flush" bit */ ++ orq X86_CR3_PCID_KERN_VAR, %rax + movq %rax, %cr3 + #endif + +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -13,6 +13,7 @@ + #include <asm/irqflags.h> + #include <asm/asm.h> + #include <asm/smap.h> ++#include <asm/pgtable_types.h> + #include <asm/kaiser.h> + #include <linux/linkage.h> + #include <linux/err.h> +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -189,6 +189,7 @@ + + #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ + #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ ++#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */ + + #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ + #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +--- a/arch/x86/include/asm/kaiser.h ++++ b/arch/x86/include/asm/kaiser.h +@@ -1,5 +1,8 @@ + #ifndef _ASM_X86_KAISER_H + #define _ASM_X86_KAISER_H ++ ++#include <uapi/asm/processor-flags.h> /* For PCID constants */ ++ + /* + * This file includes the definitions for the KAISER feature. + * KAISER is a counter measure against x86_64 side channel attacks on +@@ -21,13 +24,21 @@ + + .macro _SWITCH_TO_KERNEL_CR3 reg + movq %cr3, \reg +-andq $(~KAISER_SHADOW_PGD_OFFSET), \reg ++andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg ++orq X86_CR3_PCID_KERN_VAR, \reg + movq \reg, %cr3 + .endm + + .macro _SWITCH_TO_USER_CR3 reg + movq %cr3, \reg +-orq $(KAISER_SHADOW_PGD_OFFSET), \reg ++andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg ++/* ++ * This can obviously be one instruction by putting the ++ * KAISER_SHADOW_PGD_OFFSET bit in the X86_CR3_PCID_USER_VAR. ++ * But, just leave it now for simplicity. ++ */ ++orq X86_CR3_PCID_USER_VAR, \reg ++orq $(KAISER_SHADOW_PGD_OFFSET), \reg + movq \reg, %cr3 + .endm + +--- a/arch/x86/include/asm/pgtable_types.h ++++ b/arch/x86/include/asm/pgtable_types.h +@@ -141,6 +141,32 @@ + _PAGE_SOFT_DIRTY) + #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) + ++/* The ASID is the lower 12 bits of CR3 */ ++#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL)) ++ ++/* Mask for all the PCID-related bits in CR3: */ ++#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK) ++#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64) ++#define X86_CR3_PCID_ASID_KERN (_AC(0x4,UL)) ++#define X86_CR3_PCID_ASID_USER (_AC(0x6,UL)) ++ ++#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN) ++#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER) ++#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN) ++#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER) ++#else ++#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) ++#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL)) ++/* ++ * PCIDs are unsupported on 32-bit and none of these bits can be ++ * set in CR3: ++ */ ++#define X86_CR3_PCID_KERN_FLUSH (0) ++#define X86_CR3_PCID_USER_FLUSH (0) ++#define X86_CR3_PCID_KERN_NOFLUSH (0) ++#define X86_CR3_PCID_USER_NOFLUSH (0) ++#endif ++ + /* + * The cache modes defined here are used to translate between pure SW usage + * and the HW defined cache mode bits and/or PAT entries. +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -13,7 +13,6 @@ static inline void __invpcid(unsigned lo + unsigned long type) + { + struct { u64 d[2]; } desc = { { pcid, addr } }; +- + /* + * The memory clobber is because the whole point is to invalidate + * stale TLB entries and, especially if we're flushing global +@@ -134,14 +133,25 @@ static inline void cr4_set_bits_and_upda + + static inline void __native_flush_tlb(void) + { ++ if (!cpu_feature_enabled(X86_FEATURE_INVPCID)) { ++ /* ++ * If current->mm == NULL then we borrow a mm which may change during a ++ * task switch and therefore we must not be preempted while we write CR3 ++ * back: ++ */ ++ preempt_disable(); ++ native_write_cr3(native_read_cr3()); ++ preempt_enable(); ++ return; ++ } + /* +- * If current->mm == NULL then we borrow a mm which may change during a +- * task switch and therefore we must not be preempted while we write CR3 +- * back: +- */ +- preempt_disable(); +- native_write_cr3(native_read_cr3()); +- preempt_enable(); ++ * We are no longer using globals with KAISER, so a ++ * "nonglobals" flush would work too. But, this is more ++ * conservative. ++ * ++ * Note, this works with CR4.PCIDE=0 or 1. ++ */ ++ invpcid_flush_all(); + } + + static inline void __native_flush_tlb_global_irq_disabled(void) +@@ -163,6 +173,8 @@ static inline void __native_flush_tlb_gl + /* + * Using INVPCID is considerably faster than a pair of writes + * to CR4 sandwiched inside an IRQ flag save/restore. ++ * ++ * Note, this works with CR4.PCIDE=0 or 1. + */ + invpcid_flush_all(); + return; +@@ -182,7 +194,31 @@ static inline void __native_flush_tlb_gl + + static inline void __native_flush_tlb_single(unsigned long addr) + { +- asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); ++ /* ++ * SIMICS #GP's if you run INVPCID with type 2/3 ++ * and X86_CR4_PCIDE clear. Shame! ++ * ++ * The ASIDs used below are hard-coded. But, we must not ++ * call invpcid(type=1/2) before CR4.PCIDE=1. Just call ++ * invpcid in the case we are called early. ++ */ ++ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { ++ asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); ++ return; ++ } ++ /* Flush the address out of both PCIDs. */ ++ /* ++ * An optimization here might be to determine addresses ++ * that are only kernel-mapped and only flush the kernel ++ * ASID. But, userspace flushes are probably much more ++ * important performance-wise. ++ * ++ * Make sure to do only a single invpcid when KAISER is ++ * disabled and we have only a single ASID. ++ */ ++ if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER) ++ invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); ++ invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); + } + + static inline void __flush_tlb_all(void) +--- a/arch/x86/include/uapi/asm/processor-flags.h ++++ b/arch/x86/include/uapi/asm/processor-flags.h +@@ -77,7 +77,8 @@ + #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) + #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ + #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) +-#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ ++#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ ++#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT) + + /* + * Intel CPU features in CR4 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -324,11 +324,45 @@ static __always_inline void setup_smap(s + } + } + ++/* ++ * These can have bit 63 set, so we can not just use a plain "or" ++ * instruction to get their value or'd into CR3. It would take ++ * another register. So, we use a memory reference to these ++ * instead. ++ * ++ * This is also handy because systems that do not support ++ * PCIDs just end up or'ing a 0 into their CR3, which does ++ * no harm. ++ */ ++__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR = 0; ++__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_USER_VAR = 0; ++ + static void setup_pcid(struct cpuinfo_x86 *c) + { + if (cpu_has(c, X86_FEATURE_PCID)) { + if (cpu_has(c, X86_FEATURE_PGE)) { + cr4_set_bits(X86_CR4_PCIDE); ++ /* ++ * These variables are used by the entry/exit ++ * code to change PCIDs. ++ */ ++#ifdef CONFIG_KAISER ++ X86_CR3_PCID_KERN_VAR = X86_CR3_PCID_KERN_NOFLUSH; ++ X86_CR3_PCID_USER_VAR = X86_CR3_PCID_USER_NOFLUSH; ++#endif ++ /* ++ * INVPCID has two "groups" of types: ++ * 1/2: Invalidate an individual address ++ * 3/4: Invalidate all contexts ++ * ++ * 1/2 take a PCID, but 3/4 do not. So, 3/4 ++ * ignore the PCID argument in the descriptor. ++ * But, we have to be careful not to call 1/2 ++ * with an actual non-zero PCID in them before ++ * we do the above cr4_set_bits(). ++ */ ++ if (cpu_has(c, X86_FEATURE_INVPCID)) ++ set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE); + } else { + /* + * flush_tlb_all(), as currently implemented, won't +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -773,7 +773,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, u + return 1; + + /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ +- if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) ++ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) || ++ !is_long_mode(vcpu)) + return 1; + } + +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -239,6 +239,8 @@ static void __init kaiser_init_all_pgds( + } while (0) + + extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; ++extern unsigned long X86_CR3_PCID_KERN_VAR; ++extern unsigned long X86_CR3_PCID_USER_VAR; + /* + * If anything in here fails, we will likely die on one of the + * first kernel->user transitions and init will die. But, we +@@ -289,6 +291,11 @@ void __init kaiser_init(void) + kaiser_add_user_map_early(&debug_idt_table, + sizeof(gate_desc) * NR_VECTORS, + __PAGE_KERNEL); ++ ++ kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE, ++ __PAGE_KERNEL); ++ kaiser_add_user_map_early(&X86_CR3_PCID_USER_VAR, PAGE_SIZE, ++ __PAGE_KERNEL); + } + + /* Add a mapping to the shadow mapping, and synchronize the mappings */ +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -34,6 +34,46 @@ struct flush_tlb_info { + unsigned long flush_end; + }; + ++static void load_new_mm_cr3(pgd_t *pgdir) ++{ ++ unsigned long new_mm_cr3 = __pa(pgdir); ++ ++ /* ++ * KAISER, plus PCIDs needs some extra work here. But, ++ * if either of features is not present, we need no ++ * PCIDs here and just do a normal, full TLB flush with ++ * the write_cr3() ++ */ ++ if (!IS_ENABLED(CONFIG_KAISER) || ++ !cpu_feature_enabled(X86_FEATURE_PCID)) ++ goto out_set_cr3; ++ /* ++ * We reuse the same PCID for different tasks, so we must ++ * flush all the entires for the PCID out when we change ++ * tasks. ++ */ ++ new_mm_cr3 = X86_CR3_PCID_KERN_FLUSH | __pa(pgdir); ++ ++ /* ++ * The flush from load_cr3() may leave old TLB entries ++ * for userspace in place. We must flush that context ++ * separately. We can theoretically delay doing this ++ * until we actually load up the userspace CR3, but ++ * that's a bit tricky. We have to have the "need to ++ * flush userspace PCID" bit per-cpu and check it in the ++ * exit-to-userspace paths. ++ */ ++ invpcid_flush_single_context(X86_CR3_PCID_ASID_USER); ++ ++out_set_cr3: ++ /* ++ * Caution: many callers of this function expect ++ * that load_cr3() is serializing and orders TLB ++ * fills with respect to the mm_cpumask writes. ++ */ ++ write_cr3(new_mm_cr3); ++} ++ + /* + * We cannot call mmdrop() because we are in interrupt context, + * instead update mm->cpu_vm_mask. +@@ -45,7 +85,7 @@ void leave_mm(int cpu) + BUG(); + if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { + cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); +- load_cr3(swapper_pg_dir); ++ load_new_mm_cr3(swapper_pg_dir); + /* + * This gets called in the idle path where RCU + * functions differently. Tracing normally +@@ -120,7 +160,7 @@ void switch_mm_irqs_off(struct mm_struct + * ordering guarantee we need. + * + */ +- load_cr3(next->pgd); ++ load_new_mm_cr3(next->pgd); + + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + +@@ -167,7 +207,7 @@ void switch_mm_irqs_off(struct mm_struct + * As above, load_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask write. + */ +- load_cr3(next->pgd); ++ load_new_mm_cr3(next->pgd); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + load_mm_cr4(next); + load_mm_ldt(next); diff --git a/queue/kaiser-enomem-if-kaiser_pagetable_walk-null.patch b/queue/kaiser-enomem-if-kaiser_pagetable_walk-null.patch new file mode 100644 index 0000000..c2bd8bc --- /dev/null +++ b/queue/kaiser-enomem-if-kaiser_pagetable_walk-null.patch @@ -0,0 +1,52 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Sun, 3 Sep 2017 18:48:02 -0700 +Subject: kaiser: ENOMEM if kaiser_pagetable_walk() NULL + +From: Hugh Dickins <hughd@google.com> + + +kaiser_add_user_map() took no notice when kaiser_pagetable_walk() failed. +And avoid its might_sleep() when atomic (though atomic at present unused). + +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/mm/kaiser.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -98,11 +98,11 @@ static pte_t *kaiser_pagetable_walk(unsi + pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + +- might_sleep(); + if (is_atomic) { + gfp &= ~GFP_KERNEL; + gfp |= __GFP_HIGH | __GFP_ATOMIC; +- } ++ } else ++ might_sleep(); + + if (pgd_none(*pgd)) { + WARN_ONCE(1, "All shadow pgds should have been populated"); +@@ -159,13 +159,17 @@ int kaiser_add_user_map(const void *__st + unsigned long end_addr = PAGE_ALIGN(start_addr + size); + unsigned long target_address; + +- for (;address < end_addr; address += PAGE_SIZE) { ++ for (; address < end_addr; address += PAGE_SIZE) { + target_address = get_pa_from_mapping(address); + if (target_address == -1) { + ret = -EIO; + break; + } + pte = kaiser_pagetable_walk(address, false); ++ if (!pte) { ++ ret = -ENOMEM; ++ break; ++ } + if (pte_none(*pte)) { + set_pte(pte, __pte(flags | target_address)); + } else { diff --git a/queue/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch b/queue/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch new file mode 100644 index 0000000..92d279d --- /dev/null +++ b/queue/kaiser-fix-build-and-fixme-in-alloc_ldt_struct.patch @@ -0,0 +1,53 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Sun, 3 Sep 2017 17:09:44 -0700 +Subject: kaiser: fix build and FIXME in alloc_ldt_struct() + +From: Hugh Dickins <hughd@google.com> + + +Include linux/kaiser.h instead of asm/kaiser.h to build ldt.c without +CONFIG_KAISER. kaiser_add_mapping() does already return an error code, +so fix the FIXME. + +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kernel/ldt.c | 10 ++++------ + 1 file changed, 4 insertions(+), 6 deletions(-) + +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -16,9 +16,9 @@ + #include <linux/slab.h> + #include <linux/vmalloc.h> + #include <linux/uaccess.h> ++#include <linux/kaiser.h> + + #include <asm/ldt.h> +-#include <asm/kaiser.h> + #include <asm/desc.h> + #include <asm/mmu_context.h> + #include <asm/syscalls.h> +@@ -49,7 +49,7 @@ static struct ldt_struct *alloc_ldt_stru + { + struct ldt_struct *new_ldt; + int alloc_size; +- int ret = 0; ++ int ret; + + if (size > LDT_ENTRIES) + return NULL; +@@ -77,10 +77,8 @@ static struct ldt_struct *alloc_ldt_stru + return NULL; + } + +- // FIXME: make kaiser_add_mapping() return an error code +- // when it fails +- kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size, +- __PAGE_KERNEL); ++ ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size, ++ __PAGE_KERNEL); + if (ret) { + __free_ldt_struct(new_ldt); + return NULL; diff --git a/queue/kaiser-fix-compile-error-without-vsyscall.patch b/queue/kaiser-fix-compile-error-without-vsyscall.patch new file mode 100644 index 0000000..1c5369d --- /dev/null +++ b/queue/kaiser-fix-compile-error-without-vsyscall.patch @@ -0,0 +1,46 @@ +From foo@baz Tue Feb 13 16:45:20 CET 2018 +Date: Tue, 13 Feb 2018 16:45:20 +0100 +To: Greg KH <gregkh@linuxfoundation.org> +From: Hugh Dickins <hughd@google.com> +Subject: kaiser: fix compile error without vsyscall + +From: Hugh Dickins <hughd@google.com> + +Tobias noticed a compile error on 4.4.115, and it's the same on 4.9.80: +arch/x86/mm/kaiser.c: In function ‘kaiser_init’: +arch/x86/mm/kaiser.c:348:8: error: ‘vsyscall_pgprot’ undeclared + (first use in this function) + +It seems like his combination of kernel options doesn't work for KAISER. +X86_VSYSCALL_EMULATION is not set on his system, while LEGACY_VSYSCALL +is set to NONE (LEGACY_VSYSCALL_NONE=y). He managed to get things +compiling again, by moving the 'extern unsigned long vsyscall_pgprot' +outside of the preprocessor statement. This works because the optimizer +removes that code (vsyscall_enabled() is always false) - and that's how +it was done in some older backports. + +Reported-by: Tobias Jakobi <tjakobi@math.uni-bielefeld.de> +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/include/asm/vsyscall.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/include/asm/vsyscall.h ++++ b/arch/x86/include/asm/vsyscall.h +@@ -13,7 +13,6 @@ extern void map_vsyscall(void); + */ + extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); + extern bool vsyscall_enabled(void); +-extern unsigned long vsyscall_pgprot; + #else + static inline void map_vsyscall(void) {} + static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) +@@ -22,5 +21,6 @@ static inline bool emulate_vsyscall(stru + } + static inline bool vsyscall_enabled(void) { return false; } + #endif ++extern unsigned long vsyscall_pgprot; + + #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/queue/kaiser-fix-intel_bts-perf-crashes.patch b/queue/kaiser-fix-intel_bts-perf-crashes.patch new file mode 100644 index 0000000..5ea9a08 --- /dev/null +++ b/queue/kaiser-fix-intel_bts-perf-crashes.patch @@ -0,0 +1,132 @@ +From hughd@google.com Mon Feb 5 04:59:18 2018 +From: Hugh Dickins <hughd@google.com> +Date: Mon, 29 Jan 2018 18:16:55 -0800 +Subject: kaiser: fix intel_bts perf crashes +To: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Cc: Hugh Dickins <hughd@google.com>, Thomas Gleixner <tglx@linutronix.de>, Ingo Molnar <mingo@kernel.org>, Andy Lutomirski <luto@amacapital.net>, Alexander Shishkin <alexander.shishkin@linux.intel.com>, Linus Torvalds <torvalds@linux-foundation.org>, Vince Weaver <vince@deater.net>, stable@vger.kernel.org, Jiri Kosina <jkosina@suse.cz> +Message-ID: <20180130021655.229155-1-hughd@google.com> + +From: Hugh Dickins <hughd@google.com> + +Vince reported perf_fuzzer quickly locks up on 4.15-rc7 with PTI; +Robert reported Bad RIP with KPTI and Intel BTS also on 4.15-rc7: +honggfuzz -f /tmp/somedirectorywithatleastonefile \ + --linux_perf_bts_edge -s -- /bin/true +(honggfuzz from https://github.com/google/honggfuzz) crashed with +BUG: unable to handle kernel paging request at ffff9d3215100000 +(then narrowed it down to +perf record --per-thread -e intel_bts//u -- /bin/ls). + +The intel_bts driver does not use the 'normal' BTS buffer which is +exposed through kaiser_add_mapping(), but instead uses the memory +allocated for the perf AUX buffer. + +This obviously comes apart when using PTI, because then the kernel +mapping, which includes that AUX buffer memory, disappears while +switched to user page tables. + +Easily fixed in old-Kaiser backports, by applying kaiser_add_mapping() +to those pages; perhaps not so easy for upstream, where 4.15-rc8 commit +99a9dc98ba52 ("x86,perf: Disable intel_bts when PTI") disables for now. + +Slightly reorganized surrounding code in bts_buffer_setup_aux(), +so it can better match bts_buffer_free_aux(): free_aux with an #ifdef +to avoid the loop when PTI is off, but setup_aux needs to loop anyway +(and kaiser_add_mapping() is cheap when PTI config is off or "pti=off"). + +Reported-by: Vince Weaver <vincent.weaver@maine.edu> +Reported-by: Robert Święcki <robert@swiecki.net> +Analyzed-by: Peter Zijlstra <peterz@infradead.org> +Analyzed-by: Stephane Eranian <eranian@google.com> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: Ingo Molnar <mingo@kernel.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Vince Weaver <vince@deater.net> +Cc: Jiri Kosina <jkosina@suse.cz> +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/events/intel/bts.c | 44 +++++++++++++++++++++++++++++++++----------- + 1 file changed, 33 insertions(+), 11 deletions(-) + +--- a/arch/x86/events/intel/bts.c ++++ b/arch/x86/events/intel/bts.c +@@ -22,6 +22,7 @@ + #include <linux/debugfs.h> + #include <linux/device.h> + #include <linux/coredump.h> ++#include <linux/kaiser.h> + + #include <asm-generic/sizes.h> + #include <asm/perf_event.h> +@@ -77,6 +78,23 @@ static size_t buf_size(struct page *page + return 1 << (PAGE_SHIFT + page_private(page)); + } + ++static void bts_buffer_free_aux(void *data) ++{ ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++ struct bts_buffer *buf = data; ++ int nbuf; ++ ++ for (nbuf = 0; nbuf < buf->nr_bufs; nbuf++) { ++ struct page *page = buf->buf[nbuf].page; ++ void *kaddr = page_address(page); ++ size_t page_size = buf_size(page); ++ ++ kaiser_remove_mapping((unsigned long)kaddr, page_size); ++ } ++#endif ++ kfree(data); ++} ++ + static void * + bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite) + { +@@ -113,29 +131,33 @@ bts_buffer_setup_aux(int cpu, void **pag + buf->real_size = size - size % BTS_RECORD_SIZE; + + for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) { +- unsigned int __nr_pages; ++ void *kaddr = pages[pg]; ++ size_t page_size; ++ ++ page = virt_to_page(kaddr); ++ page_size = buf_size(page); ++ ++ if (kaiser_add_mapping((unsigned long)kaddr, ++ page_size, __PAGE_KERNEL) < 0) { ++ buf->nr_bufs = nbuf; ++ bts_buffer_free_aux(buf); ++ return NULL; ++ } + +- page = virt_to_page(pages[pg]); +- __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1; + buf->buf[nbuf].page = page; + buf->buf[nbuf].offset = offset; + buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0); +- buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement; ++ buf->buf[nbuf].size = page_size - buf->buf[nbuf].displacement; + pad = buf->buf[nbuf].size % BTS_RECORD_SIZE; + buf->buf[nbuf].size -= pad; + +- pg += __nr_pages; +- offset += __nr_pages << PAGE_SHIFT; ++ pg += page_size >> PAGE_SHIFT; ++ offset += page_size; + } + + return buf; + } + +-static void bts_buffer_free_aux(void *data) +-{ +- kfree(data); +-} +- + static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx) + { + return buf->buf[idx].offset + buf->buf[idx].displacement; diff --git a/queue/kaiser-fix-perf-crashes.patch b/queue/kaiser-fix-perf-crashes.patch new file mode 100644 index 0000000..6a9286c --- /dev/null +++ b/queue/kaiser-fix-perf-crashes.patch @@ -0,0 +1,150 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Wed, 23 Aug 2017 14:21:14 -0700 +Subject: kaiser: fix perf crashes + +From: Hugh Dickins <hughd@google.com> + + +Avoid perf crashes: place debug_store in the user-mapped per-cpu area +instead of allocating, and use page allocator plus kaiser_add_mapping() +to keep the BTS and PEBS buffers user-mapped (that is, present in the +user mapping, though visible only to kernel and hardware). The PEBS +fixup buffer does not need this treatment. + +The need for a user-mapped struct debug_store showed up before doing +any conscious perf testing: in a couple of kernel paging oopses on +Westmere, implicating the debug_store offset of the per-cpu area. + +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/events/intel/ds.c | 57 +++++++++++++++++++++++++++++++++++---------- + 1 file changed, 45 insertions(+), 12 deletions(-) + +--- a/arch/x86/events/intel/ds.c ++++ b/arch/x86/events/intel/ds.c +@@ -2,11 +2,15 @@ + #include <linux/types.h> + #include <linux/slab.h> + ++#include <asm/kaiser.h> + #include <asm/perf_event.h> + #include <asm/insn.h> + + #include "../perf_event.h" + ++static ++DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store); ++ + /* The size of a BTS record in bytes: */ + #define BTS_RECORD_SIZE 24 + +@@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu) + + static DEFINE_PER_CPU(void *, insn_buffer); + ++static void *dsalloc(size_t size, gfp_t flags, int node) ++{ ++#ifdef CONFIG_KAISER ++ unsigned int order = get_order(size); ++ struct page *page; ++ unsigned long addr; ++ ++ page = __alloc_pages_node(node, flags | __GFP_ZERO, order); ++ if (!page) ++ return NULL; ++ addr = (unsigned long)page_address(page); ++ if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) { ++ __free_pages(page, order); ++ addr = 0; ++ } ++ return (void *)addr; ++#else ++ return kmalloc_node(size, flags | __GFP_ZERO, node); ++#endif ++} ++ ++static void dsfree(const void *buffer, size_t size) ++{ ++#ifdef CONFIG_KAISER ++ if (!buffer) ++ return; ++ kaiser_remove_mapping((unsigned long)buffer, size); ++ free_pages((unsigned long)buffer, get_order(size)); ++#else ++ kfree(buffer); ++#endif ++} ++ + static int alloc_pebs_buffer(int cpu) + { + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; +@@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu) + if (!x86_pmu.pebs) + return 0; + +- buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); ++ buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); + if (unlikely(!buffer)) + return -ENOMEM; + +@@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu) + if (x86_pmu.intel_cap.pebs_format < 2) { + ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); + if (!ibuffer) { +- kfree(buffer); ++ dsfree(buffer, x86_pmu.pebs_buffer_size); + return -ENOMEM; + } + per_cpu(insn_buffer, cpu) = ibuffer; +@@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu) + kfree(per_cpu(insn_buffer, cpu)); + per_cpu(insn_buffer, cpu) = NULL; + +- kfree((void *)(unsigned long)ds->pebs_buffer_base); ++ dsfree((void *)(unsigned long)ds->pebs_buffer_base, ++ x86_pmu.pebs_buffer_size); + ds->pebs_buffer_base = 0; + } + +@@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu) + if (!x86_pmu.bts) + return 0; + +- buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); ++ buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); + if (unlikely(!buffer)) { + WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); + return -ENOMEM; +@@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu) + if (!ds || !x86_pmu.bts) + return; + +- kfree((void *)(unsigned long)ds->bts_buffer_base); ++ dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE); + ds->bts_buffer_base = 0; + } + + static int alloc_ds_buffer(int cpu) + { +- int node = cpu_to_node(cpu); +- struct debug_store *ds; +- +- ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); +- if (unlikely(!ds)) +- return -ENOMEM; ++ struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu); + ++ memset(ds, 0, sizeof(*ds)); + per_cpu(cpu_hw_events, cpu).ds = ds; + + return 0; +@@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu) + return; + + per_cpu(cpu_hw_events, cpu).ds = NULL; +- kfree(ds); + } + + void release_ds_buffers(void) diff --git a/queue/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch b/queue/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch new file mode 100644 index 0000000..babde9a --- /dev/null +++ b/queue/kaiser-fix-regs-to-do_nmi-ifndef-config_kaiser.patch @@ -0,0 +1,72 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Thu, 21 Sep 2017 20:39:56 -0700 +Subject: kaiser: fix regs to do_nmi() ifndef CONFIG_KAISER + +From: Hugh Dickins <hughd@google.com> + + +pjt has observed that nmi's second (nmi_from_kernel) call to do_nmi() +adjusted the %rdi regs arg, rightly when CONFIG_KAISER, but wrongly +when not CONFIG_KAISER. + +Although the minimal change is to add an #ifdef CONFIG_KAISER around +the addq line, that looks cluttered, and I prefer how the first call +to do_nmi() handled it: prepare args in %rdi and %rsi before getting +into the CONFIG_KAISER block, since it does not touch them at all. + +And while we're here, place the "#ifdef CONFIG_KAISER" that follows +each, to enclose the "Unconditionally restore CR3" comment: matching +how the "Unconditionally use kernel CR3" comment above is enclosed. + +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/entry/entry_64.S | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -1323,12 +1323,13 @@ ENTRY(nmi) + movq %rax, %cr3 + #endif + call do_nmi ++ ++#ifdef CONFIG_KAISER + /* + * Unconditionally restore CR3. I know we return to + * kernel code that needs user CR3, but do we ever return + * to "user mode" where we need the kernel CR3? + */ +-#ifdef CONFIG_KAISER + popq %rax + mov %rax, %cr3 + #endif +@@ -1552,6 +1553,8 @@ end_repeat_nmi: + SWAPGS + xorl %ebx, %ebx + 1: ++ movq %rsp, %rdi ++ movq $-1, %rsi + #ifdef CONFIG_KAISER + /* Unconditionally use kernel CR3 for do_nmi() */ + /* %rax is saved above, so OK to clobber here */ +@@ -1564,16 +1567,14 @@ end_repeat_nmi: + #endif + + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ +- movq %rsp, %rdi +- addq $8, %rdi /* point %rdi at ptregs, fixed up for CR3 */ +- movq $-1, %rsi + call do_nmi ++ ++#ifdef CONFIG_KAISER + /* + * Unconditionally restore CR3. We might be returning to + * kernel code that needs user CR3, like just just before + * a sysret. + */ +-#ifdef CONFIG_KAISER + popq %rax + mov %rax, %cr3 + #endif diff --git a/queue/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch b/queue/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch new file mode 100644 index 0000000..6da60ee --- /dev/null +++ b/queue/kaiser-fix-unlikely-error-in-alloc_ldt_struct.patch @@ -0,0 +1,33 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Mon, 4 Dec 2017 20:13:35 -0800 +Subject: kaiser: fix unlikely error in alloc_ldt_struct() + +From: Hugh Dickins <hughd@google.com> + + +An error from kaiser_add_mapping() here is not at all likely, but +Eric Biggers rightly points out that __free_ldt_struct() relies on +new_ldt->size being initialized: move that up. + +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kernel/ldt.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -79,11 +79,11 @@ static struct ldt_struct *alloc_ldt_stru + + ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size, + __PAGE_KERNEL); ++ new_ldt->size = size; + if (ret) { + __free_ldt_struct(new_ldt); + return NULL; + } +- new_ldt->size = size; + return new_ldt; + } + diff --git a/queue/kaiser-kaiser-depends-on-smp.patch b/queue/kaiser-kaiser-depends-on-smp.patch new file mode 100644 index 0000000..d9a4854 --- /dev/null +++ b/queue/kaiser-kaiser-depends-on-smp.patch @@ -0,0 +1,54 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Wed, 13 Sep 2017 14:03:10 -0700 +Subject: kaiser: KAISER depends on SMP + +From: Hugh Dickins <hughd@google.com> + + +It is absurd that KAISER should depend on SMP, but apparently nobody +has tried a UP build before: which breaks on implicit declaration of +function 'per_cpu_offset' in arch/x86/mm/kaiser.c. + +Now, you would expect that to be trivially fixed up; but looking at +the System.map when that block is #ifdef'ed out of kaiser_init(), +I see that in a UP build __per_cpu_user_mapped_end is precisely at +__per_cpu_user_mapped_start, and the items carefully gathered into +that section for user-mapping on SMP, dispersed elsewhere on UP. + +So, some other kind of section assignment will be needed on UP, +but implementing that is not a priority: just make KAISER depend +on SMP for now. + +Also inserted a blank line before the option, tidied up the +brief Kconfig help message, and added an "If unsure, Y". + +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + security/Kconfig | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +--- a/security/Kconfig ++++ b/security/Kconfig +@@ -30,14 +30,16 @@ config SECURITY + model will be used. + + If you are unsure how to answer this question, answer N. ++ + config KAISER + bool "Remove the kernel mapping in user mode" + default y +- depends on X86_64 +- depends on !PARAVIRT ++ depends on X86_64 && SMP && !PARAVIRT + help +- This enforces a strict kernel and user space isolation in order to close +- hardware side channels on kernel address information. ++ This enforces a strict kernel and user space isolation, in order ++ to close hardware side channels on kernel address information. ++ ++ If you are unsure how to answer this question, answer Y. + + config KAISER_REAL_SWITCH + bool "KAISER: actually switch page tables" diff --git a/queue/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch b/queue/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch new file mode 100644 index 0000000..75c1365 --- /dev/null +++ b/queue/kaiser-kaiser_flush_tlb_on_return_to_user-check-pcid.patch @@ -0,0 +1,86 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Sat, 4 Nov 2017 18:43:06 -0700 +Subject: kaiser: kaiser_flush_tlb_on_return_to_user() check PCID + +From: Hugh Dickins <hughd@google.com> + + +Let kaiser_flush_tlb_on_return_to_user() do the X86_FEATURE_PCID +check, instead of each caller doing it inline first: nobody needs +to optimize for the noPCID case, it's clearer this way, and better +suits later changes. Replace those no-op X86_CR3_PCID_KERN_FLUSH lines +by a BUILD_BUG_ON() in load_new_mm_cr3(), in case something changes. + +Signed-off-by: Hugh Dickins <hughd@google.com> +Acked-by: Jiri Kosina <jkosina@suse.cz> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/tlbflush.h | 4 ++-- + arch/x86/mm/kaiser.c | 6 +++--- + arch/x86/mm/tlb.c | 8 ++++---- + 3 files changed, 9 insertions(+), 9 deletions(-) + +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -158,7 +158,7 @@ static inline void __native_flush_tlb(vo + * back: + */ + preempt_disable(); +- if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) ++ if (kaiser_enabled) + kaiser_flush_tlb_on_return_to_user(); + native_write_cr3(native_read_cr3()); + preempt_enable(); +@@ -217,7 +217,7 @@ static inline void __native_flush_tlb_si + */ + + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { +- if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) ++ if (kaiser_enabled) + kaiser_flush_tlb_on_return_to_user(); + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + return; +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -435,12 +435,12 @@ void kaiser_setup_pcid(void) + + /* + * Make a note that this cpu will need to flush USER tlb on return to user. +- * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling: +- * if cpu does not, then the NOFLUSH bit will never have been set. ++ * If cpu does not have PCID, then the NOFLUSH bit will never have been set. + */ + void kaiser_flush_tlb_on_return_to_user(void) + { +- this_cpu_write(x86_cr3_pcid_user, ++ if (this_cpu_has(X86_FEATURE_PCID)) ++ this_cpu_write(x86_cr3_pcid_user, + X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); + } + EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -39,7 +39,7 @@ static void load_new_mm_cr3(pgd_t *pgdir + { + unsigned long new_mm_cr3 = __pa(pgdir); + +- if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) { ++ if (kaiser_enabled) { + /* + * We reuse the same PCID for different tasks, so we must + * flush all the entries for the PCID out when we change tasks. +@@ -50,10 +50,10 @@ static void load_new_mm_cr3(pgd_t *pgdir + * do it here, but can only be used if X86_FEATURE_INVPCID is + * available - and many machines support pcid without invpcid. + * +- * The line below is a no-op: X86_CR3_PCID_KERN_FLUSH is now 0; +- * but keep that line in there in case something changes. ++ * If X86_CR3_PCID_KERN_FLUSH actually added something, then it ++ * would be needed in the write_cr3() below - if PCIDs enabled. + */ +- new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH; ++ BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH); + kaiser_flush_tlb_on_return_to_user(); + } + diff --git a/queue/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch b/queue/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch new file mode 100644 index 0000000..66fe640 --- /dev/null +++ b/queue/kaiser-kaiser_remove_mapping-move-along-the-pgd.patch @@ -0,0 +1,50 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Mon, 2 Oct 2017 10:57:24 -0700 +Subject: kaiser: kaiser_remove_mapping() move along the pgd + +From: Hugh Dickins <hughd@google.com> + + +When removing the bogus comment from kaiser_remove_mapping(), +I really ought to have checked the extent of its bogosity: as +Neel points out, there is nothing to stop unmap_pud_range_nofree() +from continuing beyond the end of a pud (and starting in the wrong +position on the next). + +Fix kaiser_remove_mapping() to constrain the extent and advance pgd +pointer correctly: use pgd_addr_end() macro as used throughout base +mm (but don't assume page-rounded start and size in this case). + +But this bug was very unlikely to trigger in this backport: since +any buddy allocation is contained within a single pud extent, and +we are not using vmapped stacks (and are only mapping one page of +stack anyway): the only way to hit this bug here would be when +freeing a large modified ldt. + +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/mm/kaiser.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -319,11 +319,13 @@ void kaiser_remove_mapping(unsigned long + extern void unmap_pud_range_nofree(pgd_t *pgd, + unsigned long start, unsigned long end); + unsigned long end = start + size; +- unsigned long addr; ++ unsigned long addr, next; ++ pgd_t *pgd; + +- for (addr = start; addr < end; addr += PGDIR_SIZE) { +- pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr)); +- unmap_pud_range_nofree(pgd, addr, end); ++ pgd = native_get_shadow_pgd(pgd_offset_k(start)); ++ for (addr = start; addr < end; pgd++, addr = next) { ++ next = pgd_addr_end(addr, end); ++ unmap_pud_range_nofree(pgd, addr, next); + } + } + diff --git a/queue/kaiser-kernel-address-isolation.patch b/queue/kaiser-kernel-address-isolation.patch new file mode 100644 index 0000000..079f4d6 --- /dev/null +++ b/queue/kaiser-kernel-address-isolation.patch @@ -0,0 +1,979 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Richard Fellner <richard.fellner@student.tugraz.at> +Date: Thu, 4 May 2017 14:26:50 +0200 +Subject: KAISER: Kernel Address Isolation + +From: Richard Fellner <richard.fellner@student.tugraz.at> + + +This patch introduces our implementation of KAISER (Kernel Address Isolation to +have Side-channels Efficiently Removed), a kernel isolation technique to close +hardware side channels on kernel address information. + +More information about the patch can be found on: + + https://github.com/IAIK/KAISER + +From: Richard Fellner <richard.fellner@student.tugraz.at> +From: Daniel Gruss <daniel.gruss@iaik.tugraz.at> +Subject: [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode +Date: Thu, 4 May 2017 14:26:50 +0200 +Link: http://marc.info/?l=linux-kernel&m=149390087310405&w=2 +Kaiser-4.10-SHA1: c4b1831d44c6144d3762ccc72f0c4e71a0c713e5 + +To: <linux-kernel@vger.kernel.org> +To: <kernel-hardening@lists.openwall.com> +Cc: <clementine.maurice@iaik.tugraz.at> +Cc: <moritz.lipp@iaik.tugraz.at> +Cc: Michael Schwarz <michael.schwarz@iaik.tugraz.at> +Cc: Richard Fellner <richard.fellner@student.tugraz.at> +Cc: Ingo Molnar <mingo@kernel.org> +Cc: <kirill.shutemov@linux.intel.com> +Cc: <anders.fogh@gdata-adan.de> + +After several recent works [1,2,3] KASLR on x86_64 was basically +considered dead by many researchers. We have been working on an +efficient but effective fix for this problem and found that not mapping +the kernel space when running in user mode is the solution to this +problem [4] (the corresponding paper [5] will be presented at ESSoS17). + +With this RFC patch we allow anybody to configure their kernel with the +flag CONFIG_KAISER to add our defense mechanism. + +If there are any questions we would love to answer them. +We also appreciate any comments! + +Cheers, +Daniel (+ the KAISER team from Graz University of Technology) + +[1] http://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf +[2] https://www.blackhat.com/docs/us-16/materials/us-16-Fogh-Using-Undocumented-CPU-Behaviour-To-See-Into-Kernel-Mode-And-Break-KASLR-In-The-Process.pdf +[3] https://www.blackhat.com/docs/us-16/materials/us-16-Jang-Breaking-Kernel-Address-Space-Layout-Randomization-KASLR-With-Intel-TSX.pdf +[4] https://github.com/IAIK/KAISER +[5] https://gruss.cc/files/kaiser.pdf + +[patch based also on +https://raw.githubusercontent.com/IAIK/KAISER/master/KAISER/0001-KAISER-Kernel-Address-Isolation.patch] + +Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at> +Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at> +Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at> +Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at> +Acked-by: Jiri Kosina <jkosina@suse.cz> +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/entry/entry_64.S | 17 +++ + arch/x86/entry/entry_64_compat.S | 7 + + arch/x86/include/asm/hw_irq.h | 2 + arch/x86/include/asm/kaiser.h | 113 ++++++++++++++++++++++++ + arch/x86/include/asm/pgtable.h | 4 + arch/x86/include/asm/pgtable_64.h | 21 ++++ + arch/x86/include/asm/pgtable_types.h | 12 ++ + arch/x86/include/asm/processor.h | 7 + + arch/x86/kernel/cpu/common.c | 4 + arch/x86/kernel/espfix_64.c | 6 + + arch/x86/kernel/head_64.S | 16 ++- + arch/x86/kernel/irqinit.c | 2 + arch/x86/kernel/process.c | 2 + arch/x86/mm/Makefile | 2 + arch/x86/mm/kaiser.c | 160 +++++++++++++++++++++++++++++++++++ + arch/x86/mm/pageattr.c | 2 + arch/x86/mm/pgtable.c | 26 +++++ + include/asm-generic/vmlinux.lds.h | 11 ++ + include/linux/percpu-defs.h | 30 ++++++ + init/main.c | 6 + + kernel/fork.c | 8 + + security/Kconfig | 7 + + 22 files changed, 449 insertions(+), 16 deletions(-) + create mode 100644 arch/x86/include/asm/kaiser.h + create mode 100644 arch/x86/mm/kaiser.c + +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -36,6 +36,7 @@ + #include <asm/smap.h> + #include <asm/pgtable_types.h> + #include <asm/export.h> ++#include <asm/kaiser.h> + #include <linux/err.h> + + /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ +@@ -146,6 +147,7 @@ ENTRY(entry_SYSCALL_64) + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK ++ SWITCH_KERNEL_CR3_NO_STACK + /* + * A hypervisor implementation might want to use a label + * after the swapgs, so that it can do the swapgs +@@ -228,6 +230,7 @@ entry_SYSCALL_64_fastpath: + movq RIP(%rsp), %rcx + movq EFLAGS(%rsp), %r11 + RESTORE_C_REGS_EXCEPT_RCX_R11 ++ SWITCH_USER_CR3 + movq RSP(%rsp), %rsp + USERGS_SYSRET64 + +@@ -323,10 +326,12 @@ return_from_SYSCALL_64: + syscall_return_via_sysret: + /* rcx and r11 are already restored (see code above) */ + RESTORE_C_REGS_EXCEPT_RCX_R11 ++ SWITCH_USER_CR3 + movq RSP(%rsp), %rsp + USERGS_SYSRET64 + + opportunistic_sysret_failed: ++ SWITCH_USER_CR3 + SWAPGS + jmp restore_c_regs_and_iret + END(entry_SYSCALL_64) +@@ -424,6 +429,7 @@ ENTRY(ret_from_fork) + movq %rsp, %rdi + call syscall_return_slowpath /* returns with IRQs disabled */ + TRACE_IRQS_ON /* user mode is traced as IRQS on */ ++ SWITCH_USER_CR3 + SWAPGS + jmp restore_regs_and_iret + +@@ -478,6 +484,7 @@ END(irq_entries_start) + * tracking that we're in kernel mode. + */ + SWAPGS ++ SWITCH_KERNEL_CR3 + + /* + * We need to tell lockdep that IRQs are off. We can't do this until +@@ -535,6 +542,7 @@ GLOBAL(retint_user) + mov %rsp,%rdi + call prepare_exit_to_usermode + TRACE_IRQS_IRETQ ++ SWITCH_USER_CR3 + SWAPGS + jmp restore_regs_and_iret + +@@ -612,6 +620,7 @@ native_irq_return_ldt: + + pushq %rdi /* Stash user RDI */ + SWAPGS ++ SWITCH_KERNEL_CR3 + movq PER_CPU_VAR(espfix_waddr), %rdi + movq %rax, (0*8)(%rdi) /* user RAX */ + movq (1*8)(%rsp), %rax /* user RIP */ +@@ -638,6 +647,7 @@ native_irq_return_ldt: + * still points to an RO alias of the ESPFIX stack. + */ + orq PER_CPU_VAR(espfix_stack), %rax ++ SWITCH_USER_CR3 + SWAPGS + movq %rax, %rsp + +@@ -1034,6 +1044,7 @@ ENTRY(paranoid_entry) + testl %edx, %edx + js 1f /* negative -> in kernel */ + SWAPGS ++ SWITCH_KERNEL_CR3 + xorl %ebx, %ebx + 1: ret + END(paranoid_entry) +@@ -1056,6 +1067,7 @@ ENTRY(paranoid_exit) + testl %ebx, %ebx /* swapgs needed? */ + jnz paranoid_exit_no_swapgs + TRACE_IRQS_IRETQ ++ SWITCH_USER_CR3_NO_STACK + SWAPGS_UNSAFE_STACK + jmp paranoid_exit_restore + paranoid_exit_no_swapgs: +@@ -1084,6 +1096,7 @@ ENTRY(error_entry) + * from user mode due to an IRET fault. + */ + SWAPGS ++ SWITCH_KERNEL_CR3 + + .Lerror_entry_from_usermode_after_swapgs: + /* +@@ -1135,6 +1148,7 @@ ENTRY(error_entry) + * Switch to kernel gsbase: + */ + SWAPGS ++ SWITCH_KERNEL_CR3 + + /* + * Pretend that the exception came from user mode: set up pt_regs +@@ -1235,6 +1249,7 @@ ENTRY(nmi) + */ + + SWAPGS_UNSAFE_STACK ++ SWITCH_KERNEL_CR3_NO_STACK + cld + movq %rsp, %rdx + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp +@@ -1275,6 +1290,7 @@ ENTRY(nmi) + * work, because we don't want to enable interrupts. Fortunately, + * do_nmi doesn't modify pt_regs. + */ ++ SWITCH_USER_CR3 + SWAPGS + jmp restore_c_regs_and_iret + +@@ -1486,6 +1502,7 @@ end_repeat_nmi: + testl %ebx, %ebx /* swapgs needed? */ + jnz nmi_restore + nmi_swapgs: ++ SWITCH_USER_CR3_NO_STACK + SWAPGS_UNSAFE_STACK + nmi_restore: + RESTORE_EXTRA_REGS +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -13,6 +13,7 @@ + #include <asm/irqflags.h> + #include <asm/asm.h> + #include <asm/smap.h> ++#include <asm/kaiser.h> + #include <linux/linkage.h> + #include <linux/err.h> + +@@ -48,6 +49,7 @@ + ENTRY(entry_SYSENTER_compat) + /* Interrupts are off on entry. */ + SWAPGS_UNSAFE_STACK ++ SWITCH_KERNEL_CR3_NO_STACK + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + + /* +@@ -184,6 +186,7 @@ ENDPROC(entry_SYSENTER_compat) + ENTRY(entry_SYSCALL_compat) + /* Interrupts are off on entry. */ + SWAPGS_UNSAFE_STACK ++ SWITCH_KERNEL_CR3_NO_STACK + + /* Stash user ESP and switch to the kernel stack. */ + movl %esp, %r8d +@@ -259,6 +262,7 @@ sysret32_from_system_call: + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r10 ++ SWITCH_USER_CR3 + movq RSP-ORIG_RAX(%rsp), %rsp + swapgs + sysretl +@@ -297,7 +301,7 @@ ENTRY(entry_INT80_compat) + PARAVIRT_ADJUST_EXCEPTION_FRAME + ASM_CLAC /* Do this early to minimize exposure */ + SWAPGS +- ++ SWITCH_KERNEL_CR3_NO_STACK + /* + * User tracing code (ptrace or signal handlers) might assume that + * the saved RAX contains a 32-bit number when we're invoking a 32-bit +@@ -338,6 +342,7 @@ ENTRY(entry_INT80_compat) + + /* Go back to user mode. */ + TRACE_IRQS_ON ++ SWITCH_USER_CR3_NO_STACK + SWAPGS + jmp restore_regs_and_iret + END(entry_INT80_compat) +--- a/arch/x86/include/asm/hw_irq.h ++++ b/arch/x86/include/asm/hw_irq.h +@@ -178,7 +178,7 @@ extern char irq_entries_start[]; + #define VECTOR_RETRIGGERED ((void *)~0UL) + + typedef struct irq_desc* vector_irq_t[NR_VECTORS]; +-DECLARE_PER_CPU(vector_irq_t, vector_irq); ++DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq); + + #endif /* !ASSEMBLY_ */ + +--- /dev/null ++++ b/arch/x86/include/asm/kaiser.h +@@ -0,0 +1,113 @@ ++#ifndef _ASM_X86_KAISER_H ++#define _ASM_X86_KAISER_H ++ ++/* This file includes the definitions for the KAISER feature. ++ * KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory. ++ * It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped, ++ * but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled, ++ * the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled. ++ * By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory. ++ * ++ * A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions ++ * of the user space, or the stacks. ++ */ ++#ifdef __ASSEMBLY__ ++#ifdef CONFIG_KAISER ++ ++.macro _SWITCH_TO_KERNEL_CR3 reg ++movq %cr3, \reg ++andq $(~0x1000), \reg ++movq \reg, %cr3 ++.endm ++ ++.macro _SWITCH_TO_USER_CR3 reg ++movq %cr3, \reg ++orq $(0x1000), \reg ++movq \reg, %cr3 ++.endm ++ ++.macro SWITCH_KERNEL_CR3 ++pushq %rax ++_SWITCH_TO_KERNEL_CR3 %rax ++popq %rax ++.endm ++ ++.macro SWITCH_USER_CR3 ++pushq %rax ++_SWITCH_TO_USER_CR3 %rax ++popq %rax ++.endm ++ ++.macro SWITCH_KERNEL_CR3_NO_STACK ++movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) ++_SWITCH_TO_KERNEL_CR3 %rax ++movq PER_CPU_VAR(unsafe_stack_register_backup), %rax ++.endm ++ ++ ++.macro SWITCH_USER_CR3_NO_STACK ++ ++movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) ++_SWITCH_TO_USER_CR3 %rax ++movq PER_CPU_VAR(unsafe_stack_register_backup), %rax ++ ++.endm ++ ++#else /* CONFIG_KAISER */ ++ ++.macro SWITCH_KERNEL_CR3 reg ++.endm ++.macro SWITCH_USER_CR3 reg ++.endm ++.macro SWITCH_USER_CR3_NO_STACK ++.endm ++.macro SWITCH_KERNEL_CR3_NO_STACK ++.endm ++ ++#endif /* CONFIG_KAISER */ ++#else /* __ASSEMBLY__ */ ++ ++ ++#ifdef CONFIG_KAISER ++// Upon kernel/user mode switch, it may happen that ++// the address space has to be switched before the registers have been stored. ++// To change the address space, another register is needed. ++// A register therefore has to be stored/restored. ++// ++DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); ++ ++#endif /* CONFIG_KAISER */ ++ ++/** ++ * shadowmem_add_mapping - map a virtual memory part to the shadow mapping ++ * @addr: the start address of the range ++ * @size: the size of the range ++ * @flags: The mapping flags of the pages ++ * ++ * the mapping is done on a global scope, so no bigger synchronization has to be done. ++ * the pages have to be manually unmapped again when they are not needed any longer. ++ */ ++extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); ++ ++ ++/** ++ * shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping ++ * @addr: the start address of the range ++ * @size: the size of the range ++ */ ++extern void kaiser_remove_mapping(unsigned long start, unsigned long size); ++ ++/** ++ * shadowmem_initialize_mapping - Initalize the shadow mapping ++ * ++ * most parts of the shadow mapping can be mapped upon boot time. ++ * only the thread stacks have to be mapped on runtime. ++ * the mapped regions are not unmapped at all. ++ */ ++extern void kaiser_init(void); ++ ++#endif ++ ++ ++ ++#endif /* _ASM_X86_KAISER_H */ +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -904,6 +904,10 @@ static inline void pmdp_set_wrprotect(st + static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) + { + memcpy(dst, src, count * sizeof(pgd_t)); ++#ifdef CONFIG_KAISER ++ // clone the shadow pgd part as well ++ memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t)); ++#endif + } + + #define PTE_SHIFT ilog2(PTRS_PER_PTE) +--- a/arch/x86/include/asm/pgtable_64.h ++++ b/arch/x86/include/asm/pgtable_64.h +@@ -106,9 +106,30 @@ static inline void native_pud_clear(pud_ + native_set_pud(pud, native_make_pud(0)); + } + ++#ifdef CONFIG_KAISER ++static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) { ++ return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE); ++} ++ ++static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) { ++ return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE); ++} ++#endif /* CONFIG_KAISER */ ++ + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) + { ++#ifdef CONFIG_KAISER ++ // We know that a pgd is page aligned. ++ // Therefore the lower indices have to be mapped to user space. ++ // These pages are mapped to the shadow mapping. ++ if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) { ++ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; ++ } ++ ++ pgdp->pgd = pgd.pgd & ~_PAGE_USER; ++#else /* CONFIG_KAISER */ + *pgdp = pgd; ++#endif + } + + static inline void native_pgd_clear(pgd_t *pgd) +--- a/arch/x86/include/asm/pgtable_types.h ++++ b/arch/x86/include/asm/pgtable_types.h +@@ -45,7 +45,11 @@ + #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) + #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) + #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) +-#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) ++#ifdef CONFIG_KAISER ++#define _PAGE_GLOBAL (_AT(pteval_t, 0)) ++#else ++#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) ++#endif + #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) + #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) + #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) +@@ -119,7 +123,11 @@ + #define _PAGE_DEVMAP (_AT(pteval_t, 0)) + #endif + +-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) ++#ifdef CONFIG_KAISER ++#define _PAGE_PROTNONE (_AT(pteval_t, 0)) ++#else ++#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) ++#endif + + #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_DIRTY) +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -308,7 +308,7 @@ struct tss_struct { + + } ____cacheline_aligned; + +-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); ++DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss); + + #ifdef CONFIG_X86_32 + DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); +@@ -335,6 +335,11 @@ union irq_stack_union { + char gs_base[40]; + unsigned long stack_canary; + }; ++ ++ struct { ++ char irq_stack_pointer[64]; ++ char unused[IRQ_STACK_SIZE - 64]; ++ }; + }; + + DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -93,7 +93,7 @@ static const struct cpu_dev default_cpu + + static const struct cpu_dev *this_cpu = &default_cpu; + +-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { ++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = { + #ifdef CONFIG_X86_64 + /* + * We need valid kernel segments for data and code in long mode too +@@ -1365,7 +1365,7 @@ static const unsigned int exception_stac + [DEBUG_STACK - 1] = DEBUG_STKSZ + }; + +-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks ++DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); + + /* May not be marked __init: used by software suspend */ +--- a/arch/x86/kernel/espfix_64.c ++++ b/arch/x86/kernel/espfix_64.c +@@ -41,6 +41,7 @@ + #include <asm/pgalloc.h> + #include <asm/setup.h> + #include <asm/espfix.h> ++#include <asm/kaiser.h> + + /* + * Note: we only need 6*8 = 48 bytes for the espfix stack, but round +@@ -126,6 +127,11 @@ void __init init_espfix_bsp(void) + /* Install the espfix pud into the kernel page directory */ + pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; + pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); ++#ifdef CONFIG_KAISER ++ // add the esp stack pud to the shadow mapping here. ++ // This can be done directly, because the fixup stack has its own pud ++ set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page))); ++#endif + + /* Randomize the locations */ + init_espfix_random(); +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -405,6 +405,14 @@ GLOBAL(early_recursion_flag) + .balign PAGE_SIZE; \ + GLOBAL(name) + ++#ifdef CONFIG_KAISER ++#define NEXT_PGD_PAGE(name) \ ++ .balign 2 * PAGE_SIZE; \ ++GLOBAL(name) ++#else ++#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) ++#endif ++ + /* Automate the creation of 1 to 1 mapping pmd entries */ + #define PMDS(START, PERM, COUNT) \ + i = 0 ; \ +@@ -414,7 +422,7 @@ GLOBAL(name) + .endr + + __INITDATA +-NEXT_PAGE(early_level4_pgt) ++NEXT_PGD_PAGE(early_level4_pgt) + .fill 511,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + +@@ -424,10 +432,10 @@ NEXT_PAGE(early_dynamic_pgts) + .data + + #ifndef CONFIG_XEN +-NEXT_PAGE(init_level4_pgt) +- .fill 512,8,0 ++NEXT_PGD_PAGE(init_level4_pgt) ++ .fill 2*512,8,0 + #else +-NEXT_PAGE(init_level4_pgt) ++NEXT_PGD_PAGE(init_level4_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE +--- a/arch/x86/kernel/irqinit.c ++++ b/arch/x86/kernel/irqinit.c +@@ -51,7 +51,7 @@ static struct irqaction irq2 = { + .flags = IRQF_NO_THREAD, + }; + +-DEFINE_PER_CPU(vector_irq_t, vector_irq) = { ++DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = { + [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, + }; + +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -41,7 +41,7 @@ + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ +-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { ++__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = { + .x86_tss = { + .sp0 = TOP_OF_INIT_STACK, + #ifdef CONFIG_X86_32 +--- a/arch/x86/mm/Makefile ++++ b/arch/x86/mm/Makefile +@@ -38,4 +38,4 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulatio + obj-$(CONFIG_X86_INTEL_MPX) += mpx.o + obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o + obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +- ++obj-$(CONFIG_KAISER) += kaiser.o +--- /dev/null ++++ b/arch/x86/mm/kaiser.c +@@ -0,0 +1,160 @@ ++ ++ ++#include <linux/kernel.h> ++#include <linux/errno.h> ++#include <linux/string.h> ++#include <linux/types.h> ++#include <linux/bug.h> ++#include <linux/init.h> ++#include <linux/spinlock.h> ++#include <linux/mm.h> ++ ++#include <linux/uaccess.h> ++#include <asm/pgtable.h> ++#include <asm/pgalloc.h> ++#include <asm/desc.h> ++#ifdef CONFIG_KAISER ++ ++__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); ++ ++/** ++ * Get the real ppn from a address in kernel mapping. ++ * @param address The virtual adrress ++ * @return the physical address ++ */ ++static inline unsigned long get_pa_from_mapping (unsigned long address) ++{ ++ pgd_t *pgd; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *pte; ++ ++ pgd = pgd_offset_k(address); ++ BUG_ON(pgd_none(*pgd) || pgd_large(*pgd)); ++ ++ pud = pud_offset(pgd, address); ++ BUG_ON(pud_none(*pud)); ++ ++ if (pud_large(*pud)) { ++ return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK); ++ } ++ ++ pmd = pmd_offset(pud, address); ++ BUG_ON(pmd_none(*pmd)); ++ ++ if (pmd_large(*pmd)) { ++ return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK); ++ } ++ ++ pte = pte_offset_kernel(pmd, address); ++ BUG_ON(pte_none(*pte)); ++ ++ return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK); ++} ++ ++void _kaiser_copy (unsigned long start_addr, unsigned long size, ++ unsigned long flags) ++{ ++ pgd_t *pgd; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *pte; ++ unsigned long address; ++ unsigned long end_addr = start_addr + size; ++ unsigned long target_address; ++ ++ for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1)); ++ address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) { ++ target_address = get_pa_from_mapping(address); ++ ++ pgd = native_get_shadow_pgd(pgd_offset_k(address)); ++ ++ BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n"); ++ BUG_ON(pgd_large(*pgd)); ++ ++ pud = pud_offset(pgd, address); ++ if (pud_none(*pud)) { ++ set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address)))); ++ } ++ BUG_ON(pud_large(*pud)); ++ ++ pmd = pmd_offset(pud, address); ++ if (pmd_none(*pmd)) { ++ set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address)))); ++ } ++ BUG_ON(pmd_large(*pmd)); ++ ++ pte = pte_offset_kernel(pmd, address); ++ if (pte_none(*pte)) { ++ set_pte(pte, __pte(flags | target_address)); ++ } else { ++ BUG_ON(__pa(pte_page(*pte)) != target_address); ++ } ++ } ++} ++ ++// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping ++static inline void __init _kaiser_init(void) ++{ ++ pgd_t *pgd; ++ int i = 0; ++ ++ pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); ++ for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { ++ set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0)))); ++ } ++} ++ ++extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; ++spinlock_t shadow_table_lock; ++void __init kaiser_init(void) ++{ ++ int cpu; ++ spin_lock_init(&shadow_table_lock); ++ ++ spin_lock(&shadow_table_lock); ++ ++ _kaiser_init(); ++ ++ for_each_possible_cpu(cpu) { ++ // map the per cpu user variables ++ _kaiser_copy( ++ (unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)), ++ (unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start, ++ __PAGE_KERNEL); ++ } ++ ++ // map the entry/exit text section, which is responsible to switch between user- and kernel mode ++ _kaiser_copy( ++ (unsigned long) __entry_text_start, ++ (unsigned long) __entry_text_end - (unsigned long) __entry_text_start, ++ __PAGE_KERNEL_RX); ++ ++ // the fixed map address of the idt_table ++ _kaiser_copy( ++ (unsigned long) idt_descr.address, ++ sizeof(gate_desc) * NR_VECTORS, ++ __PAGE_KERNEL_RO); ++ ++ spin_unlock(&shadow_table_lock); ++} ++ ++// add a mapping to the shadow-mapping, and synchronize the mappings ++void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) ++{ ++ spin_lock(&shadow_table_lock); ++ _kaiser_copy(addr, size, flags); ++ spin_unlock(&shadow_table_lock); ++} ++ ++extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end); ++void kaiser_remove_mapping(unsigned long start, unsigned long size) ++{ ++ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start)); ++ spin_lock(&shadow_table_lock); ++ do { ++ unmap_pud_range(pgd, start, start + size); ++ } while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size))); ++ spin_unlock(&shadow_table_lock); ++} ++#endif /* CONFIG_KAISER */ +--- a/arch/x86/mm/pageattr.c ++++ b/arch/x86/mm/pageattr.c +@@ -823,7 +823,7 @@ static void unmap_pmd_range(pud_t *pud, + pud_clear(pud); + } + +-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) ++void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) + { + pud_t *pud = pud_offset(pgd, start); + +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -346,12 +346,38 @@ static inline void _pgd_free(pgd_t *pgd) + #else + static inline pgd_t *_pgd_alloc(void) + { ++#ifdef CONFIG_KAISER ++ // Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory ++ // block. Therefore, we have to allocate at least 3 pages. However, the ++ // __get_free_pages returns us 4 pages. Hence, we store the base pointer at ++ // the beginning of the page of our 8kb-aligned memory block in order to ++ // correctly free it afterwars. ++ ++ unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE)); ++ ++ if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages) ++ { ++ *((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages; ++ return (pgd_t *) pages; ++ } ++ else ++ { ++ *((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages; ++ return (pgd_t *) (pages + PAGE_SIZE); ++ } ++#else + return (pgd_t *)__get_free_page(PGALLOC_GFP); ++#endif + } + + static inline void _pgd_free(pgd_t *pgd) + { ++#ifdef CONFIG_KAISER ++ unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE)); ++ free_pages(pages, get_order(4*PAGE_SIZE)); ++#else + free_page((unsigned long)pgd); ++#endif + } + #endif /* CONFIG_X86_PAE */ + +--- a/include/asm-generic/vmlinux.lds.h ++++ b/include/asm-generic/vmlinux.lds.h +@@ -778,7 +778,16 @@ + */ + #define PERCPU_INPUT(cacheline) \ + VMLINUX_SYMBOL(__per_cpu_start) = .; \ +- *(.data..percpu..first) \ ++ \ ++ VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \ ++ *(.data..percpu..first) \ ++ . = ALIGN(cacheline); \ ++ *(.data..percpu..user_mapped) \ ++ *(.data..percpu..user_mapped..shared_aligned) \ ++ . = ALIGN(PAGE_SIZE); \ ++ *(.data..percpu..user_mapped..page_aligned) \ ++ VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \ ++ \ + . = ALIGN(PAGE_SIZE); \ + *(.data..percpu..page_aligned) \ + . = ALIGN(cacheline); \ +--- a/include/linux/percpu-defs.h ++++ b/include/linux/percpu-defs.h +@@ -35,6 +35,12 @@ + + #endif + ++#ifdef CONFIG_KAISER ++#define USER_MAPPED_SECTION "..user_mapped" ++#else ++#define USER_MAPPED_SECTION "" ++#endif ++ + /* + * Base implementations of per-CPU variable declarations and definitions, where + * the section in which the variable is to be placed is provided by the +@@ -115,6 +121,12 @@ + #define DEFINE_PER_CPU(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, "") + ++#define DECLARE_PER_CPU_USER_MAPPED(type, name) \ ++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) ++ ++#define DEFINE_PER_CPU_USER_MAPPED(type, name) \ ++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) ++ + /* + * Declaration/definition used for per-CPU variables that must come first in + * the set of variables. +@@ -144,6 +156,14 @@ + DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \ + ____cacheline_aligned_in_smp + ++#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ ++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ ++ ____cacheline_aligned_in_smp ++ ++#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ ++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ ++ ____cacheline_aligned_in_smp ++ + #define DECLARE_PER_CPU_ALIGNED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \ + ____cacheline_aligned +@@ -162,6 +182,16 @@ + #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \ + __aligned(PAGE_SIZE) ++/* ++ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode. ++ */ ++#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ ++ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ ++ __aligned(PAGE_SIZE) ++ ++#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ ++ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ ++ __aligned(PAGE_SIZE) + + /* + * Declaration/definition used for per-CPU variables that must be read mostly. +--- a/init/main.c ++++ b/init/main.c +@@ -86,6 +86,9 @@ + #include <asm/setup.h> + #include <asm/sections.h> + #include <asm/cacheflush.h> ++#ifdef CONFIG_KAISER ++#include <asm/kaiser.h> ++#endif + + static int kernel_init(void *); + +@@ -473,6 +476,9 @@ static void __init mm_init(void) + pgtable_init(); + vmalloc_init(); + ioremap_huge_init(); ++#ifdef CONFIG_KAISER ++ kaiser_init(); ++#endif + } + + asmlinkage __visible void __init start_kernel(void) +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -211,8 +211,12 @@ static unsigned long *alloc_thread_stack + #endif + } + ++extern void kaiser_remove_mapping(unsigned long start_addr, unsigned long size); + static inline void free_thread_stack(struct task_struct *tsk) + { ++#ifdef CONFIG_KAISER ++ kaiser_remove_mapping((unsigned long)tsk->stack, THREAD_SIZE); ++#endif + #ifdef CONFIG_VMAP_STACK + if (task_stack_vm_area(tsk)) { + unsigned long flags; +@@ -468,6 +472,7 @@ void set_task_stack_end_magic(struct tas + *stackend = STACK_END_MAGIC; /* for overflow detection */ + } + ++extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); + static struct task_struct *dup_task_struct(struct task_struct *orig, int node) + { + struct task_struct *tsk; +@@ -495,6 +500,9 @@ static struct task_struct *dup_task_stru + * functions again. + */ + tsk->stack = stack; ++#ifdef CONFIG_KAISER ++ kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL); ++#endif + #ifdef CONFIG_VMAP_STACK + tsk->stack_vm_area = stack_vm_area; + #endif +--- a/security/Kconfig ++++ b/security/Kconfig +@@ -30,6 +30,13 @@ config SECURITY + model will be used. + + If you are unsure how to answer this question, answer N. ++config KAISER ++ bool "Remove the kernel mapping in user mode" ++ depends on X86_64 ++ depends on !PARAVIRT ++ help ++ This enforces a strict kernel and user space isolation in order to close ++ hardware side channels on kernel address information. + + config SECURITYFS + bool "Enable the securityfs filesystem" diff --git a/queue/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch b/queue/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch new file mode 100644 index 0000000..3e82917 --- /dev/null +++ b/queue/kaiser-load_new_mm_cr3-let-switch_user_cr3-flush-user.patch @@ -0,0 +1,392 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Thu, 17 Aug 2017 15:00:37 -0700 +Subject: kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush user + +From: Hugh Dickins <hughd@google.com> + + +We have many machines (Westmere, Sandybridge, Ivybridge) supporting +PCID but not INVPCID: on these load_new_mm_cr3() simply crashed. + +Flushing user context inside load_new_mm_cr3() without the use of +invpcid is difficult: momentarily switch from kernel to user context +and back to do so? I'm not sure whether that can be safely done at +all, and would risk polluting user context with kernel internals, +and kernel context with stale user externals. + +Instead, follow the hint in the comment that was there: change +X86_CR3_PCID_USER_VAR to be a per-cpu variable, then load_new_mm_cr3() +can leave a note in it, for SWITCH_USER_CR3 on return to userspace to +flush user context TLB, instead of default X86_CR3_PCID_USER_NOFLUSH. + +Which works well enough that there's no need to do it this way only +when invpcid is unsupported: it's a good alternative to invpcid here. +But there's a couple of inlines in asm/tlbflush.h that need to do the +same trick, so it's best to localize all this per-cpu business in +mm/kaiser.c: moving that part of the initialization from setup_pcid() +to kaiser_setup_pcid(); with kaiser_flush_tlb_on_return_to_user() the +function for noting an X86_CR3_PCID_USER_FLUSH. And let's keep a +KAISER_SHADOW_PGD_OFFSET in there, to avoid the extra OR on exit. + +I did try to make the feature tests in asm/tlbflush.h more consistent +with each other: there seem to be far too many ways of performing such +tests, and I don't have a good grasp of their differences. At first +I converted them all to be static_cpu_has(): but that proved to be a +mistake, as the comment in __native_flush_tlb_single() hints; so then +I reversed and made them all this_cpu_has(). Probably all gratuitous +change, but that's the way it's working at present. + +I am slightly bothered by the way non-per-cpu X86_CR3_PCID_KERN_VAR +gets re-initialized by each cpu (before and after these changes): +no problem when (as usual) all cpus on a machine have the same +features, but in principle incorrect. However, my experiment +to per-cpu-ify that one did not end well... + +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/kaiser.h | 18 +++++++----- + arch/x86/include/asm/tlbflush.h | 56 +++++++++++++++++++++++++++------------- + arch/x86/kernel/cpu/common.c | 22 --------------- + arch/x86/mm/kaiser.c | 50 +++++++++++++++++++++++++++++++---- + arch/x86/mm/tlb.c | 46 ++++++++++++-------------------- + 5 files changed, 113 insertions(+), 79 deletions(-) + +--- a/arch/x86/include/asm/kaiser.h ++++ b/arch/x86/include/asm/kaiser.h +@@ -32,13 +32,12 @@ movq \reg, %cr3 + .macro _SWITCH_TO_USER_CR3 reg + movq %cr3, \reg + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg +-/* +- * This can obviously be one instruction by putting the +- * KAISER_SHADOW_PGD_OFFSET bit in the X86_CR3_PCID_USER_VAR. +- * But, just leave it now for simplicity. +- */ +-orq X86_CR3_PCID_USER_VAR, \reg +-orq $(KAISER_SHADOW_PGD_OFFSET), \reg ++orq PER_CPU_VAR(X86_CR3_PCID_USER_VAR), \reg ++js 9f ++// FLUSH this time, reset to NOFLUSH for next time ++// But if nopcid? Consider using 0x80 for user pcid? ++movb $(0x80), PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7) ++9: + movq \reg, %cr3 + .endm + +@@ -90,6 +89,11 @@ movq PER_CPU_VAR(unsafe_stack_register_b + */ + DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + ++extern unsigned long X86_CR3_PCID_KERN_VAR; ++DECLARE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR); ++ ++extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; ++ + /** + * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping + * @addr: the start address of the range +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -13,6 +13,7 @@ static inline void __invpcid(unsigned lo + unsigned long type) + { + struct { u64 d[2]; } desc = { { pcid, addr } }; ++ + /* + * The memory clobber is because the whole point is to invalidate + * stale TLB entries and, especially if we're flushing global +@@ -131,27 +132,42 @@ static inline void cr4_set_bits_and_upda + cr4_set_bits(mask); + } + ++/* ++ * Declare a couple of kaiser interfaces here for convenience, ++ * to avoid the need for asm/kaiser.h in unexpected places. ++ */ ++#ifdef CONFIG_KAISER ++extern void kaiser_setup_pcid(void); ++extern void kaiser_flush_tlb_on_return_to_user(void); ++#else ++static inline void kaiser_setup_pcid(void) ++{ ++} ++static inline void kaiser_flush_tlb_on_return_to_user(void) ++{ ++} ++#endif ++ + static inline void __native_flush_tlb(void) + { +- if (!cpu_feature_enabled(X86_FEATURE_INVPCID)) { ++ if (this_cpu_has(X86_FEATURE_INVPCID)) { + /* +- * If current->mm == NULL then we borrow a mm which may change during a +- * task switch and therefore we must not be preempted while we write CR3 +- * back: ++ * Note, this works with CR4.PCIDE=0 or 1. + */ +- preempt_disable(); +- native_write_cr3(native_read_cr3()); +- preempt_enable(); ++ invpcid_flush_all_nonglobals(); + return; + } ++ + /* +- * We are no longer using globals with KAISER, so a +- * "nonglobals" flush would work too. But, this is more +- * conservative. +- * +- * Note, this works with CR4.PCIDE=0 or 1. ++ * If current->mm == NULL then we borrow a mm which may change during a ++ * task switch and therefore we must not be preempted while we write CR3 ++ * back: + */ +- invpcid_flush_all(); ++ preempt_disable(); ++ if (this_cpu_has(X86_FEATURE_PCID)) ++ kaiser_flush_tlb_on_return_to_user(); ++ native_write_cr3(native_read_cr3()); ++ preempt_enable(); + } + + static inline void __native_flush_tlb_global_irq_disabled(void) +@@ -167,9 +183,13 @@ static inline void __native_flush_tlb_gl + + static inline void __native_flush_tlb_global(void) + { ++#ifdef CONFIG_KAISER ++ /* Globals are not used at all */ ++ __native_flush_tlb(); ++#else + unsigned long flags; + +- if (static_cpu_has(X86_FEATURE_INVPCID)) { ++ if (this_cpu_has(X86_FEATURE_INVPCID)) { + /* + * Using INVPCID is considerably faster than a pair of writes + * to CR4 sandwiched inside an IRQ flag save/restore. +@@ -186,10 +206,9 @@ static inline void __native_flush_tlb_gl + * be called from deep inside debugging code.) + */ + raw_local_irq_save(flags); +- + __native_flush_tlb_global_irq_disabled(); +- + raw_local_irq_restore(flags); ++#endif + } + + static inline void __native_flush_tlb_single(unsigned long addr) +@@ -200,9 +219,12 @@ static inline void __native_flush_tlb_si + * + * The ASIDs used below are hard-coded. But, we must not + * call invpcid(type=1/2) before CR4.PCIDE=1. Just call +- * invpcid in the case we are called early. ++ * invlpg in the case we are called early. + */ ++ + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { ++ if (this_cpu_has(X86_FEATURE_PCID)) ++ kaiser_flush_tlb_on_return_to_user(); + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + return; + } +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -324,33 +324,12 @@ static __always_inline void setup_smap(s + } + } + +-/* +- * These can have bit 63 set, so we can not just use a plain "or" +- * instruction to get their value or'd into CR3. It would take +- * another register. So, we use a memory reference to these +- * instead. +- * +- * This is also handy because systems that do not support +- * PCIDs just end up or'ing a 0 into their CR3, which does +- * no harm. +- */ +-__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR = 0; +-__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_USER_VAR = 0; +- + static void setup_pcid(struct cpuinfo_x86 *c) + { + if (cpu_has(c, X86_FEATURE_PCID)) { + if (cpu_has(c, X86_FEATURE_PGE)) { + cr4_set_bits(X86_CR4_PCIDE); + /* +- * These variables are used by the entry/exit +- * code to change PCIDs. +- */ +-#ifdef CONFIG_KAISER +- X86_CR3_PCID_KERN_VAR = X86_CR3_PCID_KERN_NOFLUSH; +- X86_CR3_PCID_USER_VAR = X86_CR3_PCID_USER_NOFLUSH; +-#endif +- /* + * INVPCID has two "groups" of types: + * 1/2: Invalidate an individual address + * 3/4: Invalidate all contexts +@@ -375,6 +354,7 @@ static void setup_pcid(struct cpuinfo_x8 + clear_cpu_cap(c, X86_FEATURE_PCID); + } + } ++ kaiser_setup_pcid(); + } + + /* +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -11,12 +11,26 @@ + #include <linux/uaccess.h> + + #include <asm/kaiser.h> ++#include <asm/tlbflush.h> /* to verify its kaiser declarations */ + #include <asm/pgtable.h> + #include <asm/pgalloc.h> + #include <asm/desc.h> ++ + #ifdef CONFIG_KAISER ++__visible ++DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); ++ ++/* ++ * These can have bit 63 set, so we can not just use a plain "or" ++ * instruction to get their value or'd into CR3. It would take ++ * another register. So, we use a memory reference to these instead. ++ * ++ * This is also handy because systems that do not support PCIDs ++ * just end up or'ing a 0 into their CR3, which does no harm. ++ */ ++__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR; ++DEFINE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR); + +-__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + /* + * At runtime, the only things we map are some things for CPU + * hotplug, and stacks for new processes. No two CPUs will ever +@@ -238,9 +252,6 @@ static void __init kaiser_init_all_pgds( + WARN_ON(__ret); \ + } while (0) + +-extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; +-extern unsigned long X86_CR3_PCID_KERN_VAR; +-extern unsigned long X86_CR3_PCID_USER_VAR; + /* + * If anything in here fails, we will likely die on one of the + * first kernel->user transitions and init will die. But, we +@@ -294,8 +305,6 @@ void __init kaiser_init(void) + + kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE, + __PAGE_KERNEL); +- kaiser_add_user_map_early(&X86_CR3_PCID_USER_VAR, PAGE_SIZE, +- __PAGE_KERNEL); + } + + /* Add a mapping to the shadow mapping, and synchronize the mappings */ +@@ -358,4 +367,33 @@ pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, + } + return pgd; + } ++ ++void kaiser_setup_pcid(void) ++{ ++ unsigned long kern_cr3 = 0; ++ unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET; ++ ++ if (this_cpu_has(X86_FEATURE_PCID)) { ++ kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH; ++ user_cr3 |= X86_CR3_PCID_USER_NOFLUSH; ++ } ++ /* ++ * These variables are used by the entry/exit ++ * code to change PCID and pgd and TLB flushing. ++ */ ++ X86_CR3_PCID_KERN_VAR = kern_cr3; ++ this_cpu_write(X86_CR3_PCID_USER_VAR, user_cr3); ++} ++ ++/* ++ * Make a note that this cpu will need to flush USER tlb on return to user. ++ * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling: ++ * if cpu does not, then the NOFLUSH bit will never have been set. ++ */ ++void kaiser_flush_tlb_on_return_to_user(void) ++{ ++ this_cpu_write(X86_CR3_PCID_USER_VAR, ++ X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); ++} ++EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); + #endif /* CONFIG_KAISER */ +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -6,13 +6,14 @@ + #include <linux/interrupt.h> + #include <linux/export.h> + #include <linux/cpu.h> ++#include <linux/debugfs.h> + + #include <asm/tlbflush.h> + #include <asm/mmu_context.h> + #include <asm/cache.h> + #include <asm/apic.h> + #include <asm/uv/uv.h> +-#include <linux/debugfs.h> ++#include <asm/kaiser.h> + + /* + * TLB flushing, formerly SMP-only +@@ -38,34 +39,23 @@ static void load_new_mm_cr3(pgd_t *pgdir + { + unsigned long new_mm_cr3 = __pa(pgdir); + +- /* +- * KAISER, plus PCIDs needs some extra work here. But, +- * if either of features is not present, we need no +- * PCIDs here and just do a normal, full TLB flush with +- * the write_cr3() +- */ +- if (!IS_ENABLED(CONFIG_KAISER) || +- !cpu_feature_enabled(X86_FEATURE_PCID)) +- goto out_set_cr3; +- /* +- * We reuse the same PCID for different tasks, so we must +- * flush all the entires for the PCID out when we change +- * tasks. +- */ +- new_mm_cr3 = X86_CR3_PCID_KERN_FLUSH | __pa(pgdir); +- +- /* +- * The flush from load_cr3() may leave old TLB entries +- * for userspace in place. We must flush that context +- * separately. We can theoretically delay doing this +- * until we actually load up the userspace CR3, but +- * that's a bit tricky. We have to have the "need to +- * flush userspace PCID" bit per-cpu and check it in the +- * exit-to-userspace paths. +- */ +- invpcid_flush_single_context(X86_CR3_PCID_ASID_USER); ++#ifdef CONFIG_KAISER ++ if (this_cpu_has(X86_FEATURE_PCID)) { ++ /* ++ * We reuse the same PCID for different tasks, so we must ++ * flush all the entries for the PCID out when we change tasks. ++ * Flush KERN below, flush USER when returning to userspace in ++ * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro. ++ * ++ * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could ++ * do it here, but can only be used if X86_FEATURE_INVPCID is ++ * available - and many machines support pcid without invpcid. ++ */ ++ new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH; ++ kaiser_flush_tlb_on_return_to_user(); ++ } ++#endif /* CONFIG_KAISER */ + +-out_set_cr3: + /* + * Caution: many callers of this function expect + * that load_cr3() is serializing and orders TLB diff --git a/queue/kaiser-merged-update.patch b/queue/kaiser-merged-update.patch new file mode 100644 index 0000000..8a0e3fe --- /dev/null +++ b/queue/kaiser-merged-update.patch @@ -0,0 +1,1298 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Dave Hansen <dave.hansen@linux.intel.com> +Date: Wed, 30 Aug 2017 16:23:00 -0700 +Subject: kaiser: merged update + +From: Dave Hansen <dave.hansen@linux.intel.com> + + +Merged fixes and cleanups, rebased to 4.9.51 tree (no 5-level paging). + +Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/entry/entry_64.S | 105 ++++++++++- + arch/x86/include/asm/kaiser.h | 43 ++-- + arch/x86/include/asm/pgtable.h | 18 + + arch/x86/include/asm/pgtable_64.h | 48 ++++- + arch/x86/include/asm/pgtable_types.h | 6 + arch/x86/kernel/espfix_64.c | 13 - + arch/x86/kernel/head_64.S | 19 +- + arch/x86/kernel/ldt.c | 27 ++ + arch/x86/kernel/tracepoint.c | 2 + arch/x86/mm/kaiser.c | 317 +++++++++++++++++++++++++---------- + arch/x86/mm/pageattr.c | 63 +++++- + arch/x86/mm/pgtable.c | 40 +--- + include/linux/kaiser.h | 26 ++ + kernel/fork.c | 9 + security/Kconfig | 5 + 15 files changed, 551 insertions(+), 190 deletions(-) + create mode 100644 include/linux/kaiser.h + +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -230,6 +230,13 @@ entry_SYSCALL_64_fastpath: + movq RIP(%rsp), %rcx + movq EFLAGS(%rsp), %r11 + RESTORE_C_REGS_EXCEPT_RCX_R11 ++ /* ++ * This opens a window where we have a user CR3, but are ++ * running in the kernel. This makes using the CS ++ * register useless for telling whether or not we need to ++ * switch CR3 in NMIs. Normal interrupts are OK because ++ * they are off here. ++ */ + SWITCH_USER_CR3 + movq RSP(%rsp), %rsp + USERGS_SYSRET64 +@@ -326,11 +333,25 @@ return_from_SYSCALL_64: + syscall_return_via_sysret: + /* rcx and r11 are already restored (see code above) */ + RESTORE_C_REGS_EXCEPT_RCX_R11 ++ /* ++ * This opens a window where we have a user CR3, but are ++ * running in the kernel. This makes using the CS ++ * register useless for telling whether or not we need to ++ * switch CR3 in NMIs. Normal interrupts are OK because ++ * they are off here. ++ */ + SWITCH_USER_CR3 + movq RSP(%rsp), %rsp + USERGS_SYSRET64 + + opportunistic_sysret_failed: ++ /* ++ * This opens a window where we have a user CR3, but are ++ * running in the kernel. This makes using the CS ++ * register useless for telling whether or not we need to ++ * switch CR3 in NMIs. Normal interrupts are OK because ++ * they are off here. ++ */ + SWITCH_USER_CR3 + SWAPGS + jmp restore_c_regs_and_iret +@@ -1087,6 +1108,13 @@ ENTRY(error_entry) + cld + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 ++ /* ++ * error_entry() always returns with a kernel gsbase and ++ * CR3. We must also have a kernel CR3/gsbase before ++ * calling TRACE_IRQS_*. Just unconditionally switch to ++ * the kernel CR3 here. ++ */ ++ SWITCH_KERNEL_CR3 + xorl %ebx, %ebx + testb $3, CS+8(%rsp) + jz .Lerror_kernelspace +@@ -1096,7 +1124,6 @@ ENTRY(error_entry) + * from user mode due to an IRET fault. + */ + SWAPGS +- SWITCH_KERNEL_CR3 + + .Lerror_entry_from_usermode_after_swapgs: + /* +@@ -1148,7 +1175,6 @@ ENTRY(error_entry) + * Switch to kernel gsbase: + */ + SWAPGS +- SWITCH_KERNEL_CR3 + + /* + * Pretend that the exception came from user mode: set up pt_regs +@@ -1249,7 +1275,10 @@ ENTRY(nmi) + */ + + SWAPGS_UNSAFE_STACK +- SWITCH_KERNEL_CR3_NO_STACK ++ /* ++ * percpu variables are mapped with user CR3, so no need ++ * to switch CR3 here. ++ */ + cld + movq %rsp, %rdx + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp +@@ -1283,14 +1312,33 @@ ENTRY(nmi) + + movq %rsp, %rdi + movq $-1, %rsi ++#ifdef CONFIG_KAISER ++ /* Unconditionally use kernel CR3 for do_nmi() */ ++ /* %rax is saved above, so OK to clobber here */ ++ movq %cr3, %rax ++ pushq %rax ++#ifdef CONFIG_KAISER_REAL_SWITCH ++ andq $(~0x1000), %rax ++#endif ++ movq %rax, %cr3 ++#endif + call do_nmi ++ /* ++ * Unconditionally restore CR3. I know we return to ++ * kernel code that needs user CR3, but do we ever return ++ * to "user mode" where we need the kernel CR3? ++ */ ++#ifdef CONFIG_KAISER ++ popq %rax ++ mov %rax, %cr3 ++#endif + + /* + * Return back to user mode. We must *not* do the normal exit +- * work, because we don't want to enable interrupts. Fortunately, +- * do_nmi doesn't modify pt_regs. ++ * work, because we don't want to enable interrupts. Do not ++ * switch to user CR3: we might be going back to kernel code ++ * that had a user CR3 set. + */ +- SWITCH_USER_CR3 + SWAPGS + jmp restore_c_regs_and_iret + +@@ -1486,23 +1534,54 @@ end_repeat_nmi: + ALLOC_PT_GPREGS_ON_STACK + + /* +- * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit +- * as we should not be calling schedule in NMI context. +- * Even with normal interrupts enabled. An NMI should not be +- * setting NEED_RESCHED or anything that normal interrupts and +- * exceptions might do. ++ * Use the same approach as paranoid_entry to handle SWAPGS, but ++ * without CR3 handling since we do that differently in NMIs. No ++ * need to use paranoid_exit as we should not be calling schedule ++ * in NMI context. Even with normal interrupts enabled. An NMI ++ * should not be setting NEED_RESCHED or anything that normal ++ * interrupts and exceptions might do. + */ +- call paranoid_entry ++ cld ++ SAVE_C_REGS ++ SAVE_EXTRA_REGS ++ movl $1, %ebx ++ movl $MSR_GS_BASE, %ecx ++ rdmsr ++ testl %edx, %edx ++ js 1f /* negative -> in kernel */ ++ SWAPGS ++ xorl %ebx, %ebx ++1: ++#ifdef CONFIG_KAISER ++ /* Unconditionally use kernel CR3 for do_nmi() */ ++ /* %rax is saved above, so OK to clobber here */ ++ movq %cr3, %rax ++ pushq %rax ++#ifdef CONFIG_KAISER_REAL_SWITCH ++ andq $(~0x1000), %rax ++#endif ++ movq %rax, %cr3 ++#endif + + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ + movq %rsp, %rdi ++ addq $8, %rdi /* point %rdi at ptregs, fixed up for CR3 */ + movq $-1, %rsi + call do_nmi ++ /* ++ * Unconditionally restore CR3. We might be returning to ++ * kernel code that needs user CR3, like just just before ++ * a sysret. ++ */ ++#ifdef CONFIG_KAISER ++ popq %rax ++ mov %rax, %cr3 ++#endif + + testl %ebx, %ebx /* swapgs needed? */ + jnz nmi_restore + nmi_swapgs: +- SWITCH_USER_CR3_NO_STACK ++ /* We fixed up CR3 above, so no need to switch it here */ + SWAPGS_UNSAFE_STACK + nmi_restore: + RESTORE_EXTRA_REGS +--- a/arch/x86/include/asm/kaiser.h ++++ b/arch/x86/include/asm/kaiser.h +@@ -16,13 +16,17 @@ + + .macro _SWITCH_TO_KERNEL_CR3 reg + movq %cr3, \reg ++#ifdef CONFIG_KAISER_REAL_SWITCH + andq $(~0x1000), \reg ++#endif + movq \reg, %cr3 + .endm + + .macro _SWITCH_TO_USER_CR3 reg + movq %cr3, \reg ++#ifdef CONFIG_KAISER_REAL_SWITCH + orq $(0x1000), \reg ++#endif + movq \reg, %cr3 + .endm + +@@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_b + .endm + + #endif /* CONFIG_KAISER */ ++ + #else /* __ASSEMBLY__ */ + + + #ifdef CONFIG_KAISER +-// Upon kernel/user mode switch, it may happen that +-// the address space has to be switched before the registers have been stored. +-// To change the address space, another register is needed. +-// A register therefore has to be stored/restored. +-// +-DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); ++/* ++ * Upon kernel/user mode switch, it may happen that the address ++ * space has to be switched before the registers have been ++ * stored. To change the address space, another register is ++ * needed. A register therefore has to be stored/restored. ++*/ + +-#endif /* CONFIG_KAISER */ ++DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + + /** +- * shadowmem_add_mapping - map a virtual memory part to the shadow mapping ++ * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping + * @addr: the start address of the range + * @size: the size of the range + * @flags: The mapping flags of the pages + * +- * the mapping is done on a global scope, so no bigger synchronization has to be done. +- * the pages have to be manually unmapped again when they are not needed any longer. ++ * The mapping is done on a global scope, so no bigger ++ * synchronization has to be done. the pages have to be ++ * manually unmapped again when they are not needed any longer. + */ +-extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); ++extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); + + + /** +- * shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping ++ * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping + * @addr: the start address of the range + * @size: the size of the range + */ + extern void kaiser_remove_mapping(unsigned long start, unsigned long size); + + /** +- * shadowmem_initialize_mapping - Initalize the shadow mapping ++ * kaiser_initialize_mapping - Initalize the shadow mapping + * +- * most parts of the shadow mapping can be mapped upon boot time. +- * only the thread stacks have to be mapped on runtime. +- * the mapped regions are not unmapped at all. ++ * Most parts of the shadow mapping can be mapped upon boot ++ * time. Only per-process things like the thread stacks ++ * or a new LDT have to be mapped at runtime. These boot- ++ * time mappings are permanent and nevertunmapped. + */ + extern void kaiser_init(void); + +-#endif ++#endif /* CONFIG_KAISER */ ++ ++#endif /* __ASSEMBLY */ + + + +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -690,7 +690,17 @@ static inline pud_t *pud_offset(pgd_t *p + + static inline int pgd_bad(pgd_t pgd) + { +- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; ++ pgdval_t ignore_flags = _PAGE_USER; ++ /* ++ * We set NX on KAISER pgds that map userspace memory so ++ * that userspace can not meaningfully use the kernel ++ * page table by accident; it will fault on the first ++ * instruction it tries to run. See native_set_pgd(). ++ */ ++ if (IS_ENABLED(CONFIG_KAISER)) ++ ignore_flags |= _PAGE_NX; ++ ++ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; + } + + static inline int pgd_none(pgd_t pgd) +@@ -905,8 +915,10 @@ static inline void clone_pgd_range(pgd_t + { + memcpy(dst, src, count * sizeof(pgd_t)); + #ifdef CONFIG_KAISER +- // clone the shadow pgd part as well +- memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t)); ++ /* Clone the shadow pgd part as well */ ++ memcpy(native_get_shadow_pgd(dst), ++ native_get_shadow_pgd(src), ++ count * sizeof(pgd_t)); + #endif + } + +--- a/arch/x86/include/asm/pgtable_64.h ++++ b/arch/x86/include/asm/pgtable_64.h +@@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_ + } + + #ifdef CONFIG_KAISER +-static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) { ++static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) ++{ + return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE); + } + +-static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) { ++static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) ++{ + return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE); + } ++#else ++static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) ++{ ++ BUILD_BUG_ON(1); ++ return NULL; ++} ++static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) ++{ ++ return pgdp; ++} + #endif /* CONFIG_KAISER */ + ++/* ++ * Page table pages are page-aligned. The lower half of the top ++ * level is used for userspace and the top half for the kernel. ++ * This returns true for user pages that need to get copied into ++ * both the user and kernel copies of the page tables, and false ++ * for kernel pages that should only be in the kernel copy. ++ */ ++static inline bool is_userspace_pgd(void *__ptr) ++{ ++ unsigned long ptr = (unsigned long)__ptr; ++ ++ return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2)); ++} ++ + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) + { + #ifdef CONFIG_KAISER +- // We know that a pgd is page aligned. +- // Therefore the lower indices have to be mapped to user space. +- // These pages are mapped to the shadow mapping. +- if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) { ++ pteval_t extra_kern_pgd_flags = 0; ++ /* Do we need to also populate the shadow pgd? */ ++ if (is_userspace_pgd(pgdp)) { + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; ++ /* ++ * Even if the entry is *mapping* userspace, ensure ++ * that userspace can not use it. This way, if we ++ * get out to userspace running on the kernel CR3, ++ * userspace will crash instead of running. ++ */ ++ extra_kern_pgd_flags = _PAGE_NX; + } +- +- pgdp->pgd = pgd.pgd & ~_PAGE_USER; ++ pgdp->pgd = pgd.pgd; ++ pgdp->pgd |= extra_kern_pgd_flags; + #else /* CONFIG_KAISER */ + *pgdp = pgd; + #endif +--- a/arch/x86/include/asm/pgtable_types.h ++++ b/arch/x86/include/asm/pgtable_types.h +@@ -48,7 +48,7 @@ + #ifdef CONFIG_KAISER + #define _PAGE_GLOBAL (_AT(pteval_t, 0)) + #else +-#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) ++#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) + #endif + #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) + #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) +@@ -123,11 +123,7 @@ + #define _PAGE_DEVMAP (_AT(pteval_t, 0)) + #endif + +-#ifdef CONFIG_KAISER +-#define _PAGE_PROTNONE (_AT(pteval_t, 0)) +-#else + #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) +-#endif + + #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_DIRTY) +--- a/arch/x86/kernel/espfix_64.c ++++ b/arch/x86/kernel/espfix_64.c +@@ -127,11 +127,14 @@ void __init init_espfix_bsp(void) + /* Install the espfix pud into the kernel page directory */ + pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; + pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); +-#ifdef CONFIG_KAISER +- // add the esp stack pud to the shadow mapping here. +- // This can be done directly, because the fixup stack has its own pud +- set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page))); +-#endif ++ /* ++ * Just copy the top-level PGD that is mapping the espfix ++ * area to ensure it is mapped into the shadow user page ++ * tables. ++ */ ++ if (IS_ENABLED(CONFIG_KAISER)) ++ set_pgd(native_get_shadow_pgd(pgd_p), ++ __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page))); + + /* Randomize the locations */ + init_espfix_random(); +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -406,11 +406,24 @@ GLOBAL(early_recursion_flag) + GLOBAL(name) + + #ifdef CONFIG_KAISER ++/* ++ * Each PGD needs to be 8k long and 8k aligned. We do not ++ * ever go out to userspace with these, so we do not ++ * strictly *need* the second page, but this allows us to ++ * have a single set_pgd() implementation that does not ++ * need to worry about whether it has 4k or 8k to work ++ * with. ++ * ++ * This ensures PGDs are 8k long: ++ */ ++#define KAISER_USER_PGD_FILL 512 ++/* This ensures they are 8k-aligned: */ + #define NEXT_PGD_PAGE(name) \ + .balign 2 * PAGE_SIZE; \ + GLOBAL(name) + #else + #define NEXT_PGD_PAGE(name) NEXT_PAGE(name) ++#define KAISER_USER_PGD_FILL 0 + #endif + + /* Automate the creation of 1 to 1 mapping pmd entries */ +@@ -425,6 +438,7 @@ GLOBAL(name) + NEXT_PGD_PAGE(early_level4_pgt) + .fill 511,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE ++ .fill KAISER_USER_PGD_FILL,8,0 + + NEXT_PAGE(early_dynamic_pgts) + .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 +@@ -433,7 +447,8 @@ NEXT_PAGE(early_dynamic_pgts) + + #ifndef CONFIG_XEN + NEXT_PGD_PAGE(init_level4_pgt) +- .fill 2*512,8,0 ++ .fill 512,8,0 ++ .fill KAISER_USER_PGD_FILL,8,0 + #else + NEXT_PGD_PAGE(init_level4_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE +@@ -442,6 +457,7 @@ NEXT_PGD_PAGE(init_level4_pgt) + .org init_level4_pgt + L4_START_KERNEL*8, 0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE ++ .fill KAISER_USER_PGD_FILL,8,0 + + NEXT_PAGE(level3_ident_pgt) + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE +@@ -452,6 +468,7 @@ NEXT_PAGE(level2_ident_pgt) + */ + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) + #endif ++ .fill KAISER_USER_PGD_FILL,8,0 + + NEXT_PAGE(level3_kernel_pgt) + .fill L3_START_KERNEL,8,0 +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -18,6 +18,7 @@ + #include <linux/uaccess.h> + + #include <asm/ldt.h> ++#include <asm/kaiser.h> + #include <asm/desc.h> + #include <asm/mmu_context.h> + #include <asm/syscalls.h> +@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm) + set_ldt(pc->ldt->entries, pc->ldt->size); + } + ++static void __free_ldt_struct(struct ldt_struct *ldt) ++{ ++ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) ++ vfree(ldt->entries); ++ else ++ free_page((unsigned long)ldt->entries); ++ kfree(ldt); ++} ++ + /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ + static struct ldt_struct *alloc_ldt_struct(int size) + { + struct ldt_struct *new_ldt; + int alloc_size; ++ int ret = 0; + + if (size > LDT_ENTRIES) + return NULL; +@@ -66,6 +77,14 @@ static struct ldt_struct *alloc_ldt_stru + return NULL; + } + ++ // FIXME: make kaiser_add_mapping() return an error code ++ // when it fails ++ kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size, ++ __PAGE_KERNEL); ++ if (ret) { ++ __free_ldt_struct(new_ldt); ++ return NULL; ++ } + new_ldt->size = size; + return new_ldt; + } +@@ -92,12 +111,10 @@ static void free_ldt_struct(struct ldt_s + if (likely(!ldt)) + return; + ++ kaiser_remove_mapping((unsigned long)ldt->entries, ++ ldt->size * LDT_ENTRY_SIZE); + paravirt_free_ldt(ldt->entries, ldt->size); +- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) +- vfree(ldt->entries); +- else +- free_page((unsigned long)ldt->entries); +- kfree(ldt); ++ __free_ldt_struct(ldt); + } + + /* +--- a/arch/x86/kernel/tracepoint.c ++++ b/arch/x86/kernel/tracepoint.c +@@ -9,10 +9,12 @@ + #include <linux/atomic.h> + + atomic_t trace_idt_ctr = ATOMIC_INIT(0); ++__aligned(PAGE_SIZE) + struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, + (unsigned long) trace_idt_table }; + + /* No need to be aligned, but done to keep all IDTs defined the same way. */ ++__aligned(PAGE_SIZE) + gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; + + static int trace_irq_vector_refcount; +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -1,160 +1,305 @@ +- +- ++#include <linux/bug.h> + #include <linux/kernel.h> + #include <linux/errno.h> + #include <linux/string.h> + #include <linux/types.h> + #include <linux/bug.h> + #include <linux/init.h> ++#include <linux/interrupt.h> + #include <linux/spinlock.h> + #include <linux/mm.h> +- + #include <linux/uaccess.h> ++ ++#include <asm/kaiser.h> + #include <asm/pgtable.h> + #include <asm/pgalloc.h> + #include <asm/desc.h> + #ifdef CONFIG_KAISER + + __visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); ++/* ++ * At runtime, the only things we map are some things for CPU ++ * hotplug, and stacks for new processes. No two CPUs will ever ++ * be populating the same addresses, so we only need to ensure ++ * that we protect between two CPUs trying to allocate and ++ * populate the same page table page. ++ * ++ * Only take this lock when doing a set_p[4um]d(), but it is not ++ * needed for doing a set_pte(). We assume that only the *owner* ++ * of a given allocation will be doing this for _their_ ++ * allocation. ++ * ++ * This ensures that once a system has been running for a while ++ * and there have been stacks all over and these page tables ++ * are fully populated, there will be no further acquisitions of ++ * this lock. ++ */ ++static DEFINE_SPINLOCK(shadow_table_allocation_lock); + +-/** +- * Get the real ppn from a address in kernel mapping. +- * @param address The virtual adrress +- * @return the physical address ++/* ++ * Returns -1 on error. + */ +-static inline unsigned long get_pa_from_mapping (unsigned long address) ++static inline unsigned long get_pa_from_mapping(unsigned long vaddr) + { + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + +- pgd = pgd_offset_k(address); +- BUG_ON(pgd_none(*pgd) || pgd_large(*pgd)); ++ pgd = pgd_offset_k(vaddr); ++ /* ++ * We made all the kernel PGDs present in kaiser_init(). ++ * We expect them to stay that way. ++ */ ++ BUG_ON(pgd_none(*pgd)); ++ /* ++ * PGDs are either 512GB or 128TB on all x86_64 ++ * configurations. We don't handle these. ++ */ ++ BUG_ON(pgd_large(*pgd)); ++ ++ pud = pud_offset(pgd, vaddr); ++ if (pud_none(*pud)) { ++ WARN_ON_ONCE(1); ++ return -1; ++ } + +- pud = pud_offset(pgd, address); +- BUG_ON(pud_none(*pud)); ++ if (pud_large(*pud)) ++ return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK); + +- if (pud_large(*pud)) { +- return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK); ++ pmd = pmd_offset(pud, vaddr); ++ if (pmd_none(*pmd)) { ++ WARN_ON_ONCE(1); ++ return -1; + } + +- pmd = pmd_offset(pud, address); +- BUG_ON(pmd_none(*pmd)); ++ if (pmd_large(*pmd)) ++ return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK); + +- if (pmd_large(*pmd)) { +- return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK); ++ pte = pte_offset_kernel(pmd, vaddr); ++ if (pte_none(*pte)) { ++ WARN_ON_ONCE(1); ++ return -1; + } + +- pte = pte_offset_kernel(pmd, address); +- BUG_ON(pte_none(*pte)); +- +- return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK); ++ return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); + } + +-void _kaiser_copy (unsigned long start_addr, unsigned long size, +- unsigned long flags) ++/* ++ * This is a relatively normal page table walk, except that it ++ * also tries to allocate page tables pages along the way. ++ * ++ * Returns a pointer to a PTE on success, or NULL on failure. ++ */ ++static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic) + { +- pgd_t *pgd; +- pud_t *pud; + pmd_t *pmd; +- pte_t *pte; +- unsigned long address; +- unsigned long end_addr = start_addr + size; +- unsigned long target_address; ++ pud_t *pud; ++ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); ++ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + +- for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1)); +- address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) { +- target_address = get_pa_from_mapping(address); ++ might_sleep(); ++ if (is_atomic) { ++ gfp &= ~GFP_KERNEL; ++ gfp |= __GFP_HIGH | __GFP_ATOMIC; ++ } + +- pgd = native_get_shadow_pgd(pgd_offset_k(address)); ++ if (pgd_none(*pgd)) { ++ WARN_ONCE(1, "All shadow pgds should have been populated"); ++ return NULL; ++ } ++ BUILD_BUG_ON(pgd_large(*pgd) != 0); + +- BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n"); +- BUG_ON(pgd_large(*pgd)); ++ pud = pud_offset(pgd, address); ++ /* The shadow page tables do not use large mappings: */ ++ if (pud_large(*pud)) { ++ WARN_ON(1); ++ return NULL; ++ } ++ if (pud_none(*pud)) { ++ unsigned long new_pmd_page = __get_free_page(gfp); ++ if (!new_pmd_page) ++ return NULL; ++ spin_lock(&shadow_table_allocation_lock); ++ if (pud_none(*pud)) ++ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); ++ else ++ free_page(new_pmd_page); ++ spin_unlock(&shadow_table_allocation_lock); ++ } + +- pud = pud_offset(pgd, address); +- if (pud_none(*pud)) { +- set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address)))); +- } +- BUG_ON(pud_large(*pud)); ++ pmd = pmd_offset(pud, address); ++ /* The shadow page tables do not use large mappings: */ ++ if (pmd_large(*pmd)) { ++ WARN_ON(1); ++ return NULL; ++ } ++ if (pmd_none(*pmd)) { ++ unsigned long new_pte_page = __get_free_page(gfp); ++ if (!new_pte_page) ++ return NULL; ++ spin_lock(&shadow_table_allocation_lock); ++ if (pmd_none(*pmd)) ++ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); ++ else ++ free_page(new_pte_page); ++ spin_unlock(&shadow_table_allocation_lock); ++ } + +- pmd = pmd_offset(pud, address); +- if (pmd_none(*pmd)) { +- set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address)))); +- } +- BUG_ON(pmd_large(*pmd)); ++ return pte_offset_kernel(pmd, address); ++} + +- pte = pte_offset_kernel(pmd, address); ++int kaiser_add_user_map(const void *__start_addr, unsigned long size, ++ unsigned long flags) ++{ ++ int ret = 0; ++ pte_t *pte; ++ unsigned long start_addr = (unsigned long )__start_addr; ++ unsigned long address = start_addr & PAGE_MASK; ++ unsigned long end_addr = PAGE_ALIGN(start_addr + size); ++ unsigned long target_address; ++ ++ for (;address < end_addr; address += PAGE_SIZE) { ++ target_address = get_pa_from_mapping(address); ++ if (target_address == -1) { ++ ret = -EIO; ++ break; ++ } ++ pte = kaiser_pagetable_walk(address, false); + if (pte_none(*pte)) { + set_pte(pte, __pte(flags | target_address)); + } else { +- BUG_ON(__pa(pte_page(*pte)) != target_address); ++ pte_t tmp; ++ set_pte(&tmp, __pte(flags | target_address)); ++ WARN_ON_ONCE(!pte_same(*pte, tmp)); + } + } ++ return ret; + } + +-// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping +-static inline void __init _kaiser_init(void) ++static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags) ++{ ++ unsigned long size = end - start; ++ ++ return kaiser_add_user_map(start, size, flags); ++} ++ ++/* ++ * Ensure that the top level of the (shadow) page tables are ++ * entirely populated. This ensures that all processes that get ++ * forked have the same entries. This way, we do not have to ++ * ever go set up new entries in older processes. ++ * ++ * Note: we never free these, so there are no updates to them ++ * after this. ++ */ ++static void __init kaiser_init_all_pgds(void) + { + pgd_t *pgd; + int i = 0; + + pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); + for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { +- set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0)))); ++ pgd_t new_pgd; ++ pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE); ++ if (!pud) { ++ WARN_ON(1); ++ break; ++ } ++ new_pgd = __pgd(_KERNPG_TABLE |__pa(pud)); ++ /* ++ * Make sure not to stomp on some other pgd entry. ++ */ ++ if (!pgd_none(pgd[i])) { ++ WARN_ON(1); ++ continue; ++ } ++ set_pgd(pgd + i, new_pgd); + } + } + ++#define kaiser_add_user_map_early(start, size, flags) do { \ ++ int __ret = kaiser_add_user_map(start, size, flags); \ ++ WARN_ON(__ret); \ ++} while (0) ++ ++#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \ ++ int __ret = kaiser_add_user_map_ptrs(start, end, flags); \ ++ WARN_ON(__ret); \ ++} while (0) ++ + extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; +-spinlock_t shadow_table_lock; ++/* ++ * If anything in here fails, we will likely die on one of the ++ * first kernel->user transitions and init will die. But, we ++ * will have most of the kernel up by then and should be able to ++ * get a clean warning out of it. If we BUG_ON() here, we run ++ * the risk of being before we have good console output. ++ */ + void __init kaiser_init(void) + { + int cpu; +- spin_lock_init(&shadow_table_lock); +- +- spin_lock(&shadow_table_lock); + +- _kaiser_init(); ++ kaiser_init_all_pgds(); + + for_each_possible_cpu(cpu) { +- // map the per cpu user variables +- _kaiser_copy( +- (unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)), +- (unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start, +- __PAGE_KERNEL); +- } +- +- // map the entry/exit text section, which is responsible to switch between user- and kernel mode +- _kaiser_copy( +- (unsigned long) __entry_text_start, +- (unsigned long) __entry_text_end - (unsigned long) __entry_text_start, +- __PAGE_KERNEL_RX); +- +- // the fixed map address of the idt_table +- _kaiser_copy( +- (unsigned long) idt_descr.address, +- sizeof(gate_desc) * NR_VECTORS, +- __PAGE_KERNEL_RO); ++ void *percpu_vaddr = __per_cpu_user_mapped_start + ++ per_cpu_offset(cpu); ++ unsigned long percpu_sz = __per_cpu_user_mapped_end - ++ __per_cpu_user_mapped_start; ++ kaiser_add_user_map_early(percpu_vaddr, percpu_sz, ++ __PAGE_KERNEL); ++ } + +- spin_unlock(&shadow_table_lock); ++ /* ++ * Map the entry/exit text section, which is needed at ++ * switches from user to and from kernel. ++ */ ++ kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end, ++ __PAGE_KERNEL_RX); ++ ++#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) ++ kaiser_add_user_map_ptrs_early(__irqentry_text_start, ++ __irqentry_text_end, ++ __PAGE_KERNEL_RX); ++#endif ++ kaiser_add_user_map_early((void *)idt_descr.address, ++ sizeof(gate_desc) * NR_VECTORS, ++ __PAGE_KERNEL_RO); ++#ifdef CONFIG_TRACING ++ kaiser_add_user_map_early(&trace_idt_descr, ++ sizeof(trace_idt_descr), ++ __PAGE_KERNEL); ++ kaiser_add_user_map_early(&trace_idt_table, ++ sizeof(gate_desc) * NR_VECTORS, ++ __PAGE_KERNEL); ++#endif ++ kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr), ++ __PAGE_KERNEL); ++ kaiser_add_user_map_early(&debug_idt_table, ++ sizeof(gate_desc) * NR_VECTORS, ++ __PAGE_KERNEL); + } + ++extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end); + // add a mapping to the shadow-mapping, and synchronize the mappings +-void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) ++int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) + { +- spin_lock(&shadow_table_lock); +- _kaiser_copy(addr, size, flags); +- spin_unlock(&shadow_table_lock); ++ return kaiser_add_user_map((const void *)addr, size, flags); + } + +-extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end); + void kaiser_remove_mapping(unsigned long start, unsigned long size) + { +- pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start)); +- spin_lock(&shadow_table_lock); +- do { +- unmap_pud_range(pgd, start, start + size); +- } while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size))); +- spin_unlock(&shadow_table_lock); ++ unsigned long end = start + size; ++ unsigned long addr; ++ ++ for (addr = start; addr < end; addr += PGDIR_SIZE) { ++ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr)); ++ /* ++ * unmap_p4d_range() handles > P4D_SIZE unmaps, ++ * so no need to trim 'end'. ++ */ ++ unmap_pud_range_nofree(pgd, addr, end); ++ } + } + #endif /* CONFIG_KAISER */ +--- a/arch/x86/mm/pageattr.c ++++ b/arch/x86/mm/pageattr.c +@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock); + #define CPA_FLUSHTLB 1 + #define CPA_ARRAY 2 + #define CPA_PAGES_ARRAY 4 ++#define CPA_FREE_PAGETABLES 8 + + #ifdef CONFIG_PROC_FS + static unsigned long direct_pages_count[PG_LEVEL_NUM]; +@@ -729,10 +730,13 @@ static int split_large_page(struct cpa_d + return 0; + } + +-static bool try_to_free_pte_page(pte_t *pte) ++static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte) + { + int i; + ++ if (!(cpa->flags & CPA_FREE_PAGETABLES)) ++ return false; ++ + for (i = 0; i < PTRS_PER_PTE; i++) + if (!pte_none(pte[i])) + return false; +@@ -741,10 +745,13 @@ static bool try_to_free_pte_page(pte_t * + return true; + } + +-static bool try_to_free_pmd_page(pmd_t *pmd) ++static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd) + { + int i; + ++ if (!(cpa->flags & CPA_FREE_PAGETABLES)) ++ return false; ++ + for (i = 0; i < PTRS_PER_PMD; i++) + if (!pmd_none(pmd[i])) + return false; +@@ -753,7 +760,9 @@ static bool try_to_free_pmd_page(pmd_t * + return true; + } + +-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) ++static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd, ++ unsigned long start, ++ unsigned long end) + { + pte_t *pte = pte_offset_kernel(pmd, start); + +@@ -764,22 +773,23 @@ static bool unmap_pte_range(pmd_t *pmd, + pte++; + } + +- if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { ++ if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) { + pmd_clear(pmd); + return true; + } + return false; + } + +-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, ++static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd, + unsigned long start, unsigned long end) + { +- if (unmap_pte_range(pmd, start, end)) +- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) ++ if (unmap_pte_range(cpa, pmd, start, end)) ++ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud))) + pud_clear(pud); + } + +-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) ++static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, ++ unsigned long start, unsigned long end) + { + pmd_t *pmd = pmd_offset(pud, start); + +@@ -790,7 +800,7 @@ static void unmap_pmd_range(pud_t *pud, + unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; + unsigned long pre_end = min_t(unsigned long, end, next_page); + +- __unmap_pmd_range(pud, pmd, start, pre_end); ++ __unmap_pmd_range(cpa, pud, pmd, start, pre_end); + + start = pre_end; + pmd++; +@@ -803,7 +813,8 @@ static void unmap_pmd_range(pud_t *pud, + if (pmd_large(*pmd)) + pmd_clear(pmd); + else +- __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); ++ __unmap_pmd_range(cpa, pud, pmd, ++ start, start + PMD_SIZE); + + start += PMD_SIZE; + pmd++; +@@ -813,17 +824,19 @@ static void unmap_pmd_range(pud_t *pud, + * 4K leftovers? + */ + if (start < end) +- return __unmap_pmd_range(pud, pmd, start, end); ++ return __unmap_pmd_range(cpa, pud, pmd, start, end); + + /* + * Try again to free the PMD page if haven't succeeded above. + */ + if (!pud_none(*pud)) +- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) ++ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud))) + pud_clear(pud); + } + +-void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) ++static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd, ++ unsigned long start, ++ unsigned long end) + { + pud_t *pud = pud_offset(pgd, start); + +@@ -834,7 +847,7 @@ void unmap_pud_range(pgd_t *pgd, unsigne + unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; + unsigned long pre_end = min_t(unsigned long, end, next_page); + +- unmap_pmd_range(pud, start, pre_end); ++ unmap_pmd_range(cpa, pud, start, pre_end); + + start = pre_end; + pud++; +@@ -848,7 +861,7 @@ void unmap_pud_range(pgd_t *pgd, unsigne + if (pud_large(*pud)) + pud_clear(pud); + else +- unmap_pmd_range(pud, start, start + PUD_SIZE); ++ unmap_pmd_range(cpa, pud, start, start + PUD_SIZE); + + start += PUD_SIZE; + pud++; +@@ -858,7 +871,7 @@ void unmap_pud_range(pgd_t *pgd, unsigne + * 2M leftovers? + */ + if (start < end) +- unmap_pmd_range(pud, start, end); ++ unmap_pmd_range(cpa, pud, start, end); + + /* + * No need to try to free the PUD page because we'll free it in +@@ -866,6 +879,24 @@ void unmap_pud_range(pgd_t *pgd, unsigne + */ + } + ++static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) ++{ ++ struct cpa_data cpa = { ++ .flags = CPA_FREE_PAGETABLES, ++ }; ++ ++ __unmap_pud_range(&cpa, pgd, start, end); ++} ++ ++void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end) ++{ ++ struct cpa_data cpa = { ++ .flags = 0, ++ }; ++ ++ __unmap_pud_range(&cpa, pgd, start, end); ++} ++ + static int alloc_pte_page(pmd_t *pmd) + { + pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -344,40 +344,26 @@ static inline void _pgd_free(pgd_t *pgd) + kmem_cache_free(pgd_cache, pgd); + } + #else +-static inline pgd_t *_pgd_alloc(void) +-{ +-#ifdef CONFIG_KAISER +- // Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory +- // block. Therefore, we have to allocate at least 3 pages. However, the +- // __get_free_pages returns us 4 pages. Hence, we store the base pointer at +- // the beginning of the page of our 8kb-aligned memory block in order to +- // correctly free it afterwars. + +- unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE)); +- +- if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages) +- { +- *((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages; +- return (pgd_t *) pages; +- } +- else +- { +- *((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages; +- return (pgd_t *) (pages + PAGE_SIZE); +- } ++#ifdef CONFIG_KAISER ++/* ++ * Instead of one pmd, we aquire two pmds. Being order-1, it is ++ * both 8k in size and 8k-aligned. That lets us just flip bit 12 ++ * in a pointer to swap between the two 4k halves. ++ */ ++#define PGD_ALLOCATION_ORDER 1 + #else +- return (pgd_t *)__get_free_page(PGALLOC_GFP); ++#define PGD_ALLOCATION_ORDER 0 + #endif ++ ++static inline pgd_t *_pgd_alloc(void) ++{ ++ return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); + } + + static inline void _pgd_free(pgd_t *pgd) + { +-#ifdef CONFIG_KAISER +- unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE)); +- free_pages(pages, get_order(4*PAGE_SIZE)); +-#else +- free_page((unsigned long)pgd); +-#endif ++ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); + } + #endif /* CONFIG_X86_PAE */ + +--- /dev/null ++++ b/include/linux/kaiser.h +@@ -0,0 +1,26 @@ ++#ifndef _INCLUDE_KAISER_H ++#define _INCLUDE_KAISER_H ++ ++#ifdef CONFIG_KAISER ++#include <asm/kaiser.h> ++#else ++ ++/* ++ * These stubs are used whenever CONFIG_KAISER is off, which ++ * includes architectures that support KAISER, but have it ++ * disabled. ++ */ ++ ++static inline void kaiser_init(void) ++{ ++} ++static inline void kaiser_remove_mapping(unsigned long start, unsigned long size) ++{ ++} ++static inline int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) ++{ ++ return 0; ++} ++ ++#endif /* !CONFIG_KAISER */ ++#endif /* _INCLUDE_KAISER_H */ +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -58,6 +58,7 @@ + #include <linux/tsacct_kern.h> + #include <linux/cn_proc.h> + #include <linux/freezer.h> ++#include <linux/kaiser.h> + #include <linux/delayacct.h> + #include <linux/taskstats_kern.h> + #include <linux/random.h> +@@ -472,7 +473,6 @@ void set_task_stack_end_magic(struct tas + *stackend = STACK_END_MAGIC; /* for overflow detection */ + } + +-extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); + static struct task_struct *dup_task_struct(struct task_struct *orig, int node) + { + struct task_struct *tsk; +@@ -500,9 +500,10 @@ static struct task_struct *dup_task_stru + * functions again. + */ + tsk->stack = stack; +-#ifdef CONFIG_KAISER +- kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL); +-#endif ++ ++ err= kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL); ++ if (err) ++ goto free_stack; + #ifdef CONFIG_VMAP_STACK + tsk->stack_vm_area = stack_vm_area; + #endif +--- a/security/Kconfig ++++ b/security/Kconfig +@@ -32,12 +32,17 @@ config SECURITY + If you are unsure how to answer this question, answer N. + config KAISER + bool "Remove the kernel mapping in user mode" ++ default y + depends on X86_64 + depends on !PARAVIRT + help + This enforces a strict kernel and user space isolation in order to close + hardware side channels on kernel address information. + ++config KAISER_REAL_SWITCH ++ bool "KAISER: actually switch page tables" ++ default y ++ + config SECURITYFS + bool "Enable the securityfs filesystem" + help diff --git a/queue/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch b/queue/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch new file mode 100644 index 0000000..447c040 --- /dev/null +++ b/queue/kaiser-name-that-0x1000-kaiser_shadow_pgd_offset.patch @@ -0,0 +1,66 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Sat, 9 Sep 2017 17:31:18 -0700 +Subject: kaiser: name that 0x1000 KAISER_SHADOW_PGD_OFFSET + +From: Hugh Dickins <hughd@google.com> + + +There's a 0x1000 in various places, which looks better with a name. + +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/entry/entry_64.S | 4 ++-- + arch/x86/include/asm/kaiser.h | 7 +++++-- + 2 files changed, 7 insertions(+), 4 deletions(-) + +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -1318,7 +1318,7 @@ ENTRY(nmi) + movq %cr3, %rax + pushq %rax + #ifdef CONFIG_KAISER_REAL_SWITCH +- andq $(~0x1000), %rax ++ andq $(~KAISER_SHADOW_PGD_OFFSET), %rax + #endif + movq %rax, %cr3 + #endif +@@ -1561,7 +1561,7 @@ end_repeat_nmi: + movq %cr3, %rax + pushq %rax + #ifdef CONFIG_KAISER_REAL_SWITCH +- andq $(~0x1000), %rax ++ andq $(~KAISER_SHADOW_PGD_OFFSET), %rax + #endif + movq %rax, %cr3 + #endif +--- a/arch/x86/include/asm/kaiser.h ++++ b/arch/x86/include/asm/kaiser.h +@@ -13,13 +13,16 @@ + * A minimalistic kernel mapping holds the parts needed to be mapped in user + * mode, such as the entry/exit functions of the user space, or the stacks. + */ ++ ++#define KAISER_SHADOW_PGD_OFFSET 0x1000 ++ + #ifdef __ASSEMBLY__ + #ifdef CONFIG_KAISER + + .macro _SWITCH_TO_KERNEL_CR3 reg + movq %cr3, \reg + #ifdef CONFIG_KAISER_REAL_SWITCH +-andq $(~0x1000), \reg ++andq $(~KAISER_SHADOW_PGD_OFFSET), \reg + #endif + movq \reg, %cr3 + .endm +@@ -27,7 +30,7 @@ movq \reg, %cr3 + .macro _SWITCH_TO_USER_CR3 reg + movq %cr3, \reg + #ifdef CONFIG_KAISER_REAL_SWITCH +-orq $(0x1000), \reg ++orq $(KAISER_SHADOW_PGD_OFFSET), \reg + #endif + movq \reg, %cr3 + .endm diff --git a/queue/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch b/queue/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch new file mode 100644 index 0000000..f0ec889 --- /dev/null +++ b/queue/kaiser-paranoid_entry-pass-cr3-need-to-paranoid_exit.patch @@ -0,0 +1,166 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Tue, 26 Sep 2017 18:43:07 -0700 +Subject: kaiser: paranoid_entry pass cr3 need to paranoid_exit + +From: Hugh Dickins <hughd@google.com> + + +Neel Natu points out that paranoid_entry() was wrong to assume that +an entry that did not need swapgs would not need SWITCH_KERNEL_CR3: +paranoid_entry (used for debug breakpoint, int3, double fault or MCE; +though I think it's only the MCE case that is cause for concern here) +can break in at an awkward time, between cr3 switch and swapgs, but +its handling always needs kernel gs and kernel cr3. + +Easy to fix in itself, but paranoid_entry() also needs to convey to +paranoid_exit() (and my reading of macro idtentry says paranoid_entry +and paranoid_exit are always paired) how to restore the prior state. +The swapgs state is already conveyed by %ebx (0 or 1), so extend that +also to convey when SWITCH_USER_CR3 will be needed (2 or 3). + +(Yes, I'd much prefer that 0 meant no swapgs, whereas it's the other +way round: and a convention shared with error_entry() and error_exit(), +which I don't want to touch. Perhaps I should have inverted the bit +for switch cr3 too, but did not.) + +paranoid_exit() would be straightforward, except for TRACE_IRQS: it +did TRACE_IRQS_IRETQ when doing swapgs, but TRACE_IRQS_IRETQ_DEBUG +when not: which is it supposed to use when SWITCH_USER_CR3 is split +apart from that? As best as I can determine, commit 5963e317b1e9 +("ftrace/x86: Do not change stacks in DEBUG when calling lockdep") +missed the swapgs case, and should have used TRACE_IRQS_IRETQ_DEBUG +there too (the discrepancy has nothing to do with the liberal use +of _NO_STACK and _UNSAFE_STACK hereabouts: TRACE_IRQS_OFF_DEBUG has +just been used in all cases); discrepancy lovingly preserved across +several paranoid_exit() cleanups, but I'm now removing it. + +Neel further indicates that to use SWITCH_USER_CR3_NO_STACK there in +paranoid_exit() is now not only unnecessary but unsafe: might corrupt +syscall entry's unsafe_stack_register_backup of %rax. Just use +SWITCH_USER_CR3: and delete SWITCH_USER_CR3_NO_STACK altogether, +before we make the mistake of using it again. + +hughd adds: this commit fixes an issue in the Kaiser-without-PCIDs +part of the series, and ought to be moved earlier, if you decided +to make a release of Kaiser-without-PCIDs. + +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/entry/entry_64.S | 46 ++++++++++++++++++++++++++++++--------- + arch/x86/entry/entry_64_compat.S | 2 - + arch/x86/include/asm/kaiser.h | 8 ------ + 3 files changed, 37 insertions(+), 19 deletions(-) + +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -1053,7 +1053,11 @@ idtentry machine_check has_error_cod + /* + * Save all registers in pt_regs, and switch gs if needed. + * Use slow, but surefire "are we in kernel?" check. +- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise ++ * ++ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit ++ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit ++ * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit ++ * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit + */ + ENTRY(paranoid_entry) + cld +@@ -1065,9 +1069,26 @@ ENTRY(paranoid_entry) + testl %edx, %edx + js 1f /* negative -> in kernel */ + SWAPGS +- SWITCH_KERNEL_CR3 + xorl %ebx, %ebx +-1: ret ++1: ++#ifdef CONFIG_KAISER ++ /* ++ * We might have come in between a swapgs and a SWITCH_KERNEL_CR3 ++ * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit. ++ * Do a conditional SWITCH_KERNEL_CR3: this could safely be done ++ * unconditionally, but we need to find out whether the reverse ++ * should be done on return (conveyed to paranoid_exit in %ebx). ++ */ ++ movq %cr3, %rax ++ testl $KAISER_SHADOW_PGD_OFFSET, %eax ++ jz 2f ++ orl $2, %ebx ++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax ++ orq x86_cr3_pcid_noflush, %rax ++ movq %rax, %cr3 ++2: ++#endif ++ ret + END(paranoid_entry) + + /* +@@ -1080,20 +1101,25 @@ END(paranoid_entry) + * be complicated. Fortunately, we there's no good reason + * to try to handle preemption here. + * +- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) ++ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3 ++ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 ++ * ebx=2: needs both swapgs and SWITCH_USER_CR3 ++ * ebx=3: needs SWITCH_USER_CR3 but not swapgs + */ + ENTRY(paranoid_exit) + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF_DEBUG +- testl %ebx, %ebx /* swapgs needed? */ ++ TRACE_IRQS_IRETQ_DEBUG ++#ifdef CONFIG_KAISER ++ testl $2, %ebx /* SWITCH_USER_CR3 needed? */ ++ jz paranoid_exit_no_switch ++ SWITCH_USER_CR3 ++paranoid_exit_no_switch: ++#endif ++ testl $1, %ebx /* swapgs needed? */ + jnz paranoid_exit_no_swapgs +- TRACE_IRQS_IRETQ +- SWITCH_USER_CR3_NO_STACK + SWAPGS_UNSAFE_STACK +- jmp paranoid_exit_restore + paranoid_exit_no_swapgs: +- TRACE_IRQS_IRETQ_DEBUG +-paranoid_exit_restore: + RESTORE_EXTRA_REGS + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 +--- a/arch/x86/entry/entry_64_compat.S ++++ b/arch/x86/entry/entry_64_compat.S +@@ -343,7 +343,7 @@ ENTRY(entry_INT80_compat) + + /* Go back to user mode. */ + TRACE_IRQS_ON +- SWITCH_USER_CR3_NO_STACK ++ SWITCH_USER_CR3 + SWAPGS + jmp restore_regs_and_iret + END(entry_INT80_compat) +--- a/arch/x86/include/asm/kaiser.h ++++ b/arch/x86/include/asm/kaiser.h +@@ -63,20 +63,12 @@ _SWITCH_TO_KERNEL_CR3 %rax + movq PER_CPU_VAR(unsafe_stack_register_backup), %rax + .endm + +-.macro SWITCH_USER_CR3_NO_STACK +-movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) +-_SWITCH_TO_USER_CR3 %rax %al +-movq PER_CPU_VAR(unsafe_stack_register_backup), %rax +-.endm +- + #else /* CONFIG_KAISER */ + + .macro SWITCH_KERNEL_CR3 reg + .endm + .macro SWITCH_USER_CR3 reg regb + .endm +-.macro SWITCH_USER_CR3_NO_STACK +-.endm + .macro SWITCH_KERNEL_CR3_NO_STACK + .endm + diff --git a/queue/kaiser-pcid-0-for-kernel-and-128-for-user.patch b/queue/kaiser-pcid-0-for-kernel-and-128-for-user.patch new file mode 100644 index 0000000..7e8f7e2 --- /dev/null +++ b/queue/kaiser-pcid-0-for-kernel-and-128-for-user.patch @@ -0,0 +1,129 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Fri, 8 Sep 2017 19:26:30 -0700 +Subject: kaiser: PCID 0 for kernel and 128 for user + +From: Hugh Dickins <hughd@google.com> + + +Why was 4 chosen for kernel PCID and 6 for user PCID? +No good reason in a backport where PCIDs are only used for Kaiser. + +If we continue with those, then we shall need to add Andy Lutomirski's +4.13 commit 6c690ee1039b ("x86/mm: Split read_cr3() into read_cr3_pa() +and __read_cr3()"), which deals with the problem of read_cr3() callers +finding stray bits in the cr3 that they expected to be page-aligned; +and for hibernation, his 4.14 commit f34902c5c6c0 ("x86/hibernate/64: +Mask off CR3's PCID bits in the saved CR3"). + +But if 0 is used for kernel PCID, then there's no need to add in those +commits - whenever the kernel looks, it sees 0 in the lower bits; and +0 for kernel seems an obvious choice. + +And I naughtily propose 128 for user PCID. Because there's a place +in _SWITCH_TO_USER_CR3 where it takes note of the need for TLB FLUSH, +but needs to reset that to NOFLUSH for the next occasion. Currently +it does so with a "movb $(0x80)" into the high byte of the per-cpu +quadword, but that will cause a machine without PCID support to crash. +Now, if %al just happened to have 0x80 in it at that point, on a +machine with PCID support, but 0 on a machine without PCID support... + +(That will go badly wrong once the pgd can be at a physical address +above 2^56, but even with 5-level paging, physical goes up to 2^52.) + +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/kaiser.h | 19 ++++++++++++------- + arch/x86/include/asm/pgtable_types.h | 7 ++++--- + arch/x86/mm/tlb.c | 3 +++ + 3 files changed, 19 insertions(+), 10 deletions(-) + +--- a/arch/x86/include/asm/kaiser.h ++++ b/arch/x86/include/asm/kaiser.h +@@ -29,14 +29,19 @@ orq X86_CR3_PCID_KERN_VAR, \reg + movq \reg, %cr3 + .endm + +-.macro _SWITCH_TO_USER_CR3 reg ++.macro _SWITCH_TO_USER_CR3 reg regb ++/* ++ * regb must be the low byte portion of reg: because we have arranged ++ * for the low byte of the user PCID to serve as the high byte of NOFLUSH ++ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are ++ * not enabled): so that the one register can update both memory and cr3. ++ */ + movq %cr3, \reg + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg + orq PER_CPU_VAR(X86_CR3_PCID_USER_VAR), \reg + js 9f +-// FLUSH this time, reset to NOFLUSH for next time +-// But if nopcid? Consider using 0x80 for user pcid? +-movb $(0x80), PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7) ++/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */ ++movb \regb, PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7) + 9: + movq \reg, %cr3 + .endm +@@ -49,7 +54,7 @@ popq %rax + + .macro SWITCH_USER_CR3 + pushq %rax +-_SWITCH_TO_USER_CR3 %rax ++_SWITCH_TO_USER_CR3 %rax %al + popq %rax + .endm + +@@ -61,7 +66,7 @@ movq PER_CPU_VAR(unsafe_stack_register_b + + .macro SWITCH_USER_CR3_NO_STACK + movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) +-_SWITCH_TO_USER_CR3 %rax ++_SWITCH_TO_USER_CR3 %rax %al + movq PER_CPU_VAR(unsafe_stack_register_backup), %rax + .endm + +@@ -69,7 +74,7 @@ movq PER_CPU_VAR(unsafe_stack_register_b + + .macro SWITCH_KERNEL_CR3 reg + .endm +-.macro SWITCH_USER_CR3 reg ++.macro SWITCH_USER_CR3 reg regb + .endm + .macro SWITCH_USER_CR3_NO_STACK + .endm +--- a/arch/x86/include/asm/pgtable_types.h ++++ b/arch/x86/include/asm/pgtable_types.h +@@ -146,16 +146,17 @@ + + /* Mask for all the PCID-related bits in CR3: */ + #define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK) ++#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) ++ + #if defined(CONFIG_KAISER) && defined(CONFIG_X86_64) +-#define X86_CR3_PCID_ASID_KERN (_AC(0x4,UL)) +-#define X86_CR3_PCID_ASID_USER (_AC(0x6,UL)) ++/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */ ++#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL)) + + #define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN) + #define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER) + #define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN) + #define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER) + #else +-#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) + #define X86_CR3_PCID_ASID_USER (_AC(0x0,UL)) + /* + * PCIDs are unsupported on 32-bit and none of these bits can be +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -50,6 +50,9 @@ static void load_new_mm_cr3(pgd_t *pgdir + * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could + * do it here, but can only be used if X86_FEATURE_INVPCID is + * available - and many machines support pcid without invpcid. ++ * ++ * The line below is a no-op: X86_CR3_PCID_KERN_FLUSH is now 0; ++ * but keep that line in there in case something changes. + */ + new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH; + kaiser_flush_tlb_on_return_to_user(); diff --git a/queue/kaiser-set-_page_nx-only-if-supported.patch b/queue/kaiser-set-_page_nx-only-if-supported.patch new file mode 100644 index 0000000..0e59d80 --- /dev/null +++ b/queue/kaiser-set-_page_nx-only-if-supported.patch @@ -0,0 +1,118 @@ +From: Guenter Roeck <groeck@chromium.org> +Date: Thu, 4 Jan 2018 13:41:55 -0800 +Subject: kaiser: Set _PAGE_NX only if supported + +From: Guenter Roeck <groeck@chromium.org> + +This resolves a crash if loaded under qemu + haxm under windows. +See https://www.spinics.net/lists/kernel/msg2689835.html for details. +Here is a boot log (the log is from chromeos-4.4, but Tao Wu says that +the same log is also seen with vanilla v4.4.110-rc1). + +[ 0.712750] Freeing unused kernel memory: 552K +[ 0.721821] init: Corrupted page table at address 57b029b332e0 +[ 0.722761] PGD 80000000bb238067 PUD bc36a067 PMD bc369067 PTE 45d2067 +[ 0.722761] Bad pagetable: 000b [#1] PREEMPT SMP +[ 0.722761] Modules linked in: +[ 0.722761] CPU: 1 PID: 1 Comm: init Not tainted 4.4.96 #31 +[ 0.722761] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS +rel-1.7.5.1-0-g8936dbb-20141113_115728-nilsson.home.kraxel.org 04/01/2014 +[ 0.722761] task: ffff8800bc290000 ti: ffff8800bc28c000 task.ti: ffff8800bc28c000 +[ 0.722761] RIP: 0010:[<ffffffff83f4129e>] [<ffffffff83f4129e>] __clear_user+0x42/0x67 +[ 0.722761] RSP: 0000:ffff8800bc28fcf8 EFLAGS: 00010202 +[ 0.722761] RAX: 0000000000000000 RBX: 00000000000001a4 RCX: 00000000000001a4 +[ 0.722761] RDX: 0000000000000000 RSI: 0000000000000008 RDI: 000057b029b332e0 +[ 0.722761] RBP: ffff8800bc28fd08 R08: ffff8800bc290000 R09: ffff8800bb2f4000 +[ 0.722761] R10: ffff8800bc290000 R11: ffff8800bb2f4000 R12: 000057b029b332e0 +[ 0.722761] R13: 0000000000000000 R14: 000057b029b33340 R15: ffff8800bb1e2a00 +[ 0.722761] FS: 0000000000000000(0000) GS:ffff8800bfb00000(0000) knlGS:0000000000000000 +[ 0.722761] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b +[ 0.722761] CR2: 000057b029b332e0 CR3: 00000000bb2f8000 CR4: 00000000000006e0 +[ 0.722761] Stack: +[ 0.722761] 000057b029b332e0 ffff8800bb95fa80 ffff8800bc28fd18 ffffffff83f4120c +[ 0.722761] ffff8800bc28fe18 ffffffff83e9e7a1 ffff8800bc28fd68 0000000000000000 +[ 0.722761] ffff8800bc290000 ffff8800bc290000 ffff8800bc290000 ffff8800bc290000 +[ 0.722761] Call Trace: +[ 0.722761] [<ffffffff83f4120c>] clear_user+0x2e/0x30 +[ 0.722761] [<ffffffff83e9e7a1>] load_elf_binary+0xa7f/0x18f7 +[ 0.722761] [<ffffffff83de2088>] search_binary_handler+0x86/0x19c +[ 0.722761] [<ffffffff83de389e>] do_execveat_common.isra.26+0x909/0xf98 +[ 0.722761] [<ffffffff844febe0>] ? rest_init+0x87/0x87 +[ 0.722761] [<ffffffff83de40be>] do_execve+0x23/0x25 +[ 0.722761] [<ffffffff83c002e3>] run_init_process+0x2b/0x2d +[ 0.722761] [<ffffffff844fec4d>] kernel_init+0x6d/0xda +[ 0.722761] [<ffffffff84505b2f>] ret_from_fork+0x3f/0x70 +[ 0.722761] [<ffffffff844febe0>] ? rest_init+0x87/0x87 +[ 0.722761] Code: 86 84 be 12 00 00 00 e8 87 0d e8 ff 66 66 90 48 89 d8 48 c1 +eb 03 4c 89 e7 83 e0 07 48 89 d9 be 08 00 00 00 31 d2 48 85 c9 74 0a <48> 89 17 +48 01 f7 ff c9 75 f6 48 89 c1 85 c9 74 09 88 17 48 ff +[ 0.722761] RIP [<ffffffff83f4129e>] __clear_user+0x42/0x67 +[ 0.722761] RSP <ffff8800bc28fcf8> +[ 0.722761] ---[ end trace def703879b4ff090 ]--- +[ 0.722761] BUG: sleeping function called from invalid context at /mnt/host/source/src/third_party/kernel/v4.4/kernel/locking/rwsem.c:21 +[ 0.722761] in_atomic(): 0, irqs_disabled(): 1, pid: 1, name: init +[ 0.722761] CPU: 1 PID: 1 Comm: init Tainted: G D 4.4.96 #31 +[ 0.722761] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5.1-0-g8936dbb-20141113_115728-nilsson.home.kraxel.org 04/01/2014 +[ 0.722761] 0000000000000086 dcb5d76098c89836 ffff8800bc28fa30 ffffffff83f34004 +[ 0.722761] ffffffff84839dc2 0000000000000015 ffff8800bc28fa40 ffffffff83d57dc9 +[ 0.722761] ffff8800bc28fa68 ffffffff83d57e6a ffffffff84a53640 0000000000000000 +[ 0.722761] Call Trace: +[ 0.722761] [<ffffffff83f34004>] dump_stack+0x4d/0x63 +[ 0.722761] [<ffffffff83d57dc9>] ___might_sleep+0x13a/0x13c +[ 0.722761] [<ffffffff83d57e6a>] __might_sleep+0x9f/0xa6 +[ 0.722761] [<ffffffff84502788>] down_read+0x20/0x31 +[ 0.722761] [<ffffffff83cc5d9b>] __blocking_notifier_call_chain+0x35/0x63 +[ 0.722761] [<ffffffff83cc5ddd>] blocking_notifier_call_chain+0x14/0x16 +[ 0.800374] usb 1-1: new full-speed USB device number 2 using uhci_hcd +[ 0.722761] [<ffffffff83cefe97>] profile_task_exit+0x1a/0x1c +[ 0.802309] [<ffffffff83cac84e>] do_exit+0x39/0xe7f +[ 0.802309] [<ffffffff83ce5938>] ? vprintk_default+0x1d/0x1f +[ 0.802309] [<ffffffff83d7bb95>] ? printk+0x57/0x73 +[ 0.802309] [<ffffffff83c46e25>] oops_end+0x80/0x85 +[ 0.802309] [<ffffffff83c7b747>] pgtable_bad+0x8a/0x95 +[ 0.802309] [<ffffffff83ca7f4a>] __do_page_fault+0x8c/0x352 +[ 0.802309] [<ffffffff83eefba5>] ? file_has_perm+0xc4/0xe5 +[ 0.802309] [<ffffffff83ca821c>] do_page_fault+0xc/0xe +[ 0.802309] [<ffffffff84507682>] page_fault+0x22/0x30 +[ 0.802309] [<ffffffff83f4129e>] ? __clear_user+0x42/0x67 +[ 0.802309] [<ffffffff83f4127f>] ? __clear_user+0x23/0x67 +[ 0.802309] [<ffffffff83f4120c>] clear_user+0x2e/0x30 +[ 0.802309] [<ffffffff83e9e7a1>] load_elf_binary+0xa7f/0x18f7 +[ 0.802309] [<ffffffff83de2088>] search_binary_handler+0x86/0x19c +[ 0.802309] [<ffffffff83de389e>] do_execveat_common.isra.26+0x909/0xf98 +[ 0.802309] [<ffffffff844febe0>] ? rest_init+0x87/0x87 +[ 0.802309] [<ffffffff83de40be>] do_execve+0x23/0x25 +[ 0.802309] [<ffffffff83c002e3>] run_init_process+0x2b/0x2d +[ 0.802309] [<ffffffff844fec4d>] kernel_init+0x6d/0xda +[ 0.802309] [<ffffffff84505b2f>] ret_from_fork+0x3f/0x70 +[ 0.802309] [<ffffffff844febe0>] ? rest_init+0x87/0x87 +[ 0.830559] Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009 +[ 0.830559] +[ 0.831305] Kernel Offset: 0x2c00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) +[ 0.831305] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000009 + +The crash part of this problem may be solved with the following patch +(thanks to Hugh for the hint). There is still another problem, though - +with this patch applied, the qemu session aborts with "VCPU Shutdown +request", whatever that means. + +Cc: lepton <ytht.net@gmail.com> +Signed-off-by: Guenter Roeck <groeck@chromium.org> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/mm/kaiser.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -413,7 +413,8 @@ pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, + * get out to userspace running on the kernel CR3, + * userspace will crash instead of running. + */ +- pgd.pgd |= _PAGE_NX; ++ if (__supported_pte_mask & _PAGE_NX) ++ pgd.pgd |= _PAGE_NX; + } + } else if (!pgd.pgd) { + /* diff --git a/queue/kaiser-stack-map-page_size-at-thread_size-page_size.patch b/queue/kaiser-stack-map-page_size-at-thread_size-page_size.patch new file mode 100644 index 0000000..6ea3930 --- /dev/null +++ b/queue/kaiser-stack-map-page_size-at-thread_size-page_size.patch @@ -0,0 +1,139 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Sun, 3 Sep 2017 18:57:03 -0700 +Subject: kaiser: stack map PAGE_SIZE at THREAD_SIZE-PAGE_SIZE + +From: Hugh Dickins <hughd@google.com> + + +Kaiser only needs to map one page of the stack; and +kernel/fork.c did not build on powerpc (no __PAGE_KERNEL). +It's all cleaner if linux/kaiser.h provides kaiser_map_thread_stack() +and kaiser_unmap_thread_stack() wrappers around asm/kaiser.h's +kaiser_add_mapping() and kaiser_remove_mapping(). And use +linux/kaiser.h in init/main.c to avoid the #ifdefs there. + +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + include/linux/kaiser.h | 40 +++++++++++++++++++++++++++++++++------- + init/main.c | 6 +----- + kernel/fork.c | 7 ++----- + 3 files changed, 36 insertions(+), 17 deletions(-) + +--- a/include/linux/kaiser.h ++++ b/include/linux/kaiser.h +@@ -1,26 +1,52 @@ +-#ifndef _INCLUDE_KAISER_H +-#define _INCLUDE_KAISER_H ++#ifndef _LINUX_KAISER_H ++#define _LINUX_KAISER_H + + #ifdef CONFIG_KAISER + #include <asm/kaiser.h> ++ ++static inline int kaiser_map_thread_stack(void *stack) ++{ ++ /* ++ * Map that page of kernel stack on which we enter from user context. ++ */ ++ return kaiser_add_mapping((unsigned long)stack + ++ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL); ++} ++ ++static inline void kaiser_unmap_thread_stack(void *stack) ++{ ++ /* ++ * Note: may be called even when kaiser_map_thread_stack() failed. ++ */ ++ kaiser_remove_mapping((unsigned long)stack + ++ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE); ++} + #else + + /* + * These stubs are used whenever CONFIG_KAISER is off, which +- * includes architectures that support KAISER, but have it +- * disabled. ++ * includes architectures that support KAISER, but have it disabled. + */ + + static inline void kaiser_init(void) + { + } +-static inline void kaiser_remove_mapping(unsigned long start, unsigned long size) ++static inline int kaiser_add_mapping(unsigned long addr, ++ unsigned long size, unsigned long flags) ++{ ++ return 0; ++} ++static inline void kaiser_remove_mapping(unsigned long start, ++ unsigned long size) + { + } +-static inline int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) ++static inline int kaiser_map_thread_stack(void *stack) + { + return 0; + } ++static inline void kaiser_unmap_thread_stack(void *stack) ++{ ++} + + #endif /* !CONFIG_KAISER */ +-#endif /* _INCLUDE_KAISER_H */ ++#endif /* _LINUX_KAISER_H */ +--- a/init/main.c ++++ b/init/main.c +@@ -80,15 +80,13 @@ + #include <linux/integrity.h> + #include <linux/proc_ns.h> + #include <linux/io.h> ++#include <linux/kaiser.h> + + #include <asm/io.h> + #include <asm/bugs.h> + #include <asm/setup.h> + #include <asm/sections.h> + #include <asm/cacheflush.h> +-#ifdef CONFIG_KAISER +-#include <asm/kaiser.h> +-#endif + + static int kernel_init(void *); + +@@ -476,9 +474,7 @@ static void __init mm_init(void) + pgtable_init(); + vmalloc_init(); + ioremap_huge_init(); +-#ifdef CONFIG_KAISER + kaiser_init(); +-#endif + } + + asmlinkage __visible void __init start_kernel(void) +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -212,12 +212,9 @@ static unsigned long *alloc_thread_stack + #endif + } + +-extern void kaiser_remove_mapping(unsigned long start_addr, unsigned long size); + static inline void free_thread_stack(struct task_struct *tsk) + { +-#ifdef CONFIG_KAISER +- kaiser_remove_mapping((unsigned long)tsk->stack, THREAD_SIZE); +-#endif ++ kaiser_unmap_thread_stack(tsk->stack); + #ifdef CONFIG_VMAP_STACK + if (task_stack_vm_area(tsk)) { + unsigned long flags; +@@ -501,7 +498,7 @@ static struct task_struct *dup_task_stru + */ + tsk->stack = stack; + +- err= kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL); ++ err= kaiser_map_thread_stack(tsk->stack); + if (err) + goto free_stack; + #ifdef CONFIG_VMAP_STACK diff --git a/queue/kaiser-tidied-up-asm-kaiser.h-somewhat.patch b/queue/kaiser-tidied-up-asm-kaiser.h-somewhat.patch new file mode 100644 index 0000000..2007d66 --- /dev/null +++ b/queue/kaiser-tidied-up-asm-kaiser.h-somewhat.patch @@ -0,0 +1,105 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Sun, 3 Sep 2017 19:18:07 -0700 +Subject: kaiser: tidied up asm/kaiser.h somewhat + +From: Hugh Dickins <hughd@google.com> + + +Mainly deleting a surfeit of blank lines, and reflowing header comment. + +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/kaiser.h | 32 +++++++++++++------------------- + 1 file changed, 13 insertions(+), 19 deletions(-) + +--- a/arch/x86/include/asm/kaiser.h ++++ b/arch/x86/include/asm/kaiser.h +@@ -1,15 +1,17 @@ + #ifndef _ASM_X86_KAISER_H + #define _ASM_X86_KAISER_H +- +-/* This file includes the definitions for the KAISER feature. +- * KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory. +- * It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped, +- * but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled, +- * the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled. +- * By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory. ++/* ++ * This file includes the definitions for the KAISER feature. ++ * KAISER is a counter measure against x86_64 side channel attacks on ++ * the kernel virtual memory. It has a shadow pgd for every process: the ++ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole ++ * user memory. Within a kernel context switch, or when an interrupt is handled, ++ * the pgd is switched to the normal one. When the system switches to user mode, ++ * the shadow pgd is enabled. By this, the virtual memory caches are freed, ++ * and the user may not attack the whole kernel memory. + * +- * A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions +- * of the user space, or the stacks. ++ * A minimalistic kernel mapping holds the parts needed to be mapped in user ++ * mode, such as the entry/exit functions of the user space, or the stacks. + */ + #ifdef __ASSEMBLY__ + #ifdef CONFIG_KAISER +@@ -48,13 +50,10 @@ _SWITCH_TO_KERNEL_CR3 %rax + movq PER_CPU_VAR(unsafe_stack_register_backup), %rax + .endm + +- + .macro SWITCH_USER_CR3_NO_STACK +- + movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) + _SWITCH_TO_USER_CR3 %rax + movq PER_CPU_VAR(unsafe_stack_register_backup), %rax +- + .endm + + #else /* CONFIG_KAISER */ +@@ -72,7 +71,6 @@ movq PER_CPU_VAR(unsafe_stack_register_b + + #else /* __ASSEMBLY__ */ + +- + #ifdef CONFIG_KAISER + /* + * Upon kernel/user mode switch, it may happen that the address +@@ -80,7 +78,6 @@ movq PER_CPU_VAR(unsafe_stack_register_b + * stored. To change the address space, another register is + * needed. A register therefore has to be stored/restored. + */ +- + DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + + /** +@@ -95,7 +92,6 @@ DECLARE_PER_CPU_USER_MAPPED(unsigned lon + */ + extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); + +- + /** + * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping + * @addr: the start address of the range +@@ -104,12 +100,12 @@ extern int kaiser_add_mapping(unsigned l + extern void kaiser_remove_mapping(unsigned long start, unsigned long size); + + /** +- * kaiser_initialize_mapping - Initalize the shadow mapping ++ * kaiser_init - Initialize the shadow mapping + * + * Most parts of the shadow mapping can be mapped upon boot + * time. Only per-process things like the thread stacks + * or a new LDT have to be mapped at runtime. These boot- +- * time mappings are permanent and nevertunmapped. ++ * time mappings are permanent and never unmapped. + */ + extern void kaiser_init(void); + +@@ -117,6 +113,4 @@ extern void kaiser_init(void); + + #endif /* __ASSEMBLY */ + +- +- + #endif /* _ASM_X86_KAISER_H */ diff --git a/queue/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch b/queue/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch new file mode 100644 index 0000000..4bb0110 --- /dev/null +++ b/queue/kaiser-tidied-up-kaiser_add-remove_mapping-slightly.patch @@ -0,0 +1,50 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Sun, 3 Sep 2017 19:23:08 -0700 +Subject: kaiser: tidied up kaiser_add/remove_mapping slightly + +From: Hugh Dickins <hughd@google.com> + + +Yes, unmap_pud_range_nofree()'s declaration ought to be in a +header file really, but I'm not sure we want to use it anyway: +so for now just declare it inside kaiser_remove_mapping(). +And there doesn't seem to be such a thing as unmap_p4d_range(), +even in a 5-level paging tree. + +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/mm/kaiser.c | 9 +++------ + 1 file changed, 3 insertions(+), 6 deletions(-) + +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -285,8 +285,7 @@ void __init kaiser_init(void) + __PAGE_KERNEL); + } + +-extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end); +-// add a mapping to the shadow-mapping, and synchronize the mappings ++/* Add a mapping to the shadow mapping, and synchronize the mappings */ + int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) + { + return kaiser_add_user_map((const void *)addr, size, flags); +@@ -294,15 +293,13 @@ int kaiser_add_mapping(unsigned long add + + void kaiser_remove_mapping(unsigned long start, unsigned long size) + { ++ extern void unmap_pud_range_nofree(pgd_t *pgd, ++ unsigned long start, unsigned long end); + unsigned long end = start + size; + unsigned long addr; + + for (addr = start; addr < end; addr += PGDIR_SIZE) { + pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr)); +- /* +- * unmap_p4d_range() handles > P4D_SIZE unmaps, +- * so no need to trim 'end'. +- */ + unmap_pud_range_nofree(pgd, addr, end); + } + } diff --git a/queue/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch b/queue/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch new file mode 100644 index 0000000..44bbb7a --- /dev/null +++ b/queue/kaiser-use-alternative-instead-of-x86_cr3_pcid_noflush.patch @@ -0,0 +1,130 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Tue, 3 Oct 2017 20:49:04 -0700 +Subject: kaiser: use ALTERNATIVE instead of x86_cr3_pcid_noflush + +From: Hugh Dickins <hughd@google.com> + + +Now that we're playing the ALTERNATIVE game, use that more efficient +method: instead of user-mapping an extra page, and reading an extra +cacheline each time for x86_cr3_pcid_noflush. + +Neel has found that __stringify(bts $X86_CR3_PCID_NOFLUSH_BIT, %rax) +is a working substitute for the "bts $63, %rax" in these ALTERNATIVEs; +but the one line with $63 in looks clearer, so let's stick with that. + +Worried about what happens with an ALTERNATIVE between the jump and +jump label in another ALTERNATIVE? I was, but have checked the +combinations in SWITCH_KERNEL_CR3_NO_STACK at entry_SYSCALL_64, +and it does a good job. + +Signed-off-by: Hugh Dickins <hughd@google.com> +Acked-by: Jiri Kosina <jkosina@suse.cz> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/entry/entry_64.S | 7 ++++--- + arch/x86/include/asm/kaiser.h | 6 +++--- + arch/x86/mm/kaiser.c | 11 +---------- + 3 files changed, 8 insertions(+), 16 deletions(-) + +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -1084,7 +1084,8 @@ ENTRY(paranoid_entry) + jz 2f + orl $2, %ebx + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax +- orq x86_cr3_pcid_noflush, %rax ++ /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */ ++ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID + movq %rax, %cr3 + 2: + #endif +@@ -1344,7 +1345,7 @@ ENTRY(nmi) + /* %rax is saved above, so OK to clobber here */ + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER + /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ +- orq x86_cr3_pcid_noflush, %rax ++ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID + pushq %rax + /* mask off "user" bit of pgd address and 12 PCID bits: */ + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax +@@ -1588,7 +1589,7 @@ end_repeat_nmi: + /* %rax is saved above, so OK to clobber here */ + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER + /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ +- orq x86_cr3_pcid_noflush, %rax ++ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID + pushq %rax + /* mask off "user" bit of pgd address and 12 PCID bits: */ + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax +--- a/arch/x86/include/asm/kaiser.h ++++ b/arch/x86/include/asm/kaiser.h +@@ -25,7 +25,8 @@ + .macro _SWITCH_TO_KERNEL_CR3 reg + movq %cr3, \reg + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg +-orq x86_cr3_pcid_noflush, \reg ++/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */ ++ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID + movq \reg, %cr3 + .endm + +@@ -39,7 +40,7 @@ movq \reg, %cr3 + movq %cr3, \reg + orq PER_CPU_VAR(x86_cr3_pcid_user), \reg + js 9f +-/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */ ++/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */ + movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7) + 9: + movq \reg, %cr3 +@@ -90,7 +91,6 @@ movq PER_CPU_VAR(unsafe_stack_register_b + */ + DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + +-extern unsigned long x86_cr3_pcid_noflush; + DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); + + extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -31,7 +31,6 @@ DEFINE_PER_CPU_USER_MAPPED(unsigned long + * This is also handy because systems that do not support PCIDs + * just end up or'ing a 0 into their CR3, which does no harm. + */ +-unsigned long x86_cr3_pcid_noflush __read_mostly; + DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user); + + /* +@@ -356,10 +355,6 @@ void __init kaiser_init(void) + kaiser_add_user_map_early(&debug_idt_table, + sizeof(gate_desc) * NR_VECTORS, + __PAGE_KERNEL); +- +- kaiser_add_user_map_early(&x86_cr3_pcid_noflush, +- sizeof(x86_cr3_pcid_noflush), +- __PAGE_KERNEL); + } + + /* Add a mapping to the shadow mapping, and synchronize the mappings */ +@@ -433,18 +428,14 @@ pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, + + void kaiser_setup_pcid(void) + { +- unsigned long kern_cr3 = 0; + unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET; + +- if (this_cpu_has(X86_FEATURE_PCID)) { +- kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH; ++ if (this_cpu_has(X86_FEATURE_PCID)) + user_cr3 |= X86_CR3_PCID_USER_NOFLUSH; +- } + /* + * These variables are used by the entry/exit + * code to change PCID and pgd and TLB flushing. + */ +- x86_cr3_pcid_noflush = kern_cr3; + this_cpu_write(x86_cr3_pcid_user, user_cr3); + } + diff --git a/queue/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch b/queue/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch new file mode 100644 index 0000000..f7c6026 --- /dev/null +++ b/queue/kaiser-vmstat-show-nr_kaisertable-as-nr_overhead.patch @@ -0,0 +1,116 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Sat, 9 Sep 2017 21:27:32 -0700 +Subject: kaiser: vmstat show NR_KAISERTABLE as nr_overhead + +From: Hugh Dickins <hughd@google.com> + + +The kaiser update made an interesting choice, never to free any shadow +page tables. Contention on global spinlock was worrying, particularly +with it held across page table scans when freeing. Something had to be +done: I was going to add refcounting; but simply never to free them is +an appealing choice, minimizing contention without complicating the code +(the more a page table is found already, the less the spinlock is used). + +But leaking pages in this way is also a worry: can we get away with it? +At the very least, we need a count to show how bad it actually gets: +in principle, one might end up wasting about 1/256 of memory that way +(1/512 for when direct-mapped pages have to be user-mapped, plus 1/512 +for when they are user-mapped from the vmalloc area on another occasion +(but we don't have vmalloc'ed stacks, so only large ldts are vmalloc'ed). + +Add per-cpu stat NR_KAISERTABLE: including 256 at startup for the +shared pgd entries, and 1 for each intermediate page table added +thereafter for user-mapping - but leave out the 1 per mm, for its +shadow pgd, because that distracts from the monotonic increase. +Shown in /proc/vmstat as nr_overhead (0 if kaiser not enabled). + +In practice, it doesn't look so bad so far: more like 1/12000 after +nine hours of gtests below; and movable pageblock segregation should +tend to cluster the kaiser tables into a subset of the address space +(if not, they will be bad for compaction too). But production may +tell a different story: keep an eye on this number, and bring back +lighter freeing if it gets out of control (maybe a shrinker). + +["nr_overhead" should of course say "nr_kaisertable", if it needs +to stay; but for the moment we are being coy, preferring that when +Joe Blow notices a new line in his /proc/vmstat, he does not get +too curious about what this "kaiser" stuff might be.] + +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/mm/kaiser.c | 16 +++++++++++----- + include/linux/mmzone.h | 3 ++- + mm/vmstat.c | 1 + + 3 files changed, 14 insertions(+), 6 deletions(-) + +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -121,9 +121,11 @@ static pte_t *kaiser_pagetable_walk(unsi + if (!new_pmd_page) + return NULL; + spin_lock(&shadow_table_allocation_lock); +- if (pud_none(*pud)) ++ if (pud_none(*pud)) { + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); +- else ++ __inc_zone_page_state(virt_to_page((void *) ++ new_pmd_page), NR_KAISERTABLE); ++ } else + free_page(new_pmd_page); + spin_unlock(&shadow_table_allocation_lock); + } +@@ -139,9 +141,11 @@ static pte_t *kaiser_pagetable_walk(unsi + if (!new_pte_page) + return NULL; + spin_lock(&shadow_table_allocation_lock); +- if (pmd_none(*pmd)) ++ if (pmd_none(*pmd)) { + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); +- else ++ __inc_zone_page_state(virt_to_page((void *) ++ new_pte_page), NR_KAISERTABLE); ++ } else + free_page(new_pte_page); + spin_unlock(&shadow_table_allocation_lock); + } +@@ -205,11 +209,13 @@ static void __init kaiser_init_all_pgds( + pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); + for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { + pgd_t new_pgd; +- pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE); ++ pud_t *pud = pud_alloc_one(&init_mm, ++ PAGE_OFFSET + i * PGDIR_SIZE); + if (!pud) { + WARN_ON(1); + break; + } ++ inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE); + new_pgd = __pgd(_KERNPG_TABLE |__pa(pud)); + /* + * Make sure not to stomp on some other pgd entry. +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -124,8 +124,9 @@ enum zone_stat_item { + NR_SLAB_UNRECLAIMABLE, + NR_PAGETABLE, /* used for pagetables */ + NR_KERNEL_STACK_KB, /* measured in KiB */ +- /* Second 128 byte cacheline */ ++ NR_KAISERTABLE, + NR_BOUNCE, ++ /* Second 128 byte cacheline */ + #if IS_ENABLED(CONFIG_ZSMALLOC) + NR_ZSPAGES, /* allocated in zsmalloc */ + #endif +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -932,6 +932,7 @@ const char * const vmstat_text[] = { + "nr_slab_unreclaimable", + "nr_page_table_pages", + "nr_kernel_stack", ++ "nr_overhead", + "nr_bounce", + #if IS_ENABLED(CONFIG_ZSMALLOC) + "nr_zspages", diff --git a/queue/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch b/queue/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch new file mode 100644 index 0000000..2098b48 --- /dev/null +++ b/queue/kaiser-x86_cr3_pcid_noflush-and-x86_cr3_pcid_user.patch @@ -0,0 +1,141 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Hugh Dickins <hughd@google.com> +Date: Sun, 27 Aug 2017 16:24:27 -0700 +Subject: kaiser: x86_cr3_pcid_noflush and x86_cr3_pcid_user + +From: Hugh Dickins <hughd@google.com> + + +Mostly this commit is just unshouting X86_CR3_PCID_KERN_VAR and +X86_CR3_PCID_USER_VAR: we usually name variables in lower-case. + +But why does x86_cr3_pcid_noflush need to be __aligned(PAGE_SIZE)? +Ah, it's a leftover from when kaiser_add_user_map() once complained +about mapping the same page twice. Make it __read_mostly instead. +(I'm a little uneasy about all the unrelated data which shares its +page getting user-mapped too, but that was so before, and not a big +deal: though we call it user-mapped, it's not mapped with _PAGE_USER.) + +And there is a little change around the two calls to do_nmi(). +Previously they set the NOFLUSH bit (if PCID supported) when +forcing to kernel context before do_nmi(); now they also have the +NOFLUSH bit set (if PCID supported) when restoring context after: +nothing done in do_nmi() should require a TLB to be flushed here. + +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/entry/entry_64.S | 8 ++++---- + arch/x86/include/asm/kaiser.h | 11 +++++------ + arch/x86/mm/kaiser.c | 13 +++++++------ + 3 files changed, 16 insertions(+), 16 deletions(-) + +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -1316,11 +1316,11 @@ ENTRY(nmi) + /* Unconditionally use kernel CR3 for do_nmi() */ + /* %rax is saved above, so OK to clobber here */ + movq %cr3, %rax ++ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ ++ orq x86_cr3_pcid_noflush, %rax + pushq %rax + /* mask off "user" bit of pgd address and 12 PCID bits: */ + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax +- /* Add back kernel PCID and "no flush" bit */ +- orq X86_CR3_PCID_KERN_VAR, %rax + movq %rax, %cr3 + #endif + call do_nmi +@@ -1560,11 +1560,11 @@ end_repeat_nmi: + /* Unconditionally use kernel CR3 for do_nmi() */ + /* %rax is saved above, so OK to clobber here */ + movq %cr3, %rax ++ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ ++ orq x86_cr3_pcid_noflush, %rax + pushq %rax + /* mask off "user" bit of pgd address and 12 PCID bits: */ + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax +- /* Add back kernel PCID and "no flush" bit */ +- orq X86_CR3_PCID_KERN_VAR, %rax + movq %rax, %cr3 + #endif + +--- a/arch/x86/include/asm/kaiser.h ++++ b/arch/x86/include/asm/kaiser.h +@@ -25,7 +25,7 @@ + .macro _SWITCH_TO_KERNEL_CR3 reg + movq %cr3, \reg + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg +-orq X86_CR3_PCID_KERN_VAR, \reg ++orq x86_cr3_pcid_noflush, \reg + movq \reg, %cr3 + .endm + +@@ -37,11 +37,10 @@ movq \reg, %cr3 + * not enabled): so that the one register can update both memory and cr3. + */ + movq %cr3, \reg +-andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg +-orq PER_CPU_VAR(X86_CR3_PCID_USER_VAR), \reg ++orq PER_CPU_VAR(x86_cr3_pcid_user), \reg + js 9f + /* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */ +-movb \regb, PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7) ++movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7) + 9: + movq \reg, %cr3 + .endm +@@ -94,8 +93,8 @@ movq PER_CPU_VAR(unsafe_stack_register_b + */ + DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + +-extern unsigned long X86_CR3_PCID_KERN_VAR; +-DECLARE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR); ++extern unsigned long x86_cr3_pcid_noflush; ++DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); + + extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; + +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -28,8 +28,8 @@ DEFINE_PER_CPU_USER_MAPPED(unsigned long + * This is also handy because systems that do not support PCIDs + * just end up or'ing a 0 into their CR3, which does no harm. + */ +-__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR; +-DEFINE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR); ++unsigned long x86_cr3_pcid_noflush __read_mostly; ++DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user); + + /* + * At runtime, the only things we map are some things for CPU +@@ -303,7 +303,8 @@ void __init kaiser_init(void) + sizeof(gate_desc) * NR_VECTORS, + __PAGE_KERNEL); + +- kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE, ++ kaiser_add_user_map_early(&x86_cr3_pcid_noflush, ++ sizeof(x86_cr3_pcid_noflush), + __PAGE_KERNEL); + } + +@@ -381,8 +382,8 @@ void kaiser_setup_pcid(void) + * These variables are used by the entry/exit + * code to change PCID and pgd and TLB flushing. + */ +- X86_CR3_PCID_KERN_VAR = kern_cr3; +- this_cpu_write(X86_CR3_PCID_USER_VAR, user_cr3); ++ x86_cr3_pcid_noflush = kern_cr3; ++ this_cpu_write(x86_cr3_pcid_user, user_cr3); + } + + /* +@@ -392,7 +393,7 @@ void kaiser_setup_pcid(void) + */ + void kaiser_flush_tlb_on_return_to_user(void) + { +- this_cpu_write(X86_CR3_PCID_USER_VAR, ++ this_cpu_write(x86_cr3_pcid_user, + X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); + } + EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); diff --git a/queue/kbuild-add-fno-stack-check-to-kernel-build-options.patch b/queue/kbuild-add-fno-stack-check-to-kernel-build-options.patch new file mode 100644 index 0000000..144d30d --- /dev/null +++ b/queue/kbuild-add-fno-stack-check-to-kernel-build-options.patch @@ -0,0 +1,49 @@ +From 3ce120b16cc548472f80cf8644f90eda958cf1b6 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds <torvalds@linux-foundation.org> +Date: Fri, 29 Dec 2017 17:34:43 -0800 +Subject: kbuild: add '-fno-stack-check' to kernel build options +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Linus Torvalds <torvalds@linux-foundation.org> + +commit 3ce120b16cc548472f80cf8644f90eda958cf1b6 upstream. + +It appears that hardened gentoo enables "-fstack-check" by default for +gcc. + +That doesn't work _at_all_ for the kernel, because the kernel stack +doesn't act like a user stack at all: it's much smaller, and it doesn't +auto-expand on use. So the extra "probe one page below the stack" code +generated by -fstack-check just breaks the kernel in horrible ways, +causing infinite double faults etc. + +[ I have to say, that the particular code gcc generates looks very + stupid even for user space where it works, but that's a separate + issue. ] + +Reported-and-tested-by: Alexander Tsoy <alexander@tsoy.me> +Reported-and-tested-by: Toralf Förster <toralf.foerster@gmx.de> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Jiri Kosina <jikos@kernel.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + Makefile | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/Makefile ++++ b/Makefile +@@ -788,6 +788,9 @@ KBUILD_CFLAGS += $(call cc-disable-warni + # disable invalid "can't wrap" optimizations for signed / pointers + KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow) + ++# Make sure -fstack-check isn't enabled (like gentoo apparently did) ++KBUILD_CFLAGS += $(call cc-option,-fno-stack-check,) ++ + # conserve stack if available + KBUILD_CFLAGS += $(call cc-option,-fconserve-stack) + diff --git a/queue/kprobes-x86-blacklist-indirect-thunk-functions-for-kprobes.patch b/queue/kprobes-x86-blacklist-indirect-thunk-functions-for-kprobes.patch new file mode 100644 index 0000000..a552f71 --- /dev/null +++ b/queue/kprobes-x86-blacklist-indirect-thunk-functions-for-kprobes.patch @@ -0,0 +1,40 @@ +From c1804a236894ecc942da7dc6c5abe209e56cba93 Mon Sep 17 00:00:00 2001 +From: Masami Hiramatsu <mhiramat@kernel.org> +Date: Fri, 19 Jan 2018 01:14:51 +0900 +Subject: kprobes/x86: Blacklist indirect thunk functions for kprobes + +From: Masami Hiramatsu <mhiramat@kernel.org> + +commit c1804a236894ecc942da7dc6c5abe209e56cba93 upstream. + +Mark __x86_indirect_thunk_* functions as blacklist for kprobes +because those functions can be called from anywhere in the kernel +including blacklist functions of kprobes. + +Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: David Woodhouse <dwmw@amazon.co.uk> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Link: https://lkml.kernel.org/r/151629209111.10241.5444852823378068683.stgit@devbox +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/lib/retpoline.S | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/arch/x86/lib/retpoline.S ++++ b/arch/x86/lib/retpoline.S +@@ -25,7 +25,8 @@ ENDPROC(__x86_indirect_thunk_\reg) + * than one per register with the correct names. So we do it + * the simple and nasty way... + */ +-#define EXPORT_THUNK(reg) EXPORT_SYMBOL(__x86_indirect_thunk_ ## reg) ++#define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) ++#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) + #define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg) + + GENERATE_THUNK(_ASM_AX) diff --git a/queue/kprobes-x86-disable-optimizing-on-the-function-jumps-to-indirect-thunk.patch b/queue/kprobes-x86-disable-optimizing-on-the-function-jumps-to-indirect-thunk.patch new file mode 100644 index 0000000..27f0a7f --- /dev/null +++ b/queue/kprobes-x86-disable-optimizing-on-the-function-jumps-to-indirect-thunk.patch @@ -0,0 +1,80 @@ +From c86a32c09f8ced67971a2310e3b0dda4d1749007 Mon Sep 17 00:00:00 2001 +From: Masami Hiramatsu <mhiramat@kernel.org> +Date: Fri, 19 Jan 2018 01:15:20 +0900 +Subject: kprobes/x86: Disable optimizing on the function jumps to indirect thunk + +From: Masami Hiramatsu <mhiramat@kernel.org> + +commit c86a32c09f8ced67971a2310e3b0dda4d1749007 upstream. + +Since indirect jump instructions will be replaced by jump +to __x86_indirect_thunk_*, those jmp instruction must be +treated as an indirect jump. Since optprobe prohibits to +optimize probes in the function which uses an indirect jump, +it also needs to find out the function which jump to +__x86_indirect_thunk_* and disable optimization. + +Add a check that the jump target address is between the +__indirect_thunk_start/end when optimizing kprobe. + +Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: David Woodhouse <dwmw@amazon.co.uk> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Link: https://lkml.kernel.org/r/151629212062.10241.6991266100233002273.stgit@devbox +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/kernel/kprobes/opt.c | 23 ++++++++++++++++++++++- + 1 file changed, 22 insertions(+), 1 deletion(-) + +--- a/arch/x86/kernel/kprobes/opt.c ++++ b/arch/x86/kernel/kprobes/opt.c +@@ -37,6 +37,7 @@ + #include <asm/alternative.h> + #include <asm/insn.h> + #include <asm/debugreg.h> ++#include <asm/nospec-branch.h> + + #include "common.h" + +@@ -192,7 +193,7 @@ static int copy_optimized_instructions(u + } + + /* Check whether insn is indirect jump */ +-static int insn_is_indirect_jump(struct insn *insn) ++static int __insn_is_indirect_jump(struct insn *insn) + { + return ((insn->opcode.bytes[0] == 0xff && + (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ +@@ -226,6 +227,26 @@ static int insn_jump_into_range(struct i + return (start <= target && target <= start + len); + } + ++static int insn_is_indirect_jump(struct insn *insn) ++{ ++ int ret = __insn_is_indirect_jump(insn); ++ ++#ifdef CONFIG_RETPOLINE ++ /* ++ * Jump to x86_indirect_thunk_* is treated as an indirect jump. ++ * Note that even with CONFIG_RETPOLINE=y, the kernel compiled with ++ * older gcc may use indirect jump. So we add this check instead of ++ * replace indirect-jump check. ++ */ ++ if (!ret) ++ ret = insn_jump_into_range(insn, ++ (unsigned long)__indirect_thunk_start, ++ (unsigned long)__indirect_thunk_end - ++ (unsigned long)__indirect_thunk_start); ++#endif ++ return ret; ++} ++ + /* Decode whole function to ensure any instructions don't jump into target */ + static int can_optimize(unsigned long paddr) + { diff --git a/queue/kpti-rename-to-page_table_isolation.patch b/queue/kpti-rename-to-page_table_isolation.patch new file mode 100644 index 0000000..f77732d --- /dev/null +++ b/queue/kpti-rename-to-page_table_isolation.patch @@ -0,0 +1,329 @@ +From keescook@chromium.org Wed Jan 3 20:47:22 2018 +From: Kees Cook <keescook@chromium.org> +Date: Wed, 3 Jan 2018 10:17:35 -0800 +Subject: KPTI: Rename to PAGE_TABLE_ISOLATION +To: Greg KH <gregkh@linuxfoundation.org> +Message-ID: <20180103181735.GA33341@beast> +Content-Disposition: inline + +From: Kees Cook <keescook@chromium.org> + +This renames CONFIG_KAISER to CONFIG_PAGE_TABLE_ISOLATION. + +Signed-off-by: Kees Cook <keescook@chromium.org> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/boot/compressed/misc.h | 2 +- + arch/x86/entry/entry_64.S | 12 ++++++------ + arch/x86/events/intel/ds.c | 4 ++-- + arch/x86/include/asm/cpufeatures.h | 2 +- + arch/x86/include/asm/kaiser.h | 12 ++++++------ + arch/x86/include/asm/pgtable.h | 4 ++-- + arch/x86/include/asm/pgtable_64.h | 4 ++-- + arch/x86/include/asm/pgtable_types.h | 2 +- + arch/x86/include/asm/tlbflush.h | 2 +- + arch/x86/kernel/head_64.S | 2 +- + arch/x86/mm/Makefile | 2 +- + arch/x86/mm/kaslr.c | 2 +- + include/linux/kaiser.h | 6 +++--- + include/linux/percpu-defs.h | 2 +- + security/Kconfig | 2 +- + tools/arch/x86/include/asm/cpufeatures.h | 2 +- + 16 files changed, 31 insertions(+), 31 deletions(-) + +--- a/arch/x86/boot/compressed/misc.h ++++ b/arch/x86/boot/compressed/misc.h +@@ -9,7 +9,7 @@ + */ + #undef CONFIG_PARAVIRT + #undef CONFIG_PARAVIRT_SPINLOCKS +-#undef CONFIG_KAISER ++#undef CONFIG_PAGE_TABLE_ISOLATION + #undef CONFIG_KASAN + + #include <linux/linkage.h> +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -1071,7 +1071,7 @@ ENTRY(paranoid_entry) + SWAPGS + xorl %ebx, %ebx + 1: +-#ifdef CONFIG_KAISER ++#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * We might have come in between a swapgs and a SWITCH_KERNEL_CR3 + * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit. +@@ -1111,7 +1111,7 @@ ENTRY(paranoid_exit) + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF_DEBUG + TRACE_IRQS_IRETQ_DEBUG +-#ifdef CONFIG_KAISER ++#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */ + testl $2, %ebx /* SWITCH_USER_CR3 needed? */ + jz paranoid_exit_no_switch +@@ -1340,7 +1340,7 @@ ENTRY(nmi) + + movq %rsp, %rdi + movq $-1, %rsi +-#ifdef CONFIG_KAISER ++#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* Unconditionally use kernel CR3 for do_nmi() */ + /* %rax is saved above, so OK to clobber here */ + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER +@@ -1354,7 +1354,7 @@ ENTRY(nmi) + #endif + call do_nmi + +-#ifdef CONFIG_KAISER ++#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * Unconditionally restore CR3. I know we return to + * kernel code that needs user CR3, but do we ever return +@@ -1584,7 +1584,7 @@ end_repeat_nmi: + 1: + movq %rsp, %rdi + movq $-1, %rsi +-#ifdef CONFIG_KAISER ++#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* Unconditionally use kernel CR3 for do_nmi() */ + /* %rax is saved above, so OK to clobber here */ + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER +@@ -1600,7 +1600,7 @@ end_repeat_nmi: + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ + call do_nmi + +-#ifdef CONFIG_KAISER ++#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * Unconditionally restore CR3. We might be returning to + * kernel code that needs user CR3, like just just before +--- a/arch/x86/events/intel/ds.c ++++ b/arch/x86/events/intel/ds.c +@@ -274,7 +274,7 @@ static DEFINE_PER_CPU(void *, insn_buffe + + static void *dsalloc(size_t size, gfp_t flags, int node) + { +-#ifdef CONFIG_KAISER ++#ifdef CONFIG_PAGE_TABLE_ISOLATION + unsigned int order = get_order(size); + struct page *page; + unsigned long addr; +@@ -295,7 +295,7 @@ static void *dsalloc(size_t size, gfp_t + + static void dsfree(const void *buffer, size_t size) + { +-#ifdef CONFIG_KAISER ++#ifdef CONFIG_PAGE_TABLE_ISOLATION + if (!buffer) + return; + kaiser_remove_mapping((unsigned long)buffer, size); +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -199,7 +199,7 @@ + #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ + + /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ +-#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */ ++#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ + + /* Virtualization flags: Linux defined, word 8 */ + #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ +--- a/arch/x86/include/asm/kaiser.h ++++ b/arch/x86/include/asm/kaiser.h +@@ -20,7 +20,7 @@ + #define KAISER_SHADOW_PGD_OFFSET 0x1000 + + #ifdef __ASSEMBLY__ +-#ifdef CONFIG_KAISER ++#ifdef CONFIG_PAGE_TABLE_ISOLATION + + .macro _SWITCH_TO_KERNEL_CR3 reg + movq %cr3, \reg +@@ -69,7 +69,7 @@ movq PER_CPU_VAR(unsafe_stack_register_b + 8: + .endm + +-#else /* CONFIG_KAISER */ ++#else /* CONFIG_PAGE_TABLE_ISOLATION */ + + .macro SWITCH_KERNEL_CR3 + .endm +@@ -78,11 +78,11 @@ movq PER_CPU_VAR(unsafe_stack_register_b + .macro SWITCH_KERNEL_CR3_NO_STACK + .endm + +-#endif /* CONFIG_KAISER */ ++#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + + #else /* __ASSEMBLY__ */ + +-#ifdef CONFIG_KAISER ++#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * Upon kernel/user mode switch, it may happen that the address + * space has to be switched before the registers have been +@@ -100,10 +100,10 @@ extern void __init kaiser_check_boottime + #else + #define kaiser_enabled 0 + static inline void __init kaiser_check_boottime_disable(void) {} +-#endif /* CONFIG_KAISER */ ++#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + + /* +- * Kaiser function prototypes are needed even when CONFIG_KAISER is not set, ++ * Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set, + * so as to build with tests on kaiser_enabled instead of #ifdefs. + */ + +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -18,7 +18,7 @@ + #ifndef __ASSEMBLY__ + #include <asm/x86_init.h> + +-#ifdef CONFIG_KAISER ++#ifdef CONFIG_PAGE_TABLE_ISOLATION + extern int kaiser_enabled; + #else + #define kaiser_enabled 0 +@@ -920,7 +920,7 @@ static inline void pmdp_set_wrprotect(st + static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) + { + memcpy(dst, src, count * sizeof(pgd_t)); +-#ifdef CONFIG_KAISER ++#ifdef CONFIG_PAGE_TABLE_ISOLATION + if (kaiser_enabled) { + /* Clone the shadow pgd part as well */ + memcpy(native_get_shadow_pgd(dst), +--- a/arch/x86/include/asm/pgtable_64.h ++++ b/arch/x86/include/asm/pgtable_64.h +@@ -106,7 +106,7 @@ static inline void native_pud_clear(pud_ + native_set_pud(pud, native_make_pud(0)); + } + +-#ifdef CONFIG_KAISER ++#ifdef CONFIG_PAGE_TABLE_ISOLATION + extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); + + static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) +@@ -127,7 +127,7 @@ static inline pgd_t *native_get_shadow_p + BUILD_BUG_ON(1); + return NULL; + } +-#endif /* CONFIG_KAISER */ ++#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) + { +--- a/arch/x86/include/asm/pgtable_types.h ++++ b/arch/x86/include/asm/pgtable_types.h +@@ -144,7 +144,7 @@ + #define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK) + #define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) + +-#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64) ++#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64) + /* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */ + #define X86_CR3_PCID_ASID_USER (_AC(0x80,UL)) + +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -136,7 +136,7 @@ static inline void cr4_set_bits_and_upda + * Declare a couple of kaiser interfaces here for convenience, + * to avoid the need for asm/kaiser.h in unexpected places. + */ +-#ifdef CONFIG_KAISER ++#ifdef CONFIG_PAGE_TABLE_ISOLATION + extern int kaiser_enabled; + extern void kaiser_setup_pcid(void); + extern void kaiser_flush_tlb_on_return_to_user(void); +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -405,7 +405,7 @@ GLOBAL(early_recursion_flag) + .balign PAGE_SIZE; \ + GLOBAL(name) + +-#ifdef CONFIG_KAISER ++#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * Each PGD needs to be 8k long and 8k aligned. We do not + * ever go out to userspace with these, so we do not +--- a/arch/x86/mm/Makefile ++++ b/arch/x86/mm/Makefile +@@ -38,4 +38,4 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulatio + obj-$(CONFIG_X86_INTEL_MPX) += mpx.o + obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o + obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +-obj-$(CONFIG_KAISER) += kaiser.o ++obj-$(CONFIG_PAGE_TABLE_ISOLATION) += kaiser.o +--- a/arch/x86/mm/kaslr.c ++++ b/arch/x86/mm/kaslr.c +@@ -189,6 +189,6 @@ void __meminit init_trampoline(void) + *pud_tramp = *pud; + } + +- /* Avoid set_pgd(), in case it's complicated by CONFIG_KAISER */ ++ /* Avoid set_pgd(), in case it's complicated by CONFIG_PAGE_TABLE_ISOLATION */ + trampoline_pgd_entry = __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)); + } +--- a/include/linux/kaiser.h ++++ b/include/linux/kaiser.h +@@ -1,7 +1,7 @@ + #ifndef _LINUX_KAISER_H + #define _LINUX_KAISER_H + +-#ifdef CONFIG_KAISER ++#ifdef CONFIG_PAGE_TABLE_ISOLATION + #include <asm/kaiser.h> + + static inline int kaiser_map_thread_stack(void *stack) +@@ -24,7 +24,7 @@ static inline void kaiser_unmap_thread_s + #else + + /* +- * These stubs are used whenever CONFIG_KAISER is off, which ++ * These stubs are used whenever CONFIG_PAGE_TABLE_ISOLATION is off, which + * includes architectures that support KAISER, but have it disabled. + */ + +@@ -48,5 +48,5 @@ static inline void kaiser_unmap_thread_s + { + } + +-#endif /* !CONFIG_KAISER */ ++#endif /* !CONFIG_PAGE_TABLE_ISOLATION */ + #endif /* _LINUX_KAISER_H */ +--- a/include/linux/percpu-defs.h ++++ b/include/linux/percpu-defs.h +@@ -35,7 +35,7 @@ + + #endif + +-#ifdef CONFIG_KAISER ++#ifdef CONFIG_PAGE_TABLE_ISOLATION + #define USER_MAPPED_SECTION "..user_mapped" + #else + #define USER_MAPPED_SECTION "" +--- a/security/Kconfig ++++ b/security/Kconfig +@@ -31,7 +31,7 @@ config SECURITY + + If you are unsure how to answer this question, answer N. + +-config KAISER ++config PAGE_TABLE_ISOLATION + bool "Remove the kernel mapping in user mode" + default y + depends on X86_64 && SMP +--- a/tools/arch/x86/include/asm/cpufeatures.h ++++ b/tools/arch/x86/include/asm/cpufeatures.h +@@ -198,7 +198,7 @@ + #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ + + /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ +-#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */ ++#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ + + /* Virtualization flags: Linux defined, word 8 */ + #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ diff --git a/queue/kpti-report-when-enabled.patch b/queue/kpti-report-when-enabled.patch new file mode 100644 index 0000000..dbd4233 --- /dev/null +++ b/queue/kpti-report-when-enabled.patch @@ -0,0 +1,48 @@ +From keescook@chromium.org Wed Jan 3 20:48:07 2018 +From: Kees Cook <keescook@chromium.org> +Date: Wed, 3 Jan 2018 10:18:01 -0800 +Subject: KPTI: Report when enabled +To: Greg KH <gregkh@linuxfoundation.org> +Message-ID: <20180103181801.GA33383@beast> +Content-Disposition: inline + +From: Kees Cook <keescook@chromium.org> + +Make sure dmesg reports when KPTI is enabled. + +Signed-off-by: Kees Cook <keescook@chromium.org> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/mm/kaiser.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -10,6 +10,9 @@ + #include <linux/mm.h> + #include <linux/uaccess.h> + ++#undef pr_fmt ++#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt ++ + #include <asm/kaiser.h> + #include <asm/tlbflush.h> /* to verify its kaiser declarations */ + #include <asm/pgtable.h> +@@ -292,7 +295,7 @@ enable: + return; + + disable: +- pr_info("Kernel/User page tables isolation: disabled\n"); ++ pr_info("disabled\n"); + + silent_disable: + kaiser_enabled = 0; +@@ -352,6 +355,8 @@ void __init kaiser_init(void) + kaiser_add_user_map_early(&debug_idt_table, + sizeof(gate_desc) * NR_VECTORS, + __PAGE_KERNEL); ++ ++ pr_info("enabled\n"); + } + + /* Add a mapping to the shadow mapping, and synchronize the mappings */ diff --git a/queue/kvm-vmx-make-indirect-call-speculation-safe.patch b/queue/kvm-vmx-make-indirect-call-speculation-safe.patch new file mode 100644 index 0000000..a981c11 --- /dev/null +++ b/queue/kvm-vmx-make-indirect-call-speculation-safe.patch @@ -0,0 +1,57 @@ +From foo@baz Wed Feb 7 19:38:23 CST 2018 +From: Peter Zijlstra <peterz@infradead.org> +Date: Thu, 25 Jan 2018 10:58:14 +0100 +Subject: KVM: VMX: Make indirect call speculation safe + +From: Peter Zijlstra <peterz@infradead.org> + +(cherry picked from commit c940a3fb1e2e9b7d03228ab28f375fb5a47ff699) + +Replace indirect call with CALL_NOSPEC. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: David Woodhouse <dwmw@amazon.co.uk> +Cc: Andrea Arcangeli <aarcange@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Ashok Raj <ashok.raj@intel.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: Jun Nakajima <jun.nakajima@intel.com> +Cc: David Woodhouse <dwmw2@infradead.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: rga@amazon.de +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Asit Mallick <asit.k.mallick@intel.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Jason Baron <jbaron@akamai.com> +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Link: https://lkml.kernel.org/r/20180125095843.645776917@infradead.org +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/vmx.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -8676,14 +8676,14 @@ static void vmx_handle_external_intr(str + #endif + "pushf\n\t" + __ASM_SIZE(push) " $%c[cs]\n\t" +- "call *%[entry]\n\t" ++ CALL_NOSPEC + : + #ifdef CONFIG_X86_64 + [sp]"=&r"(tmp), + #endif + "+r"(__sp) + : +- [entry]"r"(entry), ++ THUNK_TARGET(entry), + [ss]"i"(__KERNEL_DS), + [cs]"i"(__KERNEL_CS) + ); diff --git a/queue/kvm-x86-make-indirect-calls-in-emulator-speculation-safe.patch b/queue/kvm-x86-make-indirect-calls-in-emulator-speculation-safe.patch new file mode 100644 index 0000000..86fb749 --- /dev/null +++ b/queue/kvm-x86-make-indirect-calls-in-emulator-speculation-safe.patch @@ -0,0 +1,78 @@ +From foo@baz Wed Feb 7 19:38:23 CST 2018 +From: Peter Zijlstra <peterz@infradead.org> +Date: Thu, 25 Jan 2018 10:58:13 +0100 +Subject: KVM: x86: Make indirect calls in emulator speculation safe + +From: Peter Zijlstra <peterz@infradead.org> + +(cherry picked from commit 1a29b5b7f347a1a9230c1e0af5b37e3e571588ab) + +Replace the indirect calls with CALL_NOSPEC. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: David Woodhouse <dwmw@amazon.co.uk> +Cc: Andrea Arcangeli <aarcange@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Ashok Raj <ashok.raj@intel.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: Jun Nakajima <jun.nakajima@intel.com> +Cc: David Woodhouse <dwmw2@infradead.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: rga@amazon.de +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Asit Mallick <asit.k.mallick@intel.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Jason Baron <jbaron@akamai.com> +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Link: https://lkml.kernel.org/r/20180125095843.595615683@infradead.org +[dwmw2: Use ASM_CALL_CONSTRAINT like upstream, now we have it] +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kvm/emulate.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -25,6 +25,7 @@ + #include <asm/kvm_emulate.h> + #include <linux/stringify.h> + #include <asm/debugreg.h> ++#include <asm/nospec-branch.h> + + #include "x86.h" + #include "tss.h" +@@ -1012,8 +1013,8 @@ static __always_inline u8 test_cc(unsign + void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf); + + flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF; +- asm("push %[flags]; popf; call *%[fastop]" +- : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags)); ++ asm("push %[flags]; popf; " CALL_NOSPEC ++ : "=a"(rc) : [thunk_target]"r"(fop), [flags]"r"(flags)); + return rc; + } + +@@ -5306,15 +5307,14 @@ static void fetch_possible_mmx_operand(s + + static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) + { +- register void *__sp asm(_ASM_SP); + ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; + + if (!(ctxt->d & ByteOp)) + fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; + +- asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" ++ asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n" + : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), +- [fastop]"+S"(fop), "+r"(__sp) ++ [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT + : "c"(ctxt->src2.val)); + + ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); diff --git a/queue/map-the-vsyscall-page-with-_page_user.patch b/queue/map-the-vsyscall-page-with-_page_user.patch new file mode 100644 index 0000000..1b85895 --- /dev/null +++ b/queue/map-the-vsyscall-page-with-_page_user.patch @@ -0,0 +1,143 @@ +From: Borislav Petkov <bp@suse.de> +Date: Thu, 4 Jan 2018 17:42:45 +0100 +Subject: Map the vsyscall page with _PAGE_USER + +From: Borislav Petkov <bp@suse.de> + +This needs to happen early in kaiser_pagetable_walk(), before the +hierarchy is established so that _PAGE_USER permission can be really +set. + +A proper fix would be to teach kaiser_pagetable_walk() to update those +permissions but the vsyscall page is the only exception here so ... + +Signed-off-by: Borislav Petkov <bp@suse.de> +Acked-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/entry/vsyscall/vsyscall_64.c | 5 +++++ + arch/x86/include/asm/vsyscall.h | 2 ++ + arch/x86/mm/kaiser.c | 34 ++++++++++++++++++++++++++++++---- + 3 files changed, 37 insertions(+), 4 deletions(-) + +--- a/arch/x86/entry/vsyscall/vsyscall_64.c ++++ b/arch/x86/entry/vsyscall/vsyscall_64.c +@@ -66,6 +66,11 @@ static int __init vsyscall_setup(char *s + } + early_param("vsyscall", vsyscall_setup); + ++bool vsyscall_enabled(void) ++{ ++ return vsyscall_mode != NONE; ++} ++ + static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, + const char *message) + { +--- a/arch/x86/include/asm/vsyscall.h ++++ b/arch/x86/include/asm/vsyscall.h +@@ -12,12 +12,14 @@ extern void map_vsyscall(void); + * Returns true if handled. + */ + extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); ++extern bool vsyscall_enabled(void); + #else + static inline void map_vsyscall(void) {} + static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) + { + return false; + } ++static inline bool vsyscall_enabled(void) { return false; } + #endif + + #endif /* _ASM_X86_VSYSCALL_H */ +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -19,6 +19,7 @@ + #include <asm/pgalloc.h> + #include <asm/desc.h> + #include <asm/cmdline.h> ++#include <asm/vsyscall.h> + + int kaiser_enabled __read_mostly = 1; + EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */ +@@ -110,12 +111,13 @@ static inline unsigned long get_pa_from_ + * + * Returns a pointer to a PTE on success, or NULL on failure. + */ +-static pte_t *kaiser_pagetable_walk(unsigned long address) ++static pte_t *kaiser_pagetable_walk(unsigned long address, bool user) + { + pmd_t *pmd; + pud_t *pud; + pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); ++ unsigned long prot = _KERNPG_TABLE; + + if (pgd_none(*pgd)) { + WARN_ONCE(1, "All shadow pgds should have been populated"); +@@ -123,6 +125,17 @@ static pte_t *kaiser_pagetable_walk(unsi + } + BUILD_BUG_ON(pgd_large(*pgd) != 0); + ++ if (user) { ++ /* ++ * The vsyscall page is the only page that will have ++ * _PAGE_USER set. Catch everything else. ++ */ ++ BUG_ON(address != VSYSCALL_ADDR); ++ ++ set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); ++ prot = _PAGE_TABLE; ++ } ++ + pud = pud_offset(pgd, address); + /* The shadow page tables do not use large mappings: */ + if (pud_large(*pud)) { +@@ -135,7 +148,7 @@ static pte_t *kaiser_pagetable_walk(unsi + return NULL; + spin_lock(&shadow_table_allocation_lock); + if (pud_none(*pud)) { +- set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); ++ set_pud(pud, __pud(prot | __pa(new_pmd_page))); + __inc_zone_page_state(virt_to_page((void *) + new_pmd_page), NR_KAISERTABLE); + } else +@@ -155,7 +168,7 @@ static pte_t *kaiser_pagetable_walk(unsi + return NULL; + spin_lock(&shadow_table_allocation_lock); + if (pmd_none(*pmd)) { +- set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); ++ set_pmd(pmd, __pmd(prot | __pa(new_pte_page))); + __inc_zone_page_state(virt_to_page((void *) + new_pte_page), NR_KAISERTABLE); + } else +@@ -191,7 +204,7 @@ static int kaiser_add_user_map(const voi + ret = -EIO; + break; + } +- pte = kaiser_pagetable_walk(address); ++ pte = kaiser_pagetable_walk(address, flags & _PAGE_USER); + if (!pte) { + ret = -ENOMEM; + break; +@@ -318,6 +331,19 @@ void __init kaiser_init(void) + + kaiser_init_all_pgds(); + ++ /* ++ * Note that this sets _PAGE_USER and it needs to happen when the ++ * pagetable hierarchy gets created, i.e., early. Otherwise ++ * kaiser_pagetable_walk() will encounter initialized PTEs in the ++ * hierarchy and not set the proper permissions, leading to the ++ * pagefaults with page-protection violations when trying to read the ++ * vsyscall page. For example. ++ */ ++ if (vsyscall_enabled()) ++ kaiser_add_user_map_early((void *)VSYSCALL_ADDR, ++ PAGE_SIZE, ++ __PAGE_KERNEL_VSYSCALL); ++ + for_each_possible_cpu(cpu) { + void *percpu_vaddr = __per_cpu_user_mapped_start + + per_cpu_offset(cpu); diff --git a/queue/mm-vmstat-make-nr_tlb_remote_flush_received-available-even-on-up.patch b/queue/mm-vmstat-make-nr_tlb_remote_flush_received-available-even-on-up.patch new file mode 100644 index 0000000..df764bb --- /dev/null +++ b/queue/mm-vmstat-make-nr_tlb_remote_flush_received-available-even-on-up.patch @@ -0,0 +1,39 @@ +From 5dd0b16cdaff9b94da06074d5888b03235c0bf17 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 5 Jun 2017 07:40:25 -0700 +Subject: mm/vmstat: Make NR_TLB_REMOTE_FLUSH_RECEIVED available even on UP + +From: Andy Lutomirski <luto@kernel.org> + +commit 5dd0b16cdaff9b94da06074d5888b03235c0bf17 upstream. + +This fixes CONFIG_SMP=n, CONFIG_DEBUG_TLBFLUSH=y without introducing +further #ifdef soup. Caught by a Kbuild bot randconfig build. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Fixes: ce4a4e565f52 ("x86/mm: Remove the UP asm/tlbflush.h code, always use the (formerly) SMP code") +Link: http://lkml.kernel.org/r/76da9a3cc4415996f2ad2c905b93414add322021.1496673616.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + include/linux/vm_event_item.h | 2 -- + 1 file changed, 2 deletions(-) + +--- a/include/linux/vm_event_item.h ++++ b/include/linux/vm_event_item.h +@@ -89,10 +89,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PS + #endif + #endif + #ifdef CONFIG_DEBUG_TLBFLUSH +-#ifdef CONFIG_SMP + NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ + NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */ +-#endif /* CONFIG_SMP */ + NR_TLB_LOCAL_FLUSH_ALL, + NR_TLB_LOCAL_FLUSH_ONE, + #endif /* CONFIG_DEBUG_TLBFLUSH */ diff --git a/queue/module-add-retpoline-tag-to-vermagic.patch b/queue/module-add-retpoline-tag-to-vermagic.patch new file mode 100644 index 0000000..7870bc4 --- /dev/null +++ b/queue/module-add-retpoline-tag-to-vermagic.patch @@ -0,0 +1,53 @@ +From 6cfb521ac0d5b97470883ff9b7facae264b7ab12 Mon Sep 17 00:00:00 2001 +From: Andi Kleen <ak@linux.intel.com> +Date: Tue, 16 Jan 2018 12:52:28 -0800 +Subject: module: Add retpoline tag to VERMAGIC + +From: Andi Kleen <ak@linux.intel.com> + +commit 6cfb521ac0d5b97470883ff9b7facae264b7ab12 upstream. + +Add a marker for retpoline to the module VERMAGIC. This catches the case +when a non RETPOLINE compiled module gets loaded into a retpoline kernel, +making it insecure. + +It doesn't handle the case when retpoline has been runtime disabled. Even +in this case the match of the retcompile status will be enforced. This +implies that even with retpoline run time disabled all modules loaded need +to be recompiled. + +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Acked-by: David Woodhouse <dwmw@amazon.co.uk> +Cc: rusty@rustcorp.com.au +Cc: arjan.van.de.ven@intel.com +Cc: jeyu@kernel.org +Cc: torvalds@linux-foundation.org +Link: https://lkml.kernel.org/r/20180116205228.4890-1-andi@firstfloor.org +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + include/linux/vermagic.h | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/include/linux/vermagic.h ++++ b/include/linux/vermagic.h +@@ -24,10 +24,16 @@ + #ifndef MODULE_ARCH_VERMAGIC + #define MODULE_ARCH_VERMAGIC "" + #endif ++#ifdef RETPOLINE ++#define MODULE_VERMAGIC_RETPOLINE "retpoline " ++#else ++#define MODULE_VERMAGIC_RETPOLINE "" ++#endif + + #define VERMAGIC_STRING \ + UTS_RELEASE " " \ + MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT \ + MODULE_VERMAGIC_MODULE_UNLOAD MODULE_VERMAGIC_MODVERSIONS \ +- MODULE_ARCH_VERMAGIC ++ MODULE_ARCH_VERMAGIC \ ++ MODULE_VERMAGIC_RETPOLINE + diff --git a/queue/module-retpoline-warn-about-missing-retpoline-in-module.patch b/queue/module-retpoline-warn-about-missing-retpoline-in-module.patch new file mode 100644 index 0000000..23a00b6 --- /dev/null +++ b/queue/module-retpoline-warn-about-missing-retpoline-in-module.patch @@ -0,0 +1,149 @@ +From foo@baz Wed Feb 7 19:38:23 CST 2018 +From: Andi Kleen <ak@linux.intel.com> +Date: Thu, 25 Jan 2018 15:50:28 -0800 +Subject: module/retpoline: Warn about missing retpoline in module + +From: Andi Kleen <ak@linux.intel.com> + +(cherry picked from commit caf7501a1b4ec964190f31f9c3f163de252273b8) + +There's a risk that a kernel which has full retpoline mitigations becomes +vulnerable when a module gets loaded that hasn't been compiled with the +right compiler or the right option. + +To enable detection of that mismatch at module load time, add a module info +string "retpoline" at build time when the module was compiled with +retpoline support. This only covers compiled C source, but assembler source +or prebuilt object files are not checked. + +If a retpoline enabled kernel detects a non retpoline protected module at +load time, print a warning and report it in the sysfs vulnerability file. + +[ tglx: Massaged changelog ] + +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: David Woodhouse <dwmw2@infradead.org> +Cc: gregkh@linuxfoundation.org +Cc: torvalds@linux-foundation.org +Cc: jeyu@kernel.org +Cc: arjan@linux.intel.com +Link: https://lkml.kernel.org/r/20180125235028.31211-1-andi@firstfloor.org +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kernel/cpu/bugs.c | 17 ++++++++++++++++- + include/linux/module.h | 9 +++++++++ + kernel/module.c | 11 +++++++++++ + scripts/mod/modpost.c | 9 +++++++++ + 4 files changed, 45 insertions(+), 1 deletion(-) + +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -10,6 +10,7 @@ + #include <linux/init.h> + #include <linux/utsname.h> + #include <linux/cpu.h> ++#include <linux/module.h> + + #include <asm/nospec-branch.h> + #include <asm/cmdline.h> +@@ -92,6 +93,19 @@ static const char *spectre_v2_strings[] + #define pr_fmt(fmt) "Spectre V2 mitigation: " fmt + + static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; ++static bool spectre_v2_bad_module; ++ ++#ifdef RETPOLINE ++bool retpoline_module_ok(bool has_retpoline) ++{ ++ if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline) ++ return true; ++ ++ pr_err("System may be vunerable to spectre v2\n"); ++ spectre_v2_bad_module = true; ++ return false; ++} ++#endif + + static void __init spec2_print_if_insecure(const char *reason) + { +@@ -277,6 +291,7 @@ ssize_t cpu_show_spectre_v2(struct devic + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + return sprintf(buf, "Not affected\n"); + +- return sprintf(buf, "%s\n", spectre_v2_strings[spectre_v2_enabled]); ++ return sprintf(buf, "%s%s\n", spectre_v2_strings[spectre_v2_enabled], ++ spectre_v2_bad_module ? " - vulnerable module loaded" : ""); + } + #endif +--- a/include/linux/module.h ++++ b/include/linux/module.h +@@ -791,6 +791,15 @@ static inline void module_bug_finalize(c + static inline void module_bug_cleanup(struct module *mod) {} + #endif /* CONFIG_GENERIC_BUG */ + ++#ifdef RETPOLINE ++extern bool retpoline_module_ok(bool has_retpoline); ++#else ++static inline bool retpoline_module_ok(bool has_retpoline) ++{ ++ return true; ++} ++#endif ++ + #ifdef CONFIG_MODULE_SIG + static inline bool module_sig_ok(struct module *module) + { +--- a/kernel/module.c ++++ b/kernel/module.c +@@ -2817,6 +2817,15 @@ static int check_modinfo_livepatch(struc + } + #endif /* CONFIG_LIVEPATCH */ + ++static void check_modinfo_retpoline(struct module *mod, struct load_info *info) ++{ ++ if (retpoline_module_ok(get_modinfo(info, "retpoline"))) ++ return; ++ ++ pr_warn("%s: loading module not compiled with retpoline compiler.\n", ++ mod->name); ++} ++ + /* Sets info->hdr and info->len. */ + static int copy_module_from_user(const void __user *umod, unsigned long len, + struct load_info *info) +@@ -2969,6 +2978,8 @@ static int check_modinfo(struct module * + add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); + } + ++ check_modinfo_retpoline(mod, info); ++ + if (get_modinfo(info, "staging")) { + add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); + pr_warn("%s: module is from the staging directory, the quality " +--- a/scripts/mod/modpost.c ++++ b/scripts/mod/modpost.c +@@ -2130,6 +2130,14 @@ static void add_intree_flag(struct buffe + buf_printf(b, "\nMODULE_INFO(intree, \"Y\");\n"); + } + ++/* Cannot check for assembler */ ++static void add_retpoline(struct buffer *b) ++{ ++ buf_printf(b, "\n#ifdef RETPOLINE\n"); ++ buf_printf(b, "MODULE_INFO(retpoline, \"Y\");\n"); ++ buf_printf(b, "#endif\n"); ++} ++ + static void add_staging_flag(struct buffer *b, const char *name) + { + static const char *staging_dir = "drivers/staging"; +@@ -2474,6 +2482,7 @@ int main(int argc, char **argv) + + add_header(&buf, mod); + add_intree_flag(&buf, !external_module); ++ add_retpoline(&buf); + add_staging_flag(&buf, mod->name); + err |= add_versions(&buf, mod); + add_depends(&buf, mod, modules); diff --git a/queue/nl80211-sanitize-array-index-in-parse_txq_params.patch b/queue/nl80211-sanitize-array-index-in-parse_txq_params.patch new file mode 100644 index 0000000..5de0872 --- /dev/null +++ b/queue/nl80211-sanitize-array-index-in-parse_txq_params.patch @@ -0,0 +1,72 @@ +From foo@baz Thu Feb 8 03:32:24 CET 2018 +From: Dan Williams <dan.j.williams@intel.com> +Date: Mon, 29 Jan 2018 17:03:15 -0800 +Subject: nl80211: Sanitize array index in parse_txq_params + +From: Dan Williams <dan.j.williams@intel.com> + + +(cherry picked from commit 259d8c1e984318497c84eef547bbb6b1d9f4eb05) + +Wireless drivers rely on parse_txq_params to validate that txq_params->ac +is less than NL80211_NUM_ACS by the time the low-level driver's ->conf_tx() +handler is called. Use a new helper, array_index_nospec(), to sanitize +txq_params->ac with respect to speculation. I.e. ensure that any +speculation into ->conf_tx() handlers is done with a value of +txq_params->ac that is within the bounds of [0, NL80211_NUM_ACS). + +Reported-by: Christian Lamparter <chunkeey@gmail.com> +Reported-by: Elena Reshetova <elena.reshetova@intel.com> +Signed-off-by: Dan Williams <dan.j.williams@intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: Johannes Berg <johannes@sipsolutions.net> +Cc: linux-arch@vger.kernel.org +Cc: kernel-hardening@lists.openwall.com +Cc: gregkh@linuxfoundation.org +Cc: linux-wireless@vger.kernel.org +Cc: torvalds@linux-foundation.org +Cc: "David S. Miller" <davem@davemloft.net> +Cc: alan@linux.intel.com +Link: https://lkml.kernel.org/r/151727419584.33451.7700736761686184303.stgit@dwillia2-desk3.amr.corp.intel.com +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + net/wireless/nl80211.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +--- a/net/wireless/nl80211.c ++++ b/net/wireless/nl80211.c +@@ -16,6 +16,7 @@ + #include <linux/nl80211.h> + #include <linux/rtnetlink.h> + #include <linux/netlink.h> ++#include <linux/nospec.h> + #include <linux/etherdevice.h> + #include <net/net_namespace.h> + #include <net/genetlink.h> +@@ -2014,20 +2015,22 @@ static const struct nla_policy txq_param + static int parse_txq_params(struct nlattr *tb[], + struct ieee80211_txq_params *txq_params) + { ++ u8 ac; ++ + if (!tb[NL80211_TXQ_ATTR_AC] || !tb[NL80211_TXQ_ATTR_TXOP] || + !tb[NL80211_TXQ_ATTR_CWMIN] || !tb[NL80211_TXQ_ATTR_CWMAX] || + !tb[NL80211_TXQ_ATTR_AIFS]) + return -EINVAL; + +- txq_params->ac = nla_get_u8(tb[NL80211_TXQ_ATTR_AC]); ++ ac = nla_get_u8(tb[NL80211_TXQ_ATTR_AC]); + txq_params->txop = nla_get_u16(tb[NL80211_TXQ_ATTR_TXOP]); + txq_params->cwmin = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMIN]); + txq_params->cwmax = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMAX]); + txq_params->aifs = nla_get_u8(tb[NL80211_TXQ_ATTR_AIFS]); + +- if (txq_params->ac >= NL80211_NUM_ACS) ++ if (ac >= NL80211_NUM_ACS) + return -EINVAL; +- ++ txq_params->ac = array_index_nospec(ac, NL80211_NUM_ACS); + return 0; + } + diff --git a/queue/objtool-allow-alternatives-to-be-ignored.patch b/queue/objtool-allow-alternatives-to-be-ignored.patch new file mode 100644 index 0000000..db02f25 --- /dev/null +++ b/queue/objtool-allow-alternatives-to-be-ignored.patch @@ -0,0 +1,163 @@ +From 258c76059cece01bebae098e81bacb1af2edad17 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf <jpoimboe@redhat.com> +Date: Thu, 11 Jan 2018 21:46:24 +0000 +Subject: objtool: Allow alternatives to be ignored + +From: Josh Poimboeuf <jpoimboe@redhat.com> + +commit 258c76059cece01bebae098e81bacb1af2edad17 upstream. + +Getting objtool to understand retpolines is going to be a bit of a +challenge. For now, take advantage of the fact that retpolines are +patched in with alternatives. Just read the original (sane) +non-alternative instruction, and ignore the patched-in retpoline. + +This allows objtool to understand the control flow *around* the +retpoline, even if it can't yet follow what's inside. This means the +ORC unwinder will fail to unwind from inside a retpoline, but will work +fine otherwise. + +Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: Rik van Riel <riel@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: thomas.lendacky@amd.com +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Jiri Kosina <jikos@kernel.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Kees Cook <keescook@google.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Cc: Paul Turner <pjt@google.com> +Link: https://lkml.kernel.org/r/1515707194-20531-3-git-send-email-dwmw@amazon.co.uk +[dwmw2: Applies to tools/objtool/builtin-check.c not check.[ch]] +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + tools/objtool/builtin-check.c | 64 +++++++++++++++++++++++++++++++++++++----- + 1 file changed, 57 insertions(+), 7 deletions(-) + +--- a/tools/objtool/builtin-check.c ++++ b/tools/objtool/builtin-check.c +@@ -51,7 +51,7 @@ struct instruction { + unsigned int len, state; + unsigned char type; + unsigned long immediate; +- bool alt_group, visited; ++ bool alt_group, visited, ignore_alts; + struct symbol *call_dest; + struct instruction *jump_dest; + struct list_head alts; +@@ -353,6 +353,40 @@ static void add_ignores(struct objtool_f + } + + /* ++ * FIXME: For now, just ignore any alternatives which add retpolines. This is ++ * a temporary hack, as it doesn't allow ORC to unwind from inside a retpoline. ++ * But it at least allows objtool to understand the control flow *around* the ++ * retpoline. ++ */ ++static int add_nospec_ignores(struct objtool_file *file) ++{ ++ struct section *sec; ++ struct rela *rela; ++ struct instruction *insn; ++ ++ sec = find_section_by_name(file->elf, ".rela.discard.nospec"); ++ if (!sec) ++ return 0; ++ ++ list_for_each_entry(rela, &sec->rela_list, list) { ++ if (rela->sym->type != STT_SECTION) { ++ WARN("unexpected relocation symbol type in %s", sec->name); ++ return -1; ++ } ++ ++ insn = find_insn(file, rela->sym->sec, rela->addend); ++ if (!insn) { ++ WARN("bad .discard.nospec entry"); ++ return -1; ++ } ++ ++ insn->ignore_alts = true; ++ } ++ ++ return 0; ++} ++ ++/* + * Find the destination instructions for all jumps. + */ + static int add_jump_destinations(struct objtool_file *file) +@@ -435,11 +469,18 @@ static int add_call_destinations(struct + dest_off = insn->offset + insn->len + insn->immediate; + insn->call_dest = find_symbol_by_offset(insn->sec, + dest_off); ++ /* ++ * FIXME: Thanks to retpolines, it's now considered ++ * normal for a function to call within itself. So ++ * disable this warning for now. ++ */ ++#if 0 + if (!insn->call_dest) { + WARN_FUNC("can't find call dest symbol at offset 0x%lx", + insn->sec, insn->offset, dest_off); + return -1; + } ++#endif + } else if (rela->sym->type == STT_SECTION) { + insn->call_dest = find_symbol_by_offset(rela->sym->sec, + rela->addend+4); +@@ -601,12 +642,6 @@ static int add_special_section_alts(stru + return ret; + + list_for_each_entry_safe(special_alt, tmp, &special_alts, list) { +- alt = malloc(sizeof(*alt)); +- if (!alt) { +- WARN("malloc failed"); +- ret = -1; +- goto out; +- } + + orig_insn = find_insn(file, special_alt->orig_sec, + special_alt->orig_off); +@@ -617,6 +652,10 @@ static int add_special_section_alts(stru + goto out; + } + ++ /* Ignore retpoline alternatives. */ ++ if (orig_insn->ignore_alts) ++ continue; ++ + new_insn = NULL; + if (!special_alt->group || special_alt->new_len) { + new_insn = find_insn(file, special_alt->new_sec, +@@ -642,6 +681,13 @@ static int add_special_section_alts(stru + goto out; + } + ++ alt = malloc(sizeof(*alt)); ++ if (!alt) { ++ WARN("malloc failed"); ++ ret = -1; ++ goto out; ++ } ++ + alt->insn = new_insn; + list_add_tail(&alt->list, &orig_insn->alts); + +@@ -861,6 +907,10 @@ static int decode_sections(struct objtoo + + add_ignores(file); + ++ ret = add_nospec_ignores(file); ++ if (ret) ++ return ret; ++ + ret = add_jump_destinations(file); + if (ret) + return ret; diff --git a/queue/objtool-detect-jumps-to-retpoline-thunks.patch b/queue/objtool-detect-jumps-to-retpoline-thunks.patch new file mode 100644 index 0000000..0079121 --- /dev/null +++ b/queue/objtool-detect-jumps-to-retpoline-thunks.patch @@ -0,0 +1,61 @@ +From 39b735332cb8b33a27c28592d969e4016c86c3ea Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf <jpoimboe@redhat.com> +Date: Thu, 11 Jan 2018 21:46:23 +0000 +Subject: objtool: Detect jumps to retpoline thunks + +From: Josh Poimboeuf <jpoimboe@redhat.com> + +commit 39b735332cb8b33a27c28592d969e4016c86c3ea upstream. + +A direct jump to a retpoline thunk is really an indirect jump in +disguise. Change the objtool instruction type accordingly. + +Objtool needs to know where indirect branches are so it can detect +switch statement jump tables. + +This fixes a bunch of warnings with CONFIG_RETPOLINE like: + + arch/x86/events/intel/uncore_nhmex.o: warning: objtool: nhmex_rbox_msr_enable_event()+0x44: sibling call from callable instruction with modified stack frame + kernel/signal.o: warning: objtool: copy_siginfo_to_user()+0x91: sibling call from callable instruction with modified stack frame + ... + +Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: Rik van Riel <riel@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: thomas.lendacky@amd.com +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Jiri Kosina <jikos@kernel.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Kees Cook <keescook@google.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Cc: Paul Turner <pjt@google.com> +Link: https://lkml.kernel.org/r/1515707194-20531-2-git-send-email-dwmw@amazon.co.uk +[dwmw2: Applies to tools/objtool/builtin-check.c not check.c] +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + tools/objtool/builtin-check.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/tools/objtool/builtin-check.c ++++ b/tools/objtool/builtin-check.c +@@ -382,6 +382,13 @@ static int add_jump_destinations(struct + } else if (rela->sym->sec->idx) { + dest_sec = rela->sym->sec; + dest_off = rela->sym->sym.st_value + rela->addend + 4; ++ } else if (strstr(rela->sym->name, "_indirect_thunk_")) { ++ /* ++ * Retpoline jumps are really dynamic jumps in ++ * disguise, so convert them accordingly. ++ */ ++ insn->type = INSN_JUMP_DYNAMIC; ++ continue; + } else { + /* sibling call */ + insn->jump_dest = 0; diff --git a/queue/objtool-fix-retpoline-support-for-pre-orc-objtool.patch b/queue/objtool-fix-retpoline-support-for-pre-orc-objtool.patch new file mode 100644 index 0000000..a773f63 --- /dev/null +++ b/queue/objtool-fix-retpoline-support-for-pre-orc-objtool.patch @@ -0,0 +1,45 @@ +From jpoimboe@redhat.com Mon Jan 15 18:44:58 2018 +From: Josh Poimboeuf <jpoimboe@redhat.com> +Date: Mon, 15 Jan 2018 11:00:54 -0600 +Subject: objtool: Fix retpoline support for pre-ORC objtool +To: David Woodhouse <dwmw2@infradead.org> +Cc: gregkh@linuxfoundation.org, ak@linux.intel.com, dave.hansen@intel.com, gregkh@linux-foundation.org, jikos@kernel.org, keescook@google.com, luto@amacapital.net, peterz@infradead.org, pjt@google.com, riel@redhat.com, tglx@linutronix.de, tim.c.chen@linux.intel.com, torvalds@linux-foundation.org, stable@vger.kernel.org, stable-commits@vger.kernel.org +Message-ID: <20180115170054.6baepkgihtla4nub@treble> +Content-Disposition: inline + +From: Josh Poimboeuf <jpoimboe@redhat.com> + +Objtool 1.0 (pre-ORC) produces the following warning when it encounters +a retpoline: + + arch/x86/crypto/camellia-aesni-avx2-asm_64.o: warning: objtool: .altinstr_replacement+0xf: return instruction outside of a callable function + +That warning is meant to catch GCC bugs and missing ENTRY/ENDPROC +annotations, neither of which are applicable to alternatives. Silence +the warning for alternative instructions, just like objtool 2.0 already +does. + +Reported-by: David Woodhouse <dwmw2@infradead.org> +Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + tools/objtool/builtin-check.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/tools/objtool/builtin-check.c ++++ b/tools/objtool/builtin-check.c +@@ -1230,6 +1230,14 @@ static int validate_uncallable_instructi + + for_each_insn(file, insn) { + if (!insn->visited && insn->type == INSN_RETURN) { ++ ++ /* ++ * Don't warn about call instructions in unvisited ++ * retpoline alternatives. ++ */ ++ if (!strcmp(insn->sec->name, ".altinstr_replacement")) ++ continue; ++ + WARN_FUNC("return instruction outside of a callable function", + insn->sec, insn->offset); + warnings++; diff --git a/queue/objtool-modules-discard-objtool-annotation-sections-for-modules.patch b/queue/objtool-modules-discard-objtool-annotation-sections-for-modules.patch new file mode 100644 index 0000000..ceb2b5f --- /dev/null +++ b/queue/objtool-modules-discard-objtool-annotation-sections-for-modules.patch @@ -0,0 +1,84 @@ +From e390f9a9689a42f477a6073e2e7df530a4c1b740 Mon Sep 17 00:00:00 2001 +From: Josh Poimboeuf <jpoimboe@redhat.com> +Date: Wed, 1 Mar 2017 12:04:44 -0600 +Subject: objtool, modules: Discard objtool annotation sections for modules + +From: Josh Poimboeuf <jpoimboe@redhat.com> + +commit e390f9a9689a42f477a6073e2e7df530a4c1b740 upstream. + +The '__unreachable' and '__func_stack_frame_non_standard' sections are +only used at compile time. They're discarded for vmlinux but they +should also be discarded for modules. + +Since this is a recurring pattern, prefix the section names with +".discard.". It's a nice convention and vmlinux.lds.h already discards +such sections. + +Also remove the 'a' (allocatable) flag from the __unreachable section +since it doesn't make sense for a discarded section. + +Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Jessica Yu <jeyu@redhat.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Fixes: d1091c7fa3d5 ("objtool: Improve detection of BUG() and other dead ends") +Link: http://lkml.kernel.org/r/20170301180444.lhd53c5tibc4ns77@treble +Signed-off-by: Ingo Molnar <mingo@kernel.org> +[dwmw2: Remove the unreachable part in backporting since it's not here yet] +Signed-off-by: David Woodhouse <dwmw@amazon.co.ku> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + include/linux/frame.h | 2 +- + scripts/mod/modpost.c | 1 + + scripts/module-common.lds | 5 ++++- + tools/objtool/builtin-check.c | 2 +- + 4 files changed, 7 insertions(+), 3 deletions(-) + +--- a/include/linux/frame.h ++++ b/include/linux/frame.h +@@ -11,7 +11,7 @@ + * For more information, see tools/objtool/Documentation/stack-validation.txt. + */ + #define STACK_FRAME_NON_STANDARD(func) \ +- static void __used __section(__func_stack_frame_non_standard) \ ++ static void __used __section(.discard.func_stack_frame_non_standard) \ + *__func_stack_frame_non_standard_##func = func + + #else /* !CONFIG_STACK_VALIDATION */ +--- a/scripts/mod/modpost.c ++++ b/scripts/mod/modpost.c +@@ -838,6 +838,7 @@ static const char *const section_white_l + ".cmem*", /* EZchip */ + ".fmt_slot*", /* EZchip */ + ".gnu.lto*", ++ ".discard.*", + NULL + }; + +--- a/scripts/module-common.lds ++++ b/scripts/module-common.lds +@@ -4,7 +4,10 @@ + * combine them automatically. + */ + SECTIONS { +- /DISCARD/ : { *(.discard) } ++ /DISCARD/ : { ++ *(.discard) ++ *(.discard.*) ++ } + + __ksymtab 0 : { *(SORT(___ksymtab+*)) } + __ksymtab_gpl 0 : { *(SORT(___ksymtab_gpl+*)) } +--- a/tools/objtool/builtin-check.c ++++ b/tools/objtool/builtin-check.c +@@ -1229,7 +1229,7 @@ int cmd_check(int argc, const char **arg + + INIT_LIST_HEAD(&file.insn_list); + hash_init(file.insn_hash); +- file.whitelist = find_section_by_name(file.elf, "__func_stack_frame_non_standard"); ++ file.whitelist = find_section_by_name(file.elf, ".discard.func_stack_frame_non_standard"); + file.rodata = find_section_by_name(file.elf, ".rodata"); + file.ignore_unreachables = false; + file.c_file = find_section_by_name(file.elf, ".comment"); diff --git a/queue/retpoline-introduce-start-end-markers-of-indirect-thunk.patch b/queue/retpoline-introduce-start-end-markers-of-indirect-thunk.patch new file mode 100644 index 0000000..3fdc39b --- /dev/null +++ b/queue/retpoline-introduce-start-end-markers-of-indirect-thunk.patch @@ -0,0 +1,71 @@ +From 736e80a4213e9bbce40a7c050337047128b472ac Mon Sep 17 00:00:00 2001 +From: Masami Hiramatsu <mhiramat@kernel.org> +Date: Fri, 19 Jan 2018 01:14:21 +0900 +Subject: retpoline: Introduce start/end markers of indirect thunk + +From: Masami Hiramatsu <mhiramat@kernel.org> + +commit 736e80a4213e9bbce40a7c050337047128b472ac upstream. + +Introduce start/end markers of __x86_indirect_thunk_* functions. +To make it easy, consolidate .text.__x86.indirect_thunk.* sections +to one .text.__x86.indirect_thunk section and put it in the +end of kernel text section and adds __indirect_thunk_start/end +so that other subsystem (e.g. kprobes) can identify it. + +Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: David Woodhouse <dwmw@amazon.co.uk> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Link: https://lkml.kernel.org/r/151629206178.10241.6828804696410044771.stgit@devbox +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/include/asm/nospec-branch.h | 3 +++ + arch/x86/kernel/vmlinux.lds.S | 7 +++++++ + arch/x86/lib/retpoline.S | 2 +- + 3 files changed, 11 insertions(+), 1 deletion(-) + +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -194,6 +194,9 @@ enum spectre_v2_mitigation { + SPECTRE_V2_IBRS, + }; + ++extern char __indirect_thunk_start[]; ++extern char __indirect_thunk_end[]; ++ + /* + * On VMEXIT we must ensure that no RSB predictions learned in the guest + * can be followed in the host, by overwriting the RSB completely. Both +--- a/arch/x86/kernel/vmlinux.lds.S ++++ b/arch/x86/kernel/vmlinux.lds.S +@@ -105,6 +105,13 @@ SECTIONS + SOFTIRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) ++ ++#ifdef CONFIG_RETPOLINE ++ __indirect_thunk_start = .; ++ *(.text.__x86.indirect_thunk) ++ __indirect_thunk_end = .; ++#endif ++ + /* End of text section */ + _etext = .; + } :text = 0x9090 +--- a/arch/x86/lib/retpoline.S ++++ b/arch/x86/lib/retpoline.S +@@ -9,7 +9,7 @@ + #include <asm/nospec-branch.h> + + .macro THUNK reg +- .section .text.__x86.indirect_thunk.\reg ++ .section .text.__x86.indirect_thunk + + ENTRY(__x86_indirect_thunk_\reg) + CFI_STARTPROC diff --git a/queue/selftests-x86-add-test_vsyscall.patch b/queue/selftests-x86-add-test_vsyscall.patch new file mode 100644 index 0000000..73478ec --- /dev/null +++ b/queue/selftests-x86-add-test_vsyscall.patch @@ -0,0 +1,569 @@ +From 6fcf09dcfd33e93cfe1808fcb9474087dd40cc05 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 11 Jan 2018 17:16:51 -0800 +Subject: [PATCH] selftests/x86: Add test_vsyscall + +commit 352909b49ba0d74929b96af6dfbefc854ab6ebb5 upstream. + +This tests that the vsyscall entries do what they're expected to do. +It also confirms that attempts to read the vsyscall page behave as +expected. + +If changes are made to the vsyscall code or its memory map handling, +running this test in all three of vsyscall=none, vsyscall=emulate, +and vsyscall=native are helpful. + +(Because it's easy, this also compares the vsyscall results to their + vDSO equivalents.) + +Note to KAISER backporters: please test this under all three +vsyscall modes. Also, in the emulate and native modes, make sure +that test_vsyscall_64 agrees with the command line or config +option as to which mode you're in. It's quite easy to mess up +the kernel such that native mode accidentally emulates +or vice versa. + +Greg, etc: please backport this to all your Meltdown-patched +kernels. It'll help make sure the patches didn't regress +vsyscalls. + +CSigned-off-by: Andy Lutomirski <luto@kernel.org> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Hugh Dickins <hughd@google.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Kees Cook <keescook@chromium.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: stable@vger.kernel.org +Link: http://lkml.kernel.org/r/2b9c5a174c1d60fd7774461d518aa75598b1d8fd.1515719552.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile +index 4f747ee07f10..38dbdf4bfd89 100644 +--- a/tools/testing/selftests/x86/Makefile ++++ b/tools/testing/selftests/x86/Makefile +@@ -5,7 +5,7 @@ include ../lib.mk + .PHONY: all all_32 all_64 warn_32bit_failure clean + + TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \ +- check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test ++ check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test test_vsyscall + TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ + test_FCMOV test_FCOMI test_FISTTP \ + vdso_restorer +diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c +new file mode 100644 +index 000000000000..6e0bd52ad53d +--- /dev/null ++++ b/tools/testing/selftests/x86/test_vsyscall.c +@@ -0,0 +1,500 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#define _GNU_SOURCE ++ ++#include <stdio.h> ++#include <sys/time.h> ++#include <time.h> ++#include <stdlib.h> ++#include <sys/syscall.h> ++#include <unistd.h> ++#include <dlfcn.h> ++#include <string.h> ++#include <inttypes.h> ++#include <signal.h> ++#include <sys/ucontext.h> ++#include <errno.h> ++#include <err.h> ++#include <sched.h> ++#include <stdbool.h> ++#include <setjmp.h> ++ ++#ifdef __x86_64__ ++# define VSYS(x) (x) ++#else ++# define VSYS(x) 0 ++#endif ++ ++#ifndef SYS_getcpu ++# ifdef __x86_64__ ++# define SYS_getcpu 309 ++# else ++# define SYS_getcpu 318 ++# endif ++#endif ++ ++static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), ++ int flags) ++{ ++ struct sigaction sa; ++ memset(&sa, 0, sizeof(sa)); ++ sa.sa_sigaction = handler; ++ sa.sa_flags = SA_SIGINFO | flags; ++ sigemptyset(&sa.sa_mask); ++ if (sigaction(sig, &sa, 0)) ++ err(1, "sigaction"); ++} ++ ++/* vsyscalls and vDSO */ ++bool should_read_vsyscall = false; ++ ++typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz); ++gtod_t vgtod = (gtod_t)VSYS(0xffffffffff600000); ++gtod_t vdso_gtod; ++ ++typedef int (*vgettime_t)(clockid_t, struct timespec *); ++vgettime_t vdso_gettime; ++ ++typedef long (*time_func_t)(time_t *t); ++time_func_t vtime = (time_func_t)VSYS(0xffffffffff600400); ++time_func_t vdso_time; ++ ++typedef long (*getcpu_t)(unsigned *, unsigned *, void *); ++getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800); ++getcpu_t vdso_getcpu; ++ ++static void init_vdso(void) ++{ ++ void *vdso = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); ++ if (!vdso) ++ vdso = dlopen("linux-gate.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); ++ if (!vdso) { ++ printf("[WARN]\tfailed to find vDSO\n"); ++ return; ++ } ++ ++ vdso_gtod = (gtod_t)dlsym(vdso, "__vdso_gettimeofday"); ++ if (!vdso_gtod) ++ printf("[WARN]\tfailed to find gettimeofday in vDSO\n"); ++ ++ vdso_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime"); ++ if (!vdso_gettime) ++ printf("[WARN]\tfailed to find clock_gettime in vDSO\n"); ++ ++ vdso_time = (time_func_t)dlsym(vdso, "__vdso_time"); ++ if (!vdso_time) ++ printf("[WARN]\tfailed to find time in vDSO\n"); ++ ++ vdso_getcpu = (getcpu_t)dlsym(vdso, "__vdso_getcpu"); ++ if (!vdso_getcpu) { ++ /* getcpu() was never wired up in the 32-bit vDSO. */ ++ printf("[%s]\tfailed to find getcpu in vDSO\n", ++ sizeof(long) == 8 ? "WARN" : "NOTE"); ++ } ++} ++ ++static int init_vsys(void) ++{ ++#ifdef __x86_64__ ++ int nerrs = 0; ++ FILE *maps; ++ char line[128]; ++ bool found = false; ++ ++ maps = fopen("/proc/self/maps", "r"); ++ if (!maps) { ++ printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n"); ++ should_read_vsyscall = true; ++ return 0; ++ } ++ ++ while (fgets(line, sizeof(line), maps)) { ++ char r, x; ++ void *start, *end; ++ char name[128]; ++ if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s", ++ &start, &end, &r, &x, name) != 5) ++ continue; ++ ++ if (strcmp(name, "[vsyscall]")) ++ continue; ++ ++ printf("\tvsyscall map: %s", line); ++ ++ if (start != (void *)0xffffffffff600000 || ++ end != (void *)0xffffffffff601000) { ++ printf("[FAIL]\taddress range is nonsense\n"); ++ nerrs++; ++ } ++ ++ printf("\tvsyscall permissions are %c-%c\n", r, x); ++ should_read_vsyscall = (r == 'r'); ++ if (x != 'x') { ++ vgtod = NULL; ++ vtime = NULL; ++ vgetcpu = NULL; ++ } ++ ++ found = true; ++ break; ++ } ++ ++ fclose(maps); ++ ++ if (!found) { ++ printf("\tno vsyscall map in /proc/self/maps\n"); ++ should_read_vsyscall = false; ++ vgtod = NULL; ++ vtime = NULL; ++ vgetcpu = NULL; ++ } ++ ++ return nerrs; ++#else ++ return 0; ++#endif ++} ++ ++/* syscalls */ ++static inline long sys_gtod(struct timeval *tv, struct timezone *tz) ++{ ++ return syscall(SYS_gettimeofday, tv, tz); ++} ++ ++static inline int sys_clock_gettime(clockid_t id, struct timespec *ts) ++{ ++ return syscall(SYS_clock_gettime, id, ts); ++} ++ ++static inline long sys_time(time_t *t) ++{ ++ return syscall(SYS_time, t); ++} ++ ++static inline long sys_getcpu(unsigned * cpu, unsigned * node, ++ void* cache) ++{ ++ return syscall(SYS_getcpu, cpu, node, cache); ++} ++ ++static jmp_buf jmpbuf; ++ ++static void sigsegv(int sig, siginfo_t *info, void *ctx_void) ++{ ++ siglongjmp(jmpbuf, 1); ++} ++ ++static double tv_diff(const struct timeval *a, const struct timeval *b) ++{ ++ return (double)(a->tv_sec - b->tv_sec) + ++ (double)((int)a->tv_usec - (int)b->tv_usec) * 1e-6; ++} ++ ++static int check_gtod(const struct timeval *tv_sys1, ++ const struct timeval *tv_sys2, ++ const struct timezone *tz_sys, ++ const char *which, ++ const struct timeval *tv_other, ++ const struct timezone *tz_other) ++{ ++ int nerrs = 0; ++ double d1, d2; ++ ++ if (tz_other && (tz_sys->tz_minuteswest != tz_other->tz_minuteswest || tz_sys->tz_dsttime != tz_other->tz_dsttime)) { ++ printf("[FAIL] %s tz mismatch\n", which); ++ nerrs++; ++ } ++ ++ d1 = tv_diff(tv_other, tv_sys1); ++ d2 = tv_diff(tv_sys2, tv_other); ++ printf("\t%s time offsets: %lf %lf\n", which, d1, d2); ++ ++ if (d1 < 0 || d2 < 0) { ++ printf("[FAIL]\t%s time was inconsistent with the syscall\n", which); ++ nerrs++; ++ } else { ++ printf("[OK]\t%s gettimeofday()'s timeval was okay\n", which); ++ } ++ ++ return nerrs; ++} ++ ++static int test_gtod(void) ++{ ++ struct timeval tv_sys1, tv_sys2, tv_vdso, tv_vsys; ++ struct timezone tz_sys, tz_vdso, tz_vsys; ++ long ret_vdso = -1; ++ long ret_vsys = -1; ++ int nerrs = 0; ++ ++ printf("[RUN]\ttest gettimeofday()\n"); ++ ++ if (sys_gtod(&tv_sys1, &tz_sys) != 0) ++ err(1, "syscall gettimeofday"); ++ if (vdso_gtod) ++ ret_vdso = vdso_gtod(&tv_vdso, &tz_vdso); ++ if (vgtod) ++ ret_vsys = vgtod(&tv_vsys, &tz_vsys); ++ if (sys_gtod(&tv_sys2, &tz_sys) != 0) ++ err(1, "syscall gettimeofday"); ++ ++ if (vdso_gtod) { ++ if (ret_vdso == 0) { ++ nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vDSO", &tv_vdso, &tz_vdso); ++ } else { ++ printf("[FAIL]\tvDSO gettimeofday() failed: %ld\n", ret_vdso); ++ nerrs++; ++ } ++ } ++ ++ if (vgtod) { ++ if (ret_vsys == 0) { ++ nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys); ++ } else { ++ printf("[FAIL]\tvsys gettimeofday() failed: %ld\n", ret_vsys); ++ nerrs++; ++ } ++ } ++ ++ return nerrs; ++} ++ ++static int test_time(void) { ++ int nerrs = 0; ++ ++ printf("[RUN]\ttest time()\n"); ++ long t_sys1, t_sys2, t_vdso = 0, t_vsys = 0; ++ long t2_sys1 = -1, t2_sys2 = -1, t2_vdso = -1, t2_vsys = -1; ++ t_sys1 = sys_time(&t2_sys1); ++ if (vdso_time) ++ t_vdso = vdso_time(&t2_vdso); ++ if (vtime) ++ t_vsys = vtime(&t2_vsys); ++ t_sys2 = sys_time(&t2_sys2); ++ if (t_sys1 < 0 || t_sys1 != t2_sys1 || t_sys2 < 0 || t_sys2 != t2_sys2) { ++ printf("[FAIL]\tsyscall failed (ret1:%ld output1:%ld ret2:%ld output2:%ld)\n", t_sys1, t2_sys1, t_sys2, t2_sys2); ++ nerrs++; ++ return nerrs; ++ } ++ ++ if (vdso_time) { ++ if (t_vdso < 0 || t_vdso != t2_vdso) { ++ printf("[FAIL]\tvDSO failed (ret:%ld output:%ld)\n", t_vdso, t2_vdso); ++ nerrs++; ++ } else if (t_vdso < t_sys1 || t_vdso > t_sys2) { ++ printf("[FAIL]\tvDSO returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vdso, t_sys2); ++ nerrs++; ++ } else { ++ printf("[OK]\tvDSO time() is okay\n"); ++ } ++ } ++ ++ if (vtime) { ++ if (t_vsys < 0 || t_vsys != t2_vsys) { ++ printf("[FAIL]\tvsyscall failed (ret:%ld output:%ld)\n", t_vsys, t2_vsys); ++ nerrs++; ++ } else if (t_vsys < t_sys1 || t_vsys > t_sys2) { ++ printf("[FAIL]\tvsyscall returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vsys, t_sys2); ++ nerrs++; ++ } else { ++ printf("[OK]\tvsyscall time() is okay\n"); ++ } ++ } ++ ++ return nerrs; ++} ++ ++static int test_getcpu(int cpu) ++{ ++ int nerrs = 0; ++ long ret_sys, ret_vdso = -1, ret_vsys = -1; ++ ++ printf("[RUN]\tgetcpu() on CPU %d\n", cpu); ++ ++ cpu_set_t cpuset; ++ CPU_ZERO(&cpuset); ++ CPU_SET(cpu, &cpuset); ++ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) { ++ printf("[SKIP]\tfailed to force CPU %d\n", cpu); ++ return nerrs; ++ } ++ ++ unsigned cpu_sys, cpu_vdso, cpu_vsys, node_sys, node_vdso, node_vsys; ++ unsigned node = 0; ++ bool have_node = false; ++ ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0); ++ if (vdso_getcpu) ++ ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0); ++ if (vgetcpu) ++ ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0); ++ ++ if (ret_sys == 0) { ++ if (cpu_sys != cpu) { ++ printf("[FAIL]\tsyscall reported CPU %hu but should be %d\n", cpu_sys, cpu); ++ nerrs++; ++ } ++ ++ have_node = true; ++ node = node_sys; ++ } ++ ++ if (vdso_getcpu) { ++ if (ret_vdso) { ++ printf("[FAIL]\tvDSO getcpu() failed\n"); ++ nerrs++; ++ } else { ++ if (!have_node) { ++ have_node = true; ++ node = node_vdso; ++ } ++ ++ if (cpu_vdso != cpu) { ++ printf("[FAIL]\tvDSO reported CPU %hu but should be %d\n", cpu_vdso, cpu); ++ nerrs++; ++ } else { ++ printf("[OK]\tvDSO reported correct CPU\n"); ++ } ++ ++ if (node_vdso != node) { ++ printf("[FAIL]\tvDSO reported node %hu but should be %hu\n", node_vdso, node); ++ nerrs++; ++ } else { ++ printf("[OK]\tvDSO reported correct node\n"); ++ } ++ } ++ } ++ ++ if (vgetcpu) { ++ if (ret_vsys) { ++ printf("[FAIL]\tvsyscall getcpu() failed\n"); ++ nerrs++; ++ } else { ++ if (!have_node) { ++ have_node = true; ++ node = node_vsys; ++ } ++ ++ if (cpu_vsys != cpu) { ++ printf("[FAIL]\tvsyscall reported CPU %hu but should be %d\n", cpu_vsys, cpu); ++ nerrs++; ++ } else { ++ printf("[OK]\tvsyscall reported correct CPU\n"); ++ } ++ ++ if (node_vsys != node) { ++ printf("[FAIL]\tvsyscall reported node %hu but should be %hu\n", node_vsys, node); ++ nerrs++; ++ } else { ++ printf("[OK]\tvsyscall reported correct node\n"); ++ } ++ } ++ } ++ ++ return nerrs; ++} ++ ++static int test_vsys_r(void) ++{ ++#ifdef __x86_64__ ++ printf("[RUN]\tChecking read access to the vsyscall page\n"); ++ bool can_read; ++ if (sigsetjmp(jmpbuf, 1) == 0) { ++ *(volatile int *)0xffffffffff600000; ++ can_read = true; ++ } else { ++ can_read = false; ++ } ++ ++ if (can_read && !should_read_vsyscall) { ++ printf("[FAIL]\tWe have read access, but we shouldn't\n"); ++ return 1; ++ } else if (!can_read && should_read_vsyscall) { ++ printf("[FAIL]\tWe don't have read access, but we should\n"); ++ return 1; ++ } else { ++ printf("[OK]\tgot expected result\n"); ++ } ++#endif ++ ++ return 0; ++} ++ ++ ++#ifdef __x86_64__ ++#define X86_EFLAGS_TF (1UL << 8) ++static volatile sig_atomic_t num_vsyscall_traps; ++ ++static unsigned long get_eflags(void) ++{ ++ unsigned long eflags; ++ asm volatile ("pushfq\n\tpopq %0" : "=rm" (eflags)); ++ return eflags; ++} ++ ++static void set_eflags(unsigned long eflags) ++{ ++ asm volatile ("pushq %0\n\tpopfq" : : "rm" (eflags) : "flags"); ++} ++ ++static void sigtrap(int sig, siginfo_t *info, void *ctx_void) ++{ ++ ucontext_t *ctx = (ucontext_t *)ctx_void; ++ unsigned long ip = ctx->uc_mcontext.gregs[REG_RIP]; ++ ++ if (((ip ^ 0xffffffffff600000UL) & ~0xfffUL) == 0) ++ num_vsyscall_traps++; ++} ++ ++static int test_native_vsyscall(void) ++{ ++ time_t tmp; ++ bool is_native; ++ ++ if (!vtime) ++ return 0; ++ ++ printf("[RUN]\tchecking for native vsyscall\n"); ++ sethandler(SIGTRAP, sigtrap, 0); ++ set_eflags(get_eflags() | X86_EFLAGS_TF); ++ vtime(&tmp); ++ set_eflags(get_eflags() & ~X86_EFLAGS_TF); ++ ++ /* ++ * If vsyscalls are emulated, we expect a single trap in the ++ * vsyscall page -- the call instruction will trap with RIP ++ * pointing to the entry point before emulation takes over. ++ * In native mode, we expect two traps, since whatever code ++ * the vsyscall page contains will be more than just a ret ++ * instruction. ++ */ ++ is_native = (num_vsyscall_traps > 1); ++ ++ printf("\tvsyscalls are %s (%d instructions in vsyscall page)\n", ++ (is_native ? "native" : "emulated"), ++ (int)num_vsyscall_traps); ++ ++ return 0; ++} ++#endif ++ ++int main(int argc, char **argv) ++{ ++ int nerrs = 0; ++ ++ init_vdso(); ++ nerrs += init_vsys(); ++ ++ nerrs += test_gtod(); ++ nerrs += test_time(); ++ nerrs += test_getcpu(0); ++ nerrs += test_getcpu(1); ++ ++ sethandler(SIGSEGV, sigsegv, 0); ++ nerrs += test_vsys_r(); ++ ++#ifdef __x86_64__ ++ nerrs += test_native_vsyscall(); ++#endif ++ ++ return nerrs ? 1 : 0; ++} +-- +2.15.0 + diff --git a/queue/sysfs-cpu-add-vulnerability-folder.patch b/queue/sysfs-cpu-add-vulnerability-folder.patch new file mode 100644 index 0000000..92522ac --- /dev/null +++ b/queue/sysfs-cpu-add-vulnerability-folder.patch @@ -0,0 +1,149 @@ +From 87590ce6e373d1a5401f6539f0c59ef92dd924a9 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Sun, 7 Jan 2018 22:48:00 +0100 +Subject: sysfs/cpu: Add vulnerability folder + +From: Thomas Gleixner <tglx@linutronix.de> + +commit 87590ce6e373d1a5401f6539f0c59ef92dd924a9 upstream. + +As the meltdown/spectre problem affects several CPU architectures, it makes +sense to have common way to express whether a system is affected by a +particular vulnerability or not. If affected the way to express the +mitigation should be common as well. + +Create /sys/devices/system/cpu/vulnerabilities folder and files for +meltdown, spectre_v1 and spectre_v2. + +Allow architectures to override the show function. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Will Deacon <will.deacon@arm.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linuxfoundation.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: David Woodhouse <dwmw@amazon.co.uk> +Link: https://lkml.kernel.org/r/20180107214913.096657732@linutronix.de +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + Documentation/ABI/testing/sysfs-devices-system-cpu | 16 +++++++ + drivers/base/Kconfig | 3 + + drivers/base/cpu.c | 48 +++++++++++++++++++++ + include/linux/cpu.h | 7 +++ + 4 files changed, 74 insertions(+) + +--- a/Documentation/ABI/testing/sysfs-devices-system-cpu ++++ b/Documentation/ABI/testing/sysfs-devices-system-cpu +@@ -350,3 +350,19 @@ Contact: Linux ARM Kernel Mailing list < + Description: AArch64 CPU registers + 'identification' directory exposes the CPU ID registers for + identifying model and revision of the CPU. ++ ++What: /sys/devices/system/cpu/vulnerabilities ++ /sys/devices/system/cpu/vulnerabilities/meltdown ++ /sys/devices/system/cpu/vulnerabilities/spectre_v1 ++ /sys/devices/system/cpu/vulnerabilities/spectre_v2 ++Date: Januar 2018 ++Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> ++Description: Information about CPU vulnerabilities ++ ++ The files are named after the code names of CPU ++ vulnerabilities. The output of those files reflects the ++ state of the CPUs in the system. Possible output values: ++ ++ "Not affected" CPU is not affected by the vulnerability ++ "Vulnerable" CPU is affected and no mitigation in effect ++ "Mitigation: $M" CPU is affetcted and mitigation $M is in effect +--- a/drivers/base/Kconfig ++++ b/drivers/base/Kconfig +@@ -235,6 +235,9 @@ config GENERIC_CPU_DEVICES + config GENERIC_CPU_AUTOPROBE + bool + ++config GENERIC_CPU_VULNERABILITIES ++ bool ++ + config SOC_BUS + bool + +--- a/drivers/base/cpu.c ++++ b/drivers/base/cpu.c +@@ -499,10 +499,58 @@ static void __init cpu_dev_register_gene + #endif + } + ++#ifdef CONFIG_GENERIC_CPU_VULNERABILITIES ++ ++ssize_t __weak cpu_show_meltdown(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "Not affected\n"); ++} ++ ++ssize_t __weak cpu_show_spectre_v1(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "Not affected\n"); ++} ++ ++ssize_t __weak cpu_show_spectre_v2(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "Not affected\n"); ++} ++ ++static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); ++static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); ++static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); ++ ++static struct attribute *cpu_root_vulnerabilities_attrs[] = { ++ &dev_attr_meltdown.attr, ++ &dev_attr_spectre_v1.attr, ++ &dev_attr_spectre_v2.attr, ++ NULL ++}; ++ ++static const struct attribute_group cpu_root_vulnerabilities_group = { ++ .name = "vulnerabilities", ++ .attrs = cpu_root_vulnerabilities_attrs, ++}; ++ ++static void __init cpu_register_vulnerabilities(void) ++{ ++ if (sysfs_create_group(&cpu_subsys.dev_root->kobj, ++ &cpu_root_vulnerabilities_group)) ++ pr_err("Unable to register CPU vulnerabilities\n"); ++} ++ ++#else ++static inline void cpu_register_vulnerabilities(void) { } ++#endif ++ + void __init cpu_dev_init(void) + { + if (subsys_system_register(&cpu_subsys, cpu_root_attr_groups)) + panic("Failed to register CPU subsystem"); + + cpu_dev_register_generic(); ++ cpu_register_vulnerabilities(); + } +--- a/include/linux/cpu.h ++++ b/include/linux/cpu.h +@@ -44,6 +44,13 @@ extern void cpu_remove_dev_attr(struct d + extern int cpu_add_dev_attr_group(struct attribute_group *attrs); + extern void cpu_remove_dev_attr_group(struct attribute_group *attrs); + ++extern ssize_t cpu_show_meltdown(struct device *dev, ++ struct device_attribute *attr, char *buf); ++extern ssize_t cpu_show_spectre_v1(struct device *dev, ++ struct device_attribute *attr, char *buf); ++extern ssize_t cpu_show_spectre_v2(struct device *dev, ++ struct device_attribute *attr, char *buf); ++ + extern __printf(4, 5) + struct device *cpu_device_create(struct device *parent, void *drvdata, + const struct attribute_group **groups, diff --git a/queue/sysfs-cpu-fix-typos-in-vulnerability-documentation.patch b/queue/sysfs-cpu-fix-typos-in-vulnerability-documentation.patch new file mode 100644 index 0000000..181d38d --- /dev/null +++ b/queue/sysfs-cpu-fix-typos-in-vulnerability-documentation.patch @@ -0,0 +1,35 @@ +From 9ecccfaa7cb5249bd31bdceb93fcf5bedb8a24d8 Mon Sep 17 00:00:00 2001 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Tue, 9 Jan 2018 15:02:51 +0000 +Subject: sysfs/cpu: Fix typos in vulnerability documentation + +From: David Woodhouse <dwmw@amazon.co.uk> + +commit 9ecccfaa7cb5249bd31bdceb93fcf5bedb8a24d8 upstream. + +Fixes: 87590ce6e ("sysfs/cpu: Add vulnerability folder") +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + Documentation/ABI/testing/sysfs-devices-system-cpu | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/Documentation/ABI/testing/sysfs-devices-system-cpu ++++ b/Documentation/ABI/testing/sysfs-devices-system-cpu +@@ -355,7 +355,7 @@ What: /sys/devices/system/cpu/vulnerabi + /sys/devices/system/cpu/vulnerabilities/meltdown + /sys/devices/system/cpu/vulnerabilities/spectre_v1 + /sys/devices/system/cpu/vulnerabilities/spectre_v2 +-Date: Januar 2018 ++Date: January 2018 + Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> + Description: Information about CPU vulnerabilities + +@@ -365,4 +365,4 @@ Description: Information about CPU vulne + + "Not affected" CPU is not affected by the vulnerability + "Vulnerable" CPU is affected and no mitigation in effect +- "Mitigation: $M" CPU is affetcted and mitigation $M is in effect ++ "Mitigation: $M" CPU is affected and mitigation $M is in effect diff --git a/queue/vfs-fdtable-prevent-bounds-check-bypass-via-speculative-execution.patch b/queue/vfs-fdtable-prevent-bounds-check-bypass-via-speculative-execution.patch new file mode 100644 index 0000000..6ed1c28 --- /dev/null +++ b/queue/vfs-fdtable-prevent-bounds-check-bypass-via-speculative-execution.patch @@ -0,0 +1,54 @@ +From foo@baz Thu Feb 8 03:32:24 CET 2018 +From: Dan Williams <dan.j.williams@intel.com> +Date: Mon, 29 Jan 2018 17:03:05 -0800 +Subject: vfs, fdtable: Prevent bounds-check bypass via speculative execution + +From: Dan Williams <dan.j.williams@intel.com> + + +(cherry picked from commit 56c30ba7b348b90484969054d561f711ba196507) + +'fd' is a user controlled value that is used as a data dependency to +read from the 'fdt->fd' array. In order to avoid potential leaks of +kernel memory values, block speculative execution of the instruction +stream that could issue reads based on an invalid 'file *' returned from +__fcheck_files. + +Co-developed-by: Elena Reshetova <elena.reshetova@intel.com> +Signed-off-by: Dan Williams <dan.j.williams@intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: linux-arch@vger.kernel.org +Cc: kernel-hardening@lists.openwall.com +Cc: gregkh@linuxfoundation.org +Cc: Al Viro <viro@zeniv.linux.org.uk> +Cc: torvalds@linux-foundation.org +Cc: alan@linux.intel.com +Link: https://lkml.kernel.org/r/151727418500.33451.17392199002892248656.stgit@dwillia2-desk3.amr.corp.intel.com +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + include/linux/fdtable.h | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/include/linux/fdtable.h ++++ b/include/linux/fdtable.h +@@ -9,6 +9,7 @@ + #include <linux/compiler.h> + #include <linux/spinlock.h> + #include <linux/rcupdate.h> ++#include <linux/nospec.h> + #include <linux/types.h> + #include <linux/init.h> + #include <linux/fs.h> +@@ -81,8 +82,10 @@ static inline struct file *__fcheck_file + { + struct fdtable *fdt = rcu_dereference_raw(files->fdt); + +- if (fd < fdt->max_fds) ++ if (fd < fdt->max_fds) { ++ fd = array_index_nospec(fd, fdt->max_fds); + return rcu_dereference_raw(fdt->fd[fd]); ++ } + return NULL; + } + diff --git a/queue/vsyscall-fix-permissions-for-emulate-mode-with-kaiser-pti.patch b/queue/vsyscall-fix-permissions-for-emulate-mode-with-kaiser-pti.patch new file mode 100644 index 0000000..bce5e67 --- /dev/null +++ b/queue/vsyscall-fix-permissions-for-emulate-mode-with-kaiser-pti.patch @@ -0,0 +1,72 @@ +From ben.hutchings@codethink.co.uk Fri Jan 26 17:35:59 2018 +From: Ben Hutchings <ben.hutchings@codethink.co.uk> +Date: Fri, 26 Jan 2018 16:23:02 +0000 +Subject: vsyscall: Fix permissions for emulate mode with KAISER/PTI +To: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Cc: Borislav Petkov <bp@suse.de>, Hugh Dickins <hughd@google.com>, stable@vger.kernel.org +Message-ID: <20180126162302.ei4tmiltl73npmr6@xylophone.i.decadent.org.uk> + +From: Ben Hutchings <ben.hutchings@codethink.co.uk> + +The backport of KAISER to 4.4 turned vsyscall emulate mode into native +mode. Add a vsyscall_pgprot variable to hold the correct page +protections, like Borislav and Hugh did for 3.2 and 3.18. + +Cc: Borislav Petkov <bp@suse.de> +Cc: Hugh Dickins <hughd@google.com> +Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + + +--- + arch/x86/entry/vsyscall/vsyscall_64.c | 7 ++++--- + arch/x86/include/asm/vsyscall.h | 1 + + arch/x86/mm/kaiser.c | 2 +- + 3 files changed, 6 insertions(+), 4 deletions(-) + +--- a/arch/x86/entry/vsyscall/vsyscall_64.c ++++ b/arch/x86/entry/vsyscall/vsyscall_64.c +@@ -46,6 +46,7 @@ static enum { EMULATE, NATIVE, NONE } vs + #else + EMULATE; + #endif ++unsigned long vsyscall_pgprot = __PAGE_KERNEL_VSYSCALL; + + static int __init vsyscall_setup(char *str) + { +@@ -336,11 +337,11 @@ void __init map_vsyscall(void) + extern char __vsyscall_page; + unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); + ++ if (vsyscall_mode != NATIVE) ++ vsyscall_pgprot = __PAGE_KERNEL_VVAR; + if (vsyscall_mode != NONE) + __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, +- vsyscall_mode == NATIVE +- ? PAGE_KERNEL_VSYSCALL +- : PAGE_KERNEL_VVAR); ++ __pgprot(vsyscall_pgprot)); + + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != + (unsigned long)VSYSCALL_ADDR); +--- a/arch/x86/include/asm/vsyscall.h ++++ b/arch/x86/include/asm/vsyscall.h +@@ -13,6 +13,7 @@ extern void map_vsyscall(void); + */ + extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); + extern bool vsyscall_enabled(void); ++extern unsigned long vsyscall_pgprot; + #else + static inline void map_vsyscall(void) {} + static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -344,7 +344,7 @@ void __init kaiser_init(void) + if (vsyscall_enabled()) + kaiser_add_user_map_early((void *)VSYSCALL_ADDR, + PAGE_SIZE, +- __PAGE_KERNEL_VSYSCALL); ++ vsyscall_pgprot); + + for_each_possible_cpu(cpu) { + void *percpu_vaddr = __per_cpu_user_mapped_start + diff --git a/queue/x86-alternatives-add-missing-n-at-end-of-alternative-inline-asm.patch b/queue/x86-alternatives-add-missing-n-at-end-of-alternative-inline-asm.patch new file mode 100644 index 0000000..483e564 --- /dev/null +++ b/queue/x86-alternatives-add-missing-n-at-end-of-alternative-inline-asm.patch @@ -0,0 +1,56 @@ +From b9e705ef7cfaf22db0daab91ad3cd33b0fa32eb9 Mon Sep 17 00:00:00 2001 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Thu, 4 Jan 2018 14:37:05 +0000 +Subject: x86/alternatives: Add missing '\n' at end of ALTERNATIVE inline asm + +From: David Woodhouse <dwmw@amazon.co.uk> + +commit b9e705ef7cfaf22db0daab91ad3cd33b0fa32eb9 upstream. + +Where an ALTERNATIVE is used in the middle of an inline asm block, this +would otherwise lead to the following instruction being appended directly +to the trailing ".popsection", and a failed compile. + +Fixes: 9cebed423c84 ("x86, alternative: Use .pushsection/.popsection") +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: Rik van Riel <riel@redhat.com> +Cc: ak@linux.intel.com +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Paul Turner <pjt@google.com> +Cc: Jiri Kosina <jikos@kernel.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Kees Cook <keescook@google.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/20180104143710.8961-8-dwmw@amazon.co.uk +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/include/asm/alternative.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/x86/include/asm/alternative.h ++++ b/arch/x86/include/asm/alternative.h +@@ -139,7 +139,7 @@ static inline int alternatives_text_rese + ".popsection\n" \ + ".pushsection .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ +- ".popsection" ++ ".popsection\n" + + #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ + OLDINSTR_2(oldinstr, 1, 2) \ +@@ -150,7 +150,7 @@ static inline int alternatives_text_rese + ".pushsection .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ + ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ +- ".popsection" ++ ".popsection\n" + + /* + * Alternative instructions for different CPU types or capabilities. diff --git a/queue/x86-alternatives-fix-optimize_nops-checking.patch b/queue/x86-alternatives-fix-optimize_nops-checking.patch new file mode 100644 index 0000000..3574563 --- /dev/null +++ b/queue/x86-alternatives-fix-optimize_nops-checking.patch @@ -0,0 +1,53 @@ +From 612e8e9350fd19cae6900cf36ea0c6892d1a0dca Mon Sep 17 00:00:00 2001 +From: Borislav Petkov <bp@suse.de> +Date: Wed, 10 Jan 2018 12:28:16 +0100 +Subject: x86/alternatives: Fix optimize_nops() checking + +From: Borislav Petkov <bp@suse.de> + +commit 612e8e9350fd19cae6900cf36ea0c6892d1a0dca upstream. + +The alternatives code checks only the first byte whether it is a NOP, but +with NOPs in front of the payload and having actual instructions after it +breaks the "optimized' test. + +Make sure to scan all bytes before deciding to optimize the NOPs in there. + +Reported-by: David Woodhouse <dwmw2@infradead.org> +Signed-off-by: Borislav Petkov <bp@suse.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Tom Lendacky <thomas.lendacky@amd.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Jiri Kosina <jikos@kernel.org> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Andi Kleen <andi@firstfloor.org> +Cc: Andrew Lutomirski <luto@kernel.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Cc: Paul Turner <pjt@google.com> +Link: https://lkml.kernel.org/r/20180110112815.mgciyf5acwacphkq@pd.tnic +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kernel/alternative.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/arch/x86/kernel/alternative.c ++++ b/arch/x86/kernel/alternative.c +@@ -340,9 +340,12 @@ done: + static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr) + { + unsigned long flags; ++ int i; + +- if (instr[0] != 0x90) +- return; ++ for (i = 0; i < a->padlen; i++) { ++ if (instr[i] != 0x90) ++ return; ++ } + + local_irq_save(flags); + add_nops(instr + (a->instrlen - a->padlen), a->padlen); diff --git a/queue/x86-asm-move-status-from-thread_struct-to-thread_info.patch b/queue/x86-asm-move-status-from-thread_struct-to-thread_info.patch new file mode 100644 index 0000000..b22fc0a --- /dev/null +++ b/queue/x86-asm-move-status-from-thread_struct-to-thread_info.patch @@ -0,0 +1,172 @@ +From foo@baz Thu Feb 8 03:32:24 CET 2018 +From: Andy Lutomirski <luto@kernel.org> +Date: Sun, 28 Jan 2018 10:38:50 -0800 +Subject: x86/asm: Move 'status' from thread_struct to thread_info + +From: Andy Lutomirski <luto@kernel.org> + + +(cherry picked from commit 37a8f7c38339b22b69876d6f5a0ab851565284e3) + +The TS_COMPAT bit is very hot and is accessed from code paths that mostly +also touch thread_info::flags. Move it into struct thread_info to improve +cache locality. + +The only reason it was in thread_struct is that there was a brief period +during which arch-specific fields were not allowed in struct thread_info. + +Linus suggested further changing: + + ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); + +to: + + if (unlikely(ti->status & (TS_COMPAT|TS_I386_REGS_POKED))) + ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); + +on the theory that frequently dirtying the cacheline even in pure 64-bit +code that never needs to modify status hurts performance. That could be a +reasonable followup patch, but I suspect it matters less on top of this +patch. + +Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Ingo Molnar <mingo@kernel.org> +Acked-by: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Kernel Hardening <kernel-hardening@lists.openwall.com> +Link: https://lkml.kernel.org/r/03148bcc1b217100e6e8ecf6a5468c45cf4304b6.1517164461.git.luto@kernel.org +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/entry/common.c | 4 ++-- + arch/x86/include/asm/processor.h | 2 -- + arch/x86/include/asm/syscall.h | 6 +++--- + arch/x86/include/asm/thread_info.h | 3 ++- + arch/x86/kernel/process_64.c | 4 ++-- + arch/x86/kernel/ptrace.c | 2 +- + arch/x86/kernel/signal.c | 2 +- + 7 files changed, 11 insertions(+), 12 deletions(-) + +--- a/arch/x86/entry/common.c ++++ b/arch/x86/entry/common.c +@@ -201,7 +201,7 @@ __visible inline void prepare_exit_to_us + * special case only applies after poking regs and before the + * very next return to user mode. + */ +- current->thread.status &= ~(TS_COMPAT|TS_I386_REGS_POKED); ++ ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); + #endif + + user_enter_irqoff(); +@@ -299,7 +299,7 @@ static __always_inline void do_syscall_3 + unsigned int nr = (unsigned int)regs->orig_ax; + + #ifdef CONFIG_IA32_EMULATION +- current->thread.status |= TS_COMPAT; ++ ti->status |= TS_COMPAT; + #endif + + if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -391,8 +391,6 @@ struct thread_struct { + unsigned short gsindex; + #endif + +- u32 status; /* thread synchronous flags */ +- + #ifdef CONFIG_X86_64 + unsigned long fsbase; + unsigned long gsbase; +--- a/arch/x86/include/asm/syscall.h ++++ b/arch/x86/include/asm/syscall.h +@@ -60,7 +60,7 @@ static inline long syscall_get_error(str + * TS_COMPAT is set for 32-bit syscall entries and then + * remains set until we return to user mode. + */ +- if (task->thread.status & (TS_COMPAT|TS_I386_REGS_POKED)) ++ if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED)) + /* + * Sign-extend the value so (int)-EFOO becomes (long)-EFOO + * and will match correctly in comparisons. +@@ -116,7 +116,7 @@ static inline void syscall_get_arguments + unsigned long *args) + { + # ifdef CONFIG_IA32_EMULATION +- if (task->thread.status & TS_COMPAT) ++ if (task->thread_info.status & TS_COMPAT) + switch (i) { + case 0: + if (!n--) break; +@@ -177,7 +177,7 @@ static inline void syscall_set_arguments + const unsigned long *args) + { + # ifdef CONFIG_IA32_EMULATION +- if (task->thread.status & TS_COMPAT) ++ if (task->thread_info.status & TS_COMPAT) + switch (i) { + case 0: + if (!n--) break; +--- a/arch/x86/include/asm/thread_info.h ++++ b/arch/x86/include/asm/thread_info.h +@@ -54,6 +54,7 @@ struct task_struct; + + struct thread_info { + unsigned long flags; /* low level flags */ ++ u32 status; /* thread synchronous flags */ + }; + + #define INIT_THREAD_INFO(tsk) \ +@@ -213,7 +214,7 @@ static inline int arch_within_stack_fram + #define in_ia32_syscall() true + #else + #define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \ +- current->thread.status & TS_COMPAT) ++ current_thread_info()->status & TS_COMPAT) + #endif + + /* +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -538,7 +538,7 @@ void set_personality_ia32(bool x32) + current->personality &= ~READ_IMPLIES_EXEC; + /* in_compat_syscall() uses the presence of the x32 + syscall bit flag to determine compat status */ +- current->thread.status &= ~TS_COMPAT; ++ current_thread_info()->status &= ~TS_COMPAT; + } else { + set_thread_flag(TIF_IA32); + clear_thread_flag(TIF_X32); +@@ -546,7 +546,7 @@ void set_personality_ia32(bool x32) + current->mm->context.ia32_compat = TIF_IA32; + current->personality |= force_personality32; + /* Prepare the first "return" to user space */ +- current->thread.status |= TS_COMPAT; ++ current_thread_info()->status |= TS_COMPAT; + } + } + EXPORT_SYMBOL_GPL(set_personality_ia32); +--- a/arch/x86/kernel/ptrace.c ++++ b/arch/x86/kernel/ptrace.c +@@ -934,7 +934,7 @@ static int putreg32(struct task_struct * + */ + regs->orig_ax = value; + if (syscall_get_nr(child, regs) >= 0) +- child->thread.status |= TS_I386_REGS_POKED; ++ child->thread_info.status |= TS_I386_REGS_POKED; + break; + + case offsetof(struct user32, regs.eflags): +--- a/arch/x86/kernel/signal.c ++++ b/arch/x86/kernel/signal.c +@@ -785,7 +785,7 @@ static inline unsigned long get_nr_resta + * than the tracee. + */ + #ifdef CONFIG_IA32_EMULATION +- if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED)) ++ if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED)) + return __NR_ia32_restart_syscall; + #endif + #ifdef CONFIG_X86_X32_ABI diff --git a/queue/x86-asm-use-register-variable-to-get-stack-pointer-value.patch b/queue/x86-asm-use-register-variable-to-get-stack-pointer-value.patch new file mode 100644 index 0000000..d1036be --- /dev/null +++ b/queue/x86-asm-use-register-variable-to-get-stack-pointer-value.patch @@ -0,0 +1,138 @@ +From 196bd485ee4f03ce4c690bfcf38138abfcd0a4bc Mon Sep 17 00:00:00 2001 +From: Andrey Ryabinin <aryabinin@virtuozzo.com> +Date: Fri, 29 Sep 2017 17:15:36 +0300 +Subject: x86/asm: Use register variable to get stack pointer value + +From: Andrey Ryabinin <aryabinin@virtuozzo.com> + +commit 196bd485ee4f03ce4c690bfcf38138abfcd0a4bc upstream. + +Currently we use current_stack_pointer() function to get the value +of the stack pointer register. Since commit: + + f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang") + +... we have a stack register variable declared. It can be used instead of +current_stack_pointer() function which allows to optimize away some +excessive "mov %rsp, %<dst>" instructions: + + -mov %rsp,%rdx + -sub %rdx,%rax + -cmp $0x3fff,%rax + -ja ffffffff810722fd <ist_begin_non_atomic+0x2d> + + +sub %rsp,%rax + +cmp $0x3fff,%rax + +ja ffffffff810722fa <ist_begin_non_atomic+0x2a> + +Remove current_stack_pointer(), rename __asm_call_sp to current_stack_pointer +and use it instead of the removed function. + +Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com> +Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/20170929141537.29167-1-aryabinin@virtuozzo.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> +[dwmw2: We want ASM_CALL_CONSTRAINT for retpoline] +Signed-off-by: David Woodhouse <dwmw@amazon.co.ku> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/asm.h | 11 +++++++++++ + arch/x86/include/asm/thread_info.h | 11 ----------- + arch/x86/kernel/irq_32.c | 6 +++--- + arch/x86/kernel/traps.c | 2 +- + arch/x86/mm/tlb.c | 2 +- + 5 files changed, 16 insertions(+), 16 deletions(-) + +--- a/arch/x86/include/asm/asm.h ++++ b/arch/x86/include/asm/asm.h +@@ -125,4 +125,15 @@ + /* For C file, we already have NOKPROBE_SYMBOL macro */ + #endif + ++#ifndef __ASSEMBLY__ ++/* ++ * This output constraint should be used for any inline asm which has a "call" ++ * instruction. Otherwise the asm may be inserted before the frame pointer ++ * gets set up by the containing function. If you forget to do this, objtool ++ * may print a "call without frame pointer save/setup" warning. ++ */ ++register unsigned long current_stack_pointer asm(_ASM_SP); ++#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) ++#endif ++ + #endif /* _ASM_X86_ASM_H */ +--- a/arch/x86/include/asm/thread_info.h ++++ b/arch/x86/include/asm/thread_info.h +@@ -152,17 +152,6 @@ struct thread_info { + */ + #ifndef __ASSEMBLY__ + +-static inline unsigned long current_stack_pointer(void) +-{ +- unsigned long sp; +-#ifdef CONFIG_X86_64 +- asm("mov %%rsp,%0" : "=g" (sp)); +-#else +- asm("mov %%esp,%0" : "=g" (sp)); +-#endif +- return sp; +-} +- + /* + * Walks up the stack frames to make sure that the specified object is + * entirely contained by a single stack frame. +--- a/arch/x86/kernel/irq_32.c ++++ b/arch/x86/kernel/irq_32.c +@@ -64,7 +64,7 @@ static void call_on_stack(void *func, vo + + static inline void *current_stack(void) + { +- return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1)); ++ return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); + } + + static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) +@@ -88,7 +88,7 @@ static inline int execute_on_irq_stack(i + + /* Save the next esp at the bottom of the stack */ + prev_esp = (u32 *)irqstk; +- *prev_esp = current_stack_pointer(); ++ *prev_esp = current_stack_pointer; + + if (unlikely(overflow)) + call_on_stack(print_stack_overflow, isp); +@@ -139,7 +139,7 @@ void do_softirq_own_stack(void) + + /* Push the previous esp onto the stack */ + prev_esp = (u32 *)irqstk; +- *prev_esp = current_stack_pointer(); ++ *prev_esp = current_stack_pointer; + + call_on_stack(__do_softirq, isp); + } +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -153,7 +153,7 @@ void ist_begin_non_atomic(struct pt_regs + * from double_fault. + */ + BUG_ON((unsigned long)(current_top_of_stack() - +- current_stack_pointer()) >= THREAD_SIZE); ++ current_stack_pointer) >= THREAD_SIZE); + + preempt_enable_no_resched(); + } +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -110,7 +110,7 @@ void switch_mm_irqs_off(struct mm_struct + * mapped in the new pgd, we'll double-fault. Forcibly + * map it. + */ +- unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); ++ unsigned int stack_pgd_index = pgd_index(current_stack_pointer); + + pgd_t *pgd = next->pgd + stack_pgd_index; + diff --git a/queue/x86-boot-add-early-cmdline-parsing-for-options-with-arguments.patch b/queue/x86-boot-add-early-cmdline-parsing-for-options-with-arguments.patch new file mode 100644 index 0000000..bb7c7d3 --- /dev/null +++ b/queue/x86-boot-add-early-cmdline-parsing-for-options-with-arguments.patch @@ -0,0 +1,178 @@ +From e505371dd83963caae1a37ead9524e8d997341be Mon Sep 17 00:00:00 2001 +From: Tom Lendacky <thomas.lendacky@amd.com> +Date: Mon, 17 Jul 2017 16:10:33 -0500 +Subject: x86/boot: Add early cmdline parsing for options with arguments +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Tom Lendacky <thomas.lendacky@amd.com> + +commit e505371dd83963caae1a37ead9524e8d997341be upstream. + +Add a cmdline_find_option() function to look for cmdline options that +take arguments. The argument is returned in a supplied buffer and the +argument length (regardless of whether it fits in the supplied buffer) +is returned, with -1 indicating not found. + +Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Alexander Potapenko <glider@google.com> +Cc: Andrey Ryabinin <aryabinin@virtuozzo.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Arnd Bergmann <arnd@arndb.de> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brijesh Singh <brijesh.singh@amd.com> +Cc: Dave Young <dyoung@redhat.com> +Cc: Dmitry Vyukov <dvyukov@google.com> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +Cc: Larry Woodman <lwoodman@redhat.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Matt Fleming <matt@codeblueprint.co.uk> +Cc: Michael S. Tsirkin <mst@redhat.com> +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Radim Krčmář <rkrcmar@redhat.com> +Cc: Rik van Riel <riel@redhat.com> +Cc: Toshimitsu Kani <toshi.kani@hpe.com> +Cc: kasan-dev@googlegroups.com +Cc: kvm@vger.kernel.org +Cc: linux-arch@vger.kernel.org +Cc: linux-doc@vger.kernel.org +Cc: linux-efi@vger.kernel.org +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/36b5f97492a9745dce27682305f990fc20e5cf8a.1500319216.git.thomas.lendacky@amd.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/include/asm/cmdline.h | 2 + arch/x86/lib/cmdline.c | 105 +++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 107 insertions(+) + +--- a/arch/x86/include/asm/cmdline.h ++++ b/arch/x86/include/asm/cmdline.h +@@ -2,5 +2,7 @@ + #define _ASM_X86_CMDLINE_H + + int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); ++int cmdline_find_option(const char *cmdline_ptr, const char *option, ++ char *buffer, int bufsize); + + #endif /* _ASM_X86_CMDLINE_H */ +--- a/arch/x86/lib/cmdline.c ++++ b/arch/x86/lib/cmdline.c +@@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *c + return 0; /* Buffer overrun */ + } + ++/* ++ * Find a non-boolean option (i.e. option=argument). In accordance with ++ * standard Linux practice, if this option is repeated, this returns the ++ * last instance on the command line. ++ * ++ * @cmdline: the cmdline string ++ * @max_cmdline_size: the maximum size of cmdline ++ * @option: option string to look for ++ * @buffer: memory buffer to return the option argument ++ * @bufsize: size of the supplied memory buffer ++ * ++ * Returns the length of the argument (regardless of if it was ++ * truncated to fit in the buffer), or -1 on not found. ++ */ ++static int ++__cmdline_find_option(const char *cmdline, int max_cmdline_size, ++ const char *option, char *buffer, int bufsize) ++{ ++ char c; ++ int pos = 0, len = -1; ++ const char *opptr = NULL; ++ char *bufptr = buffer; ++ enum { ++ st_wordstart = 0, /* Start of word/after whitespace */ ++ st_wordcmp, /* Comparing this word */ ++ st_wordskip, /* Miscompare, skip */ ++ st_bufcpy, /* Copying this to buffer */ ++ } state = st_wordstart; ++ ++ if (!cmdline) ++ return -1; /* No command line */ ++ ++ /* ++ * This 'pos' check ensures we do not overrun ++ * a non-NULL-terminated 'cmdline' ++ */ ++ while (pos++ < max_cmdline_size) { ++ c = *(char *)cmdline++; ++ if (!c) ++ break; ++ ++ switch (state) { ++ case st_wordstart: ++ if (myisspace(c)) ++ break; ++ ++ state = st_wordcmp; ++ opptr = option; ++ /* fall through */ ++ ++ case st_wordcmp: ++ if ((c == '=') && !*opptr) { ++ /* ++ * We matched all the way to the end of the ++ * option we were looking for, prepare to ++ * copy the argument. ++ */ ++ len = 0; ++ bufptr = buffer; ++ state = st_bufcpy; ++ break; ++ } else if (c == *opptr++) { ++ /* ++ * We are currently matching, so continue ++ * to the next character on the cmdline. ++ */ ++ break; ++ } ++ state = st_wordskip; ++ /* fall through */ ++ ++ case st_wordskip: ++ if (myisspace(c)) ++ state = st_wordstart; ++ break; ++ ++ case st_bufcpy: ++ if (myisspace(c)) { ++ state = st_wordstart; ++ } else { ++ /* ++ * Increment len, but don't overrun the ++ * supplied buffer and leave room for the ++ * NULL terminator. ++ */ ++ if (++len < bufsize) ++ *bufptr++ = c; ++ } ++ break; ++ } ++ } ++ ++ if (bufsize) ++ *bufptr = '\0'; ++ ++ return len; ++} ++ + int cmdline_find_option_bool(const char *cmdline, const char *option) + { + return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); + } ++ ++int cmdline_find_option(const char *cmdline, const char *option, char *buffer, ++ int bufsize) ++{ ++ return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, ++ buffer, bufsize); ++} diff --git a/queue/x86-bugs-drop-one-mitigation-from-dmesg.patch b/queue/x86-bugs-drop-one-mitigation-from-dmesg.patch new file mode 100644 index 0000000..d9380c1 --- /dev/null +++ b/queue/x86-bugs-drop-one-mitigation-from-dmesg.patch @@ -0,0 +1,52 @@ +From foo@baz Thu Feb 8 03:30:27 CET 2018 +From: Borislav Petkov <bp@suse.de> +Date: Fri, 26 Jan 2018 13:11:39 +0100 +Subject: x86/bugs: Drop one "mitigation" from dmesg + +From: Borislav Petkov <bp@suse.de> + +(cherry picked from commit 55fa19d3e51f33d9cd4056d25836d93abf9438db) + +Make + +[ 0.031118] Spectre V2 mitigation: Mitigation: Full generic retpoline + +into + +[ 0.031118] Spectre V2: Mitigation: Full generic retpoline + +to reduce the mitigation mitigations strings. + +Signed-off-by: Borislav Petkov <bp@suse.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Cc: riel@redhat.com +Cc: ak@linux.intel.com +Cc: peterz@infradead.org +Cc: David Woodhouse <dwmw2@infradead.org> +Cc: jikos@kernel.org +Cc: luto@amacapital.net +Cc: dave.hansen@intel.com +Cc: torvalds@linux-foundation.org +Cc: keescook@google.com +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: tim.c.chen@linux.intel.com +Cc: pjt@google.com +Link: https://lkml.kernel.org/r/20180126121139.31959-5-bp@alien8.de +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kernel/cpu/bugs.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -90,7 +90,7 @@ static const char *spectre_v2_strings[] + }; + + #undef pr_fmt +-#define pr_fmt(fmt) "Spectre V2 mitigation: " fmt ++#define pr_fmt(fmt) "Spectre V2 : " fmt + + static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; + static bool spectre_v2_bad_module; diff --git a/queue/x86-cpu-amd-make-lfence-a-serializing-instruction.patch b/queue/x86-cpu-amd-make-lfence-a-serializing-instruction.patch new file mode 100644 index 0000000..db8ec7a --- /dev/null +++ b/queue/x86-cpu-amd-make-lfence-a-serializing-instruction.patch @@ -0,0 +1,66 @@ +From e4d0e84e490790798691aaa0f2e598637f1867ec Mon Sep 17 00:00:00 2001 +From: Tom Lendacky <thomas.lendacky@amd.com> +Date: Mon, 8 Jan 2018 16:09:21 -0600 +Subject: x86/cpu/AMD: Make LFENCE a serializing instruction + +From: Tom Lendacky <thomas.lendacky@amd.com> + +commit e4d0e84e490790798691aaa0f2e598637f1867ec upstream. + +To aid in speculation control, make LFENCE a serializing instruction +since it has less overhead than MFENCE. This is done by setting bit 1 +of MSR 0xc0011029 (DE_CFG). Some families that support LFENCE do not +have this MSR. For these families, the LFENCE instruction is already +serializing. + +Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Cc: David Woodhouse <dwmw@amazon.co.uk> +Cc: Paul Turner <pjt@google.com> +Link: https://lkml.kernel.org/r/20180108220921.12580.71694.stgit@tlendack-t1.amdoffice.net +Signed-off-by: Razvan Ghitulete <rga@amazon.de> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/include/asm/msr-index.h | 2 ++ + arch/x86/kernel/cpu/amd.c | 10 ++++++++++ + 2 files changed, 12 insertions(+) + +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -330,6 +330,8 @@ + #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL + #define FAM10H_MMIO_CONF_BASE_SHIFT 20 + #define MSR_FAM10H_NODE_ID 0xc001100c ++#define MSR_F10H_DECFG 0xc0011029 ++#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 + + /* K8 MSRs */ + #define MSR_K8_TOP_MEM1 0xc001001a +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -782,6 +782,16 @@ static void init_amd(struct cpuinfo_x86 + set_cpu_cap(c, X86_FEATURE_K8); + + if (cpu_has(c, X86_FEATURE_XMM2)) { ++ /* ++ * A serializing LFENCE has less overhead than MFENCE, so ++ * use it for execution serialization. On families which ++ * don't have that MSR, LFENCE is already serializing. ++ * msr_set_bit() uses the safe accessors, too, even if the MSR ++ * is not present. ++ */ ++ msr_set_bit(MSR_F10H_DECFG, ++ MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); ++ + /* MFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + } diff --git a/queue/x86-cpu-amd-use-lfence_rdtsc-in-preference-to-mfence_rdtsc.patch b/queue/x86-cpu-amd-use-lfence_rdtsc-in-preference-to-mfence_rdtsc.patch new file mode 100644 index 0000000..8c8a0ce --- /dev/null +++ b/queue/x86-cpu-amd-use-lfence_rdtsc-in-preference-to-mfence_rdtsc.patch @@ -0,0 +1,81 @@ +From 9c6a73c75864ad9fa49e5fa6513e4c4071c0e29f Mon Sep 17 00:00:00 2001 +From: Tom Lendacky <thomas.lendacky@amd.com> +Date: Mon, 8 Jan 2018 16:09:32 -0600 +Subject: x86/cpu/AMD: Use LFENCE_RDTSC in preference to MFENCE_RDTSC + +From: Tom Lendacky <thomas.lendacky@amd.com> + +commit 9c6a73c75864ad9fa49e5fa6513e4c4071c0e29f upstream. + +With LFENCE now a serializing instruction, use LFENCE_RDTSC in preference +to MFENCE_RDTSC. However, since the kernel could be running under a +hypervisor that does not support writing that MSR, read the MSR back and +verify that the bit has been set successfully. If the MSR can be read +and the bit is set, then set the LFENCE_RDTSC feature, otherwise set the +MFENCE_RDTSC feature. + +Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Cc: David Woodhouse <dwmw@amazon.co.uk> +Cc: Paul Turner <pjt@google.com> +Link: https://lkml.kernel.org/r/20180108220932.12580.52458.stgit@tlendack-t1.amdoffice.net +Signed-off-by: Razvan Ghitulete <rga@amazon.de> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/include/asm/msr-index.h | 1 + + arch/x86/kernel/cpu/amd.c | 18 ++++++++++++++++-- + 2 files changed, 17 insertions(+), 2 deletions(-) + +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -332,6 +332,7 @@ + #define MSR_FAM10H_NODE_ID 0xc001100c + #define MSR_F10H_DECFG 0xc0011029 + #define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 ++#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT) + + /* K8 MSRs */ + #define MSR_K8_TOP_MEM1 0xc001001a +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -782,6 +782,9 @@ static void init_amd(struct cpuinfo_x86 + set_cpu_cap(c, X86_FEATURE_K8); + + if (cpu_has(c, X86_FEATURE_XMM2)) { ++ unsigned long long val; ++ int ret; ++ + /* + * A serializing LFENCE has less overhead than MFENCE, so + * use it for execution serialization. On families which +@@ -792,8 +795,19 @@ static void init_amd(struct cpuinfo_x86 + msr_set_bit(MSR_F10H_DECFG, + MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + +- /* MFENCE stops RDTSC speculation */ +- set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); ++ /* ++ * Verify that the MSR write was successful (could be running ++ * under a hypervisor) and only then assume that LFENCE is ++ * serializing. ++ */ ++ ret = rdmsrl_safe(MSR_F10H_DECFG, &val); ++ if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) { ++ /* A serializing LFENCE stops RDTSC speculation */ ++ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); ++ } else { ++ /* MFENCE stops RDTSC speculation */ ++ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); ++ } + } + + /* diff --git a/queue/x86-cpu-bugs-make-retpoline-module-warning-conditional.patch b/queue/x86-cpu-bugs-make-retpoline-module-warning-conditional.patch new file mode 100644 index 0000000..f8a1204 --- /dev/null +++ b/queue/x86-cpu-bugs-make-retpoline-module-warning-conditional.patch @@ -0,0 +1,66 @@ +From foo@baz Thu Feb 8 03:30:27 CET 2018 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Sat, 27 Jan 2018 15:45:14 +0100 +Subject: x86/cpu/bugs: Make retpoline module warning conditional + +From: Thomas Gleixner <tglx@linutronix.de> + +(cherry picked from commit e383095c7fe8d218e00ec0f83e4b95ed4e627b02) + +If sysfs is disabled and RETPOLINE not defined: + +arch/x86/kernel/cpu/bugs.c:97:13: warning: ‘spectre_v2_bad_module’ defined but not used +[-Wunused-variable] + static bool spectre_v2_bad_module; + +Hide it. + +Fixes: caf7501a1b4e ("module/retpoline: Warn about missing retpoline in module") +Reported-by: Borislav Petkov <bp@alien8.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: David Woodhouse <dwmw2@infradead.org> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kernel/cpu/bugs.c | 14 +++++++++++--- + 1 file changed, 11 insertions(+), 3 deletions(-) + +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -93,9 +93,10 @@ static const char *spectre_v2_strings[] + #define pr_fmt(fmt) "Spectre V2 : " fmt + + static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; +-static bool spectre_v2_bad_module; + + #ifdef RETPOLINE ++static bool spectre_v2_bad_module; ++ + bool retpoline_module_ok(bool has_retpoline) + { + if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline) +@@ -105,6 +106,13 @@ bool retpoline_module_ok(bool has_retpol + spectre_v2_bad_module = true; + return false; + } ++ ++static inline const char *spectre_v2_module_string(void) ++{ ++ return spectre_v2_bad_module ? " - vulnerable module loaded" : ""; ++} ++#else ++static inline const char *spectre_v2_module_string(void) { return ""; } + #endif + + static void __init spec2_print_if_insecure(const char *reason) +@@ -299,7 +307,7 @@ ssize_t cpu_show_spectre_v2(struct devic + return sprintf(buf, "Not affected\n"); + + return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], +- boot_cpu_has(X86_FEATURE_IBPB) ? ", IPBP" : "", +- spectre_v2_bad_module ? " - vulnerable module loaded" : ""); ++ boot_cpu_has(X86_FEATURE_IBPB) ? ", IBPB" : "", ++ spectre_v2_module_string()); + } + #endif diff --git a/queue/x86-cpu-factor-out-application-of-forced-cpu-caps.patch b/queue/x86-cpu-factor-out-application-of-forced-cpu-caps.patch new file mode 100644 index 0000000..8c79c8f --- /dev/null +++ b/queue/x86-cpu-factor-out-application-of-forced-cpu-caps.patch @@ -0,0 +1,79 @@ +From 8bf1ebca215c262e48c15a4a15f175991776f57f Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Wed, 18 Jan 2017 11:15:38 -0800 +Subject: x86/cpu: Factor out application of forced CPU caps + +From: Andy Lutomirski <luto@kernel.org> + +commit 8bf1ebca215c262e48c15a4a15f175991776f57f upstream. + +There are multiple call sites that apply forced CPU caps. Factor +them into a helper. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Fenghua Yu <fenghua.yu@intel.com> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Matthew Whitehead <tedheadster@gmail.com> +Cc: Oleg Nesterov <oleg@redhat.com> +Cc: One Thousand Gnomes <gnomes@lxorguk.ukuu.org.uk> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: Yu-cheng Yu <yu-cheng.yu@intel.com> +Link: http://lkml.kernel.org/r/623ff7555488122143e4417de09b18be2085ad06.1484705016.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/kernel/cpu/common.c | 20 ++++++++++++-------- + 1 file changed, 12 insertions(+), 8 deletions(-) + +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -706,6 +706,16 @@ void cpu_detect(struct cpuinfo_x86 *c) + } + } + ++static void apply_forced_caps(struct cpuinfo_x86 *c) ++{ ++ int i; ++ ++ for (i = 0; i < NCAPINTS; i++) { ++ c->x86_capability[i] &= ~cpu_caps_cleared[i]; ++ c->x86_capability[i] |= cpu_caps_set[i]; ++ } ++} ++ + void get_cpu_cap(struct cpuinfo_x86 *c) + { + u32 eax, ebx, ecx, edx; +@@ -1086,10 +1096,7 @@ static void identify_cpu(struct cpuinfo_ + this_cpu->c_identify(c); + + /* Clear/Set all flags overridden by options, after probe */ +- for (i = 0; i < NCAPINTS; i++) { +- c->x86_capability[i] &= ~cpu_caps_cleared[i]; +- c->x86_capability[i] |= cpu_caps_set[i]; +- } ++ apply_forced_caps(c); + + #ifdef CONFIG_X86_64 + c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); +@@ -1151,10 +1158,7 @@ static void identify_cpu(struct cpuinfo_ + * Clear/Set all flags overridden by options, need do it + * before following smp all cpus cap AND. + */ +- for (i = 0; i < NCAPINTS; i++) { +- c->x86_capability[i] &= ~cpu_caps_cleared[i]; +- c->x86_capability[i] |= cpu_caps_set[i]; +- } ++ apply_forced_caps(c); + + /* + * On SMP, boot_cpu_data holds the common feature set between diff --git a/queue/x86-cpu-implement-cpu-vulnerabilites-sysfs-functions.patch b/queue/x86-cpu-implement-cpu-vulnerabilites-sysfs-functions.patch new file mode 100644 index 0000000..3d30088 --- /dev/null +++ b/queue/x86-cpu-implement-cpu-vulnerabilites-sysfs-functions.patch @@ -0,0 +1,82 @@ +From 61dc0f555b5c761cdafb0ba5bd41ecf22d68a4c4 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Sun, 7 Jan 2018 22:48:01 +0100 +Subject: x86/cpu: Implement CPU vulnerabilites sysfs functions + +From: Thomas Gleixner <tglx@linutronix.de> + +commit 61dc0f555b5c761cdafb0ba5bd41ecf22d68a4c4 upstream. + +Implement the CPU vulnerabilty show functions for meltdown, spectre_v1 and +spectre_v2. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Will Deacon <will.deacon@arm.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linuxfoundation.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: David Woodhouse <dwmw@amazon.co.uk> +Link: https://lkml.kernel.org/r/20180107214913.177414879@linutronix.de +Signed-off-by: Razvan Ghitulete <rga@amazon.de> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/Kconfig | 1 + + arch/x86/kernel/cpu/bugs.c | 29 +++++++++++++++++++++++++++++ + 2 files changed, 30 insertions(+) + +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -64,6 +64,7 @@ config X86 + select GENERIC_CLOCKEVENTS_MIN_ADJUST + select GENERIC_CMOS_UPDATE + select GENERIC_CPU_AUTOPROBE ++ select GENERIC_CPU_VULNERABILITIES + select GENERIC_EARLY_IOREMAP + select GENERIC_FIND_FIRST_BIT + select GENERIC_IOMAP +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -9,6 +9,7 @@ + */ + #include <linux/init.h> + #include <linux/utsname.h> ++#include <linux/cpu.h> + #include <asm/bugs.h> + #include <asm/processor.h> + #include <asm/processor-flags.h> +@@ -67,3 +68,31 @@ void __init check_bugs(void) + set_memory_4k((unsigned long)__va(0), 1); + #endif + } ++ ++#ifdef CONFIG_SYSFS ++ssize_t cpu_show_meltdown(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) ++ return sprintf(buf, "Not affected\n"); ++ if (boot_cpu_has(X86_FEATURE_KAISER)) ++ return sprintf(buf, "Mitigation: PTI\n"); ++ return sprintf(buf, "Vulnerable\n"); ++} ++ ++ssize_t cpu_show_spectre_v1(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) ++ return sprintf(buf, "Not affected\n"); ++ return sprintf(buf, "Vulnerable\n"); ++} ++ ++ssize_t cpu_show_spectre_v2(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) ++ return sprintf(buf, "Not affected\n"); ++ return sprintf(buf, "Vulnerable\n"); ++} ++#endif diff --git a/queue/x86-cpu-merge-bugs.c-and-bugs_64.c.patch b/queue/x86-cpu-merge-bugs.c-and-bugs_64.c.patch new file mode 100644 index 0000000..cceebca --- /dev/null +++ b/queue/x86-cpu-merge-bugs.c-and-bugs_64.c.patch @@ -0,0 +1,136 @@ +From 62a67e123e058a67db58bc6a14354dd037bafd0a Mon Sep 17 00:00:00 2001 +From: Borislav Petkov <bp@suse.de> +Date: Mon, 24 Oct 2016 19:38:43 +0200 +Subject: x86/cpu: Merge bugs.c and bugs_64.c + +From: Borislav Petkov <bp@suse.de> + +commit 62a67e123e058a67db58bc6a14354dd037bafd0a upstream. + +Should be easier when following boot paths. It probably is a left over +from the x86 unification eons ago. + +No functionality change. + +Signed-off-by: Borislav Petkov <bp@suse.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/20161024173844.23038-3-bp@alien8.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Signed-off-by: Razvan Ghitulete <rga@amazon.de> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kernel/cpu/Makefile | 4 +--- + arch/x86/kernel/cpu/bugs.c | 26 ++++++++++++++++++++++---- + arch/x86/kernel/cpu/bugs_64.c | 33 --------------------------------- + 3 files changed, 23 insertions(+), 40 deletions(-) + delete mode 100644 arch/x86/kernel/cpu/bugs_64.c + +--- a/arch/x86/kernel/cpu/Makefile ++++ b/arch/x86/kernel/cpu/Makefile +@@ -20,13 +20,11 @@ obj-y := intel_cacheinfo.o scattered.o + obj-y += common.o + obj-y += rdrand.o + obj-y += match.o ++obj-y += bugs.o + + obj-$(CONFIG_PROC_FS) += proc.o + obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o + +-obj-$(CONFIG_X86_32) += bugs.o +-obj-$(CONFIG_X86_64) += bugs_64.o +- + obj-$(CONFIG_CPU_SUP_INTEL) += intel.o + obj-$(CONFIG_CPU_SUP_AMD) += amd.o + obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -16,6 +16,8 @@ + #include <asm/msr.h> + #include <asm/paravirt.h> + #include <asm/alternative.h> ++#include <asm/pgtable.h> ++#include <asm/cacheflush.h> + + void __init check_bugs(void) + { +@@ -28,11 +30,13 @@ void __init check_bugs(void) + #endif + + identify_boot_cpu(); +-#ifndef CONFIG_SMP +- pr_info("CPU: "); +- print_cpu_info(&boot_cpu_data); +-#endif + ++ if (!IS_ENABLED(CONFIG_SMP)) { ++ pr_info("CPU: "); ++ print_cpu_info(&boot_cpu_data); ++ } ++ ++#ifdef CONFIG_X86_32 + /* + * Check whether we are able to run this kernel safely on SMP. + * +@@ -48,4 +52,18 @@ void __init check_bugs(void) + alternative_instructions(); + + fpu__init_check_bugs(); ++#else /* CONFIG_X86_64 */ ++ alternative_instructions(); ++ ++ /* ++ * Make sure the first 2MB area is not mapped by huge pages ++ * There are typically fixed size MTRRs in there and overlapping ++ * MTRRs into large pages causes slow downs. ++ * ++ * Right now we don't do that with gbpages because there seems ++ * very little benefit for that case. ++ */ ++ if (!direct_gbpages) ++ set_memory_4k((unsigned long)__va(0), 1); ++#endif + } +--- a/arch/x86/kernel/cpu/bugs_64.c ++++ /dev/null +@@ -1,33 +0,0 @@ +-/* +- * Copyright (C) 1994 Linus Torvalds +- * Copyright (C) 2000 SuSE +- */ +- +-#include <linux/kernel.h> +-#include <linux/init.h> +-#include <asm/alternative.h> +-#include <asm/bugs.h> +-#include <asm/processor.h> +-#include <asm/mtrr.h> +-#include <asm/cacheflush.h> +- +-void __init check_bugs(void) +-{ +- identify_boot_cpu(); +-#if !defined(CONFIG_SMP) +- pr_info("CPU: "); +- print_cpu_info(&boot_cpu_data); +-#endif +- alternative_instructions(); +- +- /* +- * Make sure the first 2MB area is not mapped by huge pages +- * There are typically fixed size MTRRs in there and overlapping +- * MTRRs into large pages causes slow downs. +- * +- * Right now we don't do that with gbpages because there seems +- * very little benefit for that case. +- */ +- if (!direct_gbpages) +- set_memory_4k((unsigned long)__va(0), 1); +-} diff --git a/queue/x86-cpu-x86-pti-do-not-enable-pti-on-amd-processors.patch b/queue/x86-cpu-x86-pti-do-not-enable-pti-on-amd-processors.patch new file mode 100644 index 0000000..7a7cbec --- /dev/null +++ b/queue/x86-cpu-x86-pti-do-not-enable-pti-on-amd-processors.patch @@ -0,0 +1,46 @@ +From 694d99d40972f12e59a3696effee8a376b79d7c8 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky <thomas.lendacky@amd.com> +Date: Tue, 26 Dec 2017 23:43:54 -0600 +Subject: x86/cpu, x86/pti: Do not enable PTI on AMD processors + +From: Tom Lendacky <thomas.lendacky@amd.com> + +commit 694d99d40972f12e59a3696effee8a376b79d7c8 upstream. + +AMD processors are not subject to the types of attacks that the kernel +page table isolation feature protects against. The AMD microarchitecture +does not allow memory references, including speculative references, that +access higher privileged data when running in a lesser privileged mode +when that access would result in a page fault. + +Disable page table isolation by default on AMD processors by not setting +the X86_BUG_CPU_INSECURE feature, which controls whether X86_FEATURE_PTI +is set. + +Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/20171227054354.20369.94587.stgit@tlendack-t1.amdoffice.net +Cc: Nick Lowe <nick.lowe@gmail.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/kernel/cpu/common.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -883,8 +883,8 @@ static void __init early_identify_cpu(st + + setup_force_cpu_cap(X86_FEATURE_ALWAYS); + +- /* Assume for now that ALL x86 CPUs are insecure */ +- setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); ++ if (c->x86_vendor != X86_VENDOR_AMD) ++ setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + + setup_force_cpu_bug(X86_BUG_SPECTRE_V1); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); diff --git a/queue/x86-cpufeature-blacklist-spec_ctrl-pred_cmd-on-early-spectre-v2-microcodes.patch b/queue/x86-cpufeature-blacklist-spec_ctrl-pred_cmd-on-early-spectre-v2-microcodes.patch new file mode 100644 index 0000000..6397ee8 --- /dev/null +++ b/queue/x86-cpufeature-blacklist-spec_ctrl-pred_cmd-on-early-spectre-v2-microcodes.patch @@ -0,0 +1,167 @@ +From foo@baz Wed Feb 7 19:38:23 CST 2018 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Thu, 25 Jan 2018 16:14:14 +0000 +Subject: x86/cpufeature: Blacklist SPEC_CTRL/PRED_CMD on early Spectre v2 microcodes + +From: David Woodhouse <dwmw@amazon.co.uk> + +(cherry picked from commit a5b2966364538a0e68c9fa29bc0a3a1651799035) + +This doesn't refuse to load the affected microcodes; it just refuses to +use the Spectre v2 mitigation features if they're detected, by clearing +the appropriate feature bits. + +The AMD CPUID bits are handled here too, because hypervisors *may* have +been exposing those bits even on Intel chips, for fine-grained control +of what's available. + +It is non-trivial to use x86_match_cpu() for this table because that +doesn't handle steppings. And the approach taken in commit bd9240a18 +almost made me lose my lunch. + +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: ak@linux.intel.com +Cc: ashok.raj@intel.com +Cc: dave.hansen@intel.com +Cc: karahmed@amazon.de +Cc: arjan@linux.intel.com +Cc: torvalds@linux-foundation.org +Cc: peterz@infradead.org +Cc: bp@alien8.de +Cc: pbonzini@redhat.com +Cc: tim.c.chen@linux.intel.com +Cc: gregkh@linux-foundation.org +Link: https://lkml.kernel.org/r/1516896855-7642-7-git-send-email-dwmw@amazon.co.uk +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/intel-family.h | 7 ++- + arch/x86/kernel/cpu/intel.c | 66 ++++++++++++++++++++++++++++++++++++ + 2 files changed, 71 insertions(+), 2 deletions(-) + +--- a/arch/x86/include/asm/intel-family.h ++++ b/arch/x86/include/asm/intel-family.h +@@ -12,6 +12,7 @@ + */ + + #define INTEL_FAM6_CORE_YONAH 0x0E ++ + #define INTEL_FAM6_CORE2_MEROM 0x0F + #define INTEL_FAM6_CORE2_MEROM_L 0x16 + #define INTEL_FAM6_CORE2_PENRYN 0x17 +@@ -21,6 +22,7 @@ + #define INTEL_FAM6_NEHALEM_G 0x1F /* Auburndale / Havendale */ + #define INTEL_FAM6_NEHALEM_EP 0x1A + #define INTEL_FAM6_NEHALEM_EX 0x2E ++ + #define INTEL_FAM6_WESTMERE 0x25 + #define INTEL_FAM6_WESTMERE_EP 0x2C + #define INTEL_FAM6_WESTMERE_EX 0x2F +@@ -36,9 +38,9 @@ + #define INTEL_FAM6_HASWELL_GT3E 0x46 + + #define INTEL_FAM6_BROADWELL_CORE 0x3D +-#define INTEL_FAM6_BROADWELL_XEON_D 0x56 + #define INTEL_FAM6_BROADWELL_GT3E 0x47 + #define INTEL_FAM6_BROADWELL_X 0x4F ++#define INTEL_FAM6_BROADWELL_XEON_D 0x56 + + #define INTEL_FAM6_SKYLAKE_MOBILE 0x4E + #define INTEL_FAM6_SKYLAKE_DESKTOP 0x5E +@@ -57,9 +59,10 @@ + #define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */ + #define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */ + #define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */ +-#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Annidale */ ++#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */ + #define INTEL_FAM6_ATOM_GOLDMONT 0x5C + #define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ ++#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A + + /* Xeon Phi */ + +--- a/arch/x86/kernel/cpu/intel.c ++++ b/arch/x86/kernel/cpu/intel.c +@@ -61,6 +61,59 @@ void check_mpx_erratum(struct cpuinfo_x8 + } + } + ++/* ++ * Early microcode releases for the Spectre v2 mitigation were broken. ++ * Information taken from; ++ * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/01/microcode-update-guidance.pdf ++ * - https://kb.vmware.com/s/article/52345 ++ * - Microcode revisions observed in the wild ++ * - Release note from 20180108 microcode release ++ */ ++struct sku_microcode { ++ u8 model; ++ u8 stepping; ++ u32 microcode; ++}; ++static const struct sku_microcode spectre_bad_microcodes[] = { ++ { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x84 }, ++ { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x84 }, ++ { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x84 }, ++ { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x84 }, ++ { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x84 }, ++ { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e }, ++ { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c }, ++ { INTEL_FAM6_SKYLAKE_MOBILE, 0x03, 0xc2 }, ++ { INTEL_FAM6_SKYLAKE_DESKTOP, 0x03, 0xc2 }, ++ { INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 }, ++ { INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b }, ++ { INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 }, ++ { INTEL_FAM6_BROADWELL_XEON_D, 0x03, 0x07000011 }, ++ { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 }, ++ { INTEL_FAM6_HASWELL_ULT, 0x01, 0x21 }, ++ { INTEL_FAM6_HASWELL_GT3E, 0x01, 0x18 }, ++ { INTEL_FAM6_HASWELL_CORE, 0x03, 0x23 }, ++ { INTEL_FAM6_HASWELL_X, 0x02, 0x3b }, ++ { INTEL_FAM6_HASWELL_X, 0x04, 0x10 }, ++ { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a }, ++ /* Updated in the 20180108 release; blacklist until we know otherwise */ ++ { INTEL_FAM6_ATOM_GEMINI_LAKE, 0x01, 0x22 }, ++ /* Observed in the wild */ ++ { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b }, ++ { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 }, ++}; ++ ++static bool bad_spectre_microcode(struct cpuinfo_x86 *c) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) { ++ if (c->x86_model == spectre_bad_microcodes[i].model && ++ c->x86_mask == spectre_bad_microcodes[i].stepping) ++ return (c->microcode <= spectre_bad_microcodes[i].microcode); ++ } ++ return false; ++} ++ + static void early_init_intel(struct cpuinfo_x86 *c) + { + u64 misc_enable; +@@ -87,6 +140,19 @@ static void early_init_intel(struct cpui + rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); + } + ++ if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) || ++ cpu_has(c, X86_FEATURE_STIBP) || ++ cpu_has(c, X86_FEATURE_AMD_SPEC_CTRL) || ++ cpu_has(c, X86_FEATURE_AMD_PRED_CMD) || ++ cpu_has(c, X86_FEATURE_AMD_STIBP)) && bad_spectre_microcode(c)) { ++ pr_warn("Intel Spectre v2 broken microcode detected; disabling SPEC_CTRL\n"); ++ clear_cpu_cap(c, X86_FEATURE_SPEC_CTRL); ++ clear_cpu_cap(c, X86_FEATURE_STIBP); ++ clear_cpu_cap(c, X86_FEATURE_AMD_SPEC_CTRL); ++ clear_cpu_cap(c, X86_FEATURE_AMD_PRED_CMD); ++ clear_cpu_cap(c, X86_FEATURE_AMD_STIBP); ++ } ++ + /* + * Atom erratum AAE44/AAF40/AAG38/AAH41: + * diff --git a/queue/x86-cpufeature-move-processor-tracing-out-of-scattered-features.patch b/queue/x86-cpufeature-move-processor-tracing-out-of-scattered-features.patch new file mode 100644 index 0000000..37baee4 --- /dev/null +++ b/queue/x86-cpufeature-move-processor-tracing-out-of-scattered-features.patch @@ -0,0 +1,68 @@ +From 4fdec2034b7540dda461c6ba33325dfcff345c64 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini <pbonzini@redhat.com> +Date: Tue, 16 Jan 2018 16:42:25 +0100 +Subject: x86/cpufeature: Move processor tracing out of scattered features +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Paolo Bonzini <pbonzini@redhat.com> + +commit 4fdec2034b7540dda461c6ba33325dfcff345c64 upstream. + +Processor tracing is already enumerated in word 9 (CPUID[7,0].EBX), +so do not duplicate it in the scattered features word. + +Besides being more tidy, this will be useful for KVM when it presents +processor tracing to the guests. KVM selects host features that are +supported by both the host kernel (depending on command line options, +CPU errata, or whatever) and KVM. Whenever a full feature word exists, +KVM's code is written in the expectation that the CPUID bit number +matches the X86_FEATURE_* bit number, but this is not the case for +X86_FEATURE_INTEL_PT. + +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +Cc: Borislav Petkov <bp@suse.de> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Luwei Kang <luwei.kang@intel.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Radim Krčmář <rkrcmar@redhat.com> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: kvm@vger.kernel.org +Link: http://lkml.kernel.org/r/1516117345-34561-1-git-send-email-pbonzini@redhat.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/include/asm/cpufeatures.h | 2 +- + arch/x86/kernel/cpu/scattered.c | 1 - + 2 files changed, 1 insertion(+), 2 deletions(-) + +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -197,7 +197,6 @@ + #define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ + #define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */ + +-#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ + #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ + #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ + #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ +@@ -236,6 +235,7 @@ + #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ + #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ + #define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ ++#define X86_FEATURE_INTEL_PT ( 9*32+25) /* Intel Processor Trace */ + #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ + #define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ + #define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ +--- a/arch/x86/kernel/cpu/scattered.c ++++ b/arch/x86/kernel/cpu/scattered.c +@@ -31,7 +31,6 @@ void init_scattered_cpuid_features(struc + const struct cpuid_bit *cb; + + static const struct cpuid_bit cpuid_bits[] = { +- { X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 }, + { X86_FEATURE_AVX512_4VNNIW, CR_EDX, 2, 0x00000007, 0 }, + { X86_FEATURE_AVX512_4FMAPS, CR_EDX, 3, 0x00000007, 0 }, + { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, diff --git a/queue/x86-cpufeatures-add-amd-feature-bits-for-speculation-control.patch b/queue/x86-cpufeatures-add-amd-feature-bits-for-speculation-control.patch new file mode 100644 index 0000000..7b48105 --- /dev/null +++ b/queue/x86-cpufeatures-add-amd-feature-bits-for-speculation-control.patch @@ -0,0 +1,47 @@ +From foo@baz Wed Feb 7 19:38:23 CST 2018 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Thu, 25 Jan 2018 16:14:11 +0000 +Subject: x86/cpufeatures: Add AMD feature bits for Speculation Control + +From: David Woodhouse <dwmw@amazon.co.uk> + +(cherry picked from commit 5d10cbc91d9eb5537998b65608441b592eec65e7) + +AMD exposes the PRED_CMD/SPEC_CTRL MSRs slightly differently to Intel. +See http://lkml.kernel.org/r/2b3e25cc-286d-8bd0-aeaf-9ac4aae39de8@amd.com + +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Cc: Tom Lendacky <thomas.lendacky@amd.com> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: ak@linux.intel.com +Cc: ashok.raj@intel.com +Cc: dave.hansen@intel.com +Cc: karahmed@amazon.de +Cc: arjan@linux.intel.com +Cc: torvalds@linux-foundation.org +Cc: peterz@infradead.org +Cc: bp@alien8.de +Cc: pbonzini@redhat.com +Cc: tim.c.chen@linux.intel.com +Cc: gregkh@linux-foundation.org +Link: https://lkml.kernel.org/r/1516896855-7642-4-git-send-email-dwmw@amazon.co.uk +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/cpufeatures.h | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -258,6 +258,9 @@ + /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ + #define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ + #define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ ++#define X86_FEATURE_AMD_PRED_CMD (13*32+12) /* Prediction Command MSR (AMD) */ ++#define X86_FEATURE_AMD_SPEC_CTRL (13*32+14) /* Speculation Control MSR only (AMD) */ ++#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors (AMD) */ + + /* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ + #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ diff --git a/queue/x86-cpufeatures-add-cpuid_7_edx-cpuid-leaf.patch b/queue/x86-cpufeatures-add-cpuid_7_edx-cpuid-leaf.patch new file mode 100644 index 0000000..a47b816 --- /dev/null +++ b/queue/x86-cpufeatures-add-cpuid_7_edx-cpuid-leaf.patch @@ -0,0 +1,149 @@ +From foo@baz Wed Feb 7 19:38:23 CST 2018 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Thu, 25 Jan 2018 16:14:09 +0000 +Subject: x86/cpufeatures: Add CPUID_7_EDX CPUID leaf + +From: David Woodhouse <dwmw@amazon.co.uk> + +(cherry picked from commit 95ca0ee8636059ea2800dfbac9ecac6212d6b38f) + +This is a pure feature bits leaf. There are two AVX512 feature bits in it +already which were handled as scattered bits, and three more from this leaf +are going to be added for speculation control features. + +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: ak@linux.intel.com +Cc: ashok.raj@intel.com +Cc: dave.hansen@intel.com +Cc: karahmed@amazon.de +Cc: arjan@linux.intel.com +Cc: torvalds@linux-foundation.org +Cc: peterz@infradead.org +Cc: bp@alien8.de +Cc: pbonzini@redhat.com +Cc: tim.c.chen@linux.intel.com +Cc: gregkh@linux-foundation.org +Link: https://lkml.kernel.org/r/1516896855-7642-2-git-send-email-dwmw@amazon.co.uk +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/cpufeature.h | 7 +++++-- + arch/x86/include/asm/cpufeatures.h | 10 ++++++---- + arch/x86/include/asm/disabled-features.h | 3 ++- + arch/x86/include/asm/required-features.h | 3 ++- + arch/x86/kernel/cpu/common.c | 1 + + arch/x86/kernel/cpu/scattered.c | 2 -- + 6 files changed, 16 insertions(+), 10 deletions(-) + +--- a/arch/x86/include/asm/cpufeature.h ++++ b/arch/x86/include/asm/cpufeature.h +@@ -28,6 +28,7 @@ enum cpuid_leafs + CPUID_8000_000A_EDX, + CPUID_7_ECX, + CPUID_8000_0007_EBX, ++ CPUID_7_EDX, + }; + + #ifdef CONFIG_X86_FEATURE_NAMES +@@ -78,8 +79,9 @@ extern const char * const x86_bug_flags[ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ ++ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ + REQUIRED_MASK_CHECK || \ +- BUILD_BUG_ON_ZERO(NCAPINTS != 18)) ++ BUILD_BUG_ON_ZERO(NCAPINTS != 19)) + + #define DISABLED_MASK_BIT_SET(feature_bit) \ + ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ +@@ -100,8 +102,9 @@ extern const char * const x86_bug_flags[ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ ++ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ + DISABLED_MASK_CHECK || \ +- BUILD_BUG_ON_ZERO(NCAPINTS != 18)) ++ BUILD_BUG_ON_ZERO(NCAPINTS != 19)) + + #define cpu_has(c, bit) \ + (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -12,7 +12,7 @@ + /* + * Defines x86 CPU feature bits + */ +-#define NCAPINTS 18 /* N 32-bit words worth of info */ ++#define NCAPINTS 19 /* N 32-bit words worth of info */ + #define NBUGINTS 1 /* N 32-bit bug flags */ + + /* +@@ -197,9 +197,7 @@ + #define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ + #define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */ + +-#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ +-#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ +-#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ ++#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ + + /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ + #define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ +@@ -295,6 +293,10 @@ + #define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ + #define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ + ++/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ ++#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ ++#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ ++ + /* + * BUG word(s) + */ +--- a/arch/x86/include/asm/disabled-features.h ++++ b/arch/x86/include/asm/disabled-features.h +@@ -59,6 +59,7 @@ + #define DISABLED_MASK15 0 + #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE) + #define DISABLED_MASK17 0 +-#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) ++#define DISABLED_MASK18 0 ++#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) + + #endif /* _ASM_X86_DISABLED_FEATURES_H */ +--- a/arch/x86/include/asm/required-features.h ++++ b/arch/x86/include/asm/required-features.h +@@ -100,6 +100,7 @@ + #define REQUIRED_MASK15 0 + #define REQUIRED_MASK16 0 + #define REQUIRED_MASK17 0 +-#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) ++#define REQUIRED_MASK18 0 ++#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) + + #endif /* _ASM_X86_REQUIRED_FEATURES_H */ +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -737,6 +737,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) + cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); + c->x86_capability[CPUID_7_0_EBX] = ebx; + c->x86_capability[CPUID_7_ECX] = ecx; ++ c->x86_capability[CPUID_7_EDX] = edx; + } + + /* Extended state features: level 0x0000000d */ +--- a/arch/x86/kernel/cpu/scattered.c ++++ b/arch/x86/kernel/cpu/scattered.c +@@ -31,8 +31,6 @@ void init_scattered_cpuid_features(struc + const struct cpuid_bit *cb; + + static const struct cpuid_bit cpuid_bits[] = { +- { X86_FEATURE_AVX512_4VNNIW, CR_EDX, 2, 0x00000007, 0 }, +- { X86_FEATURE_AVX512_4FMAPS, CR_EDX, 3, 0x00000007, 0 }, + { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, diff --git a/queue/x86-cpufeatures-add-intel-feature-bits-for-speculation-control.patch b/queue/x86-cpufeatures-add-intel-feature-bits-for-speculation-control.patch new file mode 100644 index 0000000..568f3bf --- /dev/null +++ b/queue/x86-cpufeatures-add-intel-feature-bits-for-speculation-control.patch @@ -0,0 +1,47 @@ +From foo@baz Wed Feb 7 19:38:23 CST 2018 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Thu, 25 Jan 2018 16:14:10 +0000 +Subject: x86/cpufeatures: Add Intel feature bits for Speculation Control + +From: David Woodhouse <dwmw@amazon.co.uk> + +(cherry picked from commit fc67dd70adb711a45d2ef34e12d1a8be75edde61) + +Add three feature bits exposed by new microcode on Intel CPUs for +speculation control. + +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: ak@linux.intel.com +Cc: ashok.raj@intel.com +Cc: dave.hansen@intel.com +Cc: karahmed@amazon.de +Cc: arjan@linux.intel.com +Cc: torvalds@linux-foundation.org +Cc: peterz@infradead.org +Cc: bp@alien8.de +Cc: pbonzini@redhat.com +Cc: tim.c.chen@linux.intel.com +Cc: gregkh@linux-foundation.org +Link: https://lkml.kernel.org/r/1516896855-7642-3-git-send-email-dwmw@amazon.co.uk +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/cpufeatures.h | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -296,6 +296,9 @@ + /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ + #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ + #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ ++#define X86_FEATURE_SPEC_CTRL (18*32+26) /* Speculation Control (IBRS + IBPB) */ ++#define X86_FEATURE_STIBP (18*32+27) /* Single Thread Indirect Branch Predictors */ ++#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ + + /* + * BUG word(s) diff --git a/queue/x86-cpufeatures-add-x86_bug_cpu_insecure.patch b/queue/x86-cpufeatures-add-x86_bug_cpu_insecure.patch new file mode 100644 index 0000000..0f6fc2c --- /dev/null +++ b/queue/x86-cpufeatures-add-x86_bug_cpu_insecure.patch @@ -0,0 +1,73 @@ +From a89f040fa34ec9cd682aed98b8f04e3c47d998bd Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Dec 2017 15:07:33 +0100 +Subject: x86/cpufeatures: Add X86_BUG_CPU_INSECURE + +From: Thomas Gleixner <tglx@linutronix.de> + +commit a89f040fa34ec9cd682aed98b8f04e3c47d998bd upstream. + +Many x86 CPUs leak information to user space due to missing isolation of +user space and kernel space page tables. There are many well documented +ways to exploit that. + +The upcoming software migitation of isolating the user and kernel space +page tables needs a misfeature flag so code can be made runtime +conditional. + +Add the BUG bits which indicates that the CPU is affected and add a feature +bit which indicates that the software migitation is enabled. + +Assume for now that _ALL_ x86 CPUs are affected by this. Exceptions can be +made later. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/cpufeatures.h | 1 + + arch/x86/kernel/cpu/common.c | 4 ++++ + 2 files changed, 5 insertions(+) + +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -316,5 +316,6 @@ + #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ + #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ + #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ ++#define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */ + + #endif /* _ASM_X86_CPUFEATURES_H */ +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -882,6 +882,10 @@ static void __init early_identify_cpu(st + } + + setup_force_cpu_cap(X86_FEATURE_ALWAYS); ++ ++ /* Assume for now that ALL x86 CPUs are insecure */ ++ setup_force_cpu_bug(X86_BUG_CPU_INSECURE); ++ + fpu__init_system(c); + } + diff --git a/queue/x86-cpufeatures-add-x86_bug_spectre_v.patch b/queue/x86-cpufeatures-add-x86_bug_spectre_v.patch new file mode 100644 index 0000000..e59cfe5 --- /dev/null +++ b/queue/x86-cpufeatures-add-x86_bug_spectre_v.patch @@ -0,0 +1,58 @@ +From 99c6fa2511d8a683e61468be91b83f85452115fa Mon Sep 17 00:00:00 2001 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Sat, 6 Jan 2018 11:49:23 +0000 +Subject: x86/cpufeatures: Add X86_BUG_SPECTRE_V[12] + +From: David Woodhouse <dwmw@amazon.co.uk> + +commit 99c6fa2511d8a683e61468be91b83f85452115fa upstream. + +Add the bug bits for spectre v1/2 and force them unconditionally for all +cpus. + +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: Rik van Riel <riel@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Jiri Kosina <jikos@kernel.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Kees Cook <keescook@google.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Cc: Paul Turner <pjt@google.com> +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/1515239374-23361-2-git-send-email-dwmw@amazon.co.uk +Signed-off-by: Razvan Ghitulete <rga@amazon.de> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/include/asm/cpufeatures.h | 2 ++ + arch/x86/kernel/cpu/common.c | 3 +++ + 2 files changed, 5 insertions(+) + +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -317,5 +317,7 @@ + #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ + #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ + #define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ ++#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ ++#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ + + #endif /* _ASM_X86_CPUFEATURES_H */ +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -886,6 +886,9 @@ static void __init early_identify_cpu(st + /* Assume for now that ALL x86 CPUs are insecure */ + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + ++ setup_force_cpu_bug(X86_BUG_SPECTRE_V1); ++ setup_force_cpu_bug(X86_BUG_SPECTRE_V2); ++ + fpu__init_system(c); + } + diff --git a/queue/x86-cpufeatures-clean-up-spectre-v2-related-cpuid-flags.patch b/queue/x86-cpufeatures-clean-up-spectre-v2-related-cpuid-flags.patch new file mode 100644 index 0000000..96900c0 --- /dev/null +++ b/queue/x86-cpufeatures-clean-up-spectre-v2-related-cpuid-flags.patch @@ -0,0 +1,171 @@ +From foo@baz Thu Feb 8 03:30:27 CET 2018 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Sat, 27 Jan 2018 16:24:32 +0000 +Subject: x86/cpufeatures: Clean up Spectre v2 related CPUID flags + +From: David Woodhouse <dwmw@amazon.co.uk> + +(cherry picked from commit 2961298efe1ea1b6fc0d7ee8b76018fa6c0bcef2) + +We want to expose the hardware features simply in /proc/cpuinfo as "ibrs", +"ibpb" and "stibp". Since AMD has separate CPUID bits for those, use them +as the user-visible bits. + +When the Intel SPEC_CTRL bit is set which indicates both IBRS and IBPB +capability, set those (AMD) bits accordingly. Likewise if the Intel STIBP +bit is set, set the AMD STIBP that's used for the generic hardware +capability. + +Hide the rest from /proc/cpuinfo by putting "" in the comments. Including +RETPOLINE and RETPOLINE_AMD which shouldn't be visible there. There are +patches to make the sysfs vulnerabilities information non-readable by +non-root, and the same should apply to all information about which +mitigations are actually in use. Those *shouldn't* appear in /proc/cpuinfo. + +The feature bit for whether IBPB is actually used, which is needed for +ALTERNATIVEs, is renamed to X86_FEATURE_USE_IBPB. + +Originally-by: Borislav Petkov <bp@suse.de> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: ak@linux.intel.com +Cc: dave.hansen@intel.com +Cc: karahmed@amazon.de +Cc: arjan@linux.intel.com +Cc: torvalds@linux-foundation.org +Cc: peterz@infradead.org +Cc: bp@alien8.de +Cc: pbonzini@redhat.com +Cc: tim.c.chen@linux.intel.com +Cc: gregkh@linux-foundation.org +Link: https://lkml.kernel.org/r/1517070274-12128-2-git-send-email-dwmw@amazon.co.uk +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/cpufeatures.h | 18 +++++++++--------- + arch/x86/include/asm/nospec-branch.h | 2 +- + arch/x86/kernel/cpu/bugs.c | 7 +++---- + arch/x86/kernel/cpu/intel.c | 31 +++++++++++++++++++++---------- + 4 files changed, 34 insertions(+), 24 deletions(-) + +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -194,15 +194,15 @@ + #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ + #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ + +-#define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ +-#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */ ++#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ ++#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ + +-#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ ++#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ + + /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ + #define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ + +-#define X86_FEATURE_IBPB ( 7*32+21) /* Indirect Branch Prediction Barrier enabled*/ ++#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ + + /* Virtualization flags: Linux defined, word 8 */ + #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ +@@ -260,9 +260,9 @@ + /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ + #define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ + #define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ +-#define X86_FEATURE_AMD_PRED_CMD (13*32+12) /* Prediction Command MSR (AMD) */ +-#define X86_FEATURE_AMD_SPEC_CTRL (13*32+14) /* Speculation Control MSR only (AMD) */ +-#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors (AMD) */ ++#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ ++#define X86_FEATURE_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ ++#define X86_FEATURE_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ + + /* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ + #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ +@@ -301,8 +301,8 @@ + /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ + #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ + #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ +-#define X86_FEATURE_SPEC_CTRL (18*32+26) /* Speculation Control (IBRS + IBPB) */ +-#define X86_FEATURE_STIBP (18*32+27) /* Single Thread Indirect Branch Predictors */ ++#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ ++#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ + #define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ + + /* +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -225,7 +225,7 @@ static inline void indirect_branch_predi + "movl %[val], %%eax\n\t" + "movl $0, %%edx\n\t" + "wrmsr", +- X86_FEATURE_IBPB) ++ X86_FEATURE_USE_IBPB) + : : [msr] "i" (MSR_IA32_PRED_CMD), + [val] "i" (PRED_CMD_IBPB) + : "eax", "ecx", "edx", "memory"); +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -272,9 +272,8 @@ retpoline_auto: + } + + /* Initialize Indirect Branch Prediction Barrier if supported */ +- if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) || +- boot_cpu_has(X86_FEATURE_AMD_PRED_CMD)) { +- setup_force_cpu_cap(X86_FEATURE_IBPB); ++ if (boot_cpu_has(X86_FEATURE_IBPB)) { ++ setup_force_cpu_cap(X86_FEATURE_USE_IBPB); + pr_info("Enabling Indirect Branch Prediction Barrier\n"); + } + } +@@ -307,7 +306,7 @@ ssize_t cpu_show_spectre_v2(struct devic + return sprintf(buf, "Not affected\n"); + + return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], +- boot_cpu_has(X86_FEATURE_IBPB) ? ", IBPB" : "", ++ boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", + spectre_v2_module_string()); + } + #endif +--- a/arch/x86/kernel/cpu/intel.c ++++ b/arch/x86/kernel/cpu/intel.c +@@ -140,17 +140,28 @@ static void early_init_intel(struct cpui + rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); + } + +- if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) || +- cpu_has(c, X86_FEATURE_STIBP) || +- cpu_has(c, X86_FEATURE_AMD_SPEC_CTRL) || +- cpu_has(c, X86_FEATURE_AMD_PRED_CMD) || +- cpu_has(c, X86_FEATURE_AMD_STIBP)) && bad_spectre_microcode(c)) { +- pr_warn("Intel Spectre v2 broken microcode detected; disabling SPEC_CTRL\n"); +- clear_cpu_cap(c, X86_FEATURE_SPEC_CTRL); ++ /* ++ * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support, ++ * and they also have a different bit for STIBP support. Also, ++ * a hypervisor might have set the individual AMD bits even on ++ * Intel CPUs, for finer-grained selection of what's available. ++ */ ++ if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) { ++ set_cpu_cap(c, X86_FEATURE_IBRS); ++ set_cpu_cap(c, X86_FEATURE_IBPB); ++ } ++ if (cpu_has(c, X86_FEATURE_INTEL_STIBP)) ++ set_cpu_cap(c, X86_FEATURE_STIBP); ++ ++ /* Now if any of them are set, check the blacklist and clear the lot */ ++ if ((cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) || ++ cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) { ++ pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n"); ++ clear_cpu_cap(c, X86_FEATURE_IBRS); ++ clear_cpu_cap(c, X86_FEATURE_IBPB); + clear_cpu_cap(c, X86_FEATURE_STIBP); +- clear_cpu_cap(c, X86_FEATURE_AMD_SPEC_CTRL); +- clear_cpu_cap(c, X86_FEATURE_AMD_PRED_CMD); +- clear_cpu_cap(c, X86_FEATURE_AMD_STIBP); ++ clear_cpu_cap(c, X86_FEATURE_SPEC_CTRL); ++ clear_cpu_cap(c, X86_FEATURE_INTEL_STIBP); + } + + /* diff --git a/queue/x86-cpufeatures-make-cpu-bugs-sticky.patch b/queue/x86-cpufeatures-make-cpu-bugs-sticky.patch new file mode 100644 index 0000000..aa4b9e0 --- /dev/null +++ b/queue/x86-cpufeatures-make-cpu-bugs-sticky.patch @@ -0,0 +1,96 @@ +From 6cbd2171e89b13377261d15e64384df60ecb530e Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Dec 2017 15:07:32 +0100 +Subject: x86/cpufeatures: Make CPU bugs sticky + +From: Thomas Gleixner <tglx@linutronix.de> + +commit 6cbd2171e89b13377261d15e64384df60ecb530e upstream. + +There is currently no way to force CPU bug bits like CPU feature bits. That +makes it impossible to set a bug bit once at boot and have it stick for all +upcoming CPUs. + +Extend the force set/clear arrays to handle bug bits as well. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Link: https://lkml.kernel.org/r/20171204150606.992156574@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/include/asm/cpufeature.h | 2 ++ + arch/x86/include/asm/processor.h | 4 ++-- + arch/x86/kernel/cpu/common.c | 6 +++--- + 3 files changed, 7 insertions(+), 5 deletions(-) + +--- a/arch/x86/include/asm/cpufeature.h ++++ b/arch/x86/include/asm/cpufeature.h +@@ -135,6 +135,8 @@ extern const char * const x86_bug_flags[ + set_bit(bit, (unsigned long *)cpu_caps_set); \ + } while (0) + ++#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) ++ + #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) + /* + * Static testing of CPU features. Used the same as boot_cpu_has(). +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -156,8 +156,8 @@ extern struct cpuinfo_x86 boot_cpu_data; + extern struct cpuinfo_x86 new_cpu_data; + + extern struct tss_struct doublefault_tss; +-extern __u32 cpu_caps_cleared[NCAPINTS]; +-extern __u32 cpu_caps_set[NCAPINTS]; ++extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; ++extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; + + #ifdef CONFIG_SMP + DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -480,8 +480,8 @@ static const char *table_lookup_model(st + return NULL; /* Not found */ + } + +-__u32 cpu_caps_cleared[NCAPINTS]; +-__u32 cpu_caps_set[NCAPINTS]; ++__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; ++__u32 cpu_caps_set[NCAPINTS + NBUGINTS]; + + void load_percpu_segment(int cpu) + { +@@ -710,7 +710,7 @@ static void apply_forced_caps(struct cpu + { + int i; + +- for (i = 0; i < NCAPINTS; i++) { ++ for (i = 0; i < NCAPINTS + NBUGINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } diff --git a/queue/x86-cpuid-fix-up-virtual-ibrs-ibpb-stibp-feature-bits-on-intel.patch b/queue/x86-cpuid-fix-up-virtual-ibrs-ibpb-stibp-feature-bits-on-intel.patch new file mode 100644 index 0000000..7e942e5 --- /dev/null +++ b/queue/x86-cpuid-fix-up-virtual-ibrs-ibpb-stibp-feature-bits-on-intel.patch @@ -0,0 +1,122 @@ +From foo@baz Thu Feb 8 03:32:24 CET 2018 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Tue, 30 Jan 2018 14:30:23 +0000 +Subject: x86/cpuid: Fix up "virtual" IBRS/IBPB/STIBP feature bits on Intel + +From: David Woodhouse <dwmw@amazon.co.uk> + + +(cherry picked from commit 7fcae1118f5fd44a862aa5c3525248e35ee67c3b) + +Despite the fact that all the other code there seems to be doing it, just +using set_cpu_cap() in early_intel_init() doesn't actually work. + +For CPUs with PKU support, setup_pku() calls get_cpu_cap() after +c->c_init() has set those feature bits. That resets those bits back to what +was queried from the hardware. + +Turning the bits off for bad microcode is easy to fix. That can just use +setup_clear_cpu_cap() to force them off for all CPUs. + +I was less keen on forcing the feature bits *on* that way, just in case +of inconsistencies. I appreciate that the kernel is going to get this +utterly wrong if CPU features are not consistent, because it has already +applied alternatives by the time secondary CPUs are brought up. + +But at least if setup_force_cpu_cap() isn't being used, we might have a +chance of *detecting* the lack of the corresponding bit and either +panicking or refusing to bring the offending CPU online. + +So ensure that the appropriate feature bits are set within get_cpu_cap() +regardless of how many extra times it's called. + +Fixes: 2961298e ("x86/cpufeatures: Clean up Spectre v2 related CPUID flags") +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: karahmed@amazon.de +Cc: peterz@infradead.org +Cc: bp@alien8.de +Link: https://lkml.kernel.org/r/1517322623-15261-1-git-send-email-dwmw@amazon.co.uk +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kernel/cpu/common.c | 21 +++++++++++++++++++++ + arch/x86/kernel/cpu/intel.c | 27 ++++++++------------------- + 2 files changed, 29 insertions(+), 19 deletions(-) + +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -718,6 +718,26 @@ static void apply_forced_caps(struct cpu + } + } + ++static void init_speculation_control(struct cpuinfo_x86 *c) ++{ ++ /* ++ * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support, ++ * and they also have a different bit for STIBP support. Also, ++ * a hypervisor might have set the individual AMD bits even on ++ * Intel CPUs, for finer-grained selection of what's available. ++ * ++ * We use the AMD bits in 0x8000_0008 EBX as the generic hardware ++ * features, which are visible in /proc/cpuinfo and used by the ++ * kernel. So set those accordingly from the Intel bits. ++ */ ++ if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) { ++ set_cpu_cap(c, X86_FEATURE_IBRS); ++ set_cpu_cap(c, X86_FEATURE_IBPB); ++ } ++ if (cpu_has(c, X86_FEATURE_INTEL_STIBP)) ++ set_cpu_cap(c, X86_FEATURE_STIBP); ++} ++ + void get_cpu_cap(struct cpuinfo_x86 *c) + { + u32 eax, ebx, ecx, edx; +@@ -812,6 +832,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) + c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); + + init_scattered_cpuid_features(c); ++ init_speculation_control(c); + } + + static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) +--- a/arch/x86/kernel/cpu/intel.c ++++ b/arch/x86/kernel/cpu/intel.c +@@ -140,28 +140,17 @@ static void early_init_intel(struct cpui + rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); + } + +- /* +- * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support, +- * and they also have a different bit for STIBP support. Also, +- * a hypervisor might have set the individual AMD bits even on +- * Intel CPUs, for finer-grained selection of what's available. +- */ +- if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) { +- set_cpu_cap(c, X86_FEATURE_IBRS); +- set_cpu_cap(c, X86_FEATURE_IBPB); +- } +- if (cpu_has(c, X86_FEATURE_INTEL_STIBP)) +- set_cpu_cap(c, X86_FEATURE_STIBP); +- + /* Now if any of them are set, check the blacklist and clear the lot */ +- if ((cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) || ++ if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) || ++ cpu_has(c, X86_FEATURE_INTEL_STIBP) || ++ cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) || + cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) { + pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n"); +- clear_cpu_cap(c, X86_FEATURE_IBRS); +- clear_cpu_cap(c, X86_FEATURE_IBPB); +- clear_cpu_cap(c, X86_FEATURE_STIBP); +- clear_cpu_cap(c, X86_FEATURE_SPEC_CTRL); +- clear_cpu_cap(c, X86_FEATURE_INTEL_STIBP); ++ setup_clear_cpu_cap(X86_FEATURE_IBRS); ++ setup_clear_cpu_cap(X86_FEATURE_IBPB); ++ setup_clear_cpu_cap(X86_FEATURE_STIBP); ++ setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL); ++ setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP); + } + + /* diff --git a/queue/x86-documentation-add-pti-description.patch b/queue/x86-documentation-add-pti-description.patch new file mode 100644 index 0000000..78b4ebd --- /dev/null +++ b/queue/x86-documentation-add-pti-description.patch @@ -0,0 +1,261 @@ +From 01c9b17bf673b05bb401b76ec763e9730ccf1376 Mon Sep 17 00:00:00 2001 +From: Dave Hansen <dave.hansen@linux.intel.com> +Date: Fri, 5 Jan 2018 09:44:36 -0800 +Subject: x86/Documentation: Add PTI description + +From: Dave Hansen <dave.hansen@linux.intel.com> + +commit 01c9b17bf673b05bb401b76ec763e9730ccf1376 upstream. + +Add some details about how PTI works, what some of the downsides +are, and how to debug it when things go wrong. + +Also document the kernel parameter: 'pti/nopti'. + +Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Randy Dunlap <rdunlap@infradead.org> +Reviewed-by: Kees Cook <keescook@chromium.org> +Cc: Moritz Lipp <moritz.lipp@iaik.tugraz.at> +Cc: Daniel Gruss <daniel.gruss@iaik.tugraz.at> +Cc: Michael Schwarz <michael.schwarz@iaik.tugraz.at> +Cc: Richard Fellner <richard.fellner@student.tugraz.at> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Hugh Dickins <hughd@google.com> +Cc: Andi Lutomirsky <luto@kernel.org> +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/20180105174436.1BC6FA2B@viggo.jf.intel.com +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + Documentation/kernel-parameters.txt | 21 ++-- + Documentation/x86/pti.txt | 186 ++++++++++++++++++++++++++++++++++++ + 2 files changed, 200 insertions(+), 7 deletions(-) + +--- a/Documentation/kernel-parameters.txt ++++ b/Documentation/kernel-parameters.txt +@@ -2763,8 +2763,6 @@ bytes respectively. Such letter suffixes + + nojitter [IA-64] Disables jitter checking for ITC timers. + +- nopti [X86-64] Disable KAISER isolation of kernel from user. +- + no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver + + no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page +@@ -3327,11 +3325,20 @@ bytes respectively. Such letter suffixes + pt. [PARIDE] + See Documentation/blockdev/paride.txt. + +- pti= [X86_64] +- Control KAISER user/kernel address space isolation: +- on - enable +- off - disable +- auto - default setting ++ pti= [X86_64] Control Page Table Isolation of user and ++ kernel address spaces. Disabling this feature ++ removes hardening, but improves performance of ++ system calls and interrupts. ++ ++ on - unconditionally enable ++ off - unconditionally disable ++ auto - kernel detects whether your CPU model is ++ vulnerable to issues that PTI mitigates ++ ++ Not specifying this option is equivalent to pti=auto. ++ ++ nopti [X86_64] ++ Equivalent to pti=off + + pty.legacy_count= + [KNL] Number of legacy pty's. Overwrites compiled-in +--- /dev/null ++++ b/Documentation/x86/pti.txt +@@ -0,0 +1,186 @@ ++Overview ++======== ++ ++Page Table Isolation (pti, previously known as KAISER[1]) is a ++countermeasure against attacks on the shared user/kernel address ++space such as the "Meltdown" approach[2]. ++ ++To mitigate this class of attacks, we create an independent set of ++page tables for use only when running userspace applications. When ++the kernel is entered via syscalls, interrupts or exceptions, the ++page tables are switched to the full "kernel" copy. When the system ++switches back to user mode, the user copy is used again. ++ ++The userspace page tables contain only a minimal amount of kernel ++data: only what is needed to enter/exit the kernel such as the ++entry/exit functions themselves and the interrupt descriptor table ++(IDT). There are a few strictly unnecessary things that get mapped ++such as the first C function when entering an interrupt (see ++comments in pti.c). ++ ++This approach helps to ensure that side-channel attacks leveraging ++the paging structures do not function when PTI is enabled. It can be ++enabled by setting CONFIG_PAGE_TABLE_ISOLATION=y at compile time. ++Once enabled at compile-time, it can be disabled at boot with the ++'nopti' or 'pti=' kernel parameters (see kernel-parameters.txt). ++ ++Page Table Management ++===================== ++ ++When PTI is enabled, the kernel manages two sets of page tables. ++The first set is very similar to the single set which is present in ++kernels without PTI. This includes a complete mapping of userspace ++that the kernel can use for things like copy_to_user(). ++ ++Although _complete_, the user portion of the kernel page tables is ++crippled by setting the NX bit in the top level. This ensures ++that any missed kernel->user CR3 switch will immediately crash ++userspace upon executing its first instruction. ++ ++The userspace page tables map only the kernel data needed to enter ++and exit the kernel. This data is entirely contained in the 'struct ++cpu_entry_area' structure which is placed in the fixmap which gives ++each CPU's copy of the area a compile-time-fixed virtual address. ++ ++For new userspace mappings, the kernel makes the entries in its ++page tables like normal. The only difference is when the kernel ++makes entries in the top (PGD) level. In addition to setting the ++entry in the main kernel PGD, a copy of the entry is made in the ++userspace page tables' PGD. ++ ++This sharing at the PGD level also inherently shares all the lower ++layers of the page tables. This leaves a single, shared set of ++userspace page tables to manage. One PTE to lock, one set of ++accessed bits, dirty bits, etc... ++ ++Overhead ++======== ++ ++Protection against side-channel attacks is important. But, ++this protection comes at a cost: ++ ++1. Increased Memory Use ++ a. Each process now needs an order-1 PGD instead of order-0. ++ (Consumes an additional 4k per process). ++ b. The 'cpu_entry_area' structure must be 2MB in size and 2MB ++ aligned so that it can be mapped by setting a single PMD ++ entry. This consumes nearly 2MB of RAM once the kernel ++ is decompressed, but no space in the kernel image itself. ++ ++2. Runtime Cost ++ a. CR3 manipulation to switch between the page table copies ++ must be done at interrupt, syscall, and exception entry ++ and exit (it can be skipped when the kernel is interrupted, ++ though.) Moves to CR3 are on the order of a hundred ++ cycles, and are required at every entry and exit. ++ b. A "trampoline" must be used for SYSCALL entry. This ++ trampoline depends on a smaller set of resources than the ++ non-PTI SYSCALL entry code, so requires mapping fewer ++ things into the userspace page tables. The downside is ++ that stacks must be switched at entry time. ++ d. Global pages are disabled for all kernel structures not ++ mapped into both kernel and userspace page tables. This ++ feature of the MMU allows different processes to share TLB ++ entries mapping the kernel. Losing the feature means more ++ TLB misses after a context switch. The actual loss of ++ performance is very small, however, never exceeding 1%. ++ d. Process Context IDentifiers (PCID) is a CPU feature that ++ allows us to skip flushing the entire TLB when switching page ++ tables by setting a special bit in CR3 when the page tables ++ are changed. This makes switching the page tables (at context ++ switch, or kernel entry/exit) cheaper. But, on systems with ++ PCID support, the context switch code must flush both the user ++ and kernel entries out of the TLB. The user PCID TLB flush is ++ deferred until the exit to userspace, minimizing the cost. ++ See intel.com/sdm for the gory PCID/INVPCID details. ++ e. The userspace page tables must be populated for each new ++ process. Even without PTI, the shared kernel mappings ++ are created by copying top-level (PGD) entries into each ++ new process. But, with PTI, there are now *two* kernel ++ mappings: one in the kernel page tables that maps everything ++ and one for the entry/exit structures. At fork(), we need to ++ copy both. ++ f. In addition to the fork()-time copying, there must also ++ be an update to the userspace PGD any time a set_pgd() is done ++ on a PGD used to map userspace. This ensures that the kernel ++ and userspace copies always map the same userspace ++ memory. ++ g. On systems without PCID support, each CR3 write flushes ++ the entire TLB. That means that each syscall, interrupt ++ or exception flushes the TLB. ++ h. INVPCID is a TLB-flushing instruction which allows flushing ++ of TLB entries for non-current PCIDs. Some systems support ++ PCIDs, but do not support INVPCID. On these systems, addresses ++ can only be flushed from the TLB for the current PCID. When ++ flushing a kernel address, we need to flush all PCIDs, so a ++ single kernel address flush will require a TLB-flushing CR3 ++ write upon the next use of every PCID. ++ ++Possible Future Work ++==================== ++1. We can be more careful about not actually writing to CR3 ++ unless its value is actually changed. ++2. Allow PTI to be enabled/disabled at runtime in addition to the ++ boot-time switching. ++ ++Testing ++======== ++ ++To test stability of PTI, the following test procedure is recommended, ++ideally doing all of these in parallel: ++ ++1. Set CONFIG_DEBUG_ENTRY=y ++2. Run several copies of all of the tools/testing/selftests/x86/ tests ++ (excluding MPX and protection_keys) in a loop on multiple CPUs for ++ several minutes. These tests frequently uncover corner cases in the ++ kernel entry code. In general, old kernels might cause these tests ++ themselves to crash, but they should never crash the kernel. ++3. Run the 'perf' tool in a mode (top or record) that generates many ++ frequent performance monitoring non-maskable interrupts (see "NMI" ++ in /proc/interrupts). This exercises the NMI entry/exit code which ++ is known to trigger bugs in code paths that did not expect to be ++ interrupted, including nested NMIs. Using "-c" boosts the rate of ++ NMIs, and using two -c with separate counters encourages nested NMIs ++ and less deterministic behavior. ++ ++ while true; do perf record -c 10000 -e instructions,cycles -a sleep 10; done ++ ++4. Launch a KVM virtual machine. ++5. Run 32-bit binaries on systems supporting the SYSCALL instruction. ++ This has been a lightly-tested code path and needs extra scrutiny. ++ ++Debugging ++========= ++ ++Bugs in PTI cause a few different signatures of crashes ++that are worth noting here. ++ ++ * Failures of the selftests/x86 code. Usually a bug in one of the ++ more obscure corners of entry_64.S ++ * Crashes in early boot, especially around CPU bringup. Bugs ++ in the trampoline code or mappings cause these. ++ * Crashes at the first interrupt. Caused by bugs in entry_64.S, ++ like screwing up a page table switch. Also caused by ++ incorrectly mapping the IRQ handler entry code. ++ * Crashes at the first NMI. The NMI code is separate from main ++ interrupt handlers and can have bugs that do not affect ++ normal interrupts. Also caused by incorrectly mapping NMI ++ code. NMIs that interrupt the entry code must be very ++ careful and can be the cause of crashes that show up when ++ running perf. ++ * Kernel crashes at the first exit to userspace. entry_64.S ++ bugs, or failing to map some of the exit code. ++ * Crashes at first interrupt that interrupts userspace. The paths ++ in entry_64.S that return to userspace are sometimes separate ++ from the ones that return to the kernel. ++ * Double faults: overflowing the kernel stack because of page ++ faults upon page faults. Caused by touching non-pti-mapped ++ data in the entry code, or forgetting to switch to kernel ++ CR3 before calling into C functions which are not pti-mapped. ++ * Userspace segfaults early in boot, sometimes manifesting ++ as mount(8) failing to mount the rootfs. These have ++ tended to be TLB invalidation issues. Usually invalidating ++ the wrong PCID, or otherwise missing an invalidation. ++ ++1. https://gruss.cc/files/kaiser.pdf ++2. https://meltdownattack.com/meltdown.pdf diff --git a/queue/x86-entry-64-push-extra-regs-right-away.patch b/queue/x86-entry-64-push-extra-regs-right-away.patch new file mode 100644 index 0000000..e80f751 --- /dev/null +++ b/queue/x86-entry-64-push-extra-regs-right-away.patch @@ -0,0 +1,46 @@ +From foo@baz Thu Feb 8 03:30:27 CET 2018 +From: Andy Lutomirski <luto@kernel.org> +Date: Sun, 28 Jan 2018 10:38:49 -0800 +Subject: x86/entry/64: Push extra regs right away + +From: Andy Lutomirski <luto@kernel.org> + +(cherry picked from commit d1f7732009e0549eedf8ea1db948dc37be77fd46) + +With the fast path removed there is no point in splitting the push of the +normal and the extra register set. Just push the extra regs right away. + +[ tglx: Split out from 'x86/entry/64: Remove the SYSCALL64 fast path' ] + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: Ingo Molnar <mingo@kernel.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Kernel Hardening <kernel-hardening@lists.openwall.com> +Link: https://lkml.kernel.org/r/462dff8d4d64dfbfc851fbf3130641809d980ecd.1517164461.git.luto@kernel.org +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/entry/entry_64.S | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -177,10 +177,14 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) + pushq %r9 /* pt_regs->r9 */ + pushq %r10 /* pt_regs->r10 */ + pushq %r11 /* pt_regs->r11 */ +- sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ ++ pushq %rbx /* pt_regs->rbx */ ++ pushq %rbp /* pt_regs->rbp */ ++ pushq %r12 /* pt_regs->r12 */ ++ pushq %r13 /* pt_regs->r13 */ ++ pushq %r14 /* pt_regs->r14 */ ++ pushq %r15 /* pt_regs->r15 */ + + /* IRQs are off. */ +- SAVE_EXTRA_REGS + movq %rsp, %rdi + call do_syscall_64 /* returns with IRQs disabled */ + diff --git a/queue/x86-entry-64-remove-the-syscall64-fast-path.patch b/queue/x86-entry-64-remove-the-syscall64-fast-path.patch new file mode 100644 index 0000000..daa62bd --- /dev/null +++ b/queue/x86-entry-64-remove-the-syscall64-fast-path.patch @@ -0,0 +1,202 @@ +From foo@baz Thu Feb 8 03:30:27 CET 2018 +From: Andy Lutomirski <luto@kernel.org> +Date: Sun, 28 Jan 2018 10:38:49 -0800 +Subject: x86/entry/64: Remove the SYSCALL64 fast path + +From: Andy Lutomirski <luto@kernel.org> + +(cherry picked from commit 21d375b6b34ff511a507de27bf316b3dde6938d9) + +The SYCALLL64 fast path was a nice, if small, optimization back in the good +old days when syscalls were actually reasonably fast. Now there is PTI to +slow everything down, and indirect branches are verboten, making everything +messier. The retpoline code in the fast path is particularly nasty. + +Just get rid of the fast path. The slow path is barely slower. + +[ tglx: Split out the 'push all extra regs' part ] + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: Ingo Molnar <mingo@kernel.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Kernel Hardening <kernel-hardening@lists.openwall.com> +Link: https://lkml.kernel.org/r/462dff8d4d64dfbfc851fbf3130641809d980ecd.1517164461.git.luto@kernel.org +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/entry/entry_64.S | 123 -------------------------------------------- + arch/x86/entry/syscall_64.c | 7 -- + 2 files changed, 3 insertions(+), 127 deletions(-) + +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -179,94 +179,11 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) + pushq %r11 /* pt_regs->r11 */ + sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + +- /* +- * If we need to do entry work or if we guess we'll need to do +- * exit work, go straight to the slow path. +- */ +- movq PER_CPU_VAR(current_task), %r11 +- testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) +- jnz entry_SYSCALL64_slow_path +- +-entry_SYSCALL_64_fastpath: +- /* +- * Easy case: enable interrupts and issue the syscall. If the syscall +- * needs pt_regs, we'll call a stub that disables interrupts again +- * and jumps to the slow path. +- */ +- TRACE_IRQS_ON +- ENABLE_INTERRUPTS(CLBR_NONE) +-#if __SYSCALL_MASK == ~0 +- cmpq $__NR_syscall_max, %rax +-#else +- andl $__SYSCALL_MASK, %eax +- cmpl $__NR_syscall_max, %eax +-#endif +- ja 1f /* return -ENOSYS (already in pt_regs->ax) */ +- movq %r10, %rcx +- +- /* +- * This call instruction is handled specially in stub_ptregs_64. +- * It might end up jumping to the slow path. If it jumps, RAX +- * and all argument registers are clobbered. +- */ +-#ifdef CONFIG_RETPOLINE +- movq sys_call_table(, %rax, 8), %rax +- call __x86_indirect_thunk_rax +-#else +- call *sys_call_table(, %rax, 8) +-#endif +-.Lentry_SYSCALL_64_after_fastpath_call: +- +- movq %rax, RAX(%rsp) +-1: +- +- /* +- * If we get here, then we know that pt_regs is clean for SYSRET64. +- * If we see that no exit work is required (which we are required +- * to check with IRQs off), then we can go straight to SYSRET64. +- */ +- DISABLE_INTERRUPTS(CLBR_NONE) +- TRACE_IRQS_OFF +- movq PER_CPU_VAR(current_task), %r11 +- testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) +- jnz 1f +- +- LOCKDEP_SYS_EXIT +- TRACE_IRQS_ON /* user mode is traced as IRQs on */ +- movq RIP(%rsp), %rcx +- movq EFLAGS(%rsp), %r11 +- RESTORE_C_REGS_EXCEPT_RCX_R11 +- /* +- * This opens a window where we have a user CR3, but are +- * running in the kernel. This makes using the CS +- * register useless for telling whether or not we need to +- * switch CR3 in NMIs. Normal interrupts are OK because +- * they are off here. +- */ +- SWITCH_USER_CR3 +- movq RSP(%rsp), %rsp +- USERGS_SYSRET64 +- +-1: +- /* +- * The fast path looked good when we started, but something changed +- * along the way and we need to switch to the slow path. Calling +- * raise(3) will trigger this, for example. IRQs are off. +- */ +- TRACE_IRQS_ON +- ENABLE_INTERRUPTS(CLBR_NONE) +- SAVE_EXTRA_REGS +- movq %rsp, %rdi +- call syscall_return_slowpath /* returns with IRQs disabled */ +- jmp return_from_SYSCALL_64 +- +-entry_SYSCALL64_slow_path: + /* IRQs are off. */ + SAVE_EXTRA_REGS + movq %rsp, %rdi + call do_syscall_64 /* returns with IRQs disabled */ + +-return_from_SYSCALL_64: + RESTORE_EXTRA_REGS + TRACE_IRQS_IRETQ /* we're about to change IF */ + +@@ -339,6 +256,7 @@ return_from_SYSCALL_64: + syscall_return_via_sysret: + /* rcx and r11 are already restored (see code above) */ + RESTORE_C_REGS_EXCEPT_RCX_R11 ++ + /* + * This opens a window where we have a user CR3, but are + * running in the kernel. This makes using the CS +@@ -363,45 +281,6 @@ opportunistic_sysret_failed: + jmp restore_c_regs_and_iret + END(entry_SYSCALL_64) + +-ENTRY(stub_ptregs_64) +- /* +- * Syscalls marked as needing ptregs land here. +- * If we are on the fast path, we need to save the extra regs, +- * which we achieve by trying again on the slow path. If we are on +- * the slow path, the extra regs are already saved. +- * +- * RAX stores a pointer to the C function implementing the syscall. +- * IRQs are on. +- */ +- cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp) +- jne 1f +- +- /* +- * Called from fast path -- disable IRQs again, pop return address +- * and jump to slow path +- */ +- DISABLE_INTERRUPTS(CLBR_NONE) +- TRACE_IRQS_OFF +- popq %rax +- jmp entry_SYSCALL64_slow_path +- +-1: +- JMP_NOSPEC %rax /* Called from C */ +-END(stub_ptregs_64) +- +-.macro ptregs_stub func +-ENTRY(ptregs_\func) +- leaq \func(%rip), %rax +- jmp stub_ptregs_64 +-END(ptregs_\func) +-.endm +- +-/* Instantiate ptregs_stub for each ptregs-using syscall */ +-#define __SYSCALL_64_QUAL_(sym) +-#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym +-#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym) +-#include <asm/syscalls_64.h> +- + /* + * %rdi: prev task + * %rsi: next task +--- a/arch/x86/entry/syscall_64.c ++++ b/arch/x86/entry/syscall_64.c +@@ -6,14 +6,11 @@ + #include <asm/asm-offsets.h> + #include <asm/syscall.h> + +-#define __SYSCALL_64_QUAL_(sym) sym +-#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym +- +-#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); ++#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); + #include <asm/syscalls_64.h> + #undef __SYSCALL_64 + +-#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym), ++#define __SYSCALL_64(nr, sym, qual) [nr] = sym, + + extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); + diff --git a/queue/x86-get_user-use-pointer-masking-to-limit-speculation.patch b/queue/x86-get_user-use-pointer-masking-to-limit-speculation.patch new file mode 100644 index 0000000..e81910d --- /dev/null +++ b/queue/x86-get_user-use-pointer-masking-to-limit-speculation.patch @@ -0,0 +1,98 @@ +From foo@baz Thu Feb 8 03:32:24 CET 2018 +From: Dan Williams <dan.j.williams@intel.com> +Date: Mon, 29 Jan 2018 17:02:54 -0800 +Subject: x86/get_user: Use pointer masking to limit speculation + +From: Dan Williams <dan.j.williams@intel.com> + + +(cherry picked from commit c7f631cb07e7da06ac1d231ca178452339e32a94) + +Quoting Linus: + + I do think that it would be a good idea to very expressly document + the fact that it's not that the user access itself is unsafe. I do + agree that things like "get_user()" want to be protected, but not + because of any direct bugs or problems with get_user() and friends, + but simply because get_user() is an excellent source of a pointer + that is obviously controlled from a potentially attacking user + space. So it's a prime candidate for then finding _subsequent_ + accesses that can then be used to perturb the cache. + +Unlike the __get_user() case get_user() includes the address limit check +near the pointer de-reference. With that locality the speculation can be +mitigated with pointer narrowing rather than a barrier, i.e. +array_index_nospec(). Where the narrowing is performed by: + + cmp %limit, %ptr + sbb %mask, %mask + and %mask, %ptr + +With respect to speculation the value of %ptr is either less than %limit +or NULL. + +Co-developed-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: Dan Williams <dan.j.williams@intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: linux-arch@vger.kernel.org +Cc: Kees Cook <keescook@chromium.org> +Cc: kernel-hardening@lists.openwall.com +Cc: gregkh@linuxfoundation.org +Cc: Al Viro <viro@zeniv.linux.org.uk> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: torvalds@linux-foundation.org +Cc: alan@linux.intel.com +Link: https://lkml.kernel.org/r/151727417469.33451.11804043010080838495.stgit@dwillia2-desk3.amr.corp.intel.com +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/lib/getuser.S | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/arch/x86/lib/getuser.S ++++ b/arch/x86/lib/getuser.S +@@ -39,6 +39,8 @@ ENTRY(__get_user_1) + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user ++ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ ++ and %_ASM_DX, %_ASM_AX + ASM_STAC + 1: movzbl (%_ASM_AX),%edx + xor %eax,%eax +@@ -53,6 +55,8 @@ ENTRY(__get_user_2) + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user ++ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ ++ and %_ASM_DX, %_ASM_AX + ASM_STAC + 2: movzwl -1(%_ASM_AX),%edx + xor %eax,%eax +@@ -67,6 +71,8 @@ ENTRY(__get_user_4) + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user ++ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ ++ and %_ASM_DX, %_ASM_AX + ASM_STAC + 3: movl -3(%_ASM_AX),%edx + xor %eax,%eax +@@ -82,6 +88,8 @@ ENTRY(__get_user_8) + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user ++ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ ++ and %_ASM_DX, %_ASM_AX + ASM_STAC + 4: movq -7(%_ASM_AX),%rdx + xor %eax,%eax +@@ -93,6 +101,8 @@ ENTRY(__get_user_8) + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX + jae bad_get_user_8 ++ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */ ++ and %_ASM_DX, %_ASM_AX + ASM_STAC + 4: movl -7(%_ASM_AX),%edx + 5: movl -3(%_ASM_AX),%ecx diff --git a/queue/x86-implement-array_index_mask_nospec.patch b/queue/x86-implement-array_index_mask_nospec.patch new file mode 100644 index 0000000..23b2cba --- /dev/null +++ b/queue/x86-implement-array_index_mask_nospec.patch @@ -0,0 +1,66 @@ +From foo@baz Thu Feb 8 03:32:24 CET 2018 +From: Dan Williams <dan.j.williams@intel.com> +Date: Mon, 29 Jan 2018 17:02:28 -0800 +Subject: x86: Implement array_index_mask_nospec + +From: Dan Williams <dan.j.williams@intel.com> + + +(cherry picked from commit babdde2698d482b6c0de1eab4f697cf5856c5859) + +array_index_nospec() uses a mask to sanitize user controllable array +indexes, i.e. generate a 0 mask if 'index' >= 'size', and a ~0 mask +otherwise. While the default array_index_mask_nospec() handles the +carry-bit from the (index - size) result in software. + +The x86 array_index_mask_nospec() does the same, but the carry-bit is +handled in the processor CF flag without conditional instructions in the +control flow. + +Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: Dan Williams <dan.j.williams@intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: linux-arch@vger.kernel.org +Cc: kernel-hardening@lists.openwall.com +Cc: gregkh@linuxfoundation.org +Cc: alan@linux.intel.com +Link: https://lkml.kernel.org/r/151727414808.33451.1873237130672785331.stgit@dwillia2-desk3.amr.corp.intel.com +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/barrier.h | 24 ++++++++++++++++++++++++ + 1 file changed, 24 insertions(+) + +--- a/arch/x86/include/asm/barrier.h ++++ b/arch/x86/include/asm/barrier.h +@@ -23,6 +23,30 @@ + #define wmb() asm volatile("sfence" ::: "memory") + #endif + ++/** ++ * array_index_mask_nospec() - generate a mask that is ~0UL when the ++ * bounds check succeeds and 0 otherwise ++ * @index: array element index ++ * @size: number of elements in array ++ * ++ * Returns: ++ * 0 - (index < size) ++ */ ++static inline unsigned long array_index_mask_nospec(unsigned long index, ++ unsigned long size) ++{ ++ unsigned long mask; ++ ++ asm ("cmp %1,%2; sbb %0,%0;" ++ :"=r" (mask) ++ :"r"(size),"r" (index) ++ :"cc"); ++ return mask; ++} ++ ++/* Override the default implementation from linux/nospec.h. */ ++#define array_index_mask_nospec array_index_mask_nospec ++ + #ifdef CONFIG_X86_PPRO_FENCE + #define dma_rmb() rmb() + #else diff --git a/queue/x86-introduce-__uaccess_begin_nospec-and-uaccess_try_nospec.patch b/queue/x86-introduce-__uaccess_begin_nospec-and-uaccess_try_nospec.patch new file mode 100644 index 0000000..885922d --- /dev/null +++ b/queue/x86-introduce-__uaccess_begin_nospec-and-uaccess_try_nospec.patch @@ -0,0 +1,80 @@ +From foo@baz Thu Feb 8 03:32:24 CET 2018 +From: Dan Williams <dan.j.williams@intel.com> +Date: Mon, 29 Jan 2018 17:02:39 -0800 +Subject: x86: Introduce __uaccess_begin_nospec() and uaccess_try_nospec + +From: Dan Williams <dan.j.williams@intel.com> + + +(cherry picked from commit b3bbfb3fb5d25776b8e3f361d2eedaabb0b496cd) + +For __get_user() paths, do not allow the kernel to speculate on the value +of a user controlled pointer. In addition to the 'stac' instruction for +Supervisor Mode Access Protection (SMAP), a barrier_nospec() causes the +access_ok() result to resolve in the pipeline before the CPU might take any +speculative action on the pointer value. Given the cost of 'stac' the +speculation barrier is placed after 'stac' to hopefully overlap the cost of +disabling SMAP with the cost of flushing the instruction pipeline. + +Since __get_user is a major kernel interface that deals with user +controlled pointers, the __uaccess_begin_nospec() mechanism will prevent +speculative execution past an access_ok() permission check. While +speculative execution past access_ok() is not enough to lead to a kernel +memory leak, it is a necessary precondition. + +To be clear, __uaccess_begin_nospec() is addressing a class of potential +problems near __get_user() usages. + +Note, that while the barrier_nospec() in __uaccess_begin_nospec() is used +to protect __get_user(), pointer masking similar to array_index_nospec() +will be used for get_user() since it incorporates a bounds check near the +usage. + +uaccess_try_nospec provides the same mechanism for get_user_try. + +No functional changes. + +Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> +Suggested-by: Andi Kleen <ak@linux.intel.com> +Suggested-by: Ingo Molnar <mingo@redhat.com> +Signed-off-by: Dan Williams <dan.j.williams@intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: linux-arch@vger.kernel.org +Cc: Tom Lendacky <thomas.lendacky@amd.com> +Cc: Kees Cook <keescook@chromium.org> +Cc: kernel-hardening@lists.openwall.com +Cc: gregkh@linuxfoundation.org +Cc: Al Viro <viro@zeniv.linux.org.uk> +Cc: alan@linux.intel.com +Link: https://lkml.kernel.org/r/151727415922.33451.5796614273104346583.stgit@dwillia2-desk3.amr.corp.intel.com +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/uaccess.h | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/arch/x86/include/asm/uaccess.h ++++ b/arch/x86/include/asm/uaccess.h +@@ -123,6 +123,11 @@ extern int __get_user_bad(void); + + #define __uaccess_begin() stac() + #define __uaccess_end() clac() ++#define __uaccess_begin_nospec() \ ++({ \ ++ stac(); \ ++ barrier_nospec(); \ ++}) + + /* + * This is a type: either unsigned long, if the argument fits into +@@ -474,6 +479,10 @@ struct __large_struct { unsigned long bu + __uaccess_begin(); \ + barrier(); + ++#define uaccess_try_nospec do { \ ++ current->thread.uaccess_err = 0; \ ++ __uaccess_begin_nospec(); \ ++ + #define uaccess_catch(err) \ + __uaccess_end(); \ + (err) |= (current->thread.uaccess_err ? -EFAULT : 0); \ diff --git a/queue/x86-introduce-barrier_nospec.patch b/queue/x86-introduce-barrier_nospec.patch new file mode 100644 index 0000000..ae6aa42 --- /dev/null +++ b/queue/x86-introduce-barrier_nospec.patch @@ -0,0 +1,66 @@ +From foo@baz Thu Feb 8 03:32:24 CET 2018 +From: Dan Williams <dan.j.williams@intel.com> +Date: Mon, 29 Jan 2018 17:02:33 -0800 +Subject: x86: Introduce barrier_nospec + +From: Dan Williams <dan.j.williams@intel.com> + + +(cherry picked from commit b3d7ad85b80bbc404635dca80f5b129f6242bc7a) + +Rename the open coded form of this instruction sequence from +rdtsc_ordered() into a generic barrier primitive, barrier_nospec(). + +One of the mitigations for Spectre variant1 vulnerabilities is to fence +speculative execution after successfully validating a bounds check. I.e. +force the result of a bounds check to resolve in the instruction pipeline +to ensure speculative execution honors that result before potentially +operating on out-of-bounds data. + +No functional changes. + +Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> +Suggested-by: Andi Kleen <ak@linux.intel.com> +Suggested-by: Ingo Molnar <mingo@redhat.com> +Signed-off-by: Dan Williams <dan.j.williams@intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: linux-arch@vger.kernel.org +Cc: Tom Lendacky <thomas.lendacky@amd.com> +Cc: Kees Cook <keescook@chromium.org> +Cc: kernel-hardening@lists.openwall.com +Cc: gregkh@linuxfoundation.org +Cc: Al Viro <viro@zeniv.linux.org.uk> +Cc: alan@linux.intel.com +Link: https://lkml.kernel.org/r/151727415361.33451.9049453007262764675.stgit@dwillia2-desk3.amr.corp.intel.com +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/barrier.h | 4 ++++ + arch/x86/include/asm/msr.h | 3 +-- + 2 files changed, 5 insertions(+), 2 deletions(-) + +--- a/arch/x86/include/asm/barrier.h ++++ b/arch/x86/include/asm/barrier.h +@@ -47,6 +47,10 @@ static inline unsigned long array_index_ + /* Override the default implementation from linux/nospec.h. */ + #define array_index_mask_nospec array_index_mask_nospec + ++/* Prevent speculative execution past this barrier. */ ++#define barrier_nospec() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \ ++ "lfence", X86_FEATURE_LFENCE_RDTSC) ++ + #ifdef CONFIG_X86_PPRO_FENCE + #define dma_rmb() rmb() + #else +--- a/arch/x86/include/asm/msr.h ++++ b/arch/x86/include/asm/msr.h +@@ -188,8 +188,7 @@ static __always_inline unsigned long lon + * that some other imaginary CPU is updating continuously with a + * time stamp. + */ +- alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, +- "lfence", X86_FEATURE_LFENCE_RDTSC); ++ barrier_nospec(); + return rdtsc(); + } + diff --git a/queue/x86-kaiser-check-boottime-cmdline-params.patch b/queue/x86-kaiser-check-boottime-cmdline-params.patch new file mode 100644 index 0000000..3476e16 --- /dev/null +++ b/queue/x86-kaiser-check-boottime-cmdline-params.patch @@ -0,0 +1,123 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Borislav Petkov <bp@suse.de> +Date: Tue, 2 Jan 2018 14:19:48 +0100 +Subject: x86/kaiser: Check boottime cmdline params + +From: Borislav Petkov <bp@suse.de> + + +AMD (and possibly other vendors) are not affected by the leak +KAISER is protecting against. + +Keep the "nopti" for traditional reasons and add pti=<on|off|auto> +like upstream. + +Signed-off-by: Borislav Petkov <bp@suse.de> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + Documentation/kernel-parameters.txt | 6 +++ + arch/x86/mm/kaiser.c | 59 +++++++++++++++++++++++++----------- + 2 files changed, 47 insertions(+), 18 deletions(-) + +--- a/Documentation/kernel-parameters.txt ++++ b/Documentation/kernel-parameters.txt +@@ -3327,6 +3327,12 @@ bytes respectively. Such letter suffixes + pt. [PARIDE] + See Documentation/blockdev/paride.txt. + ++ pti= [X86_64] ++ Control KAISER user/kernel address space isolation: ++ on - enable ++ off - disable ++ auto - default setting ++ + pty.legacy_count= + [KNL] Number of legacy pty's. Overwrites compiled-in + default number. +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -15,6 +15,7 @@ + #include <asm/pgtable.h> + #include <asm/pgalloc.h> + #include <asm/desc.h> ++#include <asm/cmdline.h> + + int kaiser_enabled __read_mostly = 1; + EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */ +@@ -263,6 +264,43 @@ static void __init kaiser_init_all_pgds( + WARN_ON(__ret); \ + } while (0) + ++void __init kaiser_check_boottime_disable(void) ++{ ++ bool enable = true; ++ char arg[5]; ++ int ret; ++ ++ ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); ++ if (ret > 0) { ++ if (!strncmp(arg, "on", 2)) ++ goto enable; ++ ++ if (!strncmp(arg, "off", 3)) ++ goto disable; ++ ++ if (!strncmp(arg, "auto", 4)) ++ goto skip; ++ } ++ ++ if (cmdline_find_option_bool(boot_command_line, "nopti")) ++ goto disable; ++ ++skip: ++ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) ++ goto disable; ++ ++enable: ++ if (enable) ++ setup_force_cpu_cap(X86_FEATURE_KAISER); ++ ++ return; ++ ++disable: ++ pr_info("Kernel/User page tables isolation: disabled\n"); ++ kaiser_enabled = 0; ++ setup_clear_cpu_cap(X86_FEATURE_KAISER); ++} ++ + /* + * If anything in here fails, we will likely die on one of the + * first kernel->user transitions and init will die. But, we +@@ -274,12 +312,10 @@ void __init kaiser_init(void) + { + int cpu; + +- if (!kaiser_enabled) { +- setup_clear_cpu_cap(X86_FEATURE_KAISER); +- return; +- } ++ kaiser_check_boottime_disable(); + +- setup_force_cpu_cap(X86_FEATURE_KAISER); ++ if (!kaiser_enabled) ++ return; + + kaiser_init_all_pgds(); + +@@ -423,16 +459,3 @@ void kaiser_flush_tlb_on_return_to_user( + X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); + } + EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); +- +-static int __init x86_nokaiser_setup(char *s) +-{ +- /* nopti doesn't accept parameters */ +- if (s) +- return -EINVAL; +- +- kaiser_enabled = 0; +- pr_info("Kernel/User page tables isolation: disabled\n"); +- +- return 0; +-} +-early_param("nopti", x86_nokaiser_setup); diff --git a/queue/x86-kaiser-move-feature-detection-up.patch b/queue/x86-kaiser-move-feature-detection-up.patch new file mode 100644 index 0000000..5d61f21 --- /dev/null +++ b/queue/x86-kaiser-move-feature-detection-up.patch @@ -0,0 +1,79 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Borislav Petkov <bp@suse.de> +Date: Mon, 25 Dec 2017 13:57:16 +0100 +Subject: x86/kaiser: Move feature detection up + +From: Borislav Petkov <bp@suse.de> + + +... before the first use of kaiser_enabled as otherwise funky +things happen: + + about to get started... + (XEN) d0v0 Unhandled page fault fault/trap [#14, ec=0000] + (XEN) Pagetable walk from ffff88022a449090: + (XEN) L4[0x110] = 0000000229e0e067 0000000000001e0e + (XEN) L3[0x008] = 0000000000000000 ffffffffffffffff + (XEN) domain_crash_sync called from entry.S: fault at ffff82d08033fd08 + entry.o#create_bounce_frame+0x135/0x14d + (XEN) Domain 0 (vcpu#0) crashed on cpu#0: + (XEN) ----[ Xen-4.9.1_02-3.21 x86_64 debug=n Not tainted ]---- + (XEN) CPU: 0 + (XEN) RIP: e033:[<ffffffff81007460>] + (XEN) RFLAGS: 0000000000000286 EM: 1 CONTEXT: pv guest (d0v0) + +Signed-off-by: Borislav Petkov <bp@suse.de> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/kaiser.h | 2 ++ + arch/x86/kernel/setup.c | 7 +++++++ + arch/x86/mm/kaiser.c | 2 -- + 3 files changed, 9 insertions(+), 2 deletions(-) + +--- a/arch/x86/include/asm/kaiser.h ++++ b/arch/x86/include/asm/kaiser.h +@@ -96,8 +96,10 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_p + extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; + + extern int kaiser_enabled; ++extern void __init kaiser_check_boottime_disable(void); + #else + #define kaiser_enabled 0 ++static inline void __init kaiser_check_boottime_disable(void) {} + #endif /* CONFIG_KAISER */ + + /* +--- a/arch/x86/kernel/setup.c ++++ b/arch/x86/kernel/setup.c +@@ -114,6 +114,7 @@ + #include <asm/microcode.h> + #include <asm/mmu_context.h> + #include <asm/kaslr.h> ++#include <asm/kaiser.h> + + /* + * max_low_pfn_mapped: highest direct mapped pfn under 4GB +@@ -1019,6 +1020,12 @@ void __init setup_arch(char **cmdline_p) + */ + init_hypervisor_platform(); + ++ /* ++ * This needs to happen right after XENPV is set on xen and ++ * kaiser_enabled is checked below in cleanup_highmap(). ++ */ ++ kaiser_check_boottime_disable(); ++ + x86_init.resources.probe_roms(); + + /* after parse_early_param, so could debug it */ +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -310,8 +310,6 @@ void __init kaiser_init(void) + { + int cpu; + +- kaiser_check_boottime_disable(); +- + if (!kaiser_enabled) + return; + diff --git a/queue/x86-kaiser-reenable-paravirt.patch b/queue/x86-kaiser-reenable-paravirt.patch new file mode 100644 index 0000000..d081b61 --- /dev/null +++ b/queue/x86-kaiser-reenable-paravirt.patch @@ -0,0 +1,28 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Borislav Petkov <bp@suse.de> +Date: Tue, 2 Jan 2018 14:19:49 +0100 +Subject: x86/kaiser: Reenable PARAVIRT + +From: Borislav Petkov <bp@suse.de> + + +Now that the required bits have been addressed, reenable +PARAVIRT. + +Signed-off-by: Borislav Petkov <bp@suse.de> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + security/Kconfig | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/security/Kconfig ++++ b/security/Kconfig +@@ -34,7 +34,7 @@ config SECURITY + config KAISER + bool "Remove the kernel mapping in user mode" + default y +- depends on X86_64 && SMP && !PARAVIRT ++ depends on X86_64 && SMP + help + This enforces a strict kernel and user space isolation, in order + to close hardware side channels on kernel address information. diff --git a/queue/x86-kaiser-rename-and-simplify-x86_feature_kaiser-handling.patch b/queue/x86-kaiser-rename-and-simplify-x86_feature_kaiser-handling.patch new file mode 100644 index 0000000..804ffac --- /dev/null +++ b/queue/x86-kaiser-rename-and-simplify-x86_feature_kaiser-handling.patch @@ -0,0 +1,97 @@ +From foo@baz Wed Jan 3 20:37:21 CET 2018 +From: Borislav Petkov <bp@suse.de> +Date: Tue, 2 Jan 2018 14:19:48 +0100 +Subject: x86/kaiser: Rename and simplify X86_FEATURE_KAISER handling + +From: Borislav Petkov <bp@suse.de> + + +Concentrate it in arch/x86/mm/kaiser.c and use the upstream string "nopti". + +Signed-off-by: Borislav Petkov <bp@suse.de> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + Documentation/kernel-parameters.txt | 2 +- + arch/x86/kernel/cpu/common.c | 18 ------------------ + arch/x86/mm/kaiser.c | 20 +++++++++++++++++++- + 3 files changed, 20 insertions(+), 20 deletions(-) + +--- a/Documentation/kernel-parameters.txt ++++ b/Documentation/kernel-parameters.txt +@@ -2763,7 +2763,7 @@ bytes respectively. Such letter suffixes + + nojitter [IA-64] Disables jitter checking for ITC timers. + +- nokaiser [X86-64] Disable KAISER isolation of kernel from user. ++ nopti [X86-64] Disable KAISER isolation of kernel from user. + + no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver + +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -179,20 +179,6 @@ static int __init x86_pcid_setup(char *s + return 1; + } + __setup("nopcid", x86_pcid_setup); +- +-static int __init x86_nokaiser_setup(char *s) +-{ +- /* nokaiser doesn't accept parameters */ +- if (s) +- return -EINVAL; +-#ifdef CONFIG_KAISER +- kaiser_enabled = 0; +- setup_clear_cpu_cap(X86_FEATURE_KAISER); +- pr_info("nokaiser: KAISER feature disabled\n"); +-#endif +- return 0; +-} +-early_param("nokaiser", x86_nokaiser_setup); + #endif + + static int __init x86_noinvpcid_setup(char *s) +@@ -813,10 +799,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c) + c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); + + init_scattered_cpuid_features(c); +-#ifdef CONFIG_KAISER +- if (kaiser_enabled) +- set_cpu_cap(c, X86_FEATURE_KAISER); +-#endif + } + + static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -274,8 +274,13 @@ void __init kaiser_init(void) + { + int cpu; + +- if (!kaiser_enabled) ++ if (!kaiser_enabled) { ++ setup_clear_cpu_cap(X86_FEATURE_KAISER); + return; ++ } ++ ++ setup_force_cpu_cap(X86_FEATURE_KAISER); ++ + kaiser_init_all_pgds(); + + for_each_possible_cpu(cpu) { +@@ -418,3 +423,16 @@ void kaiser_flush_tlb_on_return_to_user( + X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); + } + EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); ++ ++static int __init x86_nokaiser_setup(char *s) ++{ ++ /* nopti doesn't accept parameters */ ++ if (s) ++ return -EINVAL; ++ ++ kaiser_enabled = 0; ++ pr_info("Kernel/User page tables isolation: disabled\n"); ++ ++ return 0; ++} ++early_param("nopti", x86_nokaiser_setup); diff --git a/queue/x86-mm-32-move-setup_clear_cpu_cap-x86_feature_pcid-earlier.patch b/queue/x86-mm-32-move-setup_clear_cpu_cap-x86_feature_pcid-earlier.patch new file mode 100644 index 0000000..f7b9b05 --- /dev/null +++ b/queue/x86-mm-32-move-setup_clear_cpu_cap-x86_feature_pcid-earlier.patch @@ -0,0 +1,62 @@ +From b8b7abaed7a49b350f8ba659ddc264b04931d581 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Sun, 17 Sep 2017 09:03:50 -0700 +Subject: x86/mm/32: Move setup_clear_cpu_cap(X86_FEATURE_PCID) earlier + +From: Andy Lutomirski <luto@kernel.org> + +commit b8b7abaed7a49b350f8ba659ddc264b04931d581 upstream. + +Otherwise we might have the PCID feature bit set during cpu_init(). + +This is just for robustness. I haven't seen any actual bugs here. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Fixes: cba4671af755 ("x86/mm: Disable PCID on 32-bit kernels") +Link: http://lkml.kernel.org/r/b16dae9d6b0db5d9801ddbebbfd83384097c61f3.1505663533.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kernel/cpu/bugs.c | 8 -------- + arch/x86/kernel/cpu/common.c | 8 ++++++++ + 2 files changed, 8 insertions(+), 8 deletions(-) + +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -22,14 +22,6 @@ + + void __init check_bugs(void) + { +-#ifdef CONFIG_X86_32 +- /* +- * Regardless of whether PCID is enumerated, the SDM says +- * that it can't be enabled in 32-bit mode. +- */ +- setup_clear_cpu_cap(X86_FEATURE_PCID); +-#endif +- + identify_boot_cpu(); + + if (!IS_ENABLED(CONFIG_SMP)) { +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -890,6 +890,14 @@ static void __init early_identify_cpu(st + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + + fpu__init_system(c); ++ ++#ifdef CONFIG_X86_32 ++ /* ++ * Regardless of whether PCID is enumerated, the SDM says ++ * that it can't be enabled in 32-bit mode. ++ */ ++ setup_clear_cpu_cap(X86_FEATURE_PCID); ++#endif + } + + void __init early_cpu_init(void) diff --git a/queue/x86-mm-64-fix-reboot-interaction-with-cr4.pcide.patch b/queue/x86-mm-64-fix-reboot-interaction-with-cr4.pcide.patch new file mode 100644 index 0000000..d110708 --- /dev/null +++ b/queue/x86-mm-64-fix-reboot-interaction-with-cr4.pcide.patch @@ -0,0 +1,43 @@ +From 924c6b900cfdf376b07bccfd80e62b21914f8a5a Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Sun, 8 Oct 2017 21:53:05 -0700 +Subject: x86/mm/64: Fix reboot interaction with CR4.PCIDE + +From: Andy Lutomirski <luto@kernel.org> + +commit 924c6b900cfdf376b07bccfd80e62b21914f8a5a upstream. + +Trying to reboot via real mode fails with PCID on: long mode cannot +be exited while CR4.PCIDE is set. (No, I have no idea why, but the +SDM and actual CPUs are in agreement here.) The result is a GPF and +a hang instead of a reboot. + +I didn't catch this in testing because neither my computer nor my VM +reboots this way. I can trigger it with reboot=bios, though. + +Fixes: 660da7c9228f ("x86/mm: Enable CR4.PCIDE on supported systems") +Reported-and-tested-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Borislav Petkov <bp@alien8.de> +Link: https://lkml.kernel.org/r/f1e7d965998018450a7a70c2823873686a8b21c0.1507524746.git.luto@kernel.org +Cc: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/kernel/reboot.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/arch/x86/kernel/reboot.c ++++ b/arch/x86/kernel/reboot.c +@@ -106,6 +106,10 @@ void __noreturn machine_real_restart(uns + load_cr3(initial_page_table); + #else + write_cr3(real_mode_header->trampoline_pgd); ++ ++ /* Exiting long mode will fail if CR4.PCIDE is set. */ ++ if (static_cpu_has(X86_FEATURE_PCID)) ++ cr4_clear_bits(X86_CR4_PCIDE); + #endif + + /* Jump to the identity-mapped low memory code */ diff --git a/queue/x86-mm-add-the-nopcid-boot-option-to-turn-off-pcid.patch b/queue/x86-mm-add-the-nopcid-boot-option-to-turn-off-pcid.patch new file mode 100644 index 0000000..4bbaccd --- /dev/null +++ b/queue/x86-mm-add-the-nopcid-boot-option-to-turn-off-pcid.patch @@ -0,0 +1,74 @@ +From 0790c9aad84901ca1bdc14746175549c8b5da215 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 29 Jun 2017 08:53:20 -0700 +Subject: x86/mm: Add the 'nopcid' boot option to turn off PCID + +From: Andy Lutomirski <luto@kernel.org> + +commit 0790c9aad84901ca1bdc14746175549c8b5da215 upstream. + +The parameter is only present on x86_64 systems to save a few bytes, +as PCID is always disabled on x86_32. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Reviewed-by: Nadav Amit <nadav.amit@gmail.com> +Reviewed-by: Borislav Petkov <bp@suse.de> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Arjan van de Ven <arjan@linux.intel.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/8bbb2e65bcd249a5f18bfb8128b4689f08ac2b60.1498751203.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Cc: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + + +--- + Documentation/kernel-parameters.txt | 2 ++ + arch/x86/kernel/cpu/common.c | 18 ++++++++++++++++++ + 2 files changed, 20 insertions(+) + +--- a/Documentation/kernel-parameters.txt ++++ b/Documentation/kernel-parameters.txt +@@ -2795,6 +2795,8 @@ bytes respectively. Such letter suffixes + nopat [X86] Disable PAT (page attribute table extension of + pagetables) support. + ++ nopcid [X86-64] Disable the PCID cpu feature. ++ + norandmaps Don't use address space randomization. Equivalent to + echo 0 > /proc/sys/kernel/randomize_va_space + +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -163,6 +163,24 @@ static int __init x86_mpx_setup(char *s) + } + __setup("nompx", x86_mpx_setup); + ++#ifdef CONFIG_X86_64 ++static int __init x86_pcid_setup(char *s) ++{ ++ /* require an exact match without trailing characters */ ++ if (strlen(s)) ++ return 0; ++ ++ /* do not emit a message if the feature is not present */ ++ if (!boot_cpu_has(X86_FEATURE_PCID)) ++ return 1; ++ ++ setup_clear_cpu_cap(X86_FEATURE_PCID); ++ pr_info("nopcid: PCID feature disabled\n"); ++ return 1; ++} ++__setup("nopcid", x86_pcid_setup); ++#endif ++ + static int __init x86_noinvpcid_setup(char *s) + { + /* noinvpcid doesn't accept parameters */ diff --git a/queue/x86-mm-disable-pcid-on-32-bit-kernels.patch b/queue/x86-mm-disable-pcid-on-32-bit-kernels.patch new file mode 100644 index 0000000..58abb0a --- /dev/null +++ b/queue/x86-mm-disable-pcid-on-32-bit-kernels.patch @@ -0,0 +1,78 @@ +From cba4671af7550e008f7a7835f06df0763825bf3e Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 29 Jun 2017 08:53:19 -0700 +Subject: x86/mm: Disable PCID on 32-bit kernels + +From: Andy Lutomirski <luto@kernel.org> + +commit cba4671af7550e008f7a7835f06df0763825bf3e upstream. + +32-bit kernels on new hardware will see PCID in CPUID, but PCID can +only be used in 64-bit mode. Rather than making all PCID code +conditional, just disable the feature on 32-bit builds. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Reviewed-by: Nadav Amit <nadav.amit@gmail.com> +Reviewed-by: Borislav Petkov <bp@suse.de> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Arjan van de Ven <arjan@linux.intel.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/2e391769192a4d31b808410c383c6bf0734bc6ea.1498751203.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Cc: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/include/asm/disabled-features.h | 4 +++- + arch/x86/kernel/cpu/bugs.c | 8 ++++++++ + 2 files changed, 11 insertions(+), 1 deletion(-) + +--- a/arch/x86/include/asm/disabled-features.h ++++ b/arch/x86/include/asm/disabled-features.h +@@ -21,11 +21,13 @@ + # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) + # define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) + # define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) ++# define DISABLE_PCID 0 + #else + # define DISABLE_VME 0 + # define DISABLE_K6_MTRR 0 + # define DISABLE_CYRIX_ARR 0 + # define DISABLE_CENTAUR_MCR 0 ++# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) + #endif /* CONFIG_X86_64 */ + + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS +@@ -43,7 +45,7 @@ + #define DISABLED_MASK1 0 + #define DISABLED_MASK2 0 + #define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) +-#define DISABLED_MASK4 0 ++#define DISABLED_MASK4 (DISABLE_PCID) + #define DISABLED_MASK5 0 + #define DISABLED_MASK6 0 + #define DISABLED_MASK7 0 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -19,6 +19,14 @@ + + void __init check_bugs(void) + { ++#ifdef CONFIG_X86_32 ++ /* ++ * Regardless of whether PCID is enumerated, the SDM says ++ * that it can't be enabled in 32-bit mode. ++ */ ++ setup_clear_cpu_cap(X86_FEATURE_PCID); ++#endif ++ + identify_boot_cpu(); + #ifndef CONFIG_SMP + pr_info("CPU: "); diff --git a/queue/x86-mm-enable-cr4.pcide-on-supported-systems.patch b/queue/x86-mm-enable-cr4.pcide-on-supported-systems.patch new file mode 100644 index 0000000..8722495 --- /dev/null +++ b/queue/x86-mm-enable-cr4.pcide-on-supported-systems.patch @@ -0,0 +1,108 @@ +From 660da7c9228f685b2ebe664f9fd69aaddcc420b5 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Thu, 29 Jun 2017 08:53:21 -0700 +Subject: x86/mm: Enable CR4.PCIDE on supported systems + +From: Andy Lutomirski <luto@kernel.org> + +commit 660da7c9228f685b2ebe664f9fd69aaddcc420b5 upstream. + +We can use PCID if the CPU has PCID and PGE and we're not on Xen. + +By itself, this has no effect. A followup patch will start using PCID. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Reviewed-by: Nadav Amit <nadav.amit@gmail.com> +Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Reviewed-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Arjan van de Ven <arjan@linux.intel.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/6327ecd907b32f79d5aa0d466f04503bbec5df88.1498751203.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Cc: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/include/asm/tlbflush.h | 8 ++++++++ + arch/x86/kernel/cpu/common.c | 22 ++++++++++++++++++++++ + arch/x86/xen/enlighten.c | 6 ++++++ + 3 files changed, 36 insertions(+) + +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -191,6 +191,14 @@ static inline void __flush_tlb_all(void) + __flush_tlb_global(); + else + __flush_tlb(); ++ ++ /* ++ * Note: if we somehow had PCID but not PGE, then this wouldn't work -- ++ * we'd end up flushing kernel translations for the current ASID but ++ * we might fail to flush kernel translations for other cached ASIDs. ++ * ++ * To avoid this issue, we force PCID off if PGE is off. ++ */ + } + + static inline void __flush_tlb_one(unsigned long addr) +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -324,6 +324,25 @@ static __always_inline void setup_smap(s + } + } + ++static void setup_pcid(struct cpuinfo_x86 *c) ++{ ++ if (cpu_has(c, X86_FEATURE_PCID)) { ++ if (cpu_has(c, X86_FEATURE_PGE)) { ++ cr4_set_bits(X86_CR4_PCIDE); ++ } else { ++ /* ++ * flush_tlb_all(), as currently implemented, won't ++ * work if PCID is on but PGE is not. Since that ++ * combination doesn't exist on real hardware, there's ++ * no reason to try to fully support it, but it's ++ * polite to avoid corrupting data if we're on ++ * an improperly configured VM. ++ */ ++ clear_cpu_cap(c, X86_FEATURE_PCID); ++ } ++ } ++} ++ + /* + * Protection Keys are not available in 32-bit mode. + */ +@@ -1082,6 +1101,9 @@ static void identify_cpu(struct cpuinfo_ + setup_smep(c); + setup_smap(c); + ++ /* Set up PCID */ ++ setup_pcid(c); ++ + /* + * The vendor-specific functions might have changed features. + * Now we do "generic changes." +--- a/arch/x86/xen/enlighten.c ++++ b/arch/x86/xen/enlighten.c +@@ -444,6 +444,12 @@ static void __init xen_init_cpuid_mask(v + ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */ + (1 << X86_FEATURE_ACC)); /* thermal monitoring */ + ++ /* ++ * Xen PV would need some work to support PCID: CR3 handling as well ++ * as xen_flush_tlb_others() would need updating. ++ */ ++ cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_PCID % 32)); /* disable PCID */ ++ + if (!xen_initial_domain()) + cpuid_leaf1_edx_mask &= + ~((1 << X86_FEATURE_ACPI)); /* disable ACPI */ diff --git a/queue/x86-mm-make-flush_tlb_mm_range-more-predictable.patch b/queue/x86-mm-make-flush_tlb_mm_range-more-predictable.patch new file mode 100644 index 0000000..4f7f58e --- /dev/null +++ b/queue/x86-mm-make-flush_tlb_mm_range-more-predictable.patch @@ -0,0 +1,81 @@ +From ce27374fabf553153c3f53efcaa9bfab9216bd8c Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Sat, 22 Apr 2017 00:01:21 -0700 +Subject: x86/mm: Make flush_tlb_mm_range() more predictable + +From: Andy Lutomirski <luto@kernel.org> + +commit ce27374fabf553153c3f53efcaa9bfab9216bd8c upstream. + +I'm about to rewrite the function almost completely, but first I +want to get a functional change out of the way. Currently, if +flush_tlb_mm_range() does not flush the local TLB at all, it will +never do individual page flushes on remote CPUs. This seems to be +an accident, and preserving it will be awkward. Let's change it +first so that any regressions in the rewrite will be easier to +bisect and so that the rewrite can attempt to change no visible +behavior at all. + +The fix is simple: we can simply avoid short-circuiting the +calculation of base_pages_to_flush. + +As a side effect, this also eliminates a potential corner case: if +tlb_single_page_flush_ceiling == TLB_FLUSH_ALL, flush_tlb_mm_range() +could have ended up flushing the entire address space one page at a +time. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Acked-by: Dave Hansen <dave.hansen@intel.com> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Michal Hocko <mhocko@suse.com> +Cc: Nadav Amit <namit@vmware.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/4b29b771d9975aad7154c314534fec235618175a.1492844372.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Cc: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/mm/tlb.c | 12 +++++++----- + 1 file changed, 7 insertions(+), 5 deletions(-) + +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -307,6 +307,12 @@ void flush_tlb_mm_range(struct mm_struct + unsigned long base_pages_to_flush = TLB_FLUSH_ALL; + + preempt_disable(); ++ ++ if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) ++ base_pages_to_flush = (end - start) >> PAGE_SHIFT; ++ if (base_pages_to_flush > tlb_single_page_flush_ceiling) ++ base_pages_to_flush = TLB_FLUSH_ALL; ++ + if (current->active_mm != mm) { + /* Synchronize with switch_mm. */ + smp_mb(); +@@ -323,15 +329,11 @@ void flush_tlb_mm_range(struct mm_struct + goto out; + } + +- if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) +- base_pages_to_flush = (end - start) >> PAGE_SHIFT; +- + /* + * Both branches below are implicit full barriers (MOV to CR or + * INVLPG) that synchronize with switch_mm. + */ +- if (base_pages_to_flush > tlb_single_page_flush_ceiling) { +- base_pages_to_flush = TLB_FLUSH_ALL; ++ if (base_pages_to_flush == TLB_FLUSH_ALL) { + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + local_flush_tlb(); + } else { diff --git a/queue/x86-mm-reimplement-flush_tlb_page-using-flush_tlb_mm_range.patch b/queue/x86-mm-reimplement-flush_tlb_page-using-flush_tlb_mm_range.patch new file mode 100644 index 0000000..2153f10 --- /dev/null +++ b/queue/x86-mm-reimplement-flush_tlb_page-using-flush_tlb_mm_range.patch @@ -0,0 +1,105 @@ +From d698c90a07e8c70354dad23e61434edf7de2c91c Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Mon, 22 May 2017 15:30:01 -0700 +Subject: [PATCH] x86/mm: Reimplement flush_tlb_page() using + flush_tlb_mm_range() + +commit ca6c99c0794875c6d1db6e22f246699691ab7e6b upstream. + +flush_tlb_page() was very similar to flush_tlb_mm_range() except that +it had a couple of issues: + + - It was missing an smp_mb() in the case where + current->active_mm != mm. (This is a longstanding bug reported by Nadav Amit) + + - It was missing tracepoints and vm counter updates. + +The only reason that I can see for keeping it at as a separate +function is that it could avoid a few branches that +flush_tlb_mm_range() needs to decide to flush just one page. This +hardly seems worthwhile. If we decide we want to get rid of those +branches again, a better way would be to introduce an +__flush_tlb_mm_range() helper and make both flush_tlb_page() and +flush_tlb_mm_range() use it. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Acked-by: Kees Cook <keescook@chromium.org> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Michal Hocko <mhocko@suse.com> +Cc: Nadav Amit <nadav.amit@gmail.com> +Cc: Nadav Amit <namit@vmware.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/3cc3847cf888d8907577569b8bac3f01992ef8f9.1495492063.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Cc: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 8e7ae9e6c59a..abcd615ea27e 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -297,11 +297,15 @@ static inline void flush_tlb_kernel_range(unsigned long start, + flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) + + extern void flush_tlb_all(void); +-extern void flush_tlb_page(struct vm_area_struct *, unsigned long); + extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + unsigned long end, unsigned long vmflag); + extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); + ++static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) ++{ ++ flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE); ++} ++ + void native_flush_tlb_others(const struct cpumask *cpumask, + struct mm_struct *mm, + unsigned long start, unsigned long end); +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 9db9260a5e9f..38f6e37959af 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -356,33 +356,6 @@ out: + preempt_enable(); + } + +-void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) +-{ +- struct mm_struct *mm = vma->vm_mm; +- +- preempt_disable(); +- +- if (current->active_mm == mm) { +- if (current->mm) { +- /* +- * Implicit full barrier (INVLPG) that synchronizes +- * with switch_mm. +- */ +- __flush_tlb_one(start); +- } else { +- leave_mm(smp_processor_id()); +- +- /* Synchronize with switch_mm. */ +- smp_mb(); +- } +- } +- +- if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) +- flush_tlb_others(mm_cpumask(mm), mm, start, 0UL); +- +- preempt_enable(); +-} +- + static void do_flush_tlb_all(void *info) + { + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); +-- +2.15.0 + diff --git a/queue/x86-mm-remove-flush_tlb-and-flush_tlb_current_task.patch b/queue/x86-mm-remove-flush_tlb-and-flush_tlb_current_task.patch new file mode 100644 index 0000000..1f63ed5 --- /dev/null +++ b/queue/x86-mm-remove-flush_tlb-and-flush_tlb_current_task.patch @@ -0,0 +1,101 @@ +From 29961b59a51f8c6838a26a45e871a7ed6771809b Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Sat, 22 Apr 2017 00:01:20 -0700 +Subject: x86/mm: Remove flush_tlb() and flush_tlb_current_task() + +From: Andy Lutomirski <luto@kernel.org> + +commit 29961b59a51f8c6838a26a45e871a7ed6771809b upstream. + +I was trying to figure out what how flush_tlb_current_task() would +possibly work correctly if current->mm != current->active_mm, but I +realized I could spare myself the effort: it has no callers except +the unused flush_tlb() macro. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Michal Hocko <mhocko@suse.com> +Cc: Nadav Amit <namit@vmware.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/e52d64c11690f85e9f1d69d7b48cc2269cd2e94b.1492844372.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Cc: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/include/asm/tlbflush.h | 9 --------- + arch/x86/mm/tlb.c | 17 ----------------- + 2 files changed, 26 deletions(-) + +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -205,7 +205,6 @@ static inline void __flush_tlb_one(unsig + /* + * TLB flushing: + * +- * - flush_tlb() flushes the current mm struct TLBs + * - flush_tlb_all() flushes all processes TLBs + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page +@@ -237,11 +236,6 @@ static inline void flush_tlb_all(void) + __flush_tlb_all(); + } + +-static inline void flush_tlb(void) +-{ +- __flush_tlb_up(); +-} +- + static inline void local_flush_tlb(void) + { + __flush_tlb_up(); +@@ -303,14 +297,11 @@ static inline void flush_tlb_kernel_rang + flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) + + extern void flush_tlb_all(void); +-extern void flush_tlb_current_task(void); + extern void flush_tlb_page(struct vm_area_struct *, unsigned long); + extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + unsigned long end, unsigned long vmflag); + extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); + +-#define flush_tlb() flush_tlb_current_task() +- + void native_flush_tlb_others(const struct cpumask *cpumask, + struct mm_struct *mm, + unsigned long start, unsigned long end); +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -287,23 +287,6 @@ void native_flush_tlb_others(const struc + smp_call_function_many(cpumask, flush_tlb_func, &info, 1); + } + +-void flush_tlb_current_task(void) +-{ +- struct mm_struct *mm = current->mm; +- +- preempt_disable(); +- +- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); +- +- /* This is an implicit full barrier that synchronizes with switch_mm. */ +- local_flush_tlb(); +- +- trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); +- if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) +- flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); +- preempt_enable(); +-} +- + /* + * See Documentation/x86/tlb.txt for details. We choose 33 + * because it is large enough to cover the vast majority (at diff --git a/queue/x86-mm-remove-the-up-asm-tlbflush.h-code-always-use-the-formerly-smp-code.patch b/queue/x86-mm-remove-the-up-asm-tlbflush.h-code-always-use-the-formerly-smp-code.patch new file mode 100644 index 0000000..649fa38 --- /dev/null +++ b/queue/x86-mm-remove-the-up-asm-tlbflush.h-code-always-use-the-formerly-smp-code.patch @@ -0,0 +1,305 @@ +From fd56dcc62b454fbbc7d9d6822b55953e5e945976 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Sun, 28 May 2017 10:00:14 -0700 +Subject: [PATCH] x86/mm: Remove the UP asm/tlbflush.h code, always use the + (formerly) SMP code + +commit ce4a4e565f5264909a18c733b864c3f74467f69e upstream. + +The UP asm/tlbflush.h generates somewhat nicer code than the SMP version. +Aside from that, it's fallen quite a bit behind the SMP code: + + - flush_tlb_mm_range() didn't flush individual pages if the range + was small. + + - The lazy TLB code was much weaker. This usually wouldn't matter, + but, if a kernel thread flushed its lazy "active_mm" more than + once (due to reclaim or similar), it wouldn't be unlazied and + would instead pointlessly flush repeatedly. + + - Tracepoints were missing. + +Aside from that, simply having the UP code around was a maintanence +burden, since it means that any change to the TLB flush code had to +make sure not to break it. + +Simplify everything by deleting the UP code. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Arjan van de Ven <arjan@linux.intel.com> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Michal Hocko <mhocko@suse.com> +Cc: Nadav Amit <nadav.amit@gmail.com> +Cc: Nadav Amit <namit@vmware.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Cc: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 951dc26b1a5e..63e83fe8987c 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -45,7 +45,7 @@ config X86 + select ARCH_USE_CMPXCHG_LOCKREF if X86_64 + select ARCH_USE_QUEUED_RWLOCKS + select ARCH_USE_QUEUED_SPINLOCKS +- select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP ++ select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + select ARCH_WANTS_DYNAMIC_TASK_STRUCT + select ARCH_WANT_FRAME_POINTERS + select ARCH_WANT_IPC_PARSE_VERSION if X86_32 +diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h +index 59405a248fc2..9b76cd331990 100644 +--- a/arch/x86/include/asm/hardirq.h ++++ b/arch/x86/include/asm/hardirq.h +@@ -22,8 +22,8 @@ typedef struct { + #ifdef CONFIG_SMP + unsigned int irq_resched_count; + unsigned int irq_call_count; +- unsigned int irq_tlb_count; + #endif ++ unsigned int irq_tlb_count; + #ifdef CONFIG_X86_THERMAL_VECTOR + unsigned int irq_thermal_count; + #endif +diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h +index 1ea0baef1175..6deccb456060 100644 +--- a/arch/x86/include/asm/mmu.h ++++ b/arch/x86/include/asm/mmu.h +@@ -25,12 +25,6 @@ typedef struct { + atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */ + } mm_context_t; + +-#ifdef CONFIG_SMP + void leave_mm(int cpu); +-#else +-static inline void leave_mm(int cpu) +-{ +-} +-#endif + + #endif /* _ASM_X86_MMU_H */ +diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h +index d8abfcf524d1..d15f740111c9 100644 +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -98,10 +98,8 @@ static inline void load_mm_ldt(struct mm_struct *mm) + + static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) + { +-#ifdef CONFIG_SMP + if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) + this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); +-#endif + } + + static inline int init_new_context(struct task_struct *tsk, +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index abcd615ea27e..12dedd6c9e42 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -7,6 +7,7 @@ + #include <asm/processor.h> + #include <asm/cpufeature.h> + #include <asm/special_insns.h> ++#include <asm/smp.h> + + static inline void __invpcid(unsigned long pcid, unsigned long addr, + unsigned long type) +@@ -65,10 +66,8 @@ static inline void invpcid_flush_all_nonglobals(void) + #endif + + struct tlb_state { +-#ifdef CONFIG_SMP + struct mm_struct *active_mm; + int state; +-#endif + + /* + * Access to this CR4 shadow and to H/W CR4 is protected by +@@ -216,79 +215,6 @@ static inline void __flush_tlb_one(unsigned long addr) + * and page-granular flushes are available only on i486 and up. + */ + +-#ifndef CONFIG_SMP +- +-/* "_up" is for UniProcessor. +- * +- * This is a helper for other header functions. *Not* intended to be called +- * directly. All global TLB flushes need to either call this, or to bump the +- * vm statistics themselves. +- */ +-static inline void __flush_tlb_up(void) +-{ +- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); +- __flush_tlb(); +-} +- +-static inline void flush_tlb_all(void) +-{ +- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); +- __flush_tlb_all(); +-} +- +-static inline void local_flush_tlb(void) +-{ +- __flush_tlb_up(); +-} +- +-static inline void flush_tlb_mm(struct mm_struct *mm) +-{ +- if (mm == current->active_mm) +- __flush_tlb_up(); +-} +- +-static inline void flush_tlb_page(struct vm_area_struct *vma, +- unsigned long addr) +-{ +- if (vma->vm_mm == current->active_mm) +- __flush_tlb_one(addr); +-} +- +-static inline void flush_tlb_range(struct vm_area_struct *vma, +- unsigned long start, unsigned long end) +-{ +- if (vma->vm_mm == current->active_mm) +- __flush_tlb_up(); +-} +- +-static inline void flush_tlb_mm_range(struct mm_struct *mm, +- unsigned long start, unsigned long end, unsigned long vmflag) +-{ +- if (mm == current->active_mm) +- __flush_tlb_up(); +-} +- +-static inline void native_flush_tlb_others(const struct cpumask *cpumask, +- struct mm_struct *mm, +- unsigned long start, +- unsigned long end) +-{ +-} +- +-static inline void reset_lazy_tlbstate(void) +-{ +-} +- +-static inline void flush_tlb_kernel_range(unsigned long start, +- unsigned long end) +-{ +- flush_tlb_all(); +-} +- +-#else /* SMP */ +- +-#include <asm/smp.h> +- + #define local_flush_tlb() __flush_tlb() + + #define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL) +@@ -319,8 +245,6 @@ static inline void reset_lazy_tlbstate(void) + this_cpu_write(cpu_tlbstate.active_mm, &init_mm); + } + +-#endif /* SMP */ +- + #ifndef CONFIG_PARAVIRT + #define flush_tlb_others(mask, mm, start, end) \ + native_flush_tlb_others(mask, mm, start, end) +diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c +index 8a427715f541..0381e949de17 100644 +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -762,10 +762,8 @@ void __init zone_sizes_init(void) + } + + DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { +-#ifdef CONFIG_SMP + .active_mm = &init_mm, + .state = 0, +-#endif + .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ + }; + EXPORT_SYMBOL_GPL(cpu_tlbstate); +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 38f6e37959af..7882e4e3c113 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -15,7 +15,7 @@ + #include <linux/debugfs.h> + + /* +- * Smarter SMP flushing macros. ++ * TLB flushing, formerly SMP-only + * c/o Linus Torvalds. + * + * These mean you can really definitely utterly forget about +@@ -28,8 +28,6 @@ + * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi + */ + +-#ifdef CONFIG_SMP +- + struct flush_tlb_info { + struct mm_struct *flush_mm; + unsigned long flush_start; +@@ -59,8 +57,6 @@ void leave_mm(int cpu) + } + EXPORT_SYMBOL_GPL(leave_mm); + +-#endif /* CONFIG_SMP */ +- + void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) + { +@@ -91,10 +87,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + set_pgd(pgd, init_mm.pgd[stack_pgd_index]); + } + +-#ifdef CONFIG_SMP + this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + this_cpu_write(cpu_tlbstate.active_mm, next); +-#endif + + cpumask_set_cpu(cpu, mm_cpumask(next)); + +@@ -152,9 +146,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + if (unlikely(prev->context.ldt != next->context.ldt)) + load_mm_ldt(next); + #endif +- } +-#ifdef CONFIG_SMP +- else { ++ } else { + this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); + +@@ -181,11 +173,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + load_mm_ldt(next); + } + } +-#endif + } + +-#ifdef CONFIG_SMP +- + /* + * The flush IPI assumes that a thread switch happens in this order: + * [cpu0: the cpu that switches] +@@ -440,5 +429,3 @@ static int __init create_tlb_single_page_flush_ceiling(void) + return 0; + } + late_initcall(create_tlb_single_page_flush_ceiling); +- +-#endif /* CONFIG_SMP */ +-- +2.15.0 + diff --git a/queue/x86-msr-add-definitions-for-new-speculation-control-msrs.patch b/queue/x86-msr-add-definitions-for-new-speculation-control-msrs.patch new file mode 100644 index 0000000..884d448 --- /dev/null +++ b/queue/x86-msr-add-definitions-for-new-speculation-control-msrs.patch @@ -0,0 +1,63 @@ +From foo@baz Wed Feb 7 19:38:23 CST 2018 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Thu, 25 Jan 2018 16:14:12 +0000 +Subject: x86/msr: Add definitions for new speculation control MSRs + +From: David Woodhouse <dwmw@amazon.co.uk> + +(cherry picked from commit 1e340c60d0dd3ae07b5bedc16a0469c14b9f3410) + +Add MSR and bit definitions for SPEC_CTRL, PRED_CMD and ARCH_CAPABILITIES. + +See Intel's 336996-Speculative-Execution-Side-Channel-Mitigations.pdf + +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: ak@linux.intel.com +Cc: ashok.raj@intel.com +Cc: dave.hansen@intel.com +Cc: karahmed@amazon.de +Cc: arjan@linux.intel.com +Cc: torvalds@linux-foundation.org +Cc: peterz@infradead.org +Cc: bp@alien8.de +Cc: pbonzini@redhat.com +Cc: tim.c.chen@linux.intel.com +Cc: gregkh@linux-foundation.org +Link: https://lkml.kernel.org/r/1516896855-7642-5-git-send-email-dwmw@amazon.co.uk +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/msr-index.h | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -37,6 +37,13 @@ + #define EFER_FFXSR (1<<_EFER_FFXSR) + + /* Intel MSRs. Some also available on other CPUs */ ++#define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */ ++#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */ ++#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */ ++ ++#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ ++#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */ ++ + #define MSR_IA32_PERFCTR0 0x000000c1 + #define MSR_IA32_PERFCTR1 0x000000c2 + #define MSR_FSB_FREQ 0x000000cd +@@ -50,6 +57,11 @@ + #define SNB_C3_AUTO_UNDEMOTE (1UL << 28) + + #define MSR_MTRRcap 0x000000fe ++ ++#define MSR_IA32_ARCH_CAPABILITIES 0x0000010a ++#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */ ++#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */ ++ + #define MSR_IA32_BBL_CR_CTL 0x00000119 + #define MSR_IA32_BBL_CR_CTL3 0x0000011e + diff --git a/queue/x86-nospec-fix-header-guards-names.patch b/queue/x86-nospec-fix-header-guards-names.patch new file mode 100644 index 0000000..a9bcc71 --- /dev/null +++ b/queue/x86-nospec-fix-header-guards-names.patch @@ -0,0 +1,53 @@ +From foo@baz Thu Feb 8 03:30:27 CET 2018 +From: Borislav Petkov <bp@suse.de> +Date: Fri, 26 Jan 2018 13:11:37 +0100 +Subject: x86/nospec: Fix header guards names + +From: Borislav Petkov <bp@suse.de> + +(cherry picked from commit 7a32fc51ca938e67974cbb9db31e1a43f98345a9) + +... to adhere to the _ASM_X86_ naming scheme. + +No functional change. + +Signed-off-by: Borislav Petkov <bp@suse.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: riel@redhat.com +Cc: ak@linux.intel.com +Cc: peterz@infradead.org +Cc: David Woodhouse <dwmw2@infradead.org> +Cc: jikos@kernel.org +Cc: luto@amacapital.net +Cc: dave.hansen@intel.com +Cc: torvalds@linux-foundation.org +Cc: keescook@google.com +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: tim.c.chen@linux.intel.com +Cc: gregkh@linux-foundation.org +Cc: pjt@google.com +Link: https://lkml.kernel.org/r/20180126121139.31959-3-bp@alien8.de +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/nospec-branch.h | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -1,7 +1,7 @@ + /* SPDX-License-Identifier: GPL-2.0 */ + +-#ifndef __NOSPEC_BRANCH_H__ +-#define __NOSPEC_BRANCH_H__ ++#ifndef _ASM_X86_NOSPEC_BRANCH_H_ ++#define _ASM_X86_NOSPEC_BRANCH_H_ + + #include <asm/alternative.h> + #include <asm/alternative-asm.h> +@@ -232,4 +232,4 @@ static inline void indirect_branch_predi + } + + #endif /* __ASSEMBLY__ */ +-#endif /* __NOSPEC_BRANCH_H__ */ ++#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */ diff --git a/queue/x86-paravirt-dont-patch-flush_tlb_single.patch b/queue/x86-paravirt-dont-patch-flush_tlb_single.patch new file mode 100644 index 0000000..4783222 --- /dev/null +++ b/queue/x86-paravirt-dont-patch-flush_tlb_single.patch @@ -0,0 +1,68 @@ +From 7f6999b379b7f1c378345e436be46df760668145 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Dec 2017 15:07:30 +0100 +Subject: [PATCH] x86/paravirt: Dont patch flush_tlb_single + +commit a035795499ca1c2bd1928808d1a156eda1420383 upstream + +native_flush_tlb_single() will be changed with the upcoming +PAGE_TABLE_ISOLATION feature. This requires to have more code in +there than INVLPG. + +Remove the paravirt patching for it. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com> +Reviewed-by: Juergen Gross <jgross@suse.com> +Acked-by: Peter Zijlstra <peterz@infradead.org> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Cc: michael.schwarz@iaik.tugraz.at +Cc: moritz.lipp@iaik.tugraz.at +Cc: richard.fellner@student.tugraz.at +Link: https://lkml.kernel.org/r/20171204150606.828111617@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Acked-by: Borislav Petkov <bp@suse.de> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c +index e70087a04cc8..68f8273b152e 100644 +--- a/arch/x86/kernel/paravirt_patch_64.c ++++ b/arch/x86/kernel/paravirt_patch_64.c +@@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); + DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); + DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); + DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); +-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); + DEF_NATIVE(pv_cpu_ops, clts, "clts"); + DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); + +@@ -59,7 +58,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + PATCH_SITE(pv_mmu_ops, read_cr3); + PATCH_SITE(pv_mmu_ops, write_cr3); + PATCH_SITE(pv_cpu_ops, clts); +- PATCH_SITE(pv_mmu_ops, flush_tlb_single); + PATCH_SITE(pv_cpu_ops, wbinvd); + #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) + case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): +-- +2.15.0 + diff --git a/queue/x86-paravirt-remove-noreplace-paravirt-cmdline-option.patch b/queue/x86-paravirt-remove-noreplace-paravirt-cmdline-option.patch new file mode 100644 index 0000000..a670a21 --- /dev/null +++ b/queue/x86-paravirt-remove-noreplace-paravirt-cmdline-option.patch @@ -0,0 +1,91 @@ +From foo@baz Thu Feb 8 03:32:24 CET 2018 +From: Josh Poimboeuf <jpoimboe@redhat.com> +Date: Tue, 30 Jan 2018 22:13:33 -0600 +Subject: x86/paravirt: Remove 'noreplace-paravirt' cmdline option + +From: Josh Poimboeuf <jpoimboe@redhat.com> + + +(cherry picked from commit 12c69f1e94c89d40696e83804dd2f0965b5250cd) + +The 'noreplace-paravirt' option disables paravirt patching, leaving the +original pv indirect calls in place. + +That's highly incompatible with retpolines, unless we want to uglify +paravirt even further and convert the paravirt calls to retpolines. + +As far as I can tell, the option doesn't seem to be useful for much +other than introducing surprising corner cases and making the kernel +vulnerable to Spectre v2. It was probably a debug option from the early +paravirt days. So just remove it. + +Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Juergen Gross <jgross@suse.com> +Cc: Andrea Arcangeli <aarcange@redhat.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Ashok Raj <ashok.raj@intel.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: Jun Nakajima <jun.nakajima@intel.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Rusty Russell <rusty@rustcorp.com.au> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Asit Mallick <asit.k.mallick@intel.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Jason Baron <jbaron@akamai.com> +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Alok Kataria <akataria@vmware.com> +Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com> +Cc: David Woodhouse <dwmw2@infradead.org> +Cc: Dan Williams <dan.j.williams@intel.com> +Link: https://lkml.kernel.org/r/20180131041333.2x6blhxirc2kclrq@treble +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + Documentation/kernel-parameters.txt | 2 -- + arch/x86/kernel/alternative.c | 14 -------------- + 2 files changed, 16 deletions(-) + +--- a/Documentation/kernel-parameters.txt ++++ b/Documentation/kernel-parameters.txt +@@ -2805,8 +2805,6 @@ bytes respectively. Such letter suffixes + norandmaps Don't use address space randomization. Equivalent to + echo 0 > /proc/sys/kernel/randomize_va_space + +- noreplace-paravirt [X86,IA-64,PV_OPS] Don't patch paravirt_ops +- + noreplace-smp [X86-32,SMP] Don't replace SMP instructions + with UP alternatives + +--- a/arch/x86/kernel/alternative.c ++++ b/arch/x86/kernel/alternative.c +@@ -46,17 +46,6 @@ static int __init setup_noreplace_smp(ch + } + __setup("noreplace-smp", setup_noreplace_smp); + +-#ifdef CONFIG_PARAVIRT +-static int __initdata_or_module noreplace_paravirt = 0; +- +-static int __init setup_noreplace_paravirt(char *str) +-{ +- noreplace_paravirt = 1; +- return 1; +-} +-__setup("noreplace-paravirt", setup_noreplace_paravirt); +-#endif +- + #define DPRINTK(fmt, args...) \ + do { \ + if (debug_alternative) \ +@@ -588,9 +577,6 @@ void __init_or_module apply_paravirt(str + struct paravirt_patch_site *p; + char insnbuf[MAX_PATCH_LEN]; + +- if (noreplace_paravirt) +- return; +- + for (p = start; p < end; p++) { + unsigned int used; + diff --git a/queue/x86-pti-do-not-enable-pti-on-cpus-which-are-not-vulnerable-to-meltdown.patch b/queue/x86-pti-do-not-enable-pti-on-cpus-which-are-not-vulnerable-to-meltdown.patch new file mode 100644 index 0000000..ffbcbc7 --- /dev/null +++ b/queue/x86-pti-do-not-enable-pti-on-cpus-which-are-not-vulnerable-to-meltdown.patch @@ -0,0 +1,112 @@ +From foo@baz Wed Feb 7 19:38:23 CST 2018 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Thu, 25 Jan 2018 16:14:13 +0000 +Subject: x86/pti: Do not enable PTI on CPUs which are not vulnerable to Meltdown + +From: David Woodhouse <dwmw@amazon.co.uk> + +(cherry picked from commit fec9434a12f38d3aeafeb75711b71d8a1fdef621) + +Also, for CPUs which don't speculate at all, don't report that they're +vulnerable to the Spectre variants either. + +Leave the cpu_no_meltdown[] match table with just X86_VENDOR_AMD in it +for now, even though that could be done with a simple comparison, on the +assumption that we'll have more to add. + +Based on suggestions from Dave Hansen and Alan Cox. + +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Reviewed-by: Borislav Petkov <bp@suse.de> +Acked-by: Dave Hansen <dave.hansen@intel.com> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: ak@linux.intel.com +Cc: ashok.raj@intel.com +Cc: karahmed@amazon.de +Cc: arjan@linux.intel.com +Cc: torvalds@linux-foundation.org +Cc: peterz@infradead.org +Cc: bp@alien8.de +Cc: pbonzini@redhat.com +Cc: tim.c.chen@linux.intel.com +Cc: gregkh@linux-foundation.org +Link: https://lkml.kernel.org/r/1516896855-7642-6-git-send-email-dwmw@amazon.co.uk +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kernel/cpu/common.c | 48 ++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 43 insertions(+), 5 deletions(-) + +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -44,6 +44,8 @@ + #include <asm/pat.h> + #include <asm/microcode.h> + #include <asm/microcode_intel.h> ++#include <asm/intel-family.h> ++#include <asm/cpu_device_id.h> + + #ifdef CONFIG_X86_LOCAL_APIC + #include <asm/uv/uv.h> +@@ -838,6 +840,41 @@ static void identify_cpu_without_cpuid(s + #endif + } + ++static const __initdata struct x86_cpu_id cpu_no_speculation[] = { ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY }, ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY }, ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY }, ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY }, ++ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY }, ++ { X86_VENDOR_CENTAUR, 5 }, ++ { X86_VENDOR_INTEL, 5 }, ++ { X86_VENDOR_NSC, 5 }, ++ { X86_VENDOR_ANY, 4 }, ++ {} ++}; ++ ++static const __initdata struct x86_cpu_id cpu_no_meltdown[] = { ++ { X86_VENDOR_AMD }, ++ {} ++}; ++ ++static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c) ++{ ++ u64 ia32_cap = 0; ++ ++ if (x86_match_cpu(cpu_no_meltdown)) ++ return false; ++ ++ if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) ++ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); ++ ++ /* Rogue Data Cache Load? No! */ ++ if (ia32_cap & ARCH_CAP_RDCL_NO) ++ return false; ++ ++ return true; ++} ++ + /* + * Do minimum CPU detection early. + * Fields really needed: vendor, cpuid_level, family, model, mask, +@@ -884,11 +921,12 @@ static void __init early_identify_cpu(st + + setup_force_cpu_cap(X86_FEATURE_ALWAYS); + +- if (c->x86_vendor != X86_VENDOR_AMD) +- setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); +- +- setup_force_cpu_bug(X86_BUG_SPECTRE_V1); +- setup_force_cpu_bug(X86_BUG_SPECTRE_V2); ++ if (!x86_match_cpu(cpu_no_speculation)) { ++ if (cpu_vulnerable_to_meltdown(c)) ++ setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); ++ setup_force_cpu_bug(X86_BUG_SPECTRE_V1); ++ setup_force_cpu_bug(X86_BUG_SPECTRE_V2); ++ } + + fpu__init_system(c); + diff --git a/queue/x86-pti-document-fix-wrong-index.patch b/queue/x86-pti-document-fix-wrong-index.patch new file mode 100644 index 0000000..cb7b93a --- /dev/null +++ b/queue/x86-pti-document-fix-wrong-index.patch @@ -0,0 +1,32 @@ +From 98f0fceec7f84d80bc053e49e596088573086421 Mon Sep 17 00:00:00 2001 +From: "zhenwei.pi" <zhenwei.pi@youruncloud.com> +Date: Thu, 18 Jan 2018 09:04:52 +0800 +Subject: x86/pti: Document fix wrong index + +From: zhenwei.pi <zhenwei.pi@youruncloud.com> + +commit 98f0fceec7f84d80bc053e49e596088573086421 upstream. + +In section <2. Runtime Cost>, fix wrong index. + +Signed-off-by: zhenwei.pi <zhenwei.pi@youruncloud.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: dave.hansen@linux.intel.com +Link: https://lkml.kernel.org/r/1516237492-27739-1-git-send-email-zhenwei.pi@youruncloud.com +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + Documentation/x86/pti.txt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/Documentation/x86/pti.txt ++++ b/Documentation/x86/pti.txt +@@ -78,7 +78,7 @@ this protection comes at a cost: + non-PTI SYSCALL entry code, so requires mapping fewer + things into the userspace page tables. The downside is + that stacks must be switched at entry time. +- d. Global pages are disabled for all kernel structures not ++ c. Global pages are disabled for all kernel structures not + mapped into both kernel and userspace page tables. This + feature of the MMU allows different processes to share TLB + entries mapping the kernel. Losing the feature means more diff --git a/queue/x86-pti-efi-broken-conversion-from-efi-to-kernel-page-table.patch b/queue/x86-pti-efi-broken-conversion-from-efi-to-kernel-page-table.patch new file mode 100644 index 0000000..b38321d --- /dev/null +++ b/queue/x86-pti-efi-broken-conversion-from-efi-to-kernel-page-table.patch @@ -0,0 +1,76 @@ +From pasha.tatashin@oracle.com Mon Jan 15 18:48:49 2018 +From: Pavel Tatashin <pasha.tatashin@oracle.com> +Date: Mon, 15 Jan 2018 11:44:14 -0500 +Subject: x86/pti/efi: broken conversion from efi to kernel page table +To: steven.sistare@oracle.com, linux-kernel@vger.kernel.org, tglx@linutronix.de, mingo@redhat.com, hpa@zytor.com, x86@kernel.org, gregkh@linuxfoundation.org, jkosina@suse.cz, hughd@google.com, dave.hansen@linux.intel.com, luto@kernel.org, stable@vger.kernel.org +Message-ID: <20180115164414.19778-1-pasha.tatashin@oracle.com> + +From: Pavel Tatashin <pasha.tatashin@oracle.com> + +The page table order must be increased for EFI table in order to avoid a +bug where NMI tries to change the page table to kernel page table, while +efi page table is active. + +For more disccussion about this bug, see this thread: +http://lkml.iu.edu/hypermail/linux/kernel/1801.1/00951.html + +Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com> +Reviewed-by: Steven Sistare <steven.sistare@oracle.com> +Acked-by: Jiri Kosina <jkosina@suse.cz> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/pgalloc.h | 11 +++++++++++ + arch/x86/mm/pgtable.c | 7 ------- + arch/x86/platform/efi/efi_64.c | 2 +- + 3 files changed, 12 insertions(+), 8 deletions(-) + +Changelog: + v1 - v2: Fixed compiling warning + +--- a/arch/x86/include/asm/pgalloc.h ++++ b/arch/x86/include/asm/pgalloc.h +@@ -27,6 +27,17 @@ static inline void paravirt_release_pud( + */ + extern gfp_t __userpte_alloc_gfp; + ++#ifdef CONFIG_PAGE_TABLE_ISOLATION ++/* ++ * Instead of one PGD, we acquire two PGDs. Being order-1, it is ++ * both 8k in size and 8k-aligned. That lets us just flip bit 12 ++ * in a pointer to swap between the two 4k halves. ++ */ ++#define PGD_ALLOCATION_ORDER 1 ++#else ++#define PGD_ALLOCATION_ORDER 0 ++#endif ++ + /* + * Allocate and free page tables. + */ +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -345,13 +345,6 @@ static inline void _pgd_free(pgd_t *pgd) + } + #else + +-/* +- * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is +- * both 8k in size and 8k-aligned. That lets us just flip bit 12 +- * in a pointer to swap between the two 4k halves. +- */ +-#define PGD_ALLOCATION_ORDER kaiser_enabled +- + static inline pgd_t *_pgd_alloc(void) + { + return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); +--- a/arch/x86/platform/efi/efi_64.c ++++ b/arch/x86/platform/efi/efi_64.c +@@ -142,7 +142,7 @@ int __init efi_alloc_page_tables(void) + return 0; + + gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO; +- efi_pgd = (pgd_t *)__get_free_page(gfp_mask); ++ efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER); + if (!efi_pgd) + return -ENOMEM; + diff --git a/queue/x86-pti-mark-constant-arrays-as-__initconst.patch b/queue/x86-pti-mark-constant-arrays-as-__initconst.patch new file mode 100644 index 0000000..d9843e5 --- /dev/null +++ b/queue/x86-pti-mark-constant-arrays-as-__initconst.patch @@ -0,0 +1,53 @@ +From foo@baz Thu Feb 8 03:33:09 CET 2018 +From: Arnd Bergmann <arnd@arndb.de> +Date: Fri, 2 Feb 2018 22:39:23 +0100 +Subject: x86/pti: Mark constant arrays as __initconst + +From: Arnd Bergmann <arnd@arndb.de> + + +(cherry picked from commit 4bf5d56d429cbc96c23d809a08f63cd29e1a702e) + +I'm seeing build failures from the two newly introduced arrays that +are marked 'const' and '__initdata', which are mutually exclusive: + +arch/x86/kernel/cpu/common.c:882:43: error: 'cpu_no_speculation' causes a section type conflict with 'e820_table_firmware_init' +arch/x86/kernel/cpu/common.c:895:43: error: 'cpu_no_meltdown' causes a section type conflict with 'e820_table_firmware_init' + +The correct annotation is __initconst. + +Fixes: fec9434a12f3 ("x86/pti: Do not enable PTI on CPUs which are not vulnerable to Meltdown") +Signed-off-by: Arnd Bergmann <arnd@arndb.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Ricardo Neri <ricardo.neri-calderon@linux.intel.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bp@suse.de> +Cc: Thomas Garnier <thgarnie@google.com> +Cc: David Woodhouse <dwmw@amazon.co.uk> +Link: https://lkml.kernel.org/r/20180202213959.611210-1-arnd@arndb.de +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kernel/cpu/common.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -861,7 +861,7 @@ static void identify_cpu_without_cpuid(s + #endif + } + +-static const __initdata struct x86_cpu_id cpu_no_speculation[] = { ++static const __initconst struct x86_cpu_id cpu_no_speculation[] = { + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY }, +@@ -874,7 +874,7 @@ static const __initdata struct x86_cpu_i + {} + }; + +-static const __initdata struct x86_cpu_id cpu_no_meltdown[] = { ++static const __initconst struct x86_cpu_id cpu_no_meltdown[] = { + { X86_VENDOR_AMD }, + {} + }; diff --git a/queue/x86-pti-rename-bug_cpu_insecure-to-bug_cpu_meltdown.patch b/queue/x86-pti-rename-bug_cpu_insecure-to-bug_cpu_meltdown.patch new file mode 100644 index 0000000..5dce2f6 --- /dev/null +++ b/queue/x86-pti-rename-bug_cpu_insecure-to-bug_cpu_meltdown.patch @@ -0,0 +1,56 @@ +From de791821c295cc61419a06fe5562288417d1bc58 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 5 Jan 2018 15:27:34 +0100 +Subject: x86/pti: Rename BUG_CPU_INSECURE to BUG_CPU_MELTDOWN + +From: Thomas Gleixner <tglx@linutronix.de> + +commit de791821c295cc61419a06fe5562288417d1bc58 upstream. + +Use the name associated with the particular attack which needs page table +isolation for mitigation. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: David Woodhouse <dwmw@amazon.co.uk> +Cc: Alan Cox <gnomes@lxorguk.ukuu.org.uk> +Cc: Jiri Koshina <jikos@kernel.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Andi Lutomirski <luto@amacapital.net> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Paul Turner <pjt@google.com> +Cc: Tom Lendacky <thomas.lendacky@amd.com> +Cc: Greg KH <gregkh@linux-foundation.org> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Kees Cook <keescook@google.com> +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801051525300.1724@nanos +Signed-off-by: Razvan Ghitulete <rga@amazon.de> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/cpufeatures.h | 2 +- + arch/x86/kernel/cpu/common.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -316,6 +316,6 @@ + #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ + #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ + #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ +-#define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */ ++#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ + + #endif /* _ASM_X86_CPUFEATURES_H */ +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -884,7 +884,7 @@ static void __init early_identify_cpu(st + setup_force_cpu_cap(X86_FEATURE_ALWAYS); + + /* Assume for now that ALL x86 CPUs are insecure */ +- setup_force_cpu_bug(X86_BUG_CPU_INSECURE); ++ setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + + fpu__init_system(c); + } diff --git a/queue/x86-retpoline-add-initial-retpoline-support.patch b/queue/x86-retpoline-add-initial-retpoline-support.patch new file mode 100644 index 0000000..7ea928b --- /dev/null +++ b/queue/x86-retpoline-add-initial-retpoline-support.patch @@ -0,0 +1,359 @@ +From 76b043848fd22dbf7f8bf3a1452f8c70d557b860 Mon Sep 17 00:00:00 2001 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Thu, 11 Jan 2018 21:46:25 +0000 +Subject: x86/retpoline: Add initial retpoline support + +From: David Woodhouse <dwmw@amazon.co.uk> + +commit 76b043848fd22dbf7f8bf3a1452f8c70d557b860 upstream. + +Enable the use of -mindirect-branch=thunk-extern in newer GCC, and provide +the corresponding thunks. Provide assembler macros for invoking the thunks +in the same way that GCC does, from native and inline assembler. + +This adds X86_FEATURE_RETPOLINE and sets it by default on all CPUs. In +some circumstances, IBRS microcode features may be used instead, and the +retpoline can be disabled. + +On AMD CPUs if lfence is serialising, the retpoline can be dramatically +simplified to a simple "lfence; jmp *\reg". A future patch, after it has +been verified that lfence really is serialising in all circumstances, can +enable this by setting the X86_FEATURE_RETPOLINE_AMD feature bit in addition +to X86_FEATURE_RETPOLINE. + +Do not align the retpoline in the altinstr section, because there is no +guarantee that it stays aligned when it's copied over the oldinstr during +alternative patching. + +[ Andi Kleen: Rename the macros, add CONFIG_RETPOLINE option, export thunks] +[ tglx: Put actual function CALL/JMP in front of the macros, convert to + symbolic labels ] +[ dwmw2: Convert back to numeric labels, merge objtool fixes ] + +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: Arjan van de Ven <arjan@linux.intel.com> +Acked-by: Ingo Molnar <mingo@kernel.org> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: Rik van Riel <riel@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: thomas.lendacky@amd.com +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Jiri Kosina <jikos@kernel.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Kees Cook <keescook@google.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Cc: Paul Turner <pjt@google.com> +Link: https://lkml.kernel.org/r/1515707194-20531-4-git-send-email-dwmw@amazon.co.uk +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/Kconfig | 13 +++ + arch/x86/Makefile | 10 ++ + arch/x86/include/asm/asm-prototypes.h | 25 ++++++ + arch/x86/include/asm/cpufeatures.h | 3 + arch/x86/include/asm/nospec-branch.h | 128 ++++++++++++++++++++++++++++++++++ + arch/x86/kernel/cpu/common.c | 4 + + arch/x86/lib/Makefile | 1 + arch/x86/lib/retpoline.S | 48 ++++++++++++ + 8 files changed, 232 insertions(+) + create mode 100644 arch/x86/include/asm/nospec-branch.h + create mode 100644 arch/x86/lib/retpoline.S + +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -408,6 +408,19 @@ config GOLDFISH + def_bool y + depends on X86_GOLDFISH + ++config RETPOLINE ++ bool "Avoid speculative indirect branches in kernel" ++ default y ++ ---help--- ++ Compile kernel with the retpoline compiler options to guard against ++ kernel-to-user data leaks by avoiding speculative indirect ++ branches. Requires a compiler with -mindirect-branch=thunk-extern ++ support for full protection. The kernel may run slower. ++ ++ Without compiler support, at least indirect branches in assembler ++ code are eliminated. Since this includes the syscall entry path, ++ it is not entirely pointless. ++ + if X86_32 + config X86_EXTENDED_PLATFORM + bool "Support for extended (non-PC) x86 platforms" +--- a/arch/x86/Makefile ++++ b/arch/x86/Makefile +@@ -182,6 +182,16 @@ KBUILD_CFLAGS += -fno-asynchronous-unwin + KBUILD_CFLAGS += $(mflags-y) + KBUILD_AFLAGS += $(mflags-y) + ++# Avoid indirect branches in kernel to deal with Spectre ++ifdef CONFIG_RETPOLINE ++ RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) ++ ifneq ($(RETPOLINE_CFLAGS),) ++ KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE ++ else ++ $(warning CONFIG_RETPOLINE=y, but not supported by the compiler. Toolchain update recommended.) ++ endif ++endif ++ + archscripts: scripts_basic + $(Q)$(MAKE) $(build)=arch/x86/tools relocs + +--- a/arch/x86/include/asm/asm-prototypes.h ++++ b/arch/x86/include/asm/asm-prototypes.h +@@ -10,7 +10,32 @@ + #include <asm/pgtable.h> + #include <asm/special_insns.h> + #include <asm/preempt.h> ++#include <asm/asm.h> + + #ifndef CONFIG_X86_CMPXCHG64 + extern void cmpxchg8b_emu(void); + #endif ++ ++#ifdef CONFIG_RETPOLINE ++#ifdef CONFIG_X86_32 ++#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_e ## reg(void); ++#else ++#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_r ## reg(void); ++INDIRECT_THUNK(8) ++INDIRECT_THUNK(9) ++INDIRECT_THUNK(10) ++INDIRECT_THUNK(11) ++INDIRECT_THUNK(12) ++INDIRECT_THUNK(13) ++INDIRECT_THUNK(14) ++INDIRECT_THUNK(15) ++#endif ++INDIRECT_THUNK(ax) ++INDIRECT_THUNK(bx) ++INDIRECT_THUNK(cx) ++INDIRECT_THUNK(dx) ++INDIRECT_THUNK(si) ++INDIRECT_THUNK(di) ++INDIRECT_THUNK(bp) ++INDIRECT_THUNK(sp) ++#endif /* CONFIG_RETPOLINE */ +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -194,6 +194,9 @@ + #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ + #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ + ++#define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ ++#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */ ++ + #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ + #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ + #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ +--- /dev/null ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -0,0 +1,128 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef __NOSPEC_BRANCH_H__ ++#define __NOSPEC_BRANCH_H__ ++ ++#include <asm/alternative.h> ++#include <asm/alternative-asm.h> ++#include <asm/cpufeatures.h> ++ ++#ifdef __ASSEMBLY__ ++ ++/* ++ * This should be used immediately before a retpoline alternative. It tells ++ * objtool where the retpolines are so that it can make sense of the control ++ * flow by just reading the original instruction(s) and ignoring the ++ * alternatives. ++ */ ++.macro ANNOTATE_NOSPEC_ALTERNATIVE ++ .Lannotate_\@: ++ .pushsection .discard.nospec ++ .long .Lannotate_\@ - . ++ .popsection ++.endm ++ ++/* ++ * These are the bare retpoline primitives for indirect jmp and call. ++ * Do not use these directly; they only exist to make the ALTERNATIVE ++ * invocation below less ugly. ++ */ ++.macro RETPOLINE_JMP reg:req ++ call .Ldo_rop_\@ ++.Lspec_trap_\@: ++ pause ++ jmp .Lspec_trap_\@ ++.Ldo_rop_\@: ++ mov \reg, (%_ASM_SP) ++ ret ++.endm ++ ++/* ++ * This is a wrapper around RETPOLINE_JMP so the called function in reg ++ * returns to the instruction after the macro. ++ */ ++.macro RETPOLINE_CALL reg:req ++ jmp .Ldo_call_\@ ++.Ldo_retpoline_jmp_\@: ++ RETPOLINE_JMP \reg ++.Ldo_call_\@: ++ call .Ldo_retpoline_jmp_\@ ++.endm ++ ++/* ++ * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple ++ * indirect jmp/call which may be susceptible to the Spectre variant 2 ++ * attack. ++ */ ++.macro JMP_NOSPEC reg:req ++#ifdef CONFIG_RETPOLINE ++ ANNOTATE_NOSPEC_ALTERNATIVE ++ ALTERNATIVE_2 __stringify(jmp *\reg), \ ++ __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \ ++ __stringify(lfence; jmp *\reg), X86_FEATURE_RETPOLINE_AMD ++#else ++ jmp *\reg ++#endif ++.endm ++ ++.macro CALL_NOSPEC reg:req ++#ifdef CONFIG_RETPOLINE ++ ANNOTATE_NOSPEC_ALTERNATIVE ++ ALTERNATIVE_2 __stringify(call *\reg), \ ++ __stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\ ++ __stringify(lfence; call *\reg), X86_FEATURE_RETPOLINE_AMD ++#else ++ call *\reg ++#endif ++.endm ++ ++#else /* __ASSEMBLY__ */ ++ ++#define ANNOTATE_NOSPEC_ALTERNATIVE \ ++ "999:\n\t" \ ++ ".pushsection .discard.nospec\n\t" \ ++ ".long 999b - .\n\t" \ ++ ".popsection\n\t" ++ ++#if defined(CONFIG_X86_64) && defined(RETPOLINE) ++ ++/* ++ * Since the inline asm uses the %V modifier which is only in newer GCC, ++ * the 64-bit one is dependent on RETPOLINE not CONFIG_RETPOLINE. ++ */ ++# define CALL_NOSPEC \ ++ ANNOTATE_NOSPEC_ALTERNATIVE \ ++ ALTERNATIVE( \ ++ "call *%[thunk_target]\n", \ ++ "call __x86_indirect_thunk_%V[thunk_target]\n", \ ++ X86_FEATURE_RETPOLINE) ++# define THUNK_TARGET(addr) [thunk_target] "r" (addr) ++ ++#elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE) ++/* ++ * For i386 we use the original ret-equivalent retpoline, because ++ * otherwise we'll run out of registers. We don't care about CET ++ * here, anyway. ++ */ ++# define CALL_NOSPEC ALTERNATIVE("call *%[thunk_target]\n", \ ++ " jmp 904f;\n" \ ++ " .align 16\n" \ ++ "901: call 903f;\n" \ ++ "902: pause;\n" \ ++ " jmp 902b;\n" \ ++ " .align 16\n" \ ++ "903: addl $4, %%esp;\n" \ ++ " pushl %[thunk_target];\n" \ ++ " ret;\n" \ ++ " .align 16\n" \ ++ "904: call 901b;\n", \ ++ X86_FEATURE_RETPOLINE) ++ ++# define THUNK_TARGET(addr) [thunk_target] "rm" (addr) ++#else /* No retpoline */ ++# define CALL_NOSPEC "call *%[thunk_target]\n" ++# define THUNK_TARGET(addr) [thunk_target] "rm" (addr) ++#endif ++ ++#endif /* __ASSEMBLY__ */ ++#endif /* __NOSPEC_BRANCH_H__ */ +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -889,6 +889,10 @@ static void __init early_identify_cpu(st + setup_force_cpu_bug(X86_BUG_SPECTRE_V1); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + ++#ifdef CONFIG_RETPOLINE ++ setup_force_cpu_cap(X86_FEATURE_RETPOLINE); ++#endif ++ + fpu__init_system(c); + + #ifdef CONFIG_X86_32 +--- a/arch/x86/lib/Makefile ++++ b/arch/x86/lib/Makefile +@@ -25,6 +25,7 @@ lib-y += memcpy_$(BITS).o + lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o + lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o + lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o ++lib-$(CONFIG_RETPOLINE) += retpoline.o + + obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o + +--- /dev/null ++++ b/arch/x86/lib/retpoline.S +@@ -0,0 +1,48 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#include <linux/stringify.h> ++#include <linux/linkage.h> ++#include <asm/dwarf2.h> ++#include <asm/cpufeatures.h> ++#include <asm/alternative-asm.h> ++#include <asm/export.h> ++#include <asm/nospec-branch.h> ++ ++.macro THUNK reg ++ .section .text.__x86.indirect_thunk.\reg ++ ++ENTRY(__x86_indirect_thunk_\reg) ++ CFI_STARTPROC ++ JMP_NOSPEC %\reg ++ CFI_ENDPROC ++ENDPROC(__x86_indirect_thunk_\reg) ++.endm ++ ++/* ++ * Despite being an assembler file we can't just use .irp here ++ * because __KSYM_DEPS__ only uses the C preprocessor and would ++ * only see one instance of "__x86_indirect_thunk_\reg" rather ++ * than one per register with the correct names. So we do it ++ * the simple and nasty way... ++ */ ++#define EXPORT_THUNK(reg) EXPORT_SYMBOL(__x86_indirect_thunk_ ## reg) ++#define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg) ++ ++GENERATE_THUNK(_ASM_AX) ++GENERATE_THUNK(_ASM_BX) ++GENERATE_THUNK(_ASM_CX) ++GENERATE_THUNK(_ASM_DX) ++GENERATE_THUNK(_ASM_SI) ++GENERATE_THUNK(_ASM_DI) ++GENERATE_THUNK(_ASM_BP) ++GENERATE_THUNK(_ASM_SP) ++#ifdef CONFIG_64BIT ++GENERATE_THUNK(r8) ++GENERATE_THUNK(r9) ++GENERATE_THUNK(r10) ++GENERATE_THUNK(r11) ++GENERATE_THUNK(r12) ++GENERATE_THUNK(r13) ++GENERATE_THUNK(r14) ++GENERATE_THUNK(r15) ++#endif diff --git a/queue/x86-retpoline-add-lfence-to-the-retpoline-rsb-filling-rsb-macros.patch b/queue/x86-retpoline-add-lfence-to-the-retpoline-rsb-filling-rsb-macros.patch new file mode 100644 index 0000000..0f624d5 --- /dev/null +++ b/queue/x86-retpoline-add-lfence-to-the-retpoline-rsb-filling-rsb-macros.patch @@ -0,0 +1,91 @@ +From 28d437d550e1e39f805d99f9f8ac399c778827b7 Mon Sep 17 00:00:00 2001 +From: Tom Lendacky <thomas.lendacky@amd.com> +Date: Sat, 13 Jan 2018 17:27:30 -0600 +Subject: x86/retpoline: Add LFENCE to the retpoline/RSB filling RSB macros + +From: Tom Lendacky <thomas.lendacky@amd.com> + +commit 28d437d550e1e39f805d99f9f8ac399c778827b7 upstream. + +The PAUSE instruction is currently used in the retpoline and RSB filling +macros as a speculation trap. The use of PAUSE was originally suggested +because it showed a very, very small difference in the amount of +cycles/time used to execute the retpoline as compared to LFENCE. On AMD, +the PAUSE instruction is not a serializing instruction, so the pause/jmp +loop will use excess power as it is speculated over waiting for return +to mispredict to the correct target. + +The RSB filling macro is applicable to AMD, and, if software is unable to +verify that LFENCE is serializing on AMD (possible when running under a +hypervisor), the generic retpoline support will be used and, so, is also +applicable to AMD. Keep the current usage of PAUSE for Intel, but add an +LFENCE instruction to the speculation trap for AMD. + +The same sequence has been adopted by GCC for the GCC generated retpolines. + +Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@alien8.de> +Acked-by: David Woodhouse <dwmw@amazon.co.uk> +Acked-by: Arjan van de Ven <arjan@linux.intel.com> +Cc: Rik van Riel <riel@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Paul Turner <pjt@google.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Jiri Kosina <jikos@kernel.org> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Cc: Kees Cook <keescook@google.com> +Link: https://lkml.kernel.org/r/20180113232730.31060.36287.stgit@tlendack-t1.amdoffice.net +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/include/asm/nospec-branch.h | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -11,7 +11,7 @@ + * Fill the CPU return stack buffer. + * + * Each entry in the RSB, if used for a speculative 'ret', contains an +- * infinite 'pause; jmp' loop to capture speculative execution. ++ * infinite 'pause; lfence; jmp' loop to capture speculative execution. + * + * This is required in various cases for retpoline and IBRS-based + * mitigations for the Spectre variant 2 vulnerability. Sometimes to +@@ -38,11 +38,13 @@ + call 772f; \ + 773: /* speculation trap */ \ + pause; \ ++ lfence; \ + jmp 773b; \ + 772: \ + call 774f; \ + 775: /* speculation trap */ \ + pause; \ ++ lfence; \ + jmp 775b; \ + 774: \ + dec reg; \ +@@ -73,6 +75,7 @@ + call .Ldo_rop_\@ + .Lspec_trap_\@: + pause ++ lfence + jmp .Lspec_trap_\@ + .Ldo_rop_\@: + mov \reg, (%_ASM_SP) +@@ -165,6 +168,7 @@ + " .align 16\n" \ + "901: call 903f;\n" \ + "902: pause;\n" \ ++ " lfence;\n" \ + " jmp 902b;\n" \ + " .align 16\n" \ + "903: addl $4, %%esp;\n" \ diff --git a/queue/x86-retpoline-avoid-retpolines-for-built-in-__init-functions.patch b/queue/x86-retpoline-avoid-retpolines-for-built-in-__init-functions.patch new file mode 100644 index 0000000..c810eec --- /dev/null +++ b/queue/x86-retpoline-avoid-retpolines-for-built-in-__init-functions.patch @@ -0,0 +1,50 @@ +From ee4aac311ae4f66980ec4e1accc83740909c130a Mon Sep 17 00:00:00 2001 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Thu, 1 Feb 2018 11:27:20 +0000 +Subject: [PATCH] x86/retpoline: Avoid retpolines for built-in __init functions + +(cherry picked from commit 66f793099a636862a71c59d4a6ba91387b155e0c) + +There's no point in building init code with retpolines, since it runs before +any potentially hostile userspace does. And before the retpoline is actually +ALTERNATIVEd into place, for much of it. + +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: karahmed@amazon.de +Cc: peterz@infradead.org +Cc: bp@alien8.de +Link: https://lkml.kernel.org/r/1517484441-1420-2-git-send-email-dwmw@amazon.co.uk +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +diff --git a/include/linux/init.h b/include/linux/init.h +index 6935d02474aa..fe1e5981c43b 100644 +--- a/include/linux/init.h ++++ b/include/linux/init.h +@@ -4,6 +4,13 @@ + #include <linux/compiler.h> + #include <linux/types.h> + ++/* Built-in __init functions needn't be compiled with retpoline */ ++#if defined(RETPOLINE) && !defined(MODULE) ++#define __noretpoline __attribute__((indirect_branch("keep"))) ++#else ++#define __noretpoline ++#endif ++ + /* These macros are used to mark some functions or + * initialized data (doesn't apply to uninitialized data) + * as `initialization' functions. The kernel can take this +@@ -39,7 +46,7 @@ + + /* These are for everybody (although not all archs will actually + discard it in modules) */ +-#define __init __section(.init.text) __cold notrace ++#define __init __section(.init.text) __cold notrace __noretpoline + #define __initdata __section(.init.data) + #define __initconst __constsection(.init.rodata) + #define __exitdata __section(.exit.data) +-- +2.15.0 + diff --git a/queue/x86-retpoline-checksum32-convert-assembler-indirect-jumps.patch b/queue/x86-retpoline-checksum32-convert-assembler-indirect-jumps.patch new file mode 100644 index 0000000..c40ee9e --- /dev/null +++ b/queue/x86-retpoline-checksum32-convert-assembler-indirect-jumps.patch @@ -0,0 +1,67 @@ +From 5096732f6f695001fa2d6f1335a2680b37912c69 Mon Sep 17 00:00:00 2001 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Thu, 11 Jan 2018 21:46:32 +0000 +Subject: x86/retpoline/checksum32: Convert assembler indirect jumps + +From: David Woodhouse <dwmw@amazon.co.uk> + +commit 5096732f6f695001fa2d6f1335a2680b37912c69 upstream. + +Convert all indirect jumps in 32bit checksum assembler code to use +non-speculative sequences when CONFIG_RETPOLINE is enabled. + +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: Arjan van de Ven <arjan@linux.intel.com> +Acked-by: Ingo Molnar <mingo@kernel.org> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: Rik van Riel <riel@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: thomas.lendacky@amd.com +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Jiri Kosina <jikos@kernel.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Kees Cook <keescook@google.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Cc: Paul Turner <pjt@google.com> +Link: https://lkml.kernel.org/r/1515707194-20531-11-git-send-email-dwmw@amazon.co.uk +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/lib/checksum_32.S | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +--- a/arch/x86/lib/checksum_32.S ++++ b/arch/x86/lib/checksum_32.S +@@ -29,7 +29,8 @@ + #include <asm/errno.h> + #include <asm/asm.h> + #include <asm/export.h> +- ++#include <asm/nospec-branch.h> ++ + /* + * computes a partial checksum, e.g. for TCP/UDP fragments + */ +@@ -156,7 +157,7 @@ ENTRY(csum_partial) + negl %ebx + lea 45f(%ebx,%ebx,2), %ebx + testl %esi, %esi +- jmp *%ebx ++ JMP_NOSPEC %ebx + + # Handle 2-byte-aligned regions + 20: addw (%esi), %ax +@@ -439,7 +440,7 @@ ENTRY(csum_partial_copy_generic) + andl $-32,%edx + lea 3f(%ebx,%ebx), %ebx + testl %esi, %esi +- jmp *%ebx ++ JMP_NOSPEC %ebx + 1: addl $64,%esi + addl $64,%edi + SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) diff --git a/queue/x86-retpoline-crypto-convert-crypto-assembler-indirect-jumps.patch b/queue/x86-retpoline-crypto-convert-crypto-assembler-indirect-jumps.patch new file mode 100644 index 0000000..e9b7176 --- /dev/null +++ b/queue/x86-retpoline-crypto-convert-crypto-assembler-indirect-jumps.patch @@ -0,0 +1,125 @@ +From 9697fa39efd3fc3692f2949d4045f393ec58450b Mon Sep 17 00:00:00 2001 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Thu, 11 Jan 2018 21:46:27 +0000 +Subject: x86/retpoline/crypto: Convert crypto assembler indirect jumps + +From: David Woodhouse <dwmw@amazon.co.uk> + +commit 9697fa39efd3fc3692f2949d4045f393ec58450b upstream. + +Convert all indirect jumps in crypto assembler code to use non-speculative +sequences when CONFIG_RETPOLINE is enabled. + +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: Arjan van de Ven <arjan@linux.intel.com> +Acked-by: Ingo Molnar <mingo@kernel.org> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: Rik van Riel <riel@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: thomas.lendacky@amd.com +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Jiri Kosina <jikos@kernel.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Kees Cook <keescook@google.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Cc: Paul Turner <pjt@google.com> +Link: https://lkml.kernel.org/r/1515707194-20531-6-git-send-email-dwmw@amazon.co.uk +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/crypto/aesni-intel_asm.S | 5 +++-- + arch/x86/crypto/camellia-aesni-avx-asm_64.S | 3 ++- + arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 3 ++- + arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 3 ++- + 4 files changed, 9 insertions(+), 5 deletions(-) + +--- a/arch/x86/crypto/aesni-intel_asm.S ++++ b/arch/x86/crypto/aesni-intel_asm.S +@@ -32,6 +32,7 @@ + #include <linux/linkage.h> + #include <asm/inst.h> + #include <asm/frame.h> ++#include <asm/nospec-branch.h> + + /* + * The following macros are used to move an (un)aligned 16 byte value to/from +@@ -2734,7 +2735,7 @@ ENTRY(aesni_xts_crypt8) + pxor INC, STATE4 + movdqu IV, 0x30(OUTP) + +- call *%r11 ++ CALL_NOSPEC %r11 + + movdqu 0x00(OUTP), INC + pxor INC, STATE1 +@@ -2779,7 +2780,7 @@ ENTRY(aesni_xts_crypt8) + _aesni_gf128mul_x_ble() + movups IV, (IVP) + +- call *%r11 ++ CALL_NOSPEC %r11 + + movdqu 0x40(OUTP), INC + pxor INC, STATE1 +--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S ++++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S +@@ -17,6 +17,7 @@ + + #include <linux/linkage.h> + #include <asm/frame.h> ++#include <asm/nospec-branch.h> + + #define CAMELLIA_TABLE_BYTE_LEN 272 + +@@ -1224,7 +1225,7 @@ camellia_xts_crypt_16way: + vpxor 14 * 16(%rax), %xmm15, %xmm14; + vpxor 15 * 16(%rax), %xmm15, %xmm15; + +- call *%r9; ++ CALL_NOSPEC %r9; + + addq $(16 * 16), %rsp; + +--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S ++++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +@@ -12,6 +12,7 @@ + + #include <linux/linkage.h> + #include <asm/frame.h> ++#include <asm/nospec-branch.h> + + #define CAMELLIA_TABLE_BYTE_LEN 272 + +@@ -1337,7 +1338,7 @@ camellia_xts_crypt_32way: + vpxor 14 * 32(%rax), %ymm15, %ymm14; + vpxor 15 * 32(%rax), %ymm15, %ymm15; + +- call *%r9; ++ CALL_NOSPEC %r9; + + addq $(16 * 32), %rsp; + +--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S ++++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +@@ -45,6 +45,7 @@ + + #include <asm/inst.h> + #include <linux/linkage.h> ++#include <asm/nospec-branch.h> + + ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction + +@@ -172,7 +173,7 @@ continue_block: + movzxw (bufp, %rax, 2), len + lea crc_array(%rip), bufp + lea (bufp, len, 1), bufp +- jmp *bufp ++ JMP_NOSPEC bufp + + ################################################################ + ## 2a) PROCESS FULL BLOCKS: diff --git a/queue/x86-retpoline-entry-convert-entry-assembler-indirect-jumps.patch b/queue/x86-retpoline-entry-convert-entry-assembler-indirect-jumps.patch new file mode 100644 index 0000000..ea6e7ec --- /dev/null +++ b/queue/x86-retpoline-entry-convert-entry-assembler-indirect-jumps.patch @@ -0,0 +1,117 @@ +From 2641f08bb7fc63a636a2b18173221d7040a3512e Mon Sep 17 00:00:00 2001 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Thu, 11 Jan 2018 21:46:28 +0000 +Subject: x86/retpoline/entry: Convert entry assembler indirect jumps + +From: David Woodhouse <dwmw@amazon.co.uk> + +commit 2641f08bb7fc63a636a2b18173221d7040a3512e upstream. + +Convert indirect jumps in core 32/64bit entry assembler code to use +non-speculative sequences when CONFIG_RETPOLINE is enabled. + +Don't use CALL_NOSPEC in entry_SYSCALL_64_fastpath because the return +address after the 'call' instruction must be *precisely* at the +.Lentry_SYSCALL_64_after_fastpath label for stub_ptregs_64 to work, +and the use of alternatives will mess that up unless we play horrid +games to prepend with NOPs and make the variants the same length. It's +not worth it; in the case where we ALTERNATIVE out the retpoline, the +first instruction at __x86.indirect_thunk.rax is going to be a bare +jmp *%rax anyway. + +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: Ingo Molnar <mingo@kernel.org> +Acked-by: Arjan van de Ven <arjan@linux.intel.com> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: Rik van Riel <riel@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: thomas.lendacky@amd.com +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Jiri Kosina <jikos@kernel.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Kees Cook <keescook@google.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Cc: Paul Turner <pjt@google.com> +Link: https://lkml.kernel.org/r/1515707194-20531-7-git-send-email-dwmw@amazon.co.uk +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/entry/entry_32.S | 5 +++-- + arch/x86/entry/entry_64.S | 10 ++++++++-- + 2 files changed, 11 insertions(+), 4 deletions(-) + +--- a/arch/x86/entry/entry_32.S ++++ b/arch/x86/entry/entry_32.S +@@ -45,6 +45,7 @@ + #include <asm/asm.h> + #include <asm/smap.h> + #include <asm/export.h> ++#include <asm/nospec-branch.h> + + .section .entry.text, "ax" + +@@ -260,7 +261,7 @@ ENTRY(ret_from_fork) + + /* kernel thread */ + 1: movl %edi, %eax +- call *%ebx ++ CALL_NOSPEC %ebx + /* + * A kernel thread is allowed to return here after successfully + * calling do_execve(). Exit to userspace to complete the execve() +@@ -1062,7 +1063,7 @@ error_code: + movl %ecx, %es + TRACE_IRQS_OFF + movl %esp, %eax # pt_regs pointer +- call *%edi ++ CALL_NOSPEC %edi + jmp ret_from_exception + END(page_fault) + +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -37,6 +37,7 @@ + #include <asm/pgtable_types.h> + #include <asm/export.h> + #include <asm/kaiser.h> ++#include <asm/nospec-branch.h> + #include <linux/err.h> + + /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ +@@ -208,7 +209,12 @@ entry_SYSCALL_64_fastpath: + * It might end up jumping to the slow path. If it jumps, RAX + * and all argument registers are clobbered. + */ ++#ifdef CONFIG_RETPOLINE ++ movq sys_call_table(, %rax, 8), %rax ++ call __x86_indirect_thunk_rax ++#else + call *sys_call_table(, %rax, 8) ++#endif + .Lentry_SYSCALL_64_after_fastpath_call: + + movq %rax, RAX(%rsp) +@@ -380,7 +386,7 @@ ENTRY(stub_ptregs_64) + jmp entry_SYSCALL64_slow_path + + 1: +- jmp *%rax /* Called from C */ ++ JMP_NOSPEC %rax /* Called from C */ + END(stub_ptregs_64) + + .macro ptregs_stub func +@@ -457,7 +463,7 @@ ENTRY(ret_from_fork) + 1: + /* kernel thread */ + movq %r12, %rdi +- call *%rbx ++ CALL_NOSPEC %rbx + /* + * A kernel thread is allowed to return here after successfully + * calling do_execve(). Exit to userspace to complete the execve() diff --git a/queue/x86-retpoline-fill-return-stack-buffer-on-vmexit.patch b/queue/x86-retpoline-fill-return-stack-buffer-on-vmexit.patch new file mode 100644 index 0000000..1fb8cfb --- /dev/null +++ b/queue/x86-retpoline-fill-return-stack-buffer-on-vmexit.patch @@ -0,0 +1,190 @@ +From 7762bf1d921deb32bebbf18e4f7f4a5b5ca16f84 Mon Sep 17 00:00:00 2001 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Fri, 12 Jan 2018 11:11:27 +0000 +Subject: [PATCH] x86/retpoline: Fill return stack buffer on vmexit + +commit 117cc7a908c83697b0b737d15ae1eb5943afe35b upstream. + +In accordance with the Intel and AMD documentation, we need to overwrite +all entries in the RSB on exiting a guest, to prevent malicious branch +target predictions from affecting the host kernel. This is needed both +for retpoline and for IBRS. + +[ak: numbers again for the RSB stuffing labels] + +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: Rik van Riel <riel@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: thomas.lendacky@amd.com +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Jiri Kosina <jikos@kernel.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Kees Cook <keescook@google.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Cc: Paul Turner <pjt@google.com> +Link: https://lkml.kernel.org/r/1515755487-8524-1-git-send-email-dwmw@amazon.co.uk +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h +index ea034fa6e261..402a11c803c3 100644 +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -7,6 +7,48 @@ + #include <asm/alternative-asm.h> + #include <asm/cpufeatures.h> + ++/* ++ * Fill the CPU return stack buffer. ++ * ++ * Each entry in the RSB, if used for a speculative 'ret', contains an ++ * infinite 'pause; jmp' loop to capture speculative execution. ++ * ++ * This is required in various cases for retpoline and IBRS-based ++ * mitigations for the Spectre variant 2 vulnerability. Sometimes to ++ * eliminate potentially bogus entries from the RSB, and sometimes ++ * purely to ensure that it doesn't get empty, which on some CPUs would ++ * allow predictions from other (unwanted!) sources to be used. ++ * ++ * We define a CPP macro such that it can be used from both .S files and ++ * inline assembly. It's possible to do a .macro and then include that ++ * from C via asm(".include <asm/nospec-branch.h>") but let's not go there. ++ */ ++ ++#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ ++#define RSB_FILL_LOOPS 16 /* To avoid underflow */ ++ ++/* ++ * Google experimented with loop-unrolling and this turned out to be ++ * the optimal version — two calls, each with their own speculation ++ * trap should their return address end up getting used, in a loop. ++ */ ++#define __FILL_RETURN_BUFFER(reg, nr, sp) \ ++ mov $(nr/2), reg; \ ++771: \ ++ call 772f; \ ++773: /* speculation trap */ \ ++ pause; \ ++ jmp 773b; \ ++772: \ ++ call 774f; \ ++775: /* speculation trap */ \ ++ pause; \ ++ jmp 775b; \ ++774: \ ++ dec reg; \ ++ jnz 771b; \ ++ add $(BITS_PER_LONG/8) * nr, sp; ++ + #ifdef __ASSEMBLY__ + + /* +@@ -74,6 +116,20 @@ + #else + call *\reg + #endif ++.endm ++ ++ /* ++ * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP ++ * monstrosity above, manually. ++ */ ++.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ++#ifdef CONFIG_RETPOLINE ++ ANNOTATE_NOSPEC_ALTERNATIVE ++ ALTERNATIVE "jmp .Lskip_rsb_\@", \ ++ __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \ ++ \ftr ++.Lskip_rsb_\@: ++#endif + .endm + + #else /* __ASSEMBLY__ */ +@@ -119,7 +175,7 @@ + X86_FEATURE_RETPOLINE) + + # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) +-#else /* No retpoline */ ++#else /* No retpoline for C / inline asm */ + # define CALL_NOSPEC "call *%[thunk_target]\n" + # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) + #endif +@@ -134,5 +190,25 @@ enum spectre_v2_mitigation { + SPECTRE_V2_IBRS, + }; + ++/* ++ * On VMEXIT we must ensure that no RSB predictions learned in the guest ++ * can be followed in the host, by overwriting the RSB completely. Both ++ * retpoline and IBRS mitigations for Spectre v2 need this; only on future ++ * CPUs with IBRS_ATT *might* it be avoided. ++ */ ++static inline void vmexit_fill_RSB(void) ++{ ++#ifdef CONFIG_RETPOLINE ++ unsigned long loops = RSB_CLEAR_LOOPS / 2; ++ ++ asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE ++ ALTERNATIVE("jmp 910f", ++ __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)), ++ X86_FEATURE_RETPOLINE) ++ "910:" ++ : "=&r" (loops), ASM_CALL_CONSTRAINT ++ : "r" (loops) : "memory" ); ++#endif ++} + #endif /* __ASSEMBLY__ */ + #endif /* __NOSPEC_BRANCH_H__ */ +diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c +index af523d84d102..c94a5fdfb26c 100644 +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -41,6 +41,7 @@ + #include <asm/desc.h> + #include <asm/debugreg.h> + #include <asm/kvm_para.h> ++#include <asm/nospec-branch.h> + + #include <asm/virtext.h> + #include "trace.h" +@@ -4550,6 +4551,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) + #endif + ); + ++ /* Eliminate branch target predictions from guest mode */ ++ vmexit_fill_RSB(); ++ + #ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, svm->host.gs_base); + #else +diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c +index 9bef32504146..6890ba12205d 100644 +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -48,6 +48,7 @@ + #include <asm/kexec.h> + #include <asm/apic.h> + #include <asm/irq_remapping.h> ++#include <asm/nospec-branch.h> + + #include "trace.h" + #include "pmu.h" +@@ -8951,6 +8952,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) + #endif + ); + ++ /* Eliminate branch target predictions from guest mode */ ++ vmexit_fill_RSB(); ++ + /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ + if (debugctlmsr) + update_debugctlmsr(debugctlmsr); +-- +2.15.0 + diff --git a/queue/x86-retpoline-fill-rsb-on-context-switch-for-affected-cpus.patch b/queue/x86-retpoline-fill-rsb-on-context-switch-for-affected-cpus.patch new file mode 100644 index 0000000..5f0b6b1 --- /dev/null +++ b/queue/x86-retpoline-fill-rsb-on-context-switch-for-affected-cpus.patch @@ -0,0 +1,170 @@ +From c995efd5a740d9cbafbf58bde4973e8b50b4d761 Mon Sep 17 00:00:00 2001 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Fri, 12 Jan 2018 17:49:25 +0000 +Subject: x86/retpoline: Fill RSB on context switch for affected CPUs + +From: David Woodhouse <dwmw@amazon.co.uk> + +commit c995efd5a740d9cbafbf58bde4973e8b50b4d761 upstream. + +On context switch from a shallow call stack to a deeper one, as the CPU +does 'ret' up the deeper side it may encounter RSB entries (predictions for +where the 'ret' goes to) which were populated in userspace. + +This is problematic if neither SMEP nor KPTI (the latter of which marks +userspace pages as NX for the kernel) are active, as malicious code in +userspace may then be executed speculatively. + +Overwrite the CPU's return prediction stack with calls which are predicted +to return to an infinite loop, to "capture" speculation if this +happens. This is required both for retpoline, and also in conjunction with +IBRS for !SMEP && !KPTI. + +On Skylake+ the problem is slightly different, and an *underflow* of the +RSB may cause errant branch predictions to occur. So there it's not so much +overwrite, as *filling* the RSB to attempt to prevent it getting +empty. This is only a partial solution for Skylake+ since there are many +other conditions which may result in the RSB becoming empty. The full +solution on Skylake+ is to use IBRS, which will prevent the problem even +when the RSB becomes empty. With IBRS, the RSB-stuffing will not be +required on context switch. + +[ tglx: Added missing vendor check and slighty massaged comments and + changelog ] + +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: Arjan van de Ven <arjan@linux.intel.com> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: Rik van Riel <riel@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: thomas.lendacky@amd.com +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Jiri Kosina <jikos@kernel.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Kees Cook <keescook@google.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Cc: Paul Turner <pjt@google.com> +Link: https://lkml.kernel.org/r/1515779365-9032-1-git-send-email-dwmw@amazon.co.uk +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/entry/entry_32.S | 11 +++++++++++ + arch/x86/entry/entry_64.S | 11 +++++++++++ + arch/x86/include/asm/cpufeatures.h | 1 + + arch/x86/kernel/cpu/bugs.c | 36 ++++++++++++++++++++++++++++++++++++ + 4 files changed, 59 insertions(+) + +--- a/arch/x86/entry/entry_32.S ++++ b/arch/x86/entry/entry_32.S +@@ -229,6 +229,17 @@ ENTRY(__switch_to_asm) + movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset + #endif + ++#ifdef CONFIG_RETPOLINE ++ /* ++ * When switching from a shallower to a deeper call stack ++ * the RSB may either underflow or use entries populated ++ * with userspace addresses. On CPUs where those concerns ++ * exist, overwrite the RSB with entries which capture ++ * speculative execution to prevent attack. ++ */ ++ FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW ++#endif ++ + /* restore callee-saved registers */ + popl %esi + popl %edi +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -427,6 +427,17 @@ ENTRY(__switch_to_asm) + movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset + #endif + ++#ifdef CONFIG_RETPOLINE ++ /* ++ * When switching from a shallower to a deeper call stack ++ * the RSB may either underflow or use entries populated ++ * with userspace addresses. On CPUs where those concerns ++ * exist, overwrite the RSB with entries which capture ++ * speculative execution to prevent attack. ++ */ ++ FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW ++#endif ++ + /* restore callee-saved registers */ + popq %r15 + popq %r14 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -200,6 +200,7 @@ + #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ + #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ + #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ ++#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ + + /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ + #define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -22,6 +22,7 @@ + #include <asm/alternative.h> + #include <asm/pgtable.h> + #include <asm/cacheflush.h> ++#include <asm/intel-family.h> + + static void __init spectre_v2_select_mitigation(void); + +@@ -154,6 +155,23 @@ disable: + return SPECTRE_V2_CMD_NONE; + } + ++/* Check for Skylake-like CPUs (for RSB handling) */ ++static bool __init is_skylake_era(void) ++{ ++ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && ++ boot_cpu_data.x86 == 6) { ++ switch (boot_cpu_data.x86_model) { ++ case INTEL_FAM6_SKYLAKE_MOBILE: ++ case INTEL_FAM6_SKYLAKE_DESKTOP: ++ case INTEL_FAM6_SKYLAKE_X: ++ case INTEL_FAM6_KABYLAKE_MOBILE: ++ case INTEL_FAM6_KABYLAKE_DESKTOP: ++ return true; ++ } ++ } ++ return false; ++} ++ + static void __init spectre_v2_select_mitigation(void) + { + enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); +@@ -212,6 +230,24 @@ retpoline_auto: + + spectre_v2_enabled = mode; + pr_info("%s\n", spectre_v2_strings[mode]); ++ ++ /* ++ * If neither SMEP or KPTI are available, there is a risk of ++ * hitting userspace addresses in the RSB after a context switch ++ * from a shallow call stack to a deeper one. To prevent this fill ++ * the entire RSB, even when using IBRS. ++ * ++ * Skylake era CPUs have a separate issue with *underflow* of the ++ * RSB, when they will predict 'ret' targets from the generic BTB. ++ * The proper mitigation for this is IBRS. If IBRS is not supported ++ * or deactivated in favour of retpolines the RSB fill on context ++ * switch is required. ++ */ ++ if ((!boot_cpu_has(X86_FEATURE_KAISER) && ++ !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) { ++ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); ++ pr_info("Filling RSB on context switch\n"); ++ } + } + + #undef pr_fmt diff --git a/queue/x86-retpoline-ftrace-convert-ftrace-assembler-indirect-jumps.patch b/queue/x86-retpoline-ftrace-convert-ftrace-assembler-indirect-jumps.patch new file mode 100644 index 0000000..17aeb29 --- /dev/null +++ b/queue/x86-retpoline-ftrace-convert-ftrace-assembler-indirect-jumps.patch @@ -0,0 +1,88 @@ +From 9351803bd803cdbeb9b5a7850b7b6f464806e3db Mon Sep 17 00:00:00 2001 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Thu, 11 Jan 2018 21:46:29 +0000 +Subject: x86/retpoline/ftrace: Convert ftrace assembler indirect jumps + +From: David Woodhouse <dwmw@amazon.co.uk> + +commit 9351803bd803cdbeb9b5a7850b7b6f464806e3db upstream. + +Convert all indirect jumps in ftrace assembler code to use non-speculative +sequences when CONFIG_RETPOLINE is enabled. + +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: Arjan van de Ven <arjan@linux.intel.com> +Acked-by: Ingo Molnar <mingo@kernel.org> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: Rik van Riel <riel@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: thomas.lendacky@amd.com +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Jiri Kosina <jikos@kernel.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Kees Cook <keescook@google.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Cc: Paul Turner <pjt@google.com> +Link: https://lkml.kernel.org/r/1515707194-20531-8-git-send-email-dwmw@amazon.co.uk +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/entry/entry_32.S | 5 +++-- + arch/x86/kernel/mcount_64.S | 7 ++++--- + 2 files changed, 7 insertions(+), 5 deletions(-) + +--- a/arch/x86/entry/entry_32.S ++++ b/arch/x86/entry/entry_32.S +@@ -985,7 +985,8 @@ trace: + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + +- call *ftrace_trace_function ++ movl ftrace_trace_function, %ecx ++ CALL_NOSPEC %ecx + + popl %edx + popl %ecx +@@ -1021,7 +1022,7 @@ return_to_handler: + movl %eax, %ecx + popl %edx + popl %eax +- jmp *%ecx ++ JMP_NOSPEC %ecx + #endif + + #ifdef CONFIG_TRACING +--- a/arch/x86/kernel/mcount_64.S ++++ b/arch/x86/kernel/mcount_64.S +@@ -8,7 +8,7 @@ + #include <asm/ptrace.h> + #include <asm/ftrace.h> + #include <asm/export.h> +- ++#include <asm/nospec-branch.h> + + .code64 + .section .entry.text, "ax" +@@ -290,8 +290,9 @@ trace: + * ip and parent ip are used and the list function is called when + * function tracing is enabled. + */ +- call *ftrace_trace_function + ++ movq ftrace_trace_function, %r8 ++ CALL_NOSPEC %r8 + restore_mcount_regs + + jmp fgraph_trace +@@ -334,5 +335,5 @@ GLOBAL(return_to_handler) + movq 8(%rsp), %rdx + movq (%rsp), %rax + addq $24, %rsp +- jmp *%rdi ++ JMP_NOSPEC %rdi + #endif diff --git a/queue/x86-retpoline-hyperv-convert-assembler-indirect-jumps.patch b/queue/x86-retpoline-hyperv-convert-assembler-indirect-jumps.patch new file mode 100644 index 0000000..4114df9 --- /dev/null +++ b/queue/x86-retpoline-hyperv-convert-assembler-indirect-jumps.patch @@ -0,0 +1,76 @@ +From e70e5892b28c18f517f29ab6e83bd57705104b31 Mon Sep 17 00:00:00 2001 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Thu, 11 Jan 2018 21:46:30 +0000 +Subject: x86/retpoline/hyperv: Convert assembler indirect jumps + +From: David Woodhouse <dwmw@amazon.co.uk> + +commit e70e5892b28c18f517f29ab6e83bd57705104b31 upstream. + +Convert all indirect jumps in hyperv inline asm code to use non-speculative +sequences when CONFIG_RETPOLINE is enabled. + +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: Arjan van de Ven <arjan@linux.intel.com> +Acked-by: Ingo Molnar <mingo@kernel.org> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: Rik van Riel <riel@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: thomas.lendacky@amd.com +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Jiri Kosina <jikos@kernel.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Kees Cook <keescook@google.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Cc: Paul Turner <pjt@google.com> +Link: https://lkml.kernel.org/r/1515707194-20531-9-git-send-email-dwmw@amazon.co.uk +[ backport to 4.9, hopefully correct, not tested... - gregkh ] +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + drivers/hv/hv.c | 11 +++++++---- + 1 file changed, 7 insertions(+), 4 deletions(-) + +--- a/drivers/hv/hv.c ++++ b/drivers/hv/hv.c +@@ -31,6 +31,7 @@ + #include <linux/clockchips.h> + #include <asm/hyperv.h> + #include <asm/mshyperv.h> ++#include <asm/nospec-branch.h> + #include "hyperv_vmbus.h" + + /* The one and only */ +@@ -103,9 +104,10 @@ u64 hv_do_hypercall(u64 control, void *i + return (u64)ULLONG_MAX; + + __asm__ __volatile__("mov %0, %%r8" : : "r" (output_address) : "r8"); +- __asm__ __volatile__("call *%3" : "=a" (hv_status) : ++ __asm__ __volatile__(CALL_NOSPEC : ++ "=a" (hv_status) : + "c" (control), "d" (input_address), +- "m" (hypercall_page)); ++ THUNK_TARGET(hypercall_page)); + + return hv_status; + +@@ -123,11 +125,12 @@ u64 hv_do_hypercall(u64 control, void *i + if (!hypercall_page) + return (u64)ULLONG_MAX; + +- __asm__ __volatile__ ("call *%8" : "=d"(hv_status_hi), ++ __asm__ __volatile__ (CALL_NOSPEC : "=d"(hv_status_hi), + "=a"(hv_status_lo) : "d" (control_hi), + "a" (control_lo), "b" (input_address_hi), + "c" (input_address_lo), "D"(output_address_hi), +- "S"(output_address_lo), "m" (hypercall_page)); ++ "S"(output_address_lo), ++ THUNK_TARGET(hypercall_page)); + + return hv_status_lo | ((u64)hv_status_hi << 32); + #endif /* !x86_64 */ diff --git a/queue/x86-retpoline-irq32-convert-assembler-indirect-jumps.patch b/queue/x86-retpoline-irq32-convert-assembler-indirect-jumps.patch new file mode 100644 index 0000000..029afab --- /dev/null +++ b/queue/x86-retpoline-irq32-convert-assembler-indirect-jumps.patch @@ -0,0 +1,74 @@ +From 7614e913db1f40fff819b36216484dc3808995d4 Mon Sep 17 00:00:00 2001 +From: Andi Kleen <ak@linux.intel.com> +Date: Thu, 11 Jan 2018 21:46:33 +0000 +Subject: x86/retpoline/irq32: Convert assembler indirect jumps + +From: Andi Kleen <ak@linux.intel.com> + +commit 7614e913db1f40fff819b36216484dc3808995d4 upstream. + +Convert all indirect jumps in 32bit irq inline asm code to use non +speculative sequences. + +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: Arjan van de Ven <arjan@linux.intel.com> +Acked-by: Ingo Molnar <mingo@kernel.org> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: Rik van Riel <riel@redhat.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: thomas.lendacky@amd.com +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Jiri Kosina <jikos@kernel.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Kees Cook <keescook@google.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Cc: Paul Turner <pjt@google.com> +Link: https://lkml.kernel.org/r/1515707194-20531-12-git-send-email-dwmw@amazon.co.uk +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kernel/irq_32.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +--- a/arch/x86/kernel/irq_32.c ++++ b/arch/x86/kernel/irq_32.c +@@ -19,6 +19,7 @@ + #include <linux/mm.h> + + #include <asm/apic.h> ++#include <asm/nospec-branch.h> + + #ifdef CONFIG_DEBUG_STACKOVERFLOW + +@@ -54,11 +55,11 @@ DEFINE_PER_CPU(struct irq_stack *, softi + static void call_on_stack(void *func, void *stack) + { + asm volatile("xchgl %%ebx,%%esp \n" +- "call *%%edi \n" ++ CALL_NOSPEC + "movl %%ebx,%%esp \n" + : "=b" (stack) + : "0" (stack), +- "D"(func) ++ [thunk_target] "D"(func) + : "memory", "cc", "edx", "ecx", "eax"); + } + +@@ -94,11 +95,11 @@ static inline int execute_on_irq_stack(i + call_on_stack(print_stack_overflow, isp); + + asm volatile("xchgl %%ebx,%%esp \n" +- "call *%%edi \n" ++ CALL_NOSPEC + "movl %%ebx,%%esp \n" + : "=a" (arg1), "=b" (isp) + : "0" (desc), "1" (isp), +- "D" (desc->handle_irq) ++ [thunk_target] "D" (desc->handle_irq) + : "memory", "cc", "ecx"); + return 1; + } diff --git a/queue/x86-retpoline-optimize-inline-assembler-for-vmexit_fill_rsb.patch b/queue/x86-retpoline-optimize-inline-assembler-for-vmexit_fill_rsb.patch new file mode 100644 index 0000000..963e73a --- /dev/null +++ b/queue/x86-retpoline-optimize-inline-assembler-for-vmexit_fill_rsb.patch @@ -0,0 +1,58 @@ +From 3f7d875566d8e79c5e0b2c9a413e91b2c29e0854 Mon Sep 17 00:00:00 2001 +From: Andi Kleen <ak@linux.intel.com> +Date: Wed, 17 Jan 2018 14:53:28 -0800 +Subject: x86/retpoline: Optimize inline assembler for vmexit_fill_RSB + +From: Andi Kleen <ak@linux.intel.com> + +commit 3f7d875566d8e79c5e0b2c9a413e91b2c29e0854 upstream. + +The generated assembler for the C fill RSB inline asm operations has +several issues: + +- The C code sets up the loop register, which is then immediately + overwritten in __FILL_RETURN_BUFFER with the same value again. + +- The C code also passes in the iteration count in another register, which + is not used at all. + +Remove these two unnecessary operations. Just rely on the single constant +passed to the macro for the iterations. + +Signed-off-by: Andi Kleen <ak@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: David Woodhouse <dwmw@amazon.co.uk> +Cc: dave.hansen@intel.com +Cc: gregkh@linuxfoundation.org +Cc: torvalds@linux-foundation.org +Cc: arjan@linux.intel.com +Link: https://lkml.kernel.org/r/20180117225328.15414-1-andi@firstfloor.org +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/include/asm/nospec-branch.h | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -206,16 +206,17 @@ extern char __indirect_thunk_end[]; + static inline void vmexit_fill_RSB(void) + { + #ifdef CONFIG_RETPOLINE +- unsigned long loops = RSB_CLEAR_LOOPS / 2; ++ unsigned long loops; + + asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE + ALTERNATIVE("jmp 910f", + __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)), + X86_FEATURE_RETPOLINE) + "910:" +- : "=&r" (loops), ASM_CALL_CONSTRAINT +- : "r" (loops) : "memory" ); ++ : "=r" (loops), ASM_CALL_CONSTRAINT ++ : : "memory" ); + #endif + } ++ + #endif /* __ASSEMBLY__ */ + #endif /* __NOSPEC_BRANCH_H__ */ diff --git a/queue/x86-retpoline-remove-compile-time-warning.patch b/queue/x86-retpoline-remove-compile-time-warning.patch new file mode 100644 index 0000000..24dd1c7 --- /dev/null +++ b/queue/x86-retpoline-remove-compile-time-warning.patch @@ -0,0 +1,60 @@ +From b8b9ce4b5aec8de9e23cabb0a26b78641f9ab1d6 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Sun, 14 Jan 2018 22:13:29 +0100 +Subject: x86/retpoline: Remove compile time warning + +From: Thomas Gleixner <tglx@linutronix.de> + +commit b8b9ce4b5aec8de9e23cabb0a26b78641f9ab1d6 upstream. + +Remove the compile time warning when CONFIG_RETPOLINE=y and the compiler +does not have retpoline support. Linus rationale for this is: + + It's wrong because it will just make people turn off RETPOLINE, and the + asm updates - and return stack clearing - that are independent of the + compiler are likely the most important parts because they are likely the + ones easiest to target. + + And it's annoying because most people won't be able to do anything about + it. The number of people building their own compiler? Very small. So if + their distro hasn't got a compiler yet (and pretty much nobody does), the + warning is just annoying crap. + + It is already properly reported as part of the sysfs interface. The + compile-time warning only encourages bad things. + +Fixes: 76b043848fd2 ("x86/retpoline: Add initial retpoline support") +Requested-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: David Woodhouse <dwmw@amazon.co.uk> +Cc: Peter Zijlstra (Intel) <peterz@infradead.org> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: Rik van Riel <riel@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: thomas.lendacky@amd.com +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Jiri Kosina <jikos@kernel.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Kees Cook <keescook@google.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Link: https://lkml.kernel.org/r/CA+55aFzWgquv4i6Mab6bASqYXg3ErV3XDFEYf=GEcCDQg5uAtw@mail.gmail.com +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/Makefile | 2 -- + 1 file changed, 2 deletions(-) + +--- a/arch/x86/Makefile ++++ b/arch/x86/Makefile +@@ -187,8 +187,6 @@ ifdef CONFIG_RETPOLINE + RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) + ifneq ($(RETPOLINE_CFLAGS),) + KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE +- else +- $(warning CONFIG_RETPOLINE=y, but not supported by the compiler. Toolchain update recommended.) + endif + endif + diff --git a/queue/x86-retpoline-remove-the-esp-rsp-thunk.patch b/queue/x86-retpoline-remove-the-esp-rsp-thunk.patch new file mode 100644 index 0000000..f0d2444 --- /dev/null +++ b/queue/x86-retpoline-remove-the-esp-rsp-thunk.patch @@ -0,0 +1,58 @@ +From foo@baz Wed Feb 7 19:38:23 CST 2018 +From: Waiman Long <longman@redhat.com> +Date: Mon, 22 Jan 2018 17:09:34 -0500 +Subject: x86/retpoline: Remove the esp/rsp thunk + +From: Waiman Long <longman@redhat.com> + +(cherry picked from commit 1df37383a8aeabb9b418698f0bcdffea01f4b1b2) + +It doesn't make sense to have an indirect call thunk with esp/rsp as +retpoline code won't work correctly with the stack pointer register. +Removing it will help compiler writers to catch error in case such +a thunk call is emitted incorrectly. + +Fixes: 76b043848fd2 ("x86/retpoline: Add initial retpoline support") +Suggested-by: Jeff Law <law@redhat.com> +Signed-off-by: Waiman Long <longman@redhat.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: David Woodhouse <dwmw@amazon.co.uk> +Cc: Tom Lendacky <thomas.lendacky@amd.com> +Cc: Kees Cook <keescook@google.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Jiri Kosina <jikos@kernel.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Arjan van de Ven <arjan@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Cc: Paul Turner <pjt@google.com> +Link: https://lkml.kernel.org/r/1516658974-27852-1-git-send-email-longman@redhat.com +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/asm-prototypes.h | 1 - + arch/x86/lib/retpoline.S | 1 - + 2 files changed, 2 deletions(-) + +--- a/arch/x86/include/asm/asm-prototypes.h ++++ b/arch/x86/include/asm/asm-prototypes.h +@@ -37,5 +37,4 @@ INDIRECT_THUNK(dx) + INDIRECT_THUNK(si) + INDIRECT_THUNK(di) + INDIRECT_THUNK(bp) +-INDIRECT_THUNK(sp) + #endif /* CONFIG_RETPOLINE */ +--- a/arch/x86/lib/retpoline.S ++++ b/arch/x86/lib/retpoline.S +@@ -36,7 +36,6 @@ GENERATE_THUNK(_ASM_DX) + GENERATE_THUNK(_ASM_SI) + GENERATE_THUNK(_ASM_DI) + GENERATE_THUNK(_ASM_BP) +-GENERATE_THUNK(_ASM_SP) + #ifdef CONFIG_64BIT + GENERATE_THUNK(r8) + GENERATE_THUNK(r9) diff --git a/queue/x86-retpoline-simplify-vmexit_fill_rsb.patch b/queue/x86-retpoline-simplify-vmexit_fill_rsb.patch new file mode 100644 index 0000000..f524811 --- /dev/null +++ b/queue/x86-retpoline-simplify-vmexit_fill_rsb.patch @@ -0,0 +1,248 @@ +From foo@baz Thu Feb 8 03:30:27 CET 2018 +From: Borislav Petkov <bp@alien8.de> +Date: Sat, 27 Jan 2018 16:24:33 +0000 +Subject: x86/retpoline: Simplify vmexit_fill_RSB() + +From: Borislav Petkov <bp@alien8.de> + +(cherry picked from commit 1dde7415e99933bb7293d6b2843752cbdb43ec11) + +Simplify it to call an asm-function instead of pasting 41 insn bytes at +every call site. Also, add alignment to the macro as suggested here: + + https://support.google.com/faqs/answer/7625886 + +[dwmw2: Clean up comments, let it clobber %ebx and just tell the compiler] + +Signed-off-by: Borislav Petkov <bp@suse.de> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: ak@linux.intel.com +Cc: dave.hansen@intel.com +Cc: karahmed@amazon.de +Cc: arjan@linux.intel.com +Cc: torvalds@linux-foundation.org +Cc: peterz@infradead.org +Cc: bp@alien8.de +Cc: pbonzini@redhat.com +Cc: tim.c.chen@linux.intel.com +Cc: gregkh@linux-foundation.org +Link: https://lkml.kernel.org/r/1517070274-12128-3-git-send-email-dwmw@amazon.co.uk +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/entry/entry_32.S | 3 - + arch/x86/entry/entry_64.S | 3 - + arch/x86/include/asm/asm-prototypes.h | 3 + + arch/x86/include/asm/nospec-branch.h | 70 +++------------------------------- + arch/x86/lib/Makefile | 1 + arch/x86/lib/retpoline.S | 56 +++++++++++++++++++++++++++ + 6 files changed, 71 insertions(+), 65 deletions(-) + +--- a/arch/x86/entry/entry_32.S ++++ b/arch/x86/entry/entry_32.S +@@ -237,7 +237,8 @@ ENTRY(__switch_to_asm) + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ +- FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW ++ /* Clobbers %ebx */ ++ FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW + #endif + + /* restore callee-saved registers */ +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -435,7 +435,8 @@ ENTRY(__switch_to_asm) + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ +- FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW ++ /* Clobbers %rbx */ ++ FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW + #endif + + /* restore callee-saved registers */ +--- a/arch/x86/include/asm/asm-prototypes.h ++++ b/arch/x86/include/asm/asm-prototypes.h +@@ -37,4 +37,7 @@ INDIRECT_THUNK(dx) + INDIRECT_THUNK(si) + INDIRECT_THUNK(di) + INDIRECT_THUNK(bp) ++asmlinkage void __fill_rsb(void); ++asmlinkage void __clear_rsb(void); ++ + #endif /* CONFIG_RETPOLINE */ +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -7,50 +7,6 @@ + #include <asm/alternative-asm.h> + #include <asm/cpufeatures.h> + +-/* +- * Fill the CPU return stack buffer. +- * +- * Each entry in the RSB, if used for a speculative 'ret', contains an +- * infinite 'pause; lfence; jmp' loop to capture speculative execution. +- * +- * This is required in various cases for retpoline and IBRS-based +- * mitigations for the Spectre variant 2 vulnerability. Sometimes to +- * eliminate potentially bogus entries from the RSB, and sometimes +- * purely to ensure that it doesn't get empty, which on some CPUs would +- * allow predictions from other (unwanted!) sources to be used. +- * +- * We define a CPP macro such that it can be used from both .S files and +- * inline assembly. It's possible to do a .macro and then include that +- * from C via asm(".include <asm/nospec-branch.h>") but let's not go there. +- */ +- +-#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ +-#define RSB_FILL_LOOPS 16 /* To avoid underflow */ +- +-/* +- * Google experimented with loop-unrolling and this turned out to be +- * the optimal version — two calls, each with their own speculation +- * trap should their return address end up getting used, in a loop. +- */ +-#define __FILL_RETURN_BUFFER(reg, nr, sp) \ +- mov $(nr/2), reg; \ +-771: \ +- call 772f; \ +-773: /* speculation trap */ \ +- pause; \ +- lfence; \ +- jmp 773b; \ +-772: \ +- call 774f; \ +-775: /* speculation trap */ \ +- pause; \ +- lfence; \ +- jmp 775b; \ +-774: \ +- dec reg; \ +- jnz 771b; \ +- add $(BITS_PER_LONG/8) * nr, sp; +- + #ifdef __ASSEMBLY__ + + /* +@@ -121,17 +77,10 @@ + #endif + .endm + +- /* +- * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP +- * monstrosity above, manually. +- */ +-.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ++/* This clobbers the BX register */ ++.macro FILL_RETURN_BUFFER nr:req ftr:req + #ifdef CONFIG_RETPOLINE +- ANNOTATE_NOSPEC_ALTERNATIVE +- ALTERNATIVE "jmp .Lskip_rsb_\@", \ +- __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \ +- \ftr +-.Lskip_rsb_\@: ++ ALTERNATIVE "", "call __clear_rsb", \ftr + #endif + .endm + +@@ -206,15 +155,10 @@ extern char __indirect_thunk_end[]; + static inline void vmexit_fill_RSB(void) + { + #ifdef CONFIG_RETPOLINE +- unsigned long loops; +- +- asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE +- ALTERNATIVE("jmp 910f", +- __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)), +- X86_FEATURE_RETPOLINE) +- "910:" +- : "=r" (loops), ASM_CALL_CONSTRAINT +- : : "memory" ); ++ alternative_input("", ++ "call __fill_rsb", ++ X86_FEATURE_RETPOLINE, ++ ASM_NO_INPUT_CLOBBER(_ASM_BX, "memory")); + #endif + } + +--- a/arch/x86/lib/Makefile ++++ b/arch/x86/lib/Makefile +@@ -26,6 +26,7 @@ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += + lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o + lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o + lib-$(CONFIG_RETPOLINE) += retpoline.o ++OBJECT_FILES_NON_STANDARD_retpoline.o :=y + + obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o + +--- a/arch/x86/lib/retpoline.S ++++ b/arch/x86/lib/retpoline.S +@@ -7,6 +7,7 @@ + #include <asm/alternative-asm.h> + #include <asm/export.h> + #include <asm/nospec-branch.h> ++#include <asm/bitsperlong.h> + + .macro THUNK reg + .section .text.__x86.indirect_thunk +@@ -46,3 +47,58 @@ GENERATE_THUNK(r13) + GENERATE_THUNK(r14) + GENERATE_THUNK(r15) + #endif ++ ++/* ++ * Fill the CPU return stack buffer. ++ * ++ * Each entry in the RSB, if used for a speculative 'ret', contains an ++ * infinite 'pause; lfence; jmp' loop to capture speculative execution. ++ * ++ * This is required in various cases for retpoline and IBRS-based ++ * mitigations for the Spectre variant 2 vulnerability. Sometimes to ++ * eliminate potentially bogus entries from the RSB, and sometimes ++ * purely to ensure that it doesn't get empty, which on some CPUs would ++ * allow predictions from other (unwanted!) sources to be used. ++ * ++ * Google experimented with loop-unrolling and this turned out to be ++ * the optimal version - two calls, each with their own speculation ++ * trap should their return address end up getting used, in a loop. ++ */ ++.macro STUFF_RSB nr:req sp:req ++ mov $(\nr / 2), %_ASM_BX ++ .align 16 ++771: ++ call 772f ++773: /* speculation trap */ ++ pause ++ lfence ++ jmp 773b ++ .align 16 ++772: ++ call 774f ++775: /* speculation trap */ ++ pause ++ lfence ++ jmp 775b ++ .align 16 ++774: ++ dec %_ASM_BX ++ jnz 771b ++ add $((BITS_PER_LONG/8) * \nr), \sp ++.endm ++ ++#define RSB_FILL_LOOPS 16 /* To avoid underflow */ ++ ++ENTRY(__fill_rsb) ++ STUFF_RSB RSB_FILL_LOOPS, %_ASM_SP ++ ret ++END(__fill_rsb) ++EXPORT_SYMBOL_GPL(__fill_rsb) ++ ++#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ ++ ++ENTRY(__clear_rsb) ++ STUFF_RSB RSB_CLEAR_LOOPS, %_ASM_SP ++ ret ++END(__clear_rsb) ++EXPORT_SYMBOL_GPL(__clear_rsb) diff --git a/queue/x86-retpoline-xen-convert-xen-hypercall-indirect-jumps.patch b/queue/x86-retpoline-xen-convert-xen-hypercall-indirect-jumps.patch new file mode 100644 index 0000000..0893be0 --- /dev/null +++ b/queue/x86-retpoline-xen-convert-xen-hypercall-indirect-jumps.patch @@ -0,0 +1,60 @@ +From 4c7f165396cddc6e5fc484743e259fc7f92cffcc Mon Sep 17 00:00:00 2001 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Thu, 11 Jan 2018 21:46:31 +0000 +Subject: [PATCH] x86/retpoline/xen: Convert Xen hypercall indirect jumps + +commit ea08816d5b185ab3d09e95e393f265af54560350 upstream. + +Convert indirect call in Xen hypercall to use non-speculative sequence, +when CONFIG_RETPOLINE is enabled. + +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: Arjan van de Ven <arjan@linux.intel.com> +Acked-by: Ingo Molnar <mingo@kernel.org> +Reviewed-by: Juergen Gross <jgross@suse.com> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: Rik van Riel <riel@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: thomas.lendacky@amd.com +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Jiri Kosina <jikos@kernel.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Kees Cook <keescook@google.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Cc: Paul Turner <pjt@google.com> +Link: https://lkml.kernel.org/r/1515707194-20531-10-git-send-email-dwmw@amazon.co.uk +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h +index a12a047184ee..8b1f91f59f0a 100644 +--- a/arch/x86/include/asm/xen/hypercall.h ++++ b/arch/x86/include/asm/xen/hypercall.h +@@ -43,6 +43,7 @@ + + #include <asm/page.h> + #include <asm/pgtable.h> ++#include <asm/nospec-branch.h> + + #include <xen/interface/xen.h> + #include <xen/interface/sched.h> +@@ -214,9 +215,9 @@ privcmd_call(unsigned call, + __HYPERCALL_DECLS; + __HYPERCALL_5ARG(a1, a2, a3, a4, a5); + +- asm volatile("call *%[call]" ++ asm volatile(CALL_NOSPEC + : __HYPERCALL_5PARAM +- : [call] "a" (&hypercall_page[call]) ++ : [thunk_target] "a" (&hypercall_page[call]) + : __HYPERCALL_CLOBBER5); + + return (long)__res; +-- +2.15.0 + diff --git a/queue/x86-smpboot-remove-stale-tlb-flush-invocations.patch b/queue/x86-smpboot-remove-stale-tlb-flush-invocations.patch new file mode 100644 index 0000000..cdf81a5 --- /dev/null +++ b/queue/x86-smpboot-remove-stale-tlb-flush-invocations.patch @@ -0,0 +1,65 @@ +From 322f8b8b340c824aef891342b0f5795d15e11562 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Sat, 30 Dec 2017 22:13:53 +0100 +Subject: x86/smpboot: Remove stale TLB flush invocations + +From: Thomas Gleixner <tglx@linutronix.de> + +commit 322f8b8b340c824aef891342b0f5795d15e11562 upstream. + +smpboot_setup_warm_reset_vector() and smpboot_restore_warm_reset_vector() +invoke local_flush_tlb() for no obvious reason. + +Digging in history revealed that the original code in the 2.1 era added +those because the code manipulated a swapper_pg_dir pagetable entry. The +pagetable manipulation was removed long ago in the 2.3 timeframe, but the +TLB flush invocations stayed around forever. + +Remove them along with the pointless pr_debug()s which come from the same 2.1 +change. + +Reported-by: Dominik Brodowski <linux@dominikbrodowski.net> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Linus Torvalds <torvalds@linuxfoundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Link: http://lkml.kernel.org/r/20171230211829.586548655@linutronix.de +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/kernel/smpboot.c | 9 --------- + 1 file changed, 9 deletions(-) + +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -115,14 +115,10 @@ static inline void smpboot_setup_warm_re + spin_lock_irqsave(&rtc_lock, flags); + CMOS_WRITE(0xa, 0xf); + spin_unlock_irqrestore(&rtc_lock, flags); +- local_flush_tlb(); +- pr_debug("1.\n"); + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = + start_eip >> 4; +- pr_debug("2.\n"); + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = + start_eip & 0xf; +- pr_debug("3.\n"); + } + + static inline void smpboot_restore_warm_reset_vector(void) +@@ -130,11 +126,6 @@ static inline void smpboot_restore_warm_ + unsigned long flags; + + /* +- * Install writable page 0 entry to set BIOS data area. +- */ +- local_flush_tlb(); +- +- /* + * Paranoid: Set warm reset code and vector here back + * to default values. + */ diff --git a/queue/x86-spectre-add-boot-time-option-to-select-spectre-v2-mitigation.patch b/queue/x86-spectre-add-boot-time-option-to-select-spectre-v2-mitigation.patch new file mode 100644 index 0000000..475414c --- /dev/null +++ b/queue/x86-spectre-add-boot-time-option-to-select-spectre-v2-mitigation.patch @@ -0,0 +1,317 @@ +From da285121560e769cc31797bba6422eea71d473e0 Mon Sep 17 00:00:00 2001 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Thu, 11 Jan 2018 21:46:26 +0000 +Subject: x86/spectre: Add boot time option to select Spectre v2 mitigation + +From: David Woodhouse <dwmw@amazon.co.uk> + +commit da285121560e769cc31797bba6422eea71d473e0 upstream. + +Add a spectre_v2= option to select the mitigation used for the indirect +branch speculation vulnerability. + +Currently, the only option available is retpoline, in its various forms. +This will be expanded to cover the new IBRS/IBPB microcode features. + +The RETPOLINE_AMD feature relies on a serializing LFENCE for speculation +control. For AMD hardware, only set RETPOLINE_AMD if LFENCE is a +serializing instruction, which is indicated by the LFENCE_RDTSC feature. + +[ tglx: Folded back the LFENCE/AMD fixes and reworked it so IBRS + integration becomes simple ] + +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: Rik van Riel <riel@redhat.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: thomas.lendacky@amd.com +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Jiri Kosina <jikos@kernel.org> +Cc: Andy Lutomirski <luto@amacapital.net> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Kees Cook <keescook@google.com> +Cc: Tim Chen <tim.c.chen@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org> +Cc: Paul Turner <pjt@google.com> +Link: https://lkml.kernel.org/r/1515707194-20531-5-git-send-email-dwmw@amazon.co.uk +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + Documentation/kernel-parameters.txt | 28 ++++++ + arch/x86/include/asm/nospec-branch.h | 10 ++ + arch/x86/kernel/cpu/bugs.c | 158 ++++++++++++++++++++++++++++++++++- + arch/x86/kernel/cpu/common.c | 4 + 4 files changed, 195 insertions(+), 5 deletions(-) + +--- a/Documentation/kernel-parameters.txt ++++ b/Documentation/kernel-parameters.txt +@@ -2691,6 +2691,11 @@ bytes respectively. Such letter suffixes + nosmt [KNL,S390] Disable symmetric multithreading (SMT). + Equivalent to smt=1. + ++ nospectre_v2 [X86] Disable all mitigations for the Spectre variant 2 ++ (indirect branch prediction) vulnerability. System may ++ allow data leaks with this option, which is equivalent ++ to spectre_v2=off. ++ + noxsave [BUGS=X86] Disables x86 extended register state save + and restore using xsave. The kernel will fallback to + enabling legacy floating-point and sse state. +@@ -3944,6 +3949,29 @@ bytes respectively. Such letter suffixes + sonypi.*= [HW] Sony Programmable I/O Control Device driver + See Documentation/laptops/sonypi.txt + ++ spectre_v2= [X86] Control mitigation of Spectre variant 2 ++ (indirect branch speculation) vulnerability. ++ ++ on - unconditionally enable ++ off - unconditionally disable ++ auto - kernel detects whether your CPU model is ++ vulnerable ++ ++ Selecting 'on' will, and 'auto' may, choose a ++ mitigation method at run time according to the ++ CPU, the available microcode, the setting of the ++ CONFIG_RETPOLINE configuration option, and the ++ compiler with which the kernel was built. ++ ++ Specific mitigations can also be selected manually: ++ ++ retpoline - replace indirect branches ++ retpoline,generic - google's original retpoline ++ retpoline,amd - AMD-specific minimal thunk ++ ++ Not specifying this option is equivalent to ++ spectre_v2=auto. ++ + spia_io_base= [HW,MTD] + spia_fio_base= + spia_pedr= +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -124,5 +124,15 @@ + # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) + #endif + ++/* The Spectre V2 mitigation variants */ ++enum spectre_v2_mitigation { ++ SPECTRE_V2_NONE, ++ SPECTRE_V2_RETPOLINE_MINIMAL, ++ SPECTRE_V2_RETPOLINE_MINIMAL_AMD, ++ SPECTRE_V2_RETPOLINE_GENERIC, ++ SPECTRE_V2_RETPOLINE_AMD, ++ SPECTRE_V2_IBRS, ++}; ++ + #endif /* __ASSEMBLY__ */ + #endif /* __NOSPEC_BRANCH_H__ */ +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -10,6 +10,9 @@ + #include <linux/init.h> + #include <linux/utsname.h> + #include <linux/cpu.h> ++ ++#include <asm/nospec-branch.h> ++#include <asm/cmdline.h> + #include <asm/bugs.h> + #include <asm/processor.h> + #include <asm/processor-flags.h> +@@ -20,6 +23,8 @@ + #include <asm/pgtable.h> + #include <asm/cacheflush.h> + ++static void __init spectre_v2_select_mitigation(void); ++ + void __init check_bugs(void) + { + identify_boot_cpu(); +@@ -29,6 +34,9 @@ void __init check_bugs(void) + print_cpu_info(&boot_cpu_data); + } + ++ /* Select the proper spectre mitigation before patching alternatives */ ++ spectre_v2_select_mitigation(); ++ + #ifdef CONFIG_X86_32 + /* + * Check whether we are able to run this kernel safely on SMP. +@@ -61,6 +69,153 @@ void __init check_bugs(void) + #endif + } + ++/* The kernel command line selection */ ++enum spectre_v2_mitigation_cmd { ++ SPECTRE_V2_CMD_NONE, ++ SPECTRE_V2_CMD_AUTO, ++ SPECTRE_V2_CMD_FORCE, ++ SPECTRE_V2_CMD_RETPOLINE, ++ SPECTRE_V2_CMD_RETPOLINE_GENERIC, ++ SPECTRE_V2_CMD_RETPOLINE_AMD, ++}; ++ ++static const char *spectre_v2_strings[] = { ++ [SPECTRE_V2_NONE] = "Vulnerable", ++ [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline", ++ [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", ++ [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", ++ [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", ++}; ++ ++#undef pr_fmt ++#define pr_fmt(fmt) "Spectre V2 mitigation: " fmt ++ ++static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; ++ ++static void __init spec2_print_if_insecure(const char *reason) ++{ ++ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) ++ pr_info("%s\n", reason); ++} ++ ++static void __init spec2_print_if_secure(const char *reason) ++{ ++ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) ++ pr_info("%s\n", reason); ++} ++ ++static inline bool retp_compiler(void) ++{ ++ return __is_defined(RETPOLINE); ++} ++ ++static inline bool match_option(const char *arg, int arglen, const char *opt) ++{ ++ int len = strlen(opt); ++ ++ return len == arglen && !strncmp(arg, opt, len); ++} ++ ++static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) ++{ ++ char arg[20]; ++ int ret; ++ ++ ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, ++ sizeof(arg)); ++ if (ret > 0) { ++ if (match_option(arg, ret, "off")) { ++ goto disable; ++ } else if (match_option(arg, ret, "on")) { ++ spec2_print_if_secure("force enabled on command line."); ++ return SPECTRE_V2_CMD_FORCE; ++ } else if (match_option(arg, ret, "retpoline")) { ++ spec2_print_if_insecure("retpoline selected on command line."); ++ return SPECTRE_V2_CMD_RETPOLINE; ++ } else if (match_option(arg, ret, "retpoline,amd")) { ++ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { ++ pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); ++ return SPECTRE_V2_CMD_AUTO; ++ } ++ spec2_print_if_insecure("AMD retpoline selected on command line."); ++ return SPECTRE_V2_CMD_RETPOLINE_AMD; ++ } else if (match_option(arg, ret, "retpoline,generic")) { ++ spec2_print_if_insecure("generic retpoline selected on command line."); ++ return SPECTRE_V2_CMD_RETPOLINE_GENERIC; ++ } else if (match_option(arg, ret, "auto")) { ++ return SPECTRE_V2_CMD_AUTO; ++ } ++ } ++ ++ if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2")) ++ return SPECTRE_V2_CMD_AUTO; ++disable: ++ spec2_print_if_insecure("disabled on command line."); ++ return SPECTRE_V2_CMD_NONE; ++} ++ ++static void __init spectre_v2_select_mitigation(void) ++{ ++ enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); ++ enum spectre_v2_mitigation mode = SPECTRE_V2_NONE; ++ ++ /* ++ * If the CPU is not affected and the command line mode is NONE or AUTO ++ * then nothing to do. ++ */ ++ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && ++ (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO)) ++ return; ++ ++ switch (cmd) { ++ case SPECTRE_V2_CMD_NONE: ++ return; ++ ++ case SPECTRE_V2_CMD_FORCE: ++ /* FALLTRHU */ ++ case SPECTRE_V2_CMD_AUTO: ++ goto retpoline_auto; ++ ++ case SPECTRE_V2_CMD_RETPOLINE_AMD: ++ if (IS_ENABLED(CONFIG_RETPOLINE)) ++ goto retpoline_amd; ++ break; ++ case SPECTRE_V2_CMD_RETPOLINE_GENERIC: ++ if (IS_ENABLED(CONFIG_RETPOLINE)) ++ goto retpoline_generic; ++ break; ++ case SPECTRE_V2_CMD_RETPOLINE: ++ if (IS_ENABLED(CONFIG_RETPOLINE)) ++ goto retpoline_auto; ++ break; ++ } ++ pr_err("kernel not compiled with retpoline; no mitigation available!"); ++ return; ++ ++retpoline_auto: ++ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { ++ retpoline_amd: ++ if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { ++ pr_err("LFENCE not serializing. Switching to generic retpoline\n"); ++ goto retpoline_generic; ++ } ++ mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD : ++ SPECTRE_V2_RETPOLINE_MINIMAL_AMD; ++ setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD); ++ setup_force_cpu_cap(X86_FEATURE_RETPOLINE); ++ } else { ++ retpoline_generic: ++ mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_GENERIC : ++ SPECTRE_V2_RETPOLINE_MINIMAL; ++ setup_force_cpu_cap(X86_FEATURE_RETPOLINE); ++ } ++ ++ spectre_v2_enabled = mode; ++ pr_info("%s\n", spectre_v2_strings[mode]); ++} ++ ++#undef pr_fmt ++ + #ifdef CONFIG_SYSFS + ssize_t cpu_show_meltdown(struct device *dev, + struct device_attribute *attr, char *buf) +@@ -85,6 +240,7 @@ ssize_t cpu_show_spectre_v2(struct devic + { + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + return sprintf(buf, "Not affected\n"); +- return sprintf(buf, "Vulnerable\n"); ++ ++ return sprintf(buf, "%s\n", spectre_v2_strings[spectre_v2_enabled]); + } + #endif +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -889,10 +889,6 @@ static void __init early_identify_cpu(st + setup_force_cpu_bug(X86_BUG_SPECTRE_V1); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + +-#ifdef CONFIG_RETPOLINE +- setup_force_cpu_cap(X86_FEATURE_RETPOLINE); +-#endif +- + fpu__init_system(c); + + #ifdef CONFIG_X86_32 diff --git a/queue/x86-spectre-check-config_retpoline-in-command-line-parser.patch b/queue/x86-spectre-check-config_retpoline-in-command-line-parser.patch new file mode 100644 index 0000000..c2d046a --- /dev/null +++ b/queue/x86-spectre-check-config_retpoline-in-command-line-parser.patch @@ -0,0 +1,49 @@ +From foo@baz Thu Feb 8 03:30:27 CET 2018 +From: Dou Liyang <douly.fnst@cn.fujitsu.com> +Date: Tue, 30 Jan 2018 14:13:50 +0800 +Subject: x86/spectre: Check CONFIG_RETPOLINE in command line parser + +From: Dou Liyang <douly.fnst@cn.fujitsu.com> + +(cherry picked from commit 9471eee9186a46893726e22ebb54cade3f9bc043) + +The spectre_v2 option 'auto' does not check whether CONFIG_RETPOLINE is +enabled. As a consequence it fails to emit the appropriate warning and sets +feature flags which have no effect at all. + +Add the missing IS_ENABLED() check. + +Fixes: da285121560e ("x86/spectre: Add boot time option to select Spectre v2 mitigation") +Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: ak@linux.intel.com +Cc: peterz@infradead.org +Cc: Tomohiro <misono.tomohiro@jp.fujitsu.com> +Cc: dave.hansen@intel.com +Cc: bp@alien8.de +Cc: arjan@linux.intel.com +Cc: dwmw@amazon.co.uk +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/f5892721-7528-3647-08fb-f8d10e65ad87@cn.fujitsu.com +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kernel/cpu/bugs.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -212,10 +212,10 @@ static void __init spectre_v2_select_mit + return; + + case SPECTRE_V2_CMD_FORCE: +- /* FALLTRHU */ + case SPECTRE_V2_CMD_AUTO: +- goto retpoline_auto; +- ++ if (IS_ENABLED(CONFIG_RETPOLINE)) ++ goto retpoline_auto; ++ break; + case SPECTRE_V2_CMD_RETPOLINE_AMD: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_amd; diff --git a/queue/x86-spectre-fix-spelling-mistake-vunerable-vulnerable.patch b/queue/x86-spectre-fix-spelling-mistake-vunerable-vulnerable.patch new file mode 100644 index 0000000..9c55ad7 --- /dev/null +++ b/queue/x86-spectre-fix-spelling-mistake-vunerable-vulnerable.patch @@ -0,0 +1,38 @@ +From foo@baz Thu Feb 8 03:32:24 CET 2018 +From: Colin Ian King <colin.king@canonical.com> +Date: Tue, 30 Jan 2018 19:32:18 +0000 +Subject: x86/spectre: Fix spelling mistake: "vunerable"-> "vulnerable" + +From: Colin Ian King <colin.king@canonical.com> + + +(cherry picked from commit e698dcdfcda41efd0984de539767b4cddd235f1e) + +Trivial fix to spelling mistake in pr_err error message text. + +Signed-off-by: Colin Ian King <colin.king@canonical.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Cc: kernel-janitors@vger.kernel.org +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bp@suse.de> +Cc: David Woodhouse <dwmw@amazon.co.uk> +Link: https://lkml.kernel.org/r/20180130193218.9271-1-colin.king@canonical.com +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kernel/cpu/bugs.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -102,7 +102,7 @@ bool retpoline_module_ok(bool has_retpol + if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline) + return true; + +- pr_err("System may be vunerable to spectre v2\n"); ++ pr_err("System may be vulnerable to spectre v2\n"); + spectre_v2_bad_module = true; + return false; + } diff --git a/queue/x86-spectre-report-get_user-mitigation-for-spectre_v1.patch b/queue/x86-spectre-report-get_user-mitigation-for-spectre_v1.patch new file mode 100644 index 0000000..1388be9 --- /dev/null +++ b/queue/x86-spectre-report-get_user-mitigation-for-spectre_v1.patch @@ -0,0 +1,41 @@ +From foo@baz Thu Feb 8 03:32:24 CET 2018 +From: Dan Williams <dan.j.williams@intel.com> +Date: Mon, 29 Jan 2018 17:03:21 -0800 +Subject: x86/spectre: Report get_user mitigation for spectre_v1 + +From: Dan Williams <dan.j.williams@intel.com> + + +(cherry picked from commit edfbae53dab8348fca778531be9f4855d2ca0360) + +Reflect the presence of get_user(), __get_user(), and 'syscall' protections +in sysfs. The expectation is that new and better tooling will allow the +kernel to grow more usages of array_index_nospec(), for now, only claim +mitigation for __user pointer de-references. + +Reported-by: Jiri Slaby <jslaby@suse.cz> +Signed-off-by: Dan Williams <dan.j.williams@intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: linux-arch@vger.kernel.org +Cc: kernel-hardening@lists.openwall.com +Cc: gregkh@linuxfoundation.org +Cc: torvalds@linux-foundation.org +Cc: alan@linux.intel.com +Link: https://lkml.kernel.org/r/151727420158.33451.11658324346540434635.stgit@dwillia2-desk3.amr.corp.intel.com +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kernel/cpu/bugs.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -296,7 +296,7 @@ ssize_t cpu_show_spectre_v1(struct devic + { + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) + return sprintf(buf, "Not affected\n"); +- return sprintf(buf, "Vulnerable\n"); ++ return sprintf(buf, "Mitigation: __user pointer sanitization\n"); + } + + ssize_t cpu_show_spectre_v2(struct device *dev, diff --git a/queue/x86-spectre-simplify-spectre_v2-command-line-parsing.patch b/queue/x86-spectre-simplify-spectre_v2-command-line-parsing.patch new file mode 100644 index 0000000..bef7737 --- /dev/null +++ b/queue/x86-spectre-simplify-spectre_v2-command-line-parsing.patch @@ -0,0 +1,138 @@ +From foo@baz Thu Feb 8 03:32:24 CET 2018 +From: KarimAllah Ahmed <karahmed@amazon.de> +Date: Thu, 1 Feb 2018 11:27:21 +0000 +Subject: x86/spectre: Simplify spectre_v2 command line parsing + +From: KarimAllah Ahmed <karahmed@amazon.de> + + +(cherry picked from commit 9005c6834c0ffdfe46afa76656bd9276cca864f6) + +[dwmw2: Use ARRAY_SIZE] + +Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: peterz@infradead.org +Cc: bp@alien8.de +Link: https://lkml.kernel.org/r/1517484441-1420-3-git-send-email-dwmw@amazon.co.uk +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kernel/cpu/bugs.c | 84 +++++++++++++++++++++++++++++---------------- + 1 file changed, 55 insertions(+), 29 deletions(-) + +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -118,13 +118,13 @@ static inline const char *spectre_v2_mod + static void __init spec2_print_if_insecure(const char *reason) + { + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) +- pr_info("%s\n", reason); ++ pr_info("%s selected on command line.\n", reason); + } + + static void __init spec2_print_if_secure(const char *reason) + { + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) +- pr_info("%s\n", reason); ++ pr_info("%s selected on command line.\n", reason); + } + + static inline bool retp_compiler(void) +@@ -139,42 +139,68 @@ static inline bool match_option(const ch + return len == arglen && !strncmp(arg, opt, len); + } + ++static const struct { ++ const char *option; ++ enum spectre_v2_mitigation_cmd cmd; ++ bool secure; ++} mitigation_options[] = { ++ { "off", SPECTRE_V2_CMD_NONE, false }, ++ { "on", SPECTRE_V2_CMD_FORCE, true }, ++ { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, ++ { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false }, ++ { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, ++ { "auto", SPECTRE_V2_CMD_AUTO, false }, ++}; ++ + static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) + { + char arg[20]; +- int ret; ++ int ret, i; ++ enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO; ++ ++ if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) ++ return SPECTRE_V2_CMD_NONE; ++ else { ++ ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, ++ sizeof(arg)); ++ if (ret < 0) ++ return SPECTRE_V2_CMD_AUTO; + +- ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, +- sizeof(arg)); +- if (ret > 0) { +- if (match_option(arg, ret, "off")) { +- goto disable; +- } else if (match_option(arg, ret, "on")) { +- spec2_print_if_secure("force enabled on command line."); +- return SPECTRE_V2_CMD_FORCE; +- } else if (match_option(arg, ret, "retpoline")) { +- spec2_print_if_insecure("retpoline selected on command line."); +- return SPECTRE_V2_CMD_RETPOLINE; +- } else if (match_option(arg, ret, "retpoline,amd")) { +- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { +- pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); +- return SPECTRE_V2_CMD_AUTO; +- } +- spec2_print_if_insecure("AMD retpoline selected on command line."); +- return SPECTRE_V2_CMD_RETPOLINE_AMD; +- } else if (match_option(arg, ret, "retpoline,generic")) { +- spec2_print_if_insecure("generic retpoline selected on command line."); +- return SPECTRE_V2_CMD_RETPOLINE_GENERIC; +- } else if (match_option(arg, ret, "auto")) { ++ for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { ++ if (!match_option(arg, ret, mitigation_options[i].option)) ++ continue; ++ cmd = mitigation_options[i].cmd; ++ break; ++ } ++ ++ if (i >= ARRAY_SIZE(mitigation_options)) { ++ pr_err("unknown option (%s). Switching to AUTO select\n", ++ mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + } + +- if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2")) ++ if ((cmd == SPECTRE_V2_CMD_RETPOLINE || ++ cmd == SPECTRE_V2_CMD_RETPOLINE_AMD || ++ cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) && ++ !IS_ENABLED(CONFIG_RETPOLINE)) { ++ pr_err("%s selected but not compiled in. Switching to AUTO select\n", ++ mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; +-disable: +- spec2_print_if_insecure("disabled on command line."); +- return SPECTRE_V2_CMD_NONE; ++ } ++ ++ if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD && ++ boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { ++ pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); ++ return SPECTRE_V2_CMD_AUTO; ++ } ++ ++ if (mitigation_options[i].secure) ++ spec2_print_if_secure(mitigation_options[i].option); ++ else ++ spec2_print_if_insecure(mitigation_options[i].option); ++ ++ return cmd; + } + + /* Check for Skylake-like CPUs (for RSB handling) */ diff --git a/queue/x86-speculation-add-basic-ibpb-indirect-branch-prediction-barrier-support.patch b/queue/x86-speculation-add-basic-ibpb-indirect-branch-prediction-barrier-support.patch new file mode 100644 index 0000000..676d99d --- /dev/null +++ b/queue/x86-speculation-add-basic-ibpb-indirect-branch-prediction-barrier-support.patch @@ -0,0 +1,94 @@ +From foo@baz Thu Feb 8 03:30:27 CET 2018 +From: David Woodhouse <dwmw@amazon.co.uk> +Date: Thu, 25 Jan 2018 16:14:15 +0000 +Subject: x86/speculation: Add basic IBPB (Indirect Branch Prediction Barrier) support + +From: David Woodhouse <dwmw@amazon.co.uk> + +(cherry picked from commit 20ffa1caecca4db8f79fe665acdeaa5af815a24d) + +Expose indirect_branch_prediction_barrier() for use in subsequent patches. + +[ tglx: Add IBPB status to spectre_v2 sysfs file ] + +Co-developed-by: KarimAllah Ahmed <karahmed@amazon.de> +Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de> +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Cc: gnomes@lxorguk.ukuu.org.uk +Cc: ak@linux.intel.com +Cc: ashok.raj@intel.com +Cc: dave.hansen@intel.com +Cc: arjan@linux.intel.com +Cc: torvalds@linux-foundation.org +Cc: peterz@infradead.org +Cc: bp@alien8.de +Cc: pbonzini@redhat.com +Cc: tim.c.chen@linux.intel.com +Cc: gregkh@linux-foundation.org +Link: https://lkml.kernel.org/r/1516896855-7642-8-git-send-email-dwmw@amazon.co.uk +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/cpufeatures.h | 2 ++ + arch/x86/include/asm/nospec-branch.h | 13 +++++++++++++ + arch/x86/kernel/cpu/bugs.c | 10 +++++++++- + 3 files changed, 24 insertions(+), 1 deletion(-) + +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -202,6 +202,8 @@ + /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ + #define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ + ++#define X86_FEATURE_IBPB ( 7*32+21) /* Indirect Branch Prediction Barrier enabled*/ ++ + /* Virtualization flags: Linux defined, word 8 */ + #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ + #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -218,5 +218,18 @@ static inline void vmexit_fill_RSB(void) + #endif + } + ++static inline void indirect_branch_prediction_barrier(void) ++{ ++ asm volatile(ALTERNATIVE("", ++ "movl %[msr], %%ecx\n\t" ++ "movl %[val], %%eax\n\t" ++ "movl $0, %%edx\n\t" ++ "wrmsr", ++ X86_FEATURE_IBPB) ++ : : [msr] "i" (MSR_IA32_PRED_CMD), ++ [val] "i" (PRED_CMD_IBPB) ++ : "eax", "ecx", "edx", "memory"); ++} ++ + #endif /* __ASSEMBLY__ */ + #endif /* __NOSPEC_BRANCH_H__ */ +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -262,6 +262,13 @@ retpoline_auto: + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); + pr_info("Filling RSB on context switch\n"); + } ++ ++ /* Initialize Indirect Branch Prediction Barrier if supported */ ++ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) || ++ boot_cpu_has(X86_FEATURE_AMD_PRED_CMD)) { ++ setup_force_cpu_cap(X86_FEATURE_IBPB); ++ pr_info("Enabling Indirect Branch Prediction Barrier\n"); ++ } + } + + #undef pr_fmt +@@ -291,7 +298,8 @@ ssize_t cpu_show_spectre_v2(struct devic + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + return sprintf(buf, "Not affected\n"); + +- return sprintf(buf, "%s%s\n", spectre_v2_strings[spectre_v2_enabled], ++ return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], ++ boot_cpu_has(X86_FEATURE_IBPB) ? ", IPBP" : "", + spectre_v2_bad_module ? " - vulnerable module loaded" : ""); + } + #endif diff --git a/queue/x86-speculation-fix-typo-ibrs_att-which-should-be-ibrs_all.patch b/queue/x86-speculation-fix-typo-ibrs_att-which-should-be-ibrs_all.patch new file mode 100644 index 0000000..2446b18 --- /dev/null +++ b/queue/x86-speculation-fix-typo-ibrs_att-which-should-be-ibrs_all.patch @@ -0,0 +1,38 @@ +From foo@baz Thu Feb 8 03:33:09 CET 2018 +From: Darren Kenny <darren.kenny@oracle.com> +Date: Fri, 2 Feb 2018 19:12:20 +0000 +Subject: x86/speculation: Fix typo IBRS_ATT, which should be IBRS_ALL + +From: Darren Kenny <darren.kenny@oracle.com> + + +(cherry picked from commit af189c95a371b59f493dbe0f50c0a09724868881) + +Fixes: 117cc7a908c83 ("x86/retpoline: Fill return stack buffer on vmexit") +Signed-off-by: Darren Kenny <darren.kenny@oracle.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> +Cc: Tom Lendacky <thomas.lendacky@amd.com> +Cc: Andi Kleen <ak@linux.intel.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Masami Hiramatsu <mhiramat@kernel.org> +Cc: Arjan van de Ven <arjan@linux.intel.com> +Cc: David Woodhouse <dwmw@amazon.co.uk> +Link: https://lkml.kernel.org/r/20180202191220.blvgkgutojecxr3b@starbug-vm.ie.oracle.com +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/nospec-branch.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -150,7 +150,7 @@ extern char __indirect_thunk_end[]; + * On VMEXIT we must ensure that no RSB predictions learned in the guest + * can be followed in the host, by overwriting the RSB completely. Both + * retpoline and IBRS mitigations for Spectre v2 need this; only on future +- * CPUs with IBRS_ATT *might* it be avoided. ++ * CPUs with IBRS_ALL *might* it be avoided. + */ + static inline void vmexit_fill_RSB(void) + { diff --git a/queue/x86-syscall-sanitize-syscall-table-de-references-under-speculation.patch b/queue/x86-syscall-sanitize-syscall-table-de-references-under-speculation.patch new file mode 100644 index 0000000..cce824f --- /dev/null +++ b/queue/x86-syscall-sanitize-syscall-table-de-references-under-speculation.patch @@ -0,0 +1,61 @@ +From foo@baz Thu Feb 8 03:32:24 CET 2018 +From: Dan Williams <dan.j.williams@intel.com> +Date: Mon, 29 Jan 2018 17:02:59 -0800 +Subject: x86/syscall: Sanitize syscall table de-references under speculation + +From: Dan Williams <dan.j.williams@intel.com> + + +(cherry picked from commit 2fbd7af5af8665d18bcefae3e9700be07e22b681) + +The syscall table base is a user controlled function pointer in kernel +space. Use array_index_nospec() to prevent any out of bounds speculation. + +While retpoline prevents speculating into a userspace directed target it +does not stop the pointer de-reference, the concern is leaking memory +relative to the syscall table base, by observing instruction cache +behavior. + +Reported-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: Dan Williams <dan.j.williams@intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: linux-arch@vger.kernel.org +Cc: kernel-hardening@lists.openwall.com +Cc: gregkh@linuxfoundation.org +Cc: Andy Lutomirski <luto@kernel.org> +Cc: alan@linux.intel.com +Link: https://lkml.kernel.org/r/151727417984.33451.1216731042505722161.stgit@dwillia2-desk3.amr.corp.intel.com +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/entry/common.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/arch/x86/entry/common.c ++++ b/arch/x86/entry/common.c +@@ -20,6 +20,7 @@ + #include <linux/export.h> + #include <linux/context_tracking.h> + #include <linux/user-return-notifier.h> ++#include <linux/nospec.h> + #include <linux/uprobes.h> + + #include <asm/desc.h> +@@ -277,7 +278,8 @@ __visible void do_syscall_64(struct pt_r + * regs->orig_ax, which changes the behavior of some syscalls. + */ + if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) { +- regs->ax = sys_call_table[nr & __SYSCALL_MASK]( ++ nr = array_index_nospec(nr & __SYSCALL_MASK, NR_syscalls); ++ regs->ax = sys_call_table[nr]( + regs->di, regs->si, regs->dx, + regs->r10, regs->r8, regs->r9); + } +@@ -313,6 +315,7 @@ static __always_inline void do_syscall_3 + } + + if (likely(nr < IA32_NR_syscalls)) { ++ nr = array_index_nospec(nr, IA32_NR_syscalls); + /* + * It's possible that a 32-bit syscall implementation + * takes a 64-bit parameter but nonetheless assumes that diff --git a/queue/x86-tlb-drop-the-_gpl-from-the-cpu_tlbstate-export.patch b/queue/x86-tlb-drop-the-_gpl-from-the-cpu_tlbstate-export.patch new file mode 100644 index 0000000..9d1ad97 --- /dev/null +++ b/queue/x86-tlb-drop-the-_gpl-from-the-cpu_tlbstate-export.patch @@ -0,0 +1,48 @@ +From 1e5476815fd7f98b888e01a0f9522b63085f96c9 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Thu, 4 Jan 2018 22:19:04 +0100 +Subject: x86/tlb: Drop the _GPL from the cpu_tlbstate export + +From: Thomas Gleixner <tglx@linutronix.de> + +commit 1e5476815fd7f98b888e01a0f9522b63085f96c9 upstream. + +The recent changes for PTI touch cpu_tlbstate from various tlb_flush +inlines. cpu_tlbstate is exported as GPL symbol, so this causes a +regression when building out of tree drivers for certain graphics cards. + +Aside of that the export was wrong since it was introduced as it should +have been EXPORT_PER_CPU_SYMBOL_GPL(). + +Use the correct PER_CPU export and drop the _GPL to restore the previous +state which allows users to utilize the cards they payed for. + +As always I'm really thrilled to make this kind of change to support the +#friends (or however the hot hashtag of today is spelled) from that closet +sauce graphics corp. + +Fixes: 1e02ce4cccdc ("x86: Store a per-cpu shadow copy of CR4") +Fixes: 6fd166aae78c ("x86/mm: Use/Fix PCID to optimize user/kernel switches") +Reported-by: Kees Cook <keescook@google.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Thomas Backlund <tmb@mageia.org> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/mm/init.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -768,7 +768,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb + .state = 0, + .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ + }; +-EXPORT_SYMBOL_GPL(cpu_tlbstate); ++EXPORT_PER_CPU_SYMBOL(cpu_tlbstate); + + void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) + { diff --git a/queue/x86-uaccess-use-__uaccess_begin_nospec-and-uaccess_try_nospec.patch b/queue/x86-uaccess-use-__uaccess_begin_nospec-and-uaccess_try_nospec.patch new file mode 100644 index 0000000..4a3fa28 --- /dev/null +++ b/queue/x86-uaccess-use-__uaccess_begin_nospec-and-uaccess_try_nospec.patch @@ -0,0 +1,187 @@ +From foo@baz Thu Feb 8 03:32:24 CET 2018 +From: Dan Williams <dan.j.williams@intel.com> +Date: Mon, 29 Jan 2018 17:02:49 -0800 +Subject: x86/uaccess: Use __uaccess_begin_nospec() and uaccess_try_nospec + +From: Dan Williams <dan.j.williams@intel.com> + + +(cherry picked from commit 304ec1b050310548db33063e567123fae8fd0301) + +Quoting Linus: + + I do think that it would be a good idea to very expressly document + the fact that it's not that the user access itself is unsafe. I do + agree that things like "get_user()" want to be protected, but not + because of any direct bugs or problems with get_user() and friends, + but simply because get_user() is an excellent source of a pointer + that is obviously controlled from a potentially attacking user + space. So it's a prime candidate for then finding _subsequent_ + accesses that can then be used to perturb the cache. + +__uaccess_begin_nospec() covers __get_user() and copy_from_iter() where the +limit check is far away from the user pointer de-reference. In those cases +a barrier_nospec() prevents speculation with a potential pointer to +privileged memory. uaccess_try_nospec covers get_user_try. + +Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> +Suggested-by: Andi Kleen <ak@linux.intel.com> +Signed-off-by: Dan Williams <dan.j.williams@intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: linux-arch@vger.kernel.org +Cc: Kees Cook <keescook@chromium.org> +Cc: kernel-hardening@lists.openwall.com +Cc: gregkh@linuxfoundation.org +Cc: Al Viro <viro@zeniv.linux.org.uk> +Cc: alan@linux.intel.com +Link: https://lkml.kernel.org/r/151727416953.33451.10508284228526170604.stgit@dwillia2-desk3.amr.corp.intel.com +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/include/asm/uaccess.h | 6 +++--- + arch/x86/include/asm/uaccess_32.h | 12 ++++++------ + arch/x86/include/asm/uaccess_64.h | 12 ++++++------ + arch/x86/lib/usercopy_32.c | 4 ++-- + 4 files changed, 17 insertions(+), 17 deletions(-) + +--- a/arch/x86/include/asm/uaccess.h ++++ b/arch/x86/include/asm/uaccess.h +@@ -437,7 +437,7 @@ do { \ + ({ \ + int __gu_err; \ + __inttype(*(ptr)) __gu_val; \ +- __uaccess_begin(); \ ++ __uaccess_begin_nospec(); \ + __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ + __uaccess_end(); \ + (x) = (__force __typeof__(*(ptr)))__gu_val; \ +@@ -547,7 +547,7 @@ struct __large_struct { unsigned long bu + * get_user_ex(...); + * } get_user_catch(err) + */ +-#define get_user_try uaccess_try ++#define get_user_try uaccess_try_nospec + #define get_user_catch(err) uaccess_catch(err) + + #define get_user_ex(x, ptr) do { \ +@@ -582,7 +582,7 @@ extern void __cmpxchg_wrong_size(void) + __typeof__(ptr) __uval = (uval); \ + __typeof__(*(ptr)) __old = (old); \ + __typeof__(*(ptr)) __new = (new); \ +- __uaccess_begin(); \ ++ __uaccess_begin_nospec(); \ + switch (size) { \ + case 1: \ + { \ +--- a/arch/x86/include/asm/uaccess_32.h ++++ b/arch/x86/include/asm/uaccess_32.h +@@ -102,17 +102,17 @@ __copy_from_user(void *to, const void __ + + switch (n) { + case 1: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_size(*(u8 *)to, from, 1, ret, 1); + __uaccess_end(); + return ret; + case 2: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_size(*(u16 *)to, from, 2, ret, 2); + __uaccess_end(); + return ret; + case 4: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_size(*(u32 *)to, from, 4, ret, 4); + __uaccess_end(); + return ret; +@@ -130,17 +130,17 @@ static __always_inline unsigned long __c + + switch (n) { + case 1: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_size(*(u8 *)to, from, 1, ret, 1); + __uaccess_end(); + return ret; + case 2: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_size(*(u16 *)to, from, 2, ret, 2); + __uaccess_end(); + return ret; + case 4: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_size(*(u32 *)to, from, 4, ret, 4); + __uaccess_end(); + return ret; +--- a/arch/x86/include/asm/uaccess_64.h ++++ b/arch/x86/include/asm/uaccess_64.h +@@ -59,31 +59,31 @@ int __copy_from_user_nocheck(void *dst, + return copy_user_generic(dst, (__force void *)src, size); + switch (size) { + case 1: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm(*(u8 *)dst, (u8 __user *)src, + ret, "b", "b", "=q", 1); + __uaccess_end(); + return ret; + case 2: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm(*(u16 *)dst, (u16 __user *)src, + ret, "w", "w", "=r", 2); + __uaccess_end(); + return ret; + case 4: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm(*(u32 *)dst, (u32 __user *)src, + ret, "l", "k", "=r", 4); + __uaccess_end(); + return ret; + case 8: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm(*(u64 *)dst, (u64 __user *)src, + ret, "q", "", "=r", 8); + __uaccess_end(); + return ret; + case 10: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm(*(u64 *)dst, (u64 __user *)src, + ret, "q", "", "=r", 10); + if (likely(!ret)) +@@ -93,7 +93,7 @@ int __copy_from_user_nocheck(void *dst, + __uaccess_end(); + return ret; + case 16: +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + __get_user_asm(*(u64 *)dst, (u64 __user *)src, + ret, "q", "", "=r", 16); + if (likely(!ret)) +--- a/arch/x86/lib/usercopy_32.c ++++ b/arch/x86/lib/usercopy_32.c +@@ -570,7 +570,7 @@ do { \ + unsigned long __copy_to_user_ll(void __user *to, const void *from, + unsigned long n) + { +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + if (movsl_is_ok(to, from, n)) + __copy_user(to, from, n); + else +@@ -627,7 +627,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocach + unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from, + unsigned long n) + { +- __uaccess_begin(); ++ __uaccess_begin_nospec(); + #ifdef CONFIG_X86_INTEL_USERCOPY + if (n > 64 && static_cpu_has(X86_FEATURE_XMM2)) + n = __copy_user_intel_nocache(to, from, n); diff --git a/queue/x86-usercopy-replace-open-coded-stac-clac-with-__uaccess_-begin-end.patch b/queue/x86-usercopy-replace-open-coded-stac-clac-with-__uaccess_-begin-end.patch new file mode 100644 index 0000000..bf11e7c --- /dev/null +++ b/queue/x86-usercopy-replace-open-coded-stac-clac-with-__uaccess_-begin-end.patch @@ -0,0 +1,70 @@ +From foo@baz Thu Feb 8 03:32:24 CET 2018 +From: Dan Williams <dan.j.williams@intel.com> +Date: Mon, 29 Jan 2018 17:02:44 -0800 +Subject: x86/usercopy: Replace open coded stac/clac with __uaccess_{begin, end} + +From: Dan Williams <dan.j.williams@intel.com> + + +(cherry picked from commit b5c4ae4f35325d520b230bab6eb3310613b72ac1) + +In preparation for converting some __uaccess_begin() instances to +__uacess_begin_nospec(), make sure all 'from user' uaccess paths are +using the _begin(), _end() helpers rather than open-coded stac() and +clac(). + +No functional changes. + +Suggested-by: Ingo Molnar <mingo@redhat.com> +Signed-off-by: Dan Williams <dan.j.williams@intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: linux-arch@vger.kernel.org +Cc: Tom Lendacky <thomas.lendacky@amd.com> +Cc: Kees Cook <keescook@chromium.org> +Cc: kernel-hardening@lists.openwall.com +Cc: gregkh@linuxfoundation.org +Cc: Al Viro <viro@zeniv.linux.org.uk> +Cc: torvalds@linux-foundation.org +Cc: alan@linux.intel.com +Link: https://lkml.kernel.org/r/151727416438.33451.17309465232057176966.stgit@dwillia2-desk3.amr.corp.intel.com +Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/lib/usercopy_32.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/arch/x86/lib/usercopy_32.c ++++ b/arch/x86/lib/usercopy_32.c +@@ -570,12 +570,12 @@ do { \ + unsigned long __copy_to_user_ll(void __user *to, const void *from, + unsigned long n) + { +- stac(); ++ __uaccess_begin(); + if (movsl_is_ok(to, from, n)) + __copy_user(to, from, n); + else + n = __copy_user_intel(to, from, n); +- clac(); ++ __uaccess_end(); + return n; + } + EXPORT_SYMBOL(__copy_to_user_ll); +@@ -627,7 +627,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocach + unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from, + unsigned long n) + { +- stac(); ++ __uaccess_begin(); + #ifdef CONFIG_X86_INTEL_USERCOPY + if (n > 64 && static_cpu_has(X86_FEATURE_XMM2)) + n = __copy_user_intel_nocache(to, from, n); +@@ -636,7 +636,7 @@ unsigned long __copy_from_user_ll_nocach + #else + __copy_user(to, from, n); + #endif +- clac(); ++ __uaccess_end(); + return n; + } + EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero); diff --git a/queue/x86-vm86-32-switch-to-flush_tlb_mm_range-in-mark_screen_rdonly.patch b/queue/x86-vm86-32-switch-to-flush_tlb_mm_range-in-mark_screen_rdonly.patch new file mode 100644 index 0000000..ba2096a --- /dev/null +++ b/queue/x86-vm86-32-switch-to-flush_tlb_mm_range-in-mark_screen_rdonly.patch @@ -0,0 +1,50 @@ +From 9ccee2373f0658f234727700e619df097ba57023 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Sat, 22 Apr 2017 00:01:19 -0700 +Subject: x86/vm86/32: Switch to flush_tlb_mm_range() in mark_screen_rdonly() + +From: Andy Lutomirski <luto@kernel.org> + +commit 9ccee2373f0658f234727700e619df097ba57023 upstream. + +mark_screen_rdonly() is the last remaining caller of flush_tlb(). +flush_tlb_mm_range() is potentially faster and isn't obsolete. + +Compile-tested only because I don't know whether software that uses +this mechanism even exists. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Michal Hocko <mhocko@suse.com> +Cc: Nadav Amit <namit@vmware.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Rik van Riel <riel@redhat.com> +Cc: Sasha Levin <sasha.levin@oracle.com> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/791a644076fc3577ba7f7b7cafd643cc089baa7d.1492844372.git.luto@kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Cc: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> + +--- + arch/x86/kernel/vm86_32.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/kernel/vm86_32.c ++++ b/arch/x86/kernel/vm86_32.c +@@ -191,7 +191,7 @@ static void mark_screen_rdonly(struct mm + pte_unmap_unlock(pte, ptl); + out: + up_write(&mm->mmap_sem); +- flush_tlb(); ++ flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL); + } + + |