diff options
author | Paul Gortmaker <paul.gortmaker@windriver.com> | 2018-08-01 11:25:06 -0400 |
---|---|---|
committer | Paul Gortmaker <paul.gortmaker@windriver.com> | 2018-08-01 11:25:06 -0400 |
commit | c691029d42e0fba69676b12803bb1526d4a46ff9 (patch) | |
tree | 564f15aeae92c5a7da186a1710e9332f0dbd9b4b | |
parent | d9d5980c21367e9e1d07a808cba8024c29d6250e (diff) | |
download | longterm-queue-4.12-c691029d42e0fba69676b12803bb1526d4a46ff9.tar.gz |
queue more prospective patches to be audited
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
74 files changed, 7226 insertions, 0 deletions
diff --git a/queue/ACPI-APEI-ERST-Fix-missing-error-handling-in-erst_re.patch b/queue/ACPI-APEI-ERST-Fix-missing-error-handling-in-erst_re.patch new file mode 100644 index 0000000..31a252f --- /dev/null +++ b/queue/ACPI-APEI-ERST-Fix-missing-error-handling-in-erst_re.patch @@ -0,0 +1,53 @@ +From bb82e0b4a7e96494f0c1004ce50cec3d7b5fb3d1 Mon Sep 17 00:00:00 2001 +From: Takashi Iwai <tiwai@suse.de> +Date: Thu, 14 Dec 2017 13:31:16 +0100 +Subject: [PATCH] ACPI: APEI / ERST: Fix missing error handling in + erst_reader() + +commit bb82e0b4a7e96494f0c1004ce50cec3d7b5fb3d1 upstream. + +The commit f6f828513290 ("pstore: pass allocated memory region back to +caller") changed the check of the return value from erst_read() in +erst_reader() in the following way: + + if (len == -ENOENT) + goto skip; +- else if (len < 0) { +- rc = -1; ++ else if (len < sizeof(*rcd)) { ++ rc = -EIO; + goto out; + +This introduced another bug: since the comparison with sizeof() is +cast to unsigned, a negative len value doesn't hit any longer. +As a result, when an error is returned from erst_read(), the code +falls through, and it may eventually lead to some weird thing like +memory corruption. + +This patch adds the negative error value check more explicitly for +addressing the issue. + +Fixes: f6f828513290 (pstore: pass allocated memory region back to caller) +Cc: All applicable <stable@vger.kernel.org> +Tested-by: Jerry Tang <jtang@suse.com> +Signed-off-by: Takashi Iwai <tiwai@suse.de> +Acked-by: Kees Cook <keescook@chromium.org> +Reviewed-by: Borislav Petkov <bp@suse.de> +Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> + +diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c +index 6742f6c68034..9bff853e85f3 100644 +--- a/drivers/acpi/apei/erst.c ++++ b/drivers/acpi/apei/erst.c +@@ -1007,7 +1007,7 @@ static ssize_t erst_reader(struct pstore_record *record) + /* The record may be cleared by others, try read next record */ + if (len == -ENOENT) + goto skip; +- else if (len < sizeof(*rcd)) { ++ else if (len < 0 || len < sizeof(*rcd)) { + rc = -EIO; + goto out; + } +-- +2.15.0 + diff --git a/queue/ALSA-hda-Add-vendor-id-for-Cannonlake-HDMI-codec.patch b/queue/ALSA-hda-Add-vendor-id-for-Cannonlake-HDMI-codec.patch new file mode 100644 index 0000000..e2f33b1 --- /dev/null +++ b/queue/ALSA-hda-Add-vendor-id-for-Cannonlake-HDMI-codec.patch @@ -0,0 +1,43 @@ +From 2b4584d00a6bc02b63ab3c7213060d41a74bdff1 Mon Sep 17 00:00:00 2001 +From: Guneshwor Singh <guneshwor.o.singh@intel.com> +Date: Thu, 7 Dec 2017 18:06:20 +0530 +Subject: [PATCH] ALSA: hda - Add vendor id for Cannonlake HDMI codec + +commit 2b4584d00a6bc02b63ab3c7213060d41a74bdff1 upstream. + +Cannonlake HDMI codec has the same nid as Geminilake. This adds the +codec entry for it. + +Signed-off-by: Guneshwor Singh <guneshwor.o.singh@intel.com> +Cc: <stable@vger.kernel.org> +Signed-off-by: Takashi Iwai <tiwai@suse.de> + +diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c +index c19c81d230bd..b4f1b6e88305 100644 +--- a/sound/pci/hda/patch_hdmi.c ++++ b/sound/pci/hda/patch_hdmi.c +@@ -55,10 +55,11 @@ MODULE_PARM_DESC(static_hdmi_pcm, "Don't restrict PCM parameters per ELD info"); + #define is_kabylake(codec) ((codec)->core.vendor_id == 0x8086280b) + #define is_geminilake(codec) (((codec)->core.vendor_id == 0x8086280d) || \ + ((codec)->core.vendor_id == 0x80862800)) ++#define is_cannonlake(codec) ((codec)->core.vendor_id == 0x8086280c) + #define is_haswell_plus(codec) (is_haswell(codec) || is_broadwell(codec) \ + || is_skylake(codec) || is_broxton(codec) \ +- || is_kabylake(codec)) || is_geminilake(codec) +- ++ || is_kabylake(codec)) || is_geminilake(codec) \ ++ || is_cannonlake(codec) + #define is_valleyview(codec) ((codec)->core.vendor_id == 0x80862882) + #define is_cherryview(codec) ((codec)->core.vendor_id == 0x80862883) + #define is_valleyview_plus(codec) (is_valleyview(codec) || is_cherryview(codec)) +@@ -3841,6 +3842,7 @@ HDA_CODEC_ENTRY(0x80862808, "Broadwell HDMI", patch_i915_hsw_hdmi), + HDA_CODEC_ENTRY(0x80862809, "Skylake HDMI", patch_i915_hsw_hdmi), + HDA_CODEC_ENTRY(0x8086280a, "Broxton HDMI", patch_i915_hsw_hdmi), + HDA_CODEC_ENTRY(0x8086280b, "Kabylake HDMI", patch_i915_hsw_hdmi), ++HDA_CODEC_ENTRY(0x8086280c, "Cannonlake HDMI", patch_i915_glk_hdmi), + HDA_CODEC_ENTRY(0x8086280d, "Geminilake HDMI", patch_i915_glk_hdmi), + HDA_CODEC_ENTRY(0x80862800, "Geminilake HDMI", patch_i915_glk_hdmi), + HDA_CODEC_ENTRY(0x80862880, "CedarTrail HDMI", patch_generic_hdmi), +-- +2.15.0 + diff --git a/queue/ALSA-hda-realtek-Fix-Dell-AIO-LineOut-issue.patch b/queue/ALSA-hda-realtek-Fix-Dell-AIO-LineOut-issue.patch new file mode 100644 index 0000000..175bf95 --- /dev/null +++ b/queue/ALSA-hda-realtek-Fix-Dell-AIO-LineOut-issue.patch @@ -0,0 +1,94 @@ +From 9226665159f0367ad08bc7d5dd194aeadb90316f Mon Sep 17 00:00:00 2001 +From: Kailang Yang <kailang@realtek.com> +Date: Thu, 14 Dec 2017 15:28:58 +0800 +Subject: [PATCH] ALSA: hda/realtek - Fix Dell AIO LineOut issue + +commit 9226665159f0367ad08bc7d5dd194aeadb90316f upstream. + +Dell AIO had LineOut jack. +Add LineOut verb into this patch. + +[ Additional notes: + the ALC274 codec seems requiring the fixed pin / DAC connections for + HP / line-out pins for enabling EQ for speakers; i.e. the HP / LO + pins expect to be connected with NID 0x03 while keeping the speaker + with NID 0x02. However, by adding a new line-out pin, the + auto-parser assigns the NID 0x02 for HP/LO pins as primary outputs. + As an easy workaround, we provide the preferred_pairs[] to map + forcibly for these pins. -- tiwai ] + +Fixes: 75ee94b20b46 ("ALSA: hda - fix headset mic problem for Dell machines with alc274") +Signed-off-by: Kailang Yang <kailang@realtek.com> +Cc: <stable@vger.kernel.org> +Signed-off-by: Takashi Iwai <tiwai@suse.de> + +diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c +index 4b21f71d685c..6a4db00511ab 100644 +--- a/sound/pci/hda/patch_realtek.c ++++ b/sound/pci/hda/patch_realtek.c +@@ -5185,6 +5185,22 @@ static void alc233_alc662_fixup_lenovo_dual_codecs(struct hda_codec *codec, + } + } + ++/* Forcibly assign NID 0x03 to HP/LO while NID 0x02 to SPK for EQ */ ++static void alc274_fixup_bind_dacs(struct hda_codec *codec, ++ const struct hda_fixup *fix, int action) ++{ ++ struct alc_spec *spec = codec->spec; ++ static hda_nid_t preferred_pairs[] = { ++ 0x21, 0x03, 0x1b, 0x03, 0x16, 0x02, ++ 0 ++ }; ++ ++ if (action != HDA_FIXUP_ACT_PRE_PROBE) ++ return; ++ ++ spec->gen.preferred_dacs = preferred_pairs; ++} ++ + /* for hda_fixup_thinkpad_acpi() */ + #include "thinkpad_helper.c" + +@@ -5302,6 +5318,8 @@ enum { + ALC233_FIXUP_LENOVO_MULTI_CODECS, + ALC294_FIXUP_LENOVO_MIC_LOCATION, + ALC700_FIXUP_INTEL_REFERENCE, ++ ALC274_FIXUP_DELL_BIND_DACS, ++ ALC274_FIXUP_DELL_AIO_LINEOUT_VERB, + }; + + static const struct hda_fixup alc269_fixups[] = { +@@ -6112,6 +6130,21 @@ static const struct hda_fixup alc269_fixups[] = { + {} + } + }, ++ [ALC274_FIXUP_DELL_BIND_DACS] = { ++ .type = HDA_FIXUP_FUNC, ++ .v.func = alc274_fixup_bind_dacs, ++ .chained = true, ++ .chain_id = ALC269_FIXUP_DELL1_MIC_NO_PRESENCE ++ }, ++ [ALC274_FIXUP_DELL_AIO_LINEOUT_VERB] = { ++ .type = HDA_FIXUP_PINS, ++ .v.pins = (const struct hda_pintbl[]) { ++ { 0x1b, 0x0401102f }, ++ { } ++ }, ++ .chained = true, ++ .chain_id = ALC274_FIXUP_DELL_BIND_DACS ++ }, + }; + + static const struct snd_pci_quirk alc269_fixup_tbl[] = { +@@ -6578,7 +6611,7 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { + {0x14, 0x90170110}, + {0x1b, 0x90a70130}, + {0x21, 0x03211020}), +- SND_HDA_PIN_QUIRK(0x10ec0274, 0x1028, "Dell", ALC269_FIXUP_DELL1_MIC_NO_PRESENCE, ++ SND_HDA_PIN_QUIRK(0x10ec0274, 0x1028, "Dell", ALC274_FIXUP_DELL_AIO_LINEOUT_VERB, + {0x12, 0xb7a60130}, + {0x13, 0xb8a61140}, + {0x16, 0x90170110}, +-- +2.15.0 + diff --git a/queue/ALSA-rawmidi-Avoid-racy-info-ioctl-via-ctl-device.patch b/queue/ALSA-rawmidi-Avoid-racy-info-ioctl-via-ctl-device.patch new file mode 100644 index 0000000..a473442 --- /dev/null +++ b/queue/ALSA-rawmidi-Avoid-racy-info-ioctl-via-ctl-device.patch @@ -0,0 +1,65 @@ +From c1cfd9025cc394fd137a01159d74335c5ac978ce Mon Sep 17 00:00:00 2001 +From: Takashi Iwai <tiwai@suse.de> +Date: Thu, 14 Dec 2017 16:44:12 +0100 +Subject: [PATCH] ALSA: rawmidi: Avoid racy info ioctl via ctl device + +commit c1cfd9025cc394fd137a01159d74335c5ac978ce upstream. + +The rawmidi also allows to obtaining the information via ioctl of ctl +API. It means that user can issue an ioctl to the rawmidi device even +when it's being removed as long as the control device is present. +Although the code has some protection via the global register_mutex, +its range is limited to the search of the corresponding rawmidi +object, and the mutex is already unlocked at accessing the rawmidi +object. This may lead to a use-after-free. + +For avoiding it, this patch widens the application of register_mutex +to the whole snd_rawmidi_info_select() function. We have another +mutex per rawmidi object, but this operation isn't very hot path, so +it shouldn't matter from the performance POV. + +Cc: <stable@vger.kernel.org> +Signed-off-by: Takashi Iwai <tiwai@suse.de> + +diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c +index b3b353d72527..f055ca10bbc1 100644 +--- a/sound/core/rawmidi.c ++++ b/sound/core/rawmidi.c +@@ -579,15 +579,14 @@ static int snd_rawmidi_info_user(struct snd_rawmidi_substream *substream, + return 0; + } + +-int snd_rawmidi_info_select(struct snd_card *card, struct snd_rawmidi_info *info) ++static int __snd_rawmidi_info_select(struct snd_card *card, ++ struct snd_rawmidi_info *info) + { + struct snd_rawmidi *rmidi; + struct snd_rawmidi_str *pstr; + struct snd_rawmidi_substream *substream; + +- mutex_lock(®ister_mutex); + rmidi = snd_rawmidi_search(card, info->device); +- mutex_unlock(®ister_mutex); + if (!rmidi) + return -ENXIO; + if (info->stream < 0 || info->stream > 1) +@@ -603,6 +602,16 @@ int snd_rawmidi_info_select(struct snd_card *card, struct snd_rawmidi_info *info + } + return -ENXIO; + } ++ ++int snd_rawmidi_info_select(struct snd_card *card, struct snd_rawmidi_info *info) ++{ ++ int ret; ++ ++ mutex_lock(®ister_mutex); ++ ret = __snd_rawmidi_info_select(card, info); ++ mutex_unlock(®ister_mutex); ++ return ret; ++} + EXPORT_SYMBOL(snd_rawmidi_info_select); + + static int snd_rawmidi_info_select_user(struct snd_card *card, +-- +2.15.0 + diff --git a/queue/ALSA-usb-audio-Add-native-DSD-support-for-Esoteric-D.patch b/queue/ALSA-usb-audio-Add-native-DSD-support-for-Esoteric-D.patch new file mode 100644 index 0000000..94a371e --- /dev/null +++ b/queue/ALSA-usb-audio-Add-native-DSD-support-for-Esoteric-D.patch @@ -0,0 +1,54 @@ +From 866f7ed7d67936dcdbcddc111c8af878c918fe7c Mon Sep 17 00:00:00 2001 +From: Jussi Laako <jussi@sonarnerd.net> +Date: Thu, 7 Dec 2017 12:58:33 +0200 +Subject: [PATCH] ALSA: usb-audio: Add native DSD support for Esoteric D-05X + +commit 866f7ed7d67936dcdbcddc111c8af878c918fe7c upstream. + +Adds VID:PID of Esoteric D-05X to the TEAC device id's. +Renames the is_teac_50X_dac() function to is_teac_dsd_dac() to cover +broader device family from the same corporation sharing the same USB +audio implementation. + +Signed-off-by: Jussi Laako <jussi@sonarnerd.net> +Cc: <stable@vger.kernel.org> +Signed-off-by: Takashi Iwai <tiwai@suse.de> + +diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c +index 77eecaa4db1f..a66ef5777887 100644 +--- a/sound/usb/quirks.c ++++ b/sound/usb/quirks.c +@@ -1166,10 +1166,11 @@ static bool is_marantz_denon_dac(unsigned int id) + /* TEAC UD-501/UD-503/NT-503 USB DACs need a vendor cmd to switch + * between PCM/DOP and native DSD mode + */ +-static bool is_teac_50X_dac(unsigned int id) ++static bool is_teac_dsd_dac(unsigned int id) + { + switch (id) { + case USB_ID(0x0644, 0x8043): /* TEAC UD-501/UD-503/NT-503 */ ++ case USB_ID(0x0644, 0x8044): /* Esoteric D-05X */ + return true; + } + return false; +@@ -1202,7 +1203,7 @@ int snd_usb_select_mode_quirk(struct snd_usb_substream *subs, + break; + } + mdelay(20); +- } else if (is_teac_50X_dac(subs->stream->chip->usb_id)) { ++ } else if (is_teac_dsd_dac(subs->stream->chip->usb_id)) { + /* Vendor mode switch cmd is required. */ + switch (fmt->altsetting) { + case 3: /* DSD mode (DSD_U32) requested */ +@@ -1392,7 +1393,7 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip, + } + + /* TEAC devices with USB DAC functionality */ +- if (is_teac_50X_dac(chip->usb_id)) { ++ if (is_teac_dsd_dac(chip->usb_id)) { + if (fp->altsetting == 3) + return SNDRV_PCM_FMTBIT_DSD_U32_BE; + } +-- +2.15.0 + diff --git a/queue/ALSA-usb-audio-Fix-the-missing-ctl-name-suffix-at-pa.patch b/queue/ALSA-usb-audio-Fix-the-missing-ctl-name-suffix-at-pa.patch new file mode 100644 index 0000000..1c7b135 --- /dev/null +++ b/queue/ALSA-usb-audio-Fix-the-missing-ctl-name-suffix-at-pa.patch @@ -0,0 +1,75 @@ +From 5a15f289ee87eaf33f13f08a4909ec99d837ec5f Mon Sep 17 00:00:00 2001 +From: Takashi Iwai <tiwai@suse.de> +Date: Mon, 18 Dec 2017 23:36:57 +0100 +Subject: [PATCH] ALSA: usb-audio: Fix the missing ctl name suffix at parsing + SU + +commit 5a15f289ee87eaf33f13f08a4909ec99d837ec5f upstream. + +The commit 89b89d121ffc ("ALSA: usb-audio: Add check return value for +usb_string()") added the check of the return value from +snd_usb_copy_string_desc(), which is correct per se, but it introduced +a regression. In the original code, either the "Clock Source", +"Playback Source" or "Capture Source" suffix is added after the +terminal string, while the commit changed it to add the suffix only +when get_term_name() is failing. It ended up with an incorrect ctl +name like "PCM" instead of "PCM Capture Source". + +Also, even the original code has a similar bug: when the ctl name is +generated from snd_usb_copy_string_desc() for the given iSelector, it +also doesn't put the suffix. + +This patch addresses these issues: the suffix is added always when no +static mapping is found. Also the patch tries to put more comments +and cleans up the if/else block for better readability in order to +avoid the same pitfall again. + +Fixes: 89b89d121ffc ("ALSA: usb-audio: Add check return value for usb_string()") +Reported-and-tested-by: Mauro Santos <registo.mailling@gmail.com> +Cc: <stable@vger.kernel.org> +Signed-off-by: Takashi Iwai <tiwai@suse.de> + +diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c +index afc208e1c756..60ebc99ae323 100644 +--- a/sound/usb/mixer.c ++++ b/sound/usb/mixer.c +@@ -2173,20 +2173,25 @@ static int parse_audio_selector_unit(struct mixer_build *state, int unitid, + kctl->private_value = (unsigned long)namelist; + kctl->private_free = usb_mixer_selector_elem_free; + +- nameid = uac_selector_unit_iSelector(desc); ++ /* check the static mapping table at first */ + len = check_mapped_name(map, kctl->id.name, sizeof(kctl->id.name)); +- if (len) +- ; +- else if (nameid) +- len = snd_usb_copy_string_desc(state, nameid, kctl->id.name, +- sizeof(kctl->id.name)); +- else +- len = get_term_name(state, &state->oterm, +- kctl->id.name, sizeof(kctl->id.name), 0); +- + if (!len) { +- strlcpy(kctl->id.name, "USB", sizeof(kctl->id.name)); ++ /* no mapping ? */ ++ /* if iSelector is given, use it */ ++ nameid = uac_selector_unit_iSelector(desc); ++ if (nameid) ++ len = snd_usb_copy_string_desc(state, nameid, ++ kctl->id.name, ++ sizeof(kctl->id.name)); ++ /* ... or pick up the terminal name at next */ ++ if (!len) ++ len = get_term_name(state, &state->oterm, ++ kctl->id.name, sizeof(kctl->id.name), 0); ++ /* ... or use the fixed string "USB" as the last resort */ ++ if (!len) ++ strlcpy(kctl->id.name, "USB", sizeof(kctl->id.name)); + ++ /* and add the proper suffix */ + if (desc->bDescriptorSubtype == UAC2_CLOCK_SELECTOR) + append_ctl_name(kctl, " Clock Source"); + else if ((state->oterm.type & 0xff00) == 0x0100) +-- +2.15.0 + diff --git a/queue/KVM-MMU-Fix-infinite-loop-when-there-is-no-available.patch b/queue/KVM-MMU-Fix-infinite-loop-when-there-is-no-available.patch new file mode 100644 index 0000000..2e9b1db --- /dev/null +++ b/queue/KVM-MMU-Fix-infinite-loop-when-there-is-no-available.patch @@ -0,0 +1,97 @@ +From ed52870f4676489124d8697fd00e6ae6c504e586 Mon Sep 17 00:00:00 2001 +From: Wanpeng Li <wanpeng.li@hotmail.com> +Date: Mon, 4 Dec 2017 22:21:30 -0800 +Subject: [PATCH] KVM: MMU: Fix infinite loop when there is no available mmu + page +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit ed52870f4676489124d8697fd00e6ae6c504e586 upstream. + +The below test case can cause infinite loop in kvm when ept=0. + + #include <unistd.h> + #include <sys/syscall.h> + #include <string.h> + #include <stdint.h> + #include <linux/kvm.h> + #include <fcntl.h> + #include <sys/ioctl.h> + + long r[5]; + int main() + { + r[2] = open("/dev/kvm", O_RDONLY); + r[3] = ioctl(r[2], KVM_CREATE_VM, 0); + r[4] = ioctl(r[3], KVM_CREATE_VCPU, 7); + ioctl(r[4], KVM_RUN, 0); + } + +It doesn't setup the memory regions, mmu_alloc_shadow/direct_roots() in +kvm return 1 when kvm fails to allocate root page table which can result +in beblow infinite loop: + + vcpu_run() { + for (;;) { + r = vcpu_enter_guest()::kvm_mmu_reload() returns 1 + if (r <= 0) + break; + if (need_resched()) + cond_resched(); + } + } + +This patch fixes it by returning -ENOSPC when there is no available kvm mmu +page for root page table. + +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Radim Krčmář <rkrcmar@redhat.com> +Cc: stable@vger.kernel.org +Fixes: 26eeb53cf0f (KVM: MMU: Bail out immediately if there is no available mmu page) +Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> + +diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c +index e5e66e5c6640..c4deb1f34faa 100644 +--- a/arch/x86/kvm/mmu.c ++++ b/arch/x86/kvm/mmu.c +@@ -3395,7 +3395,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) + spin_lock(&vcpu->kvm->mmu_lock); + if(make_mmu_pages_available(vcpu) < 0) { + spin_unlock(&vcpu->kvm->mmu_lock); +- return 1; ++ return -ENOSPC; + } + sp = kvm_mmu_get_page(vcpu, 0, 0, + vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL); +@@ -3410,7 +3410,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) + spin_lock(&vcpu->kvm->mmu_lock); + if (make_mmu_pages_available(vcpu) < 0) { + spin_unlock(&vcpu->kvm->mmu_lock); +- return 1; ++ return -ENOSPC; + } + sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), + i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL); +@@ -3450,7 +3450,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) + spin_lock(&vcpu->kvm->mmu_lock); + if (make_mmu_pages_available(vcpu) < 0) { + spin_unlock(&vcpu->kvm->mmu_lock); +- return 1; ++ return -ENOSPC; + } + sp = kvm_mmu_get_page(vcpu, root_gfn, 0, + vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL); +@@ -3487,7 +3487,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) + spin_lock(&vcpu->kvm->mmu_lock); + if (make_mmu_pages_available(vcpu) < 0) { + spin_unlock(&vcpu->kvm->mmu_lock); +- return 1; ++ return -ENOSPC; + } + sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, + 0, ACC_ALL); +-- +2.15.0 + diff --git a/queue/KVM-PPC-Book3S-HV-Fix-pending_pri-value-in-kvmppc_xi.patch b/queue/KVM-PPC-Book3S-HV-Fix-pending_pri-value-in-kvmppc_xi.patch new file mode 100644 index 0000000..0313eb6 --- /dev/null +++ b/queue/KVM-PPC-Book3S-HV-Fix-pending_pri-value-in-kvmppc_xi.patch @@ -0,0 +1,52 @@ +From 7333b5aca412d6ad02667b5a513485838a91b136 Mon Sep 17 00:00:00 2001 +From: Laurent Vivier <lvivier@redhat.com> +Date: Tue, 12 Dec 2017 18:23:56 +0100 +Subject: [PATCH] KVM: PPC: Book3S HV: Fix pending_pri value in + kvmppc_xive_get_icp() + +commit 7333b5aca412d6ad02667b5a513485838a91b136 upstream. + +When we migrate a VM from a POWER8 host (XICS) to a POWER9 host +(XICS-on-XIVE), we have an error: + +qemu-kvm: Unable to restore KVM interrupt controller state \ + (0xff000000) for CPU 0: Invalid argument + +This is because kvmppc_xics_set_icp() checks the new state +is internaly consistent, and especially: + +... + 1129 if (xisr == 0) { + 1130 if (pending_pri != 0xff) + 1131 return -EINVAL; +... + +On the other side, kvmppc_xive_get_icp() doesn't set +neither the pending_pri value, nor the xisr value (set to 0) +(and kvmppc_xive_set_icp() ignores the pending_pri value) + +As xisr is 0, pending_pri must be set to 0xff. + +Fixes: 5af50993850a ("KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller") +Cc: stable@vger.kernel.org # v4.12+ +Signed-off-by: Laurent Vivier <lvivier@redhat.com> +Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> +Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> + +diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c +index b5e6d227a034..0d750d274c4e 100644 +--- a/arch/powerpc/kvm/book3s_xive.c ++++ b/arch/powerpc/kvm/book3s_xive.c +@@ -725,7 +725,8 @@ u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu) + + /* Return the per-cpu state for state saving/migration */ + return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT | +- (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT; ++ (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT | ++ (u64)0xff << KVM_REG_PPC_ICP_PPRI_SHIFT; + } + + int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) +-- +2.15.0 + diff --git a/queue/KVM-PPC-Book3S-fix-XIVE-migration-of-pending-interru.patch b/queue/KVM-PPC-Book3S-fix-XIVE-migration-of-pending-interru.patch new file mode 100644 index 0000000..f182f16 --- /dev/null +++ b/queue/KVM-PPC-Book3S-fix-XIVE-migration-of-pending-interru.patch @@ -0,0 +1,50 @@ +From dc1c4165d189350cb51bdd3057deb6ecd164beda Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@kaod.org> +Date: Tue, 12 Dec 2017 12:02:04 +0000 +Subject: [PATCH] KVM: PPC: Book3S: fix XIVE migration of pending interrupts +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit dc1c4165d189350cb51bdd3057deb6ecd164beda upstream. + +When restoring a pending interrupt, we are setting the Q bit to force +a retrigger in xive_finish_unmask(). But we also need to force an EOI +in this case to reach the same initial state : P=1, Q=0. + +This can be done by not setting 'old_p' for pending interrupts which +will inform xive_finish_unmask() that an EOI needs to be sent. + +Fixes: 5af50993850a ("KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller") +Cc: stable@vger.kernel.org # v4.12+ +Suggested-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> +Signed-off-by: Cédric Le Goater <clg@kaod.org> +Reviewed-by: Laurent Vivier <lvivier@redhat.com> +Tested-by: Laurent Vivier <lvivier@redhat.com> +Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> + +diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c +index bf457843e032..b5e6d227a034 100644 +--- a/arch/powerpc/kvm/book3s_xive.c ++++ b/arch/powerpc/kvm/book3s_xive.c +@@ -1558,7 +1558,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr) + + /* + * Restore P and Q. If the interrupt was pending, we +- * force both P and Q, which will trigger a resend. ++ * force Q and !P, which will trigger a resend. + * + * That means that a guest that had both an interrupt + * pending (queued) and Q set will restore with only +@@ -1566,7 +1566,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr) + * is perfectly fine as coalescing interrupts that haven't + * been presented yet is always allowed. + */ +- if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING) ++ if (val & KVM_XICS_PRESENTED && !(val & KVM_XICS_PENDING)) + state->old_p = true; + if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING) + state->old_q = true; +-- +2.15.0 + diff --git a/queue/KVM-X86-Fix-load-RFLAGS-w-o-the-fixed-bit.patch b/queue/KVM-X86-Fix-load-RFLAGS-w-o-the-fixed-bit.patch new file mode 100644 index 0000000..1945c60 --- /dev/null +++ b/queue/KVM-X86-Fix-load-RFLAGS-w-o-the-fixed-bit.patch @@ -0,0 +1,72 @@ +From d73235d17ba63b53dc0e1051dbc10a1f1be91b71 Mon Sep 17 00:00:00 2001 +From: Wanpeng Li <wanpeng.li@hotmail.com> +Date: Thu, 7 Dec 2017 00:30:08 -0800 +Subject: [PATCH] KVM: X86: Fix load RFLAGS w/o the fixed bit +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit d73235d17ba63b53dc0e1051dbc10a1f1be91b71 upstream. + + *** Guest State *** + CR0: actual=0x0000000000000030, shadow=0x0000000060000010, gh_mask=fffffffffffffff7 + CR4: actual=0x0000000000002050, shadow=0x0000000000000000, gh_mask=ffffffffffffe871 + CR3 = 0x00000000fffbc000 + RSP = 0x0000000000000000 RIP = 0x0000000000000000 + RFLAGS=0x00000000 DR7 = 0x0000000000000400 + ^^^^^^^^^^ + +The failed vmentry is triggered by the following testcase when ept=Y: + + #include <unistd.h> + #include <sys/syscall.h> + #include <string.h> + #include <stdint.h> + #include <linux/kvm.h> + #include <fcntl.h> + #include <sys/ioctl.h> + + long r[5]; + int main() + { + r[2] = open("/dev/kvm", O_RDONLY); + r[3] = ioctl(r[2], KVM_CREATE_VM, 0); + r[4] = ioctl(r[3], KVM_CREATE_VCPU, 7); + struct kvm_regs regs = { + .rflags = 0, + }; + ioctl(r[4], KVM_SET_REGS, ®s); + ioctl(r[4], KVM_RUN, 0); + } + +X86 RFLAGS bit 1 is fixed set, userspace can simply clearing bit 1 +of RFLAGS with KVM_SET_REGS ioctl which results in vmentry fails. +This patch fixes it by oring X86_EFLAGS_FIXED during ioctl. + +Cc: stable@vger.kernel.org +Suggested-by: Jim Mattson <jmattson@google.com> +Reviewed-by: David Hildenbrand <david@redhat.com> +Reviewed-by: Quan Xu <quan.xu0@gmail.com> +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Radim Krčmář <rkrcmar@redhat.com> +Cc: Jim Mattson <jmattson@google.com> +Cc: stable@vger.kernel.org +Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> + +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index faf843c9b916..154ea27746e9 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -7384,7 +7384,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) + #endif + + kvm_rip_write(vcpu, regs->rip); +- kvm_set_rflags(vcpu, regs->rflags); ++ kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED); + + vcpu->arch.exception.pending = false; + +-- +2.15.0 + diff --git a/queue/KVM-arm-arm64-Fix-HYP-unmapping-going-off-limits.patch b/queue/KVM-arm-arm64-Fix-HYP-unmapping-going-off-limits.patch new file mode 100644 index 0000000..733ecfc --- /dev/null +++ b/queue/KVM-arm-arm64-Fix-HYP-unmapping-going-off-limits.patch @@ -0,0 +1,62 @@ +From 7839c672e58bf62da8f2f0197fefb442c02ba1dd Mon Sep 17 00:00:00 2001 +From: Marc Zyngier <marc.zyngier@arm.com> +Date: Thu, 7 Dec 2017 11:45:45 +0000 +Subject: [PATCH] KVM: arm/arm64: Fix HYP unmapping going off limits + +commit 7839c672e58bf62da8f2f0197fefb442c02ba1dd upstream. + +When we unmap the HYP memory, we try to be clever and unmap one +PGD at a time. If we start with a non-PGD aligned address and try +to unmap a whole PGD, things go horribly wrong in unmap_hyp_range +(addr and end can never match, and it all goes really badly as we +keep incrementing pgd and parse random memory as page tables...). + +The obvious fix is to let unmap_hyp_range do what it does best, +which is to iterate over a range. + +The size of the linear mapping, which begins at PAGE_OFFSET, can be +easily calculated by subtracting PAGE_OFFSET form high_memory, because +high_memory is defined as the linear map address of the last byte of +DRAM, plus one. + +The size of the vmalloc region is given trivially by VMALLOC_END - +VMALLOC_START. + +Cc: stable@vger.kernel.org +Reported-by: Andre Przywara <andre.przywara@arm.com> +Tested-by: Andre Przywara <andre.przywara@arm.com> +Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org> +Signed-off-by: Marc Zyngier <marc.zyngier@arm.com> +Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org> + +diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c +index b36945d49986..b4b69c2d1012 100644 +--- a/virt/kvm/arm/mmu.c ++++ b/virt/kvm/arm/mmu.c +@@ -509,8 +509,6 @@ static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) + */ + void free_hyp_pgds(void) + { +- unsigned long addr; +- + mutex_lock(&kvm_hyp_pgd_mutex); + + if (boot_hyp_pgd) { +@@ -521,10 +519,10 @@ void free_hyp_pgds(void) + + if (hyp_pgd) { + unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE); +- for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) +- unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE); +- for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) +- unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE); ++ unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET), ++ (uintptr_t)high_memory - PAGE_OFFSET); ++ unmap_hyp_range(hyp_pgd, kern_hyp_va(VMALLOC_START), ++ VMALLOC_END - VMALLOC_START); + + free_pages((unsigned long)hyp_pgd, hyp_pgd_order); + hyp_pgd = NULL; +-- +2.15.0 + diff --git a/queue/PCI-PM-Force-devices-to-D0-in-pci_pm_thaw_noirq.patch b/queue/PCI-PM-Force-devices-to-D0-in-pci_pm_thaw_noirq.patch new file mode 100644 index 0000000..91da6d3 --- /dev/null +++ b/queue/PCI-PM-Force-devices-to-D0-in-pci_pm_thaw_noirq.patch @@ -0,0 +1,46 @@ +From 5839ee7389e893a31e4e3c9cf17b50d14103c902 Mon Sep 17 00:00:00 2001 +From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com> +Date: Fri, 15 Dec 2017 03:07:18 +0100 +Subject: [PATCH] PCI / PM: Force devices to D0 in pci_pm_thaw_noirq() + +commit 5839ee7389e893a31e4e3c9cf17b50d14103c902 upstream. + +It is incorrect to call pci_restore_state() for devices in low-power +states (D1-D3), as that involves the restoration of MSI setup which +requires MMIO to be operational and that is only the case in D0. + +However, pci_pm_thaw_noirq() may do that if the driver's "freeze" +callbacks put the device into a low-power state, so fix it by making +it force devices into D0 via pci_set_power_state() instead of trying +to "update" their power state which is pointless. + +Fixes: e60514bd4485 (PCI/PM: Restore the status of PCI devices across hibernation) +Cc: 4.13+ <stable@vger.kernel.org> # 4.13+ +Reported-by: Thomas Gleixner <tglx@linutronix.de> +Reported-by: Maarten Lankhorst <dev@mblankhorst.nl> +Tested-by: Thomas Gleixner <tglx@linutronix.de> +Tested-by: Maarten Lankhorst <dev@mblankhorst.nl> +Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> +Acked-by: Bjorn Helgaas <bhelgaas@google.com> + +diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c +index 945099d49f8f..14fd865a5120 100644 +--- a/drivers/pci/pci-driver.c ++++ b/drivers/pci/pci-driver.c +@@ -1012,7 +1012,12 @@ static int pci_pm_thaw_noirq(struct device *dev) + if (pci_has_legacy_pm_support(pci_dev)) + return pci_legacy_resume_early(dev); + +- pci_update_current_state(pci_dev, PCI_D0); ++ /* ++ * pci_restore_state() requires the device to be in D0 (because of MSI ++ * restoration among other things), so force it into D0 in case the ++ * driver's "freeze" callbacks put it into a low-power state directly. ++ */ ++ pci_set_power_state(pci_dev, PCI_D0); + pci_restore_state(pci_dev); + + if (drv && drv->pm && drv->pm->thaw_noirq) +-- +2.15.0 + diff --git a/queue/Revert-parisc-Re-enable-interrupts-early.patch b/queue/Revert-parisc-Re-enable-interrupts-early.patch new file mode 100644 index 0000000..52c08ea --- /dev/null +++ b/queue/Revert-parisc-Re-enable-interrupts-early.patch @@ -0,0 +1,78 @@ +From 9352aeada4d8d8753fc0e414fbfe8fdfcb68a12c Mon Sep 17 00:00:00 2001 +From: John David Anglin <dave.anglin@bell.net> +Date: Mon, 13 Nov 2017 19:35:33 -0500 +Subject: [PATCH] Revert "parisc: Re-enable interrupts early" + +commit 9352aeada4d8d8753fc0e414fbfe8fdfcb68a12c upstream. + +This reverts commit 5c38602d83e584047906b41b162ababd4db4106d. + +Interrupts can't be enabled early because the register saves are done on +the thread stack prior to switching to the IRQ stack. This caused stack +overflows and the thread stack needed increasing to 32k. Even then, +stack overflows still occasionally occurred. + +Background: +Even with a 32 kB thread stack, I have seen instances where the thread +stack overflowed on the mx3210 buildd. Detection of stack overflow only +occurs when we have an external interrupt. When an external interrupt +occurs, we switch to the thread stack if we are not already on a kernel +stack. Then, registers and specials are saved to the kernel stack. + +The bug occurs in intr_return where interrupts are reenabled prior to +returning from the interrupt. This was done incase we need to schedule +or deliver signals. However, it introduces the possibility that +multiple external interrupts may occur on the thread stack and cause a +stack overflow. These might not be detected and cause the kernel to +misbehave in random ways. + +This patch changes the code back to only reenable interrupts when we are +going to schedule or deliver signals. As a result, we generally return +from an interrupt before reenabling interrupts. This minimizes the +growth of the thread stack. + +Fixes: 5c38602d83e5 ("parisc: Re-enable interrupts early") +Signed-off-by: John David Anglin <dave.anglin@bell.net> +Cc: <stable@vger.kernel.org> # v4.10+ +Signed-off-by: Helge Deller <deller@gmx.de> + +diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S +index a4fd296c958e..f3cecf5117cf 100644 +--- a/arch/parisc/kernel/entry.S ++++ b/arch/parisc/kernel/entry.S +@@ -878,9 +878,6 @@ ENTRY_CFI(syscall_exit_rfi) + STREG %r19,PT_SR7(%r16) + + intr_return: +- /* NOTE: Need to enable interrupts incase we schedule. */ +- ssm PSW_SM_I, %r0 +- + /* check for reschedule */ + mfctl %cr30,%r1 + LDREG TI_FLAGS(%r1),%r19 /* sched.h: TIF_NEED_RESCHED */ +@@ -907,6 +904,11 @@ intr_check_sig: + LDREG PT_IASQ1(%r16), %r20 + cmpib,COND(=),n 0,%r20,intr_restore /* backward */ + ++ /* NOTE: We need to enable interrupts if we have to deliver ++ * signals. We used to do this earlier but it caused kernel ++ * stack overflows. */ ++ ssm PSW_SM_I, %r0 ++ + copy %r0, %r25 /* long in_syscall = 0 */ + #ifdef CONFIG_64BIT + ldo -16(%r30),%r29 /* Reference param save area */ +@@ -958,6 +960,10 @@ intr_do_resched: + cmpib,COND(=) 0, %r20, intr_do_preempt + nop + ++ /* NOTE: We need to enable interrupts if we schedule. We used ++ * to do this earlier but it caused kernel stack overflows. */ ++ ssm PSW_SM_I, %r0 ++ + #ifdef CONFIG_64BIT + ldo -16(%r30),%r29 /* Reference param save area */ + #endif +-- +2.15.0 + diff --git a/queue/acpi-nfit-fix-health-event-notification.patch b/queue/acpi-nfit-fix-health-event-notification.patch new file mode 100644 index 0000000..980b1b7 --- /dev/null +++ b/queue/acpi-nfit-fix-health-event-notification.patch @@ -0,0 +1,59 @@ +From adf6895754e2503d994a765535fd1813f8834674 Mon Sep 17 00:00:00 2001 +From: Dan Williams <dan.j.williams@intel.com> +Date: Thu, 30 Nov 2017 19:42:52 -0800 +Subject: [PATCH] acpi, nfit: fix health event notification + +commit adf6895754e2503d994a765535fd1813f8834674 upstream. + +Integration testing with a BIOS that generates injected health event +notifications fails to communicate those events to userspace. The nfit +driver neglects to link the ACPI DIMM device with the necessary driver +data so acpi_nvdimm_notify() fails this lookup: + + nfit_mem = dev_get_drvdata(dev); + if (nfit_mem && nfit_mem->flags_attr) + sysfs_notify_dirent(nfit_mem->flags_attr); + +Add the necessary linkage when installing the notification handler and +clean it up when the nfit driver instance is torn down. + +Cc: <stable@vger.kernel.org> +Cc: Toshi Kani <toshi.kani@hpe.com> +Cc: Vishal Verma <vishal.l.verma@intel.com> +Fixes: ba9c8dd3c222 ("acpi, nfit: add dimm device notification support") +Reported-by: Daniel Osawa <daniel.k.osawa@intel.com> +Tested-by: Daniel Osawa <daniel.k.osawa@intel.com> +Signed-off-by: Dan Williams <dan.j.williams@intel.com> + +diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c +index ff2580e7611d..abeb4df4f22e 100644 +--- a/drivers/acpi/nfit/core.c ++++ b/drivers/acpi/nfit/core.c +@@ -1670,6 +1670,11 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, + dev_name(&adev_dimm->dev)); + return -ENXIO; + } ++ /* ++ * Record nfit_mem for the notification path to track back to ++ * the nfit sysfs attributes for this dimm device object. ++ */ ++ dev_set_drvdata(&adev_dimm->dev, nfit_mem); + + /* + * Until standardization materializes we need to consider 4 +@@ -1752,9 +1757,11 @@ static void shutdown_dimm_notify(void *data) + sysfs_put(nfit_mem->flags_attr); + nfit_mem->flags_attr = NULL; + } +- if (adev_dimm) ++ if (adev_dimm) { + acpi_remove_notify_handler(adev_dimm->handle, + ACPI_DEVICE_NOTIFY, acpi_nvdimm_notify); ++ dev_set_drvdata(&adev_dimm->dev, NULL); ++ } + } + mutex_unlock(&acpi_desc->init_mutex); + } +-- +2.15.0 + diff --git a/queue/arch-mm-Allow-arch_dup_mmap-to-fail.patch b/queue/arch-mm-Allow-arch_dup_mmap-to-fail.patch new file mode 100644 index 0000000..fa051a1 --- /dev/null +++ b/queue/arch-mm-Allow-arch_dup_mmap-to-fail.patch @@ -0,0 +1,139 @@ +From c10e83f598d08046dd1ebc8360d4bb12d802d51b Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Thu, 14 Dec 2017 12:27:29 +0100 +Subject: [PATCH] arch, mm: Allow arch_dup_mmap() to fail + +commit c10e83f598d08046dd1ebc8360d4bb12d802d51b upstream. + +In order to sanitize the LDT initialization on x86 arch_dup_mmap() must be +allowed to fail. Fix up all instances. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Andy Lutomirsky <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: dan.j.williams@intel.com +Cc: hughd@google.com +Cc: keescook@google.com +Cc: kirill.shutemov@linux.intel.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h +index 492d8140a395..44fdf4786638 100644 +--- a/arch/powerpc/include/asm/mmu_context.h ++++ b/arch/powerpc/include/asm/mmu_context.h +@@ -114,9 +114,10 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, + #endif + } + +-static inline void arch_dup_mmap(struct mm_struct *oldmm, +- struct mm_struct *mm) ++static inline int arch_dup_mmap(struct mm_struct *oldmm, ++ struct mm_struct *mm) + { ++ return 0; + } + + static inline void arch_exit_mmap(struct mm_struct *mm) +diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h +index b668e351fd6c..fca34b2177e2 100644 +--- a/arch/um/include/asm/mmu_context.h ++++ b/arch/um/include/asm/mmu_context.h +@@ -15,9 +15,10 @@ extern void uml_setup_stubs(struct mm_struct *mm); + /* + * Needed since we do not use the asm-generic/mm_hooks.h: + */ +-static inline void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) ++static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) + { + uml_setup_stubs(mm); ++ return 0; + } + extern void arch_exit_mmap(struct mm_struct *mm); + static inline void arch_unmap(struct mm_struct *mm, +diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h +index 59b06b48f27d..5c205a9cb5a6 100644 +--- a/arch/unicore32/include/asm/mmu_context.h ++++ b/arch/unicore32/include/asm/mmu_context.h +@@ -81,9 +81,10 @@ do { \ + } \ + } while (0) + +-static inline void arch_dup_mmap(struct mm_struct *oldmm, +- struct mm_struct *mm) ++static inline int arch_dup_mmap(struct mm_struct *oldmm, ++ struct mm_struct *mm) + { ++ return 0; + } + + static inline void arch_unmap(struct mm_struct *mm, +diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h +index 6d16d15d09a0..c76162439c8a 100644 +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -176,10 +176,10 @@ do { \ + } while (0) + #endif + +-static inline void arch_dup_mmap(struct mm_struct *oldmm, +- struct mm_struct *mm) ++static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) + { + paravirt_arch_dup_mmap(oldmm, mm); ++ return 0; + } + + static inline void arch_exit_mmap(struct mm_struct *mm) +diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h +index ea189d88a3cc..8ac4e68a12f0 100644 +--- a/include/asm-generic/mm_hooks.h ++++ b/include/asm-generic/mm_hooks.h +@@ -7,9 +7,10 @@ + #ifndef _ASM_GENERIC_MM_HOOKS_H + #define _ASM_GENERIC_MM_HOOKS_H + +-static inline void arch_dup_mmap(struct mm_struct *oldmm, +- struct mm_struct *mm) ++static inline int arch_dup_mmap(struct mm_struct *oldmm, ++ struct mm_struct *mm) + { ++ return 0; + } + + static inline void arch_exit_mmap(struct mm_struct *mm) +diff --git a/kernel/fork.c b/kernel/fork.c +index 07cc743698d3..500ce64517d9 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -721,8 +721,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, + goto out; + } + /* a new mm has just been created */ +- arch_dup_mmap(oldmm, mm); +- retval = 0; ++ retval = arch_dup_mmap(oldmm, mm); + out: + up_write(&mm->mmap_sem); + flush_tlb_mm(oldmm); +-- +2.15.0 + diff --git a/queue/arm64-kvm-Prevent-restoring-stale-PMSCR_EL1-for-vcpu.patch b/queue/arm64-kvm-Prevent-restoring-stale-PMSCR_EL1-for-vcpu.patch new file mode 100644 index 0000000..f99c371 --- /dev/null +++ b/queue/arm64-kvm-Prevent-restoring-stale-PMSCR_EL1-for-vcpu.patch @@ -0,0 +1,42 @@ +From bfe766cf65fb65e68c4764f76158718560bdcee5 Mon Sep 17 00:00:00 2001 +From: Julien Thierry <julien.thierry@arm.com> +Date: Wed, 6 Dec 2017 17:09:49 +0000 +Subject: [PATCH] arm64: kvm: Prevent restoring stale PMSCR_EL1 for vcpu + +commit bfe766cf65fb65e68c4764f76158718560bdcee5 upstream. + +When VHE is not present, KVM needs to save and restores PMSCR_EL1 when +possible. If SPE is used by the host, value of PMSCR_EL1 cannot be saved +for the guest. +If the host starts using SPE between two save+restore on the same vcpu, +restore will write the value of PMSCR_EL1 read during the first save. + +Make sure __debug_save_spe_nvhe clears the value of the saved PMSCR_EL1 +when the guest cannot use SPE. + +Signed-off-by: Julien Thierry <julien.thierry@arm.com> +Cc: Christoffer Dall <christoffer.dall@linaro.org> +Cc: Marc Zyngier <marc.zyngier@arm.com> +Cc: Catalin Marinas <catalin.marinas@arm.com> +Cc: <stable@vger.kernel.org> +Reviewed-by: Will Deacon <will.deacon@arm.com> +Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org> +Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org> + +diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c +index 321c9c05dd9e..f4363d40e2cd 100644 +--- a/arch/arm64/kvm/hyp/debug-sr.c ++++ b/arch/arm64/kvm/hyp/debug-sr.c +@@ -74,6 +74,9 @@ static void __hyp_text __debug_save_spe_nvhe(u64 *pmscr_el1) + { + u64 reg; + ++ /* Clear pmscr in case of early return */ ++ *pmscr_el1 = 0; ++ + /* SPE present on this CPU? */ + if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1), + ID_AA64DFR0_PMSVER_SHIFT)) +-- +2.15.0 + diff --git a/queue/block-throttle-avoid-double-charge.patch b/queue/block-throttle-avoid-double-charge.patch new file mode 100644 index 0000000..157cefa --- /dev/null +++ b/queue/block-throttle-avoid-double-charge.patch @@ -0,0 +1,113 @@ +From 111be883981748acc9a56e855c8336404a8e787c Mon Sep 17 00:00:00 2001 +From: Shaohua Li <shli@fb.com> +Date: Wed, 20 Dec 2017 11:10:17 -0700 +Subject: [PATCH] block-throttle: avoid double charge + +commit 111be883981748acc9a56e855c8336404a8e787c upstream. + +If a bio is throttled and split after throttling, the bio could be +resubmited and enters the throttling again. This will cause part of the +bio to be charged multiple times. If the cgroup has an IO limit, the +double charge will significantly harm the performance. The bio split +becomes quite common after arbitrary bio size change. + +To fix this, we always set the BIO_THROTTLED flag if a bio is throttled. +If the bio is cloned/split, we copy the flag to new bio too to avoid a +double charge. However, cloned bio could be directed to a new disk, +keeping the flag be a problem. The observation is we always set new disk +for the bio in this case, so we can clear the flag in bio_set_dev(). + +This issue exists for a long time, arbitrary bio size change just makes +it worse, so this should go into stable at least since v4.2. + +V1-> V2: Not add extra field in bio based on discussion with Tejun + +Cc: Vivek Goyal <vgoyal@redhat.com> +Cc: stable@vger.kernel.org +Acked-by: Tejun Heo <tj@kernel.org> +Signed-off-by: Shaohua Li <shli@fb.com> +Signed-off-by: Jens Axboe <axboe@kernel.dk> + +diff --git a/block/bio.c b/block/bio.c +index 8bfdea58159b..9ef6cf3addb3 100644 +--- a/block/bio.c ++++ b/block/bio.c +@@ -599,6 +599,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) + bio->bi_disk = bio_src->bi_disk; + bio->bi_partno = bio_src->bi_partno; + bio_set_flag(bio, BIO_CLONED); ++ if (bio_flagged(bio_src, BIO_THROTTLED)) ++ bio_set_flag(bio, BIO_THROTTLED); + bio->bi_opf = bio_src->bi_opf; + bio->bi_write_hint = bio_src->bi_write_hint; + bio->bi_iter = bio_src->bi_iter; +diff --git a/block/blk-throttle.c b/block/blk-throttle.c +index 825bc29767e6..d19f416d6101 100644 +--- a/block/blk-throttle.c ++++ b/block/blk-throttle.c +@@ -2226,13 +2226,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, + out_unlock: + spin_unlock_irq(q->queue_lock); + out: +- /* +- * As multiple blk-throtls may stack in the same issue path, we +- * don't want bios to leave with the flag set. Clear the flag if +- * being issued. +- */ +- if (!throttled) +- bio_clear_flag(bio, BIO_THROTTLED); ++ bio_set_flag(bio, BIO_THROTTLED); + + #ifdef CONFIG_BLK_DEV_THROTTLING_LOW + if (throttled || !td->track_bio_latency) +diff --git a/include/linux/bio.h b/include/linux/bio.h +index 82f0c8fd7be8..23d29b39f71e 100644 +--- a/include/linux/bio.h ++++ b/include/linux/bio.h +@@ -492,6 +492,8 @@ extern unsigned int bvec_nr_vecs(unsigned short idx); + + #define bio_set_dev(bio, bdev) \ + do { \ ++ if ((bio)->bi_disk != (bdev)->bd_disk) \ ++ bio_clear_flag(bio, BIO_THROTTLED);\ + (bio)->bi_disk = (bdev)->bd_disk; \ + (bio)->bi_partno = (bdev)->bd_partno; \ + } while (0) +diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h +index a1e628e032da..9e7d8bd776d2 100644 +--- a/include/linux/blk_types.h ++++ b/include/linux/blk_types.h +@@ -50,8 +50,6 @@ struct blk_issue_stat { + struct bio { + struct bio *bi_next; /* request queue link */ + struct gendisk *bi_disk; +- u8 bi_partno; +- blk_status_t bi_status; + unsigned int bi_opf; /* bottom bits req flags, + * top bits REQ_OP. Use + * accessors. +@@ -59,8 +57,8 @@ struct bio { + unsigned short bi_flags; /* status, etc and bvec pool number */ + unsigned short bi_ioprio; + unsigned short bi_write_hint; +- +- struct bvec_iter bi_iter; ++ blk_status_t bi_status; ++ u8 bi_partno; + + /* Number of segments in this BIO after + * physical address coalescing is performed. +@@ -74,8 +72,9 @@ struct bio { + unsigned int bi_seg_front_size; + unsigned int bi_seg_back_size; + +- atomic_t __bi_remaining; ++ struct bvec_iter bi_iter; + ++ atomic_t __bi_remaining; + bio_end_io_t *bi_end_io; + + void *bi_private; +-- +2.15.0 + diff --git a/queue/block-unalign-call_single_data-in-struct-request.patch b/queue/block-unalign-call_single_data-in-struct-request.patch new file mode 100644 index 0000000..1aa8717 --- /dev/null +++ b/queue/block-unalign-call_single_data-in-struct-request.patch @@ -0,0 +1,33 @@ +From 4ccafe032005e9b96acbef2e389a4de5b1254add Mon Sep 17 00:00:00 2001 +From: Jens Axboe <axboe@kernel.dk> +Date: Wed, 20 Dec 2017 13:13:58 -0700 +Subject: [PATCH] block: unalign call_single_data in struct request + +commit 4ccafe032005e9b96acbef2e389a4de5b1254add upstream. + +A previous change blindly added massive alignment to the +call_single_data structure in struct request. This ballooned it in size +from 296 to 320 bytes on my setup, for no valid reason at all. + +Use the unaligned struct __call_single_data variant instead. + +Fixes: 966a967116e69 ("smp: Avoid using two cache lines for struct call_single_data") +Cc: stable@vger.kernel.org # v4.14 +Signed-off-by: Jens Axboe <axboe@kernel.dk> + +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 100d0df38026..0ce8a372d506 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -135,7 +135,7 @@ typedef __u32 __bitwise req_flags_t; + struct request { + struct list_head queuelist; + union { +- call_single_data_t csd; ++ struct __call_single_data csd; + u64 fifo_time; + }; + +-- +2.15.0 + diff --git a/queue/clk-sunxi-sun9i-mmc-Implement-reset-callback-for-res.patch b/queue/clk-sunxi-sun9i-mmc-Implement-reset-callback-for-res.patch new file mode 100644 index 0000000..db86027 --- /dev/null +++ b/queue/clk-sunxi-sun9i-mmc-Implement-reset-callback-for-res.patch @@ -0,0 +1,60 @@ +From 61d2f2a05765a5f57149efbd93e3e81a83cbc2c1 Mon Sep 17 00:00:00 2001 +From: Chen-Yu Tsai <wens@csie.org> +Date: Mon, 18 Dec 2017 11:57:51 +0800 +Subject: [PATCH] clk: sunxi: sun9i-mmc: Implement reset callback for reset + controls + +commit 61d2f2a05765a5f57149efbd93e3e81a83cbc2c1 upstream. + +Our MMC host driver now issues a reset, instead of just deasserting +the reset control, since commit c34eda69ad4c ("mmc: sunxi: Reset the +device at probe time"). The sun9i-mmc clock driver does not support +this, and will fail, which results in MMC not probing. + +This patch implements the reset callback by asserting the reset control, +then deasserting it after a small delay. + +Fixes: 7a6fca879f59 ("clk: sunxi: Add driver for A80 MMC config clocks/resets") +Cc: <stable@vger.kernel.org> # 4.14.x +Signed-off-by: Chen-Yu Tsai <wens@csie.org> +Acked-by: Philipp Zabel <p.zabel@pengutronix.de> +Acked-by: Maxime Ripard <maxime.ripard@free-electrons.com> +Signed-off-by: Michael Turquette <mturquette@baylibre.com> +Link: lkml.kernel.org/r/20171218035751.20661-1-wens@csie.org + +diff --git a/drivers/clk/sunxi/clk-sun9i-mmc.c b/drivers/clk/sunxi/clk-sun9i-mmc.c +index a1a634253d6f..f00d8758ba24 100644 +--- a/drivers/clk/sunxi/clk-sun9i-mmc.c ++++ b/drivers/clk/sunxi/clk-sun9i-mmc.c +@@ -16,6 +16,7 @@ + + #include <linux/clk.h> + #include <linux/clk-provider.h> ++#include <linux/delay.h> + #include <linux/init.h> + #include <linux/of.h> + #include <linux/of_device.h> +@@ -83,9 +84,20 @@ static int sun9i_mmc_reset_deassert(struct reset_controller_dev *rcdev, + return 0; + } + ++static int sun9i_mmc_reset_reset(struct reset_controller_dev *rcdev, ++ unsigned long id) ++{ ++ sun9i_mmc_reset_assert(rcdev, id); ++ udelay(10); ++ sun9i_mmc_reset_deassert(rcdev, id); ++ ++ return 0; ++} ++ + static const struct reset_control_ops sun9i_mmc_reset_ops = { + .assert = sun9i_mmc_reset_assert, + .deassert = sun9i_mmc_reset_deassert, ++ .reset = sun9i_mmc_reset_reset, + }; + + static int sun9i_a80_mmc_config_clk_probe(struct platform_device *pdev) +-- +2.15.0 + diff --git a/queue/crypto-af_alg-fix-race-accessing-cipher-request.patch b/queue/crypto-af_alg-fix-race-accessing-cipher-request.patch new file mode 100644 index 0000000..1b4e0eb --- /dev/null +++ b/queue/crypto-af_alg-fix-race-accessing-cipher-request.patch @@ -0,0 +1,87 @@ +From d53c5135792319e095bb126bc43b2ee98586f7fe Mon Sep 17 00:00:00 2001 +From: Stephan Mueller <smueller@chronox.de> +Date: Fri, 8 Dec 2017 11:50:37 +0100 +Subject: [PATCH] crypto: af_alg - fix race accessing cipher request + +commit d53c5135792319e095bb126bc43b2ee98586f7fe upstream. + +When invoking an asynchronous cipher operation, the invocation of the +callback may be performed before the subsequent operations in the +initial code path are invoked. The callback deletes the cipher request +data structure which implies that after the invocation of the +asynchronous cipher operation, this data structure must not be accessed +any more. + +The setting of the return code size with the request data structure must +therefore be moved before the invocation of the asynchronous cipher +operation. + +Fixes: e870456d8e7c ("crypto: algif_skcipher - overhaul memory management") +Fixes: d887c52d6ae4 ("crypto: algif_aead - overhaul memory management") +Reported-by: syzbot <syzkaller@googlegroups.com> +Cc: <stable@vger.kernel.org> # v4.14+ +Signed-off-by: Stephan Mueller <smueller@chronox.de> +Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> +Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> + +diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c +index c8a32bef208a..b73db2b27656 100644 +--- a/crypto/algif_aead.c ++++ b/crypto/algif_aead.c +@@ -291,6 +291,10 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg, + /* AIO operation */ + sock_hold(sk); + areq->iocb = msg->msg_iocb; ++ ++ /* Remember output size that will be generated. */ ++ areq->outlen = outlen; ++ + aead_request_set_callback(&areq->cra_u.aead_req, + CRYPTO_TFM_REQ_MAY_BACKLOG, + af_alg_async_cb, areq); +@@ -298,12 +302,8 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg, + crypto_aead_decrypt(&areq->cra_u.aead_req); + + /* AIO operation in progress */ +- if (err == -EINPROGRESS || err == -EBUSY) { +- /* Remember output size that will be generated. */ +- areq->outlen = outlen; +- ++ if (err == -EINPROGRESS || err == -EBUSY) + return -EIOCBQUEUED; +- } + + sock_put(sk); + } else { +diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c +index 6fb595cd63ac..baef9bfccdda 100644 +--- a/crypto/algif_skcipher.c ++++ b/crypto/algif_skcipher.c +@@ -125,6 +125,10 @@ static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg, + /* AIO operation */ + sock_hold(sk); + areq->iocb = msg->msg_iocb; ++ ++ /* Remember output size that will be generated. */ ++ areq->outlen = len; ++ + skcipher_request_set_callback(&areq->cra_u.skcipher_req, + CRYPTO_TFM_REQ_MAY_SLEEP, + af_alg_async_cb, areq); +@@ -133,12 +137,8 @@ static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg, + crypto_skcipher_decrypt(&areq->cra_u.skcipher_req); + + /* AIO operation in progress */ +- if (err == -EINPROGRESS || err == -EBUSY) { +- /* Remember output size that will be generated. */ +- areq->outlen = len; +- ++ if (err == -EINPROGRESS || err == -EBUSY) + return -EIOCBQUEUED; +- } + + sock_put(sk); + } else { +-- +2.15.0 + diff --git a/queue/crypto-af_alg-wait-for-data-at-beginning-of-recvmsg.patch b/queue/crypto-af_alg-wait-for-data-at-beginning-of-recvmsg.patch new file mode 100644 index 0000000..b732590 --- /dev/null +++ b/queue/crypto-af_alg-wait-for-data-at-beginning-of-recvmsg.patch @@ -0,0 +1,76 @@ +From 11edb555966ed2c66c533d17c604f9d7e580a829 Mon Sep 17 00:00:00 2001 +From: Stephan Mueller <smueller@chronox.de> +Date: Wed, 29 Nov 2017 12:02:23 +0100 +Subject: [PATCH] crypto: af_alg - wait for data at beginning of recvmsg + +commit 11edb555966ed2c66c533d17c604f9d7e580a829 upstream. + +The wait for data is a non-atomic operation that can sleep and therefore +potentially release the socket lock. The release of the socket lock +allows another thread to modify the context data structure. The waiting +operation for new data therefore must be called at the beginning of +recvmsg. This prevents a race condition where checks of the members of +the context data structure are performed by recvmsg while there is a +potential for modification of these values. + +Fixes: e870456d8e7c ("crypto: algif_skcipher - overhaul memory management") +Fixes: d887c52d6ae4 ("crypto: algif_aead - overhaul memory management") +Reported-by: syzbot <syzkaller@googlegroups.com> +Cc: <stable@vger.kernel.org> # v4.14+ +Signed-off-by: Stephan Mueller <smueller@chronox.de> +Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> + +diff --git a/crypto/af_alg.c b/crypto/af_alg.c +index 358749c38894..f1a2caf1b59b 100644 +--- a/crypto/af_alg.c ++++ b/crypto/af_alg.c +@@ -1137,12 +1137,6 @@ int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags, + if (!af_alg_readable(sk)) + break; + +- if (!ctx->used) { +- err = af_alg_wait_for_data(sk, flags); +- if (err) +- return err; +- } +- + seglen = min_t(size_t, (maxsize - len), + msg_data_left(msg)); + +diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c +index 805f485ddf1b..c8a32bef208a 100644 +--- a/crypto/algif_aead.c ++++ b/crypto/algif_aead.c +@@ -111,6 +111,12 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg, + size_t usedpages = 0; /* [in] RX bufs to be used from user */ + size_t processed = 0; /* [in] TX bufs to be consumed */ + ++ if (!ctx->used) { ++ err = af_alg_wait_for_data(sk, flags); ++ if (err) ++ return err; ++ } ++ + /* + * Data length provided by caller via sendmsg/sendpage that has not + * yet been processed. +diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c +index 30cff827dd8f..6fb595cd63ac 100644 +--- a/crypto/algif_skcipher.c ++++ b/crypto/algif_skcipher.c +@@ -72,6 +72,12 @@ static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg, + int err = 0; + size_t len = 0; + ++ if (!ctx->used) { ++ err = af_alg_wait_for_data(sk, flags); ++ if (err) ++ return err; ++ } ++ + /* Allocate cipher request for current operation. */ + areq = af_alg_alloc_areq(sk, sizeof(struct af_alg_async_req) + + crypto_skcipher_reqsize(tfm)); +-- +2.15.0 + diff --git a/queue/crypto-mcryptd-protect-the-per-CPU-queue-with-a-lock.patch b/queue/crypto-mcryptd-protect-the-per-CPU-queue-with-a-lock.patch new file mode 100644 index 0000000..f2ccc22 --- /dev/null +++ b/queue/crypto-mcryptd-protect-the-per-CPU-queue-with-a-lock.patch @@ -0,0 +1,113 @@ +From 9abffc6f2efe46c3564c04312e52e07622d40e51 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Thu, 30 Nov 2017 13:39:27 +0100 +Subject: [PATCH] crypto: mcryptd - protect the per-CPU queue with a lock + +commit 9abffc6f2efe46c3564c04312e52e07622d40e51 upstream. + +mcryptd_enqueue_request() grabs the per-CPU queue struct and protects +access to it with disabled preemption. Then it schedules a worker on the +same CPU. The worker in mcryptd_queue_worker() guards access to the same +per-CPU variable with disabled preemption. + +If we take CPU-hotplug into account then it is possible that between +queue_work_on() and the actual invocation of the worker the CPU goes +down and the worker will be scheduled on _another_ CPU. And here the +preempt_disable() protection does not work anymore. The easiest thing is +to add a spin_lock() to guard access to the list. + +Another detail: mcryptd_queue_worker() is not processing more than +MCRYPTD_BATCH invocation in a row. If there are still items left, then +it will invoke queue_work() to proceed with more later. *I* would +suggest to simply drop that check because it does not use a system +workqueue and the workqueue is already marked as "CPU_INTENSIVE". And if +preemption is required then the scheduler should do it. +However if queue_work() is used then the work item is marked as CPU +unbound. That means it will try to run on the local CPU but it may run +on another CPU as well. Especially with CONFIG_DEBUG_WQ_FORCE_RR_CPU=y. +Again, the preempt_disable() won't work here but lock which was +introduced will help. +In order to keep work-item on the local CPU (and avoid RR) I changed it +to queue_work_on(). + +Cc: stable@vger.kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> + +diff --git a/crypto/mcryptd.c b/crypto/mcryptd.c +index 4e6472658852..eca04d3729b3 100644 +--- a/crypto/mcryptd.c ++++ b/crypto/mcryptd.c +@@ -81,6 +81,7 @@ static int mcryptd_init_queue(struct mcryptd_queue *queue, + pr_debug("cpu_queue #%d %p\n", cpu, queue->cpu_queue); + crypto_init_queue(&cpu_queue->queue, max_cpu_qlen); + INIT_WORK(&cpu_queue->work, mcryptd_queue_worker); ++ spin_lock_init(&cpu_queue->q_lock); + } + return 0; + } +@@ -104,15 +105,16 @@ static int mcryptd_enqueue_request(struct mcryptd_queue *queue, + int cpu, err; + struct mcryptd_cpu_queue *cpu_queue; + +- cpu = get_cpu(); +- cpu_queue = this_cpu_ptr(queue->cpu_queue); +- rctx->tag.cpu = cpu; ++ cpu_queue = raw_cpu_ptr(queue->cpu_queue); ++ spin_lock(&cpu_queue->q_lock); ++ cpu = smp_processor_id(); ++ rctx->tag.cpu = smp_processor_id(); + + err = crypto_enqueue_request(&cpu_queue->queue, request); + pr_debug("enqueue request: cpu %d cpu_queue %p request %p\n", + cpu, cpu_queue, request); ++ spin_unlock(&cpu_queue->q_lock); + queue_work_on(cpu, kcrypto_wq, &cpu_queue->work); +- put_cpu(); + + return err; + } +@@ -161,16 +163,11 @@ static void mcryptd_queue_worker(struct work_struct *work) + cpu_queue = container_of(work, struct mcryptd_cpu_queue, work); + i = 0; + while (i < MCRYPTD_BATCH || single_task_running()) { +- /* +- * preempt_disable/enable is used to prevent +- * being preempted by mcryptd_enqueue_request() +- */ +- local_bh_disable(); +- preempt_disable(); ++ ++ spin_lock_bh(&cpu_queue->q_lock); + backlog = crypto_get_backlog(&cpu_queue->queue); + req = crypto_dequeue_request(&cpu_queue->queue); +- preempt_enable(); +- local_bh_enable(); ++ spin_unlock_bh(&cpu_queue->q_lock); + + if (!req) { + mcryptd_opportunistic_flush(); +@@ -185,7 +182,7 @@ static void mcryptd_queue_worker(struct work_struct *work) + ++i; + } + if (cpu_queue->queue.qlen) +- queue_work(kcrypto_wq, &cpu_queue->work); ++ queue_work_on(smp_processor_id(), kcrypto_wq, &cpu_queue->work); + } + + void mcryptd_flusher(struct work_struct *__work) +diff --git a/include/crypto/mcryptd.h b/include/crypto/mcryptd.h +index cceafa01f907..b67404fc4b34 100644 +--- a/include/crypto/mcryptd.h ++++ b/include/crypto/mcryptd.h +@@ -27,6 +27,7 @@ static inline struct mcryptd_ahash *__mcryptd_ahash_cast( + + struct mcryptd_cpu_queue { + struct crypto_queue queue; ++ spinlock_t q_lock; + struct work_struct work; + }; + +-- +2.15.0 + diff --git a/queue/crypto-skcipher-set-walk.iv-for-zero-length-inputs.patch b/queue/crypto-skcipher-set-walk.iv-for-zero-length-inputs.patch new file mode 100644 index 0000000..4795d22 --- /dev/null +++ b/queue/crypto-skcipher-set-walk.iv-for-zero-length-inputs.patch @@ -0,0 +1,79 @@ +From 2b4f27c36bcd46e820ddb9a8e6fe6a63fa4250b8 Mon Sep 17 00:00:00 2001 +From: Eric Biggers <ebiggers@google.com> +Date: Wed, 29 Nov 2017 01:18:57 -0800 +Subject: [PATCH] crypto: skcipher - set walk.iv for zero-length inputs + +commit 2b4f27c36bcd46e820ddb9a8e6fe6a63fa4250b8 upstream. + +All the ChaCha20 algorithms as well as the ARM bit-sliced AES-XTS +algorithms call skcipher_walk_virt(), then access the IV (walk.iv) +before checking whether any bytes need to be processed (walk.nbytes). + +But if the input is empty, then skcipher_walk_virt() doesn't set the IV, +and the algorithms crash trying to use the uninitialized IV pointer. + +Fix it by setting the IV earlier in skcipher_walk_virt(). Also fix it +for the AEAD walk functions. + +This isn't a perfect solution because we can't actually align the IV to +->cra_alignmask unless there are bytes to process, for one because the +temporary buffer for the aligned IV is freed by skcipher_walk_done(), +which is only called when there are bytes to process. Thus, algorithms +that require aligned IVs will still need to avoid accessing the IV when +walk.nbytes == 0. Still, many algorithms/architectures are fine with +IVs having any alignment, and even for those that aren't, a misaligned +pointer bug is much less severe than an uninitialized pointer bug. + +This change also matches the behavior of the older blkcipher_walk API. + +Fixes: 0cabf2af6f5a ("crypto: skcipher - Fix crash on zero-length input") +Reported-by: syzbot <syzkaller@googlegroups.com> +Cc: <stable@vger.kernel.org> # v4.14+ +Signed-off-by: Eric Biggers <ebiggers@google.com> +Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> + +diff --git a/crypto/skcipher.c b/crypto/skcipher.c +index 778e0ff42bfa..11af5fd6a443 100644 +--- a/crypto/skcipher.c ++++ b/crypto/skcipher.c +@@ -449,6 +449,8 @@ static int skcipher_walk_skcipher(struct skcipher_walk *walk, + + walk->total = req->cryptlen; + walk->nbytes = 0; ++ walk->iv = req->iv; ++ walk->oiv = req->iv; + + if (unlikely(!walk->total)) + return 0; +@@ -456,9 +458,6 @@ static int skcipher_walk_skcipher(struct skcipher_walk *walk, + scatterwalk_start(&walk->in, req->src); + scatterwalk_start(&walk->out, req->dst); + +- walk->iv = req->iv; +- walk->oiv = req->iv; +- + walk->flags &= ~SKCIPHER_WALK_SLEEP; + walk->flags |= req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? + SKCIPHER_WALK_SLEEP : 0; +@@ -510,6 +509,8 @@ static int skcipher_walk_aead_common(struct skcipher_walk *walk, + int err; + + walk->nbytes = 0; ++ walk->iv = req->iv; ++ walk->oiv = req->iv; + + if (unlikely(!walk->total)) + return 0; +@@ -525,9 +526,6 @@ static int skcipher_walk_aead_common(struct skcipher_walk *walk, + scatterwalk_done(&walk->in, 0, walk->total); + scatterwalk_done(&walk->out, 0, walk->total); + +- walk->iv = req->iv; +- walk->oiv = req->iv; +- + if (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) + walk->flags |= SKCIPHER_WALK_SLEEP; + else +-- +2.15.0 + diff --git a/queue/drm-i915-Flush-pending-GTT-writes-before-unbinding.patch b/queue/drm-i915-Flush-pending-GTT-writes-before-unbinding.patch new file mode 100644 index 0000000..92591f8 --- /dev/null +++ b/queue/drm-i915-Flush-pending-GTT-writes-before-unbinding.patch @@ -0,0 +1,55 @@ +From 2797c4a11f373b2545c2398ccb02e362ee66a142 Mon Sep 17 00:00:00 2001 +From: Chris Wilson <chris@chris-wilson.co.uk> +Date: Mon, 4 Dec 2017 13:25:13 +0000 +Subject: [PATCH] drm/i915: Flush pending GTT writes before unbinding + +commit 2797c4a11f373b2545c2398ccb02e362ee66a142 upstream. + +From the shrinker paths, we want to relinquish the GPU and GGTT access to +the object, releasing the backing storage back to the system for +swapout. As a part of that process we would unpin the pages, marking +them for access by the CPU (for the swapout/swapin). However, if that +process was interrupted after unbind the vma, we missed a flush of the +inflight GGTT writes before we made that GTT space available again for +reuse, with the prospect that we would redirect them to another page. + +The bug dates back to the introduction of multiple GGTT vma, but the +code itself dates to commit 02bef8f98d26 ("drm/i915: Unbind closed vma +for i915_gem_object_unbind()"). + +Fixes: 02bef8f98d26 ("drm/i915: Unbind closed vma for i915_gem_object_unbind()") +Fixes: c5ad54cf7dd8 ("drm/i915: Use partial view in mmap fault handler") +Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> +Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> +Cc: stable@vger.kernel.org +Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> +Link: https://patchwork.freedesktop.org/patch/msgid/20171204132513.7303-1-chris@chris-wilson.co.uk +(cherry picked from commit 5888fc9eac3c2ff96e76aeeb865fdb46ab2d711e) +Signed-off-by: Jani Nikula <jani.nikula@intel.com> + +diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c +index ad4050f7ab3b..18de6569d04a 100644 +--- a/drivers/gpu/drm/i915/i915_gem.c ++++ b/drivers/gpu/drm/i915/i915_gem.c +@@ -330,17 +330,10 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj) + * must wait for all rendering to complete to the object (as unbinding + * must anyway), and retire the requests. + */ +- ret = i915_gem_object_wait(obj, +- I915_WAIT_INTERRUPTIBLE | +- I915_WAIT_LOCKED | +- I915_WAIT_ALL, +- MAX_SCHEDULE_TIMEOUT, +- NULL); ++ ret = i915_gem_object_set_to_cpu_domain(obj, false); + if (ret) + return ret; + +- i915_gem_retire_requests(to_i915(obj->base.dev)); +- + while ((vma = list_first_entry_or_null(&obj->vma_list, + struct i915_vma, + obj_link))) { +-- +2.15.0 + diff --git a/queue/drm-sun4i-Fix-error-path-handling.patch b/queue/drm-sun4i-Fix-error-path-handling.patch new file mode 100644 index 0000000..b958664 --- /dev/null +++ b/queue/drm-sun4i-Fix-error-path-handling.patch @@ -0,0 +1,44 @@ +From 92411f6d7f1afcc95e54295d40e96a75385212ec Mon Sep 17 00:00:00 2001 +From: Maxime Ripard <maxime.ripard@free-electrons.com> +Date: Thu, 7 Dec 2017 16:58:50 +0100 +Subject: [PATCH] drm/sun4i: Fix error path handling + +commit 92411f6d7f1afcc95e54295d40e96a75385212ec upstream. + +The commit 4c7f16d14a33 ("drm/sun4i: Fix TCON clock and regmap +initialization sequence") moved a bunch of logic around, but forgot to +update the gotos after the introduction of the err_free_dotclock label. + +It means that if we fail later that the one introduced in that commit, +we'll just to the old label which isn't free the clock we created. This +will result in a breakage as soon as someone tries to do something with +that clock, since its resources will have been long reclaimed. + +Cc: <stable@vger.kernel.org> +Fixes: 4c7f16d14a33 ("drm/sun4i: Fix TCON clock and regmap initialization sequence") +Reviewed-by: Chen-Yu Tsai <wens@csie.org> +Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com> +Link: https://patchwork.freedesktop.org/patch/msgid/f83c1cebc731f0b4251f5ddd7b38c718cd79bb0b.1512662253.git-series.maxime.ripard@free-electrons.com + +diff --git a/drivers/gpu/drm/sun4i/sun4i_tcon.c b/drivers/gpu/drm/sun4i/sun4i_tcon.c +index e122f5b2a395..f4284b51bdca 100644 +--- a/drivers/gpu/drm/sun4i/sun4i_tcon.c ++++ b/drivers/gpu/drm/sun4i/sun4i_tcon.c +@@ -724,12 +724,12 @@ static int sun4i_tcon_bind(struct device *dev, struct device *master, + if (IS_ERR(tcon->crtc)) { + dev_err(dev, "Couldn't create our CRTC\n"); + ret = PTR_ERR(tcon->crtc); +- goto err_free_clocks; ++ goto err_free_dotclock; + } + + ret = sun4i_rgb_init(drm, tcon); + if (ret < 0) +- goto err_free_clocks; ++ goto err_free_dotclock; + + if (tcon->quirks->needs_de_be_mux) { + /* +-- +2.15.0 + diff --git a/queue/init-Invoke-init_espfix_bsp-from-mm_init.patch b/queue/init-Invoke-init_espfix_bsp-from-mm_init.patch new file mode 100644 index 0000000..55c5643 --- /dev/null +++ b/queue/init-Invoke-init_espfix_bsp-from-mm_init.patch @@ -0,0 +1,109 @@ +From 613e396bc0d4c7604fba23256644e78454c68cf6 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Sun, 17 Dec 2017 10:56:29 +0100 +Subject: [PATCH] init: Invoke init_espfix_bsp() from mm_init() + +commit 613e396bc0d4c7604fba23256644e78454c68cf6 upstream. + +init_espfix_bsp() needs to be invoked before the page table isolation +initialization. Move it into mm_init() which is the place where pti_init() +will be added. + +While at it get rid of the #ifdeffery and provide proper stub functions. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h +index 0211029076ea..6777480d8a42 100644 +--- a/arch/x86/include/asm/espfix.h ++++ b/arch/x86/include/asm/espfix.h +@@ -2,7 +2,7 @@ + #ifndef _ASM_X86_ESPFIX_H + #define _ASM_X86_ESPFIX_H + +-#ifdef CONFIG_X86_64 ++#ifdef CONFIG_X86_ESPFIX64 + + #include <asm/percpu.h> + +@@ -11,7 +11,8 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); + + extern void init_espfix_bsp(void); + extern void init_espfix_ap(int cpu); +- +-#endif /* CONFIG_X86_64 */ ++#else ++static inline void init_espfix_ap(int cpu) { } ++#endif + + #endif /* _ASM_X86_ESPFIX_H */ +diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c +index d56c1d209283..33d6000265aa 100644 +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -990,12 +990,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, + initial_code = (unsigned long)start_secondary; + initial_stack = idle->thread.sp; + +- /* +- * Enable the espfix hack for this CPU +- */ +-#ifdef CONFIG_X86_ESPFIX64 ++ /* Enable the espfix hack for this CPU */ + init_espfix_ap(cpu); +-#endif + + /* So we see what's up */ + announce_cpu(cpu, apicid); +diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h +index 757dc6ffc7ba..231b35a76dd9 100644 +--- a/include/asm-generic/pgtable.h ++++ b/include/asm-generic/pgtable.h +@@ -1017,6 +1017,11 @@ static inline int pmd_clear_huge(pmd_t *pmd) + struct file; + int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t *vma_prot); ++ ++#ifndef CONFIG_X86_ESPFIX64 ++static inline void init_espfix_bsp(void) { } ++#endif ++ + #endif /* !__ASSEMBLY__ */ + + #ifndef io_remap_pfn_range +diff --git a/init/main.c b/init/main.c +index 0ee9c6866ada..8a390f60ec81 100644 +--- a/init/main.c ++++ b/init/main.c +@@ -504,6 +504,8 @@ static void __init mm_init(void) + pgtable_init(); + vmalloc_init(); + ioremap_huge_init(); ++ /* Should be run before the first non-init thread is created */ ++ init_espfix_bsp(); + } + + asmlinkage __visible void __init start_kernel(void) +@@ -673,10 +675,6 @@ asmlinkage __visible void __init start_kernel(void) + #ifdef CONFIG_X86 + if (efi_enabled(EFI_RUNTIME_SERVICES)) + efi_enter_virtual_mode(); +-#endif +-#ifdef CONFIG_X86_ESPFIX64 +- /* Should be run before the first non-init thread is created */ +- init_espfix_bsp(); + #endif + thread_stack_cache_init(); + cred_init(); +-- +2.15.0 + diff --git a/queue/kvm-x86-fix-RSM-when-PCID-is-non-zero.patch b/queue/kvm-x86-fix-RSM-when-PCID-is-non-zero.patch new file mode 100644 index 0000000..9254afb --- /dev/null +++ b/queue/kvm-x86-fix-RSM-when-PCID-is-non-zero.patch @@ -0,0 +1,113 @@ +From fae1a3e775cca8c3a9e0eb34443b310871a15a92 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini <pbonzini@redhat.com> +Date: Thu, 21 Dec 2017 00:49:14 +0100 +Subject: [PATCH] kvm: x86: fix RSM when PCID is non-zero + +commit fae1a3e775cca8c3a9e0eb34443b310871a15a92 upstream. + +rsm_load_state_64() and rsm_enter_protected_mode() load CR3, then +CR4 & ~PCIDE, then CR0, then CR4. + +However, setting CR4.PCIDE fails if CR3[11:0] != 0. It's probably easier +in the long run to replace rsm_enter_protected_mode() with an emulator +callback that sets all the special registers (like KVM_SET_SREGS would +do). For now, set the PCID field of CR3 only after CR4.PCIDE is 1. + +Reported-by: Laszlo Ersek <lersek@redhat.com> +Tested-by: Laszlo Ersek <lersek@redhat.com> +Fixes: 660a5d517aaab9187f93854425c4c63f4a09195c +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> + +diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c +index abe74f779f9d..b514b2b2845a 100644 +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -2390,9 +2390,21 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) + } + + static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, +- u64 cr0, u64 cr4) ++ u64 cr0, u64 cr3, u64 cr4) + { + int bad; ++ u64 pcid; ++ ++ /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */ ++ pcid = 0; ++ if (cr4 & X86_CR4_PCIDE) { ++ pcid = cr3 & 0xfff; ++ cr3 &= ~0xfff; ++ } ++ ++ bad = ctxt->ops->set_cr(ctxt, 3, cr3); ++ if (bad) ++ return X86EMUL_UNHANDLEABLE; + + /* + * First enable PAE, long mode needs it before CR0.PG = 1 is set. +@@ -2411,6 +2423,12 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, + bad = ctxt->ops->set_cr(ctxt, 4, cr4); + if (bad) + return X86EMUL_UNHANDLEABLE; ++ if (pcid) { ++ bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid); ++ if (bad) ++ return X86EMUL_UNHANDLEABLE; ++ } ++ + } + + return X86EMUL_CONTINUE; +@@ -2421,11 +2439,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) + struct desc_struct desc; + struct desc_ptr dt; + u16 selector; +- u32 val, cr0, cr4; ++ u32 val, cr0, cr3, cr4; + int i; + + cr0 = GET_SMSTATE(u32, smbase, 0x7ffc); +- ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8)); ++ cr3 = GET_SMSTATE(u32, smbase, 0x7ff8); + ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED; + ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0); + +@@ -2467,14 +2485,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) + + ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8)); + +- return rsm_enter_protected_mode(ctxt, cr0, cr4); ++ return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); + } + + static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) + { + struct desc_struct desc; + struct desc_ptr dt; +- u64 val, cr0, cr4; ++ u64 val, cr0, cr3, cr4; + u32 base3; + u16 selector; + int i, r; +@@ -2491,7 +2509,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) + ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); + + cr0 = GET_SMSTATE(u64, smbase, 0x7f58); +- ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50)); ++ cr3 = GET_SMSTATE(u64, smbase, 0x7f50); + cr4 = GET_SMSTATE(u64, smbase, 0x7f48); + ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00)); + val = GET_SMSTATE(u64, smbase, 0x7ed0); +@@ -2519,7 +2537,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) + dt.address = GET_SMSTATE(u64, smbase, 0x7e68); + ctxt->ops->set_gdt(ctxt, &dt); + +- r = rsm_enter_protected_mode(ctxt, cr0, cr4); ++ r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); + if (r != X86EMUL_CONTINUE) + return r; + +-- +2.15.0 + diff --git a/queue/libnvdimm-btt-Fix-an-incompatibility-in-the-log-layo.patch b/queue/libnvdimm-btt-Fix-an-incompatibility-in-the-log-layo.patch new file mode 100644 index 0000000..89f125e --- /dev/null +++ b/queue/libnvdimm-btt-Fix-an-incompatibility-in-the-log-layo.patch @@ -0,0 +1,433 @@ +From 24e3a7fb60a9187e5df90e5fa655ffc94b9c4f77 Mon Sep 17 00:00:00 2001 +From: Vishal Verma <vishal.l.verma@intel.com> +Date: Mon, 18 Dec 2017 09:28:39 -0700 +Subject: [PATCH] libnvdimm, btt: Fix an incompatibility in the log layout + +commit 24e3a7fb60a9187e5df90e5fa655ffc94b9c4f77 upstream. + +Due to a spec misinterpretation, the Linux implementation of the BTT log +area had different padding scheme from other implementations, such as +UEFI and NVML. + +This fixes the padding scheme, and defaults to it for new BTT layouts. +We attempt to detect the padding scheme in use when probing for an +existing BTT. If we detect the older/incompatible scheme, we continue +using it. + +Reported-by: Juston Li <juston.li@intel.com> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: <stable@vger.kernel.org> +Fixes: 5212e11fde4d ("nd_btt: atomic sector updates") +Signed-off-by: Vishal Verma <vishal.l.verma@intel.com> +Signed-off-by: Dan Williams <dan.j.williams@intel.com> + +diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c +index e949e3302af4..c586bcdb5190 100644 +--- a/drivers/nvdimm/btt.c ++++ b/drivers/nvdimm/btt.c +@@ -211,12 +211,12 @@ static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping, + return ret; + } + +-static int btt_log_read_pair(struct arena_info *arena, u32 lane, +- struct log_entry *ent) ++static int btt_log_group_read(struct arena_info *arena, u32 lane, ++ struct log_group *log) + { + return arena_read_bytes(arena, +- arena->logoff + (2 * lane * LOG_ENT_SIZE), ent, +- 2 * LOG_ENT_SIZE, 0); ++ arena->logoff + (lane * LOG_GRP_SIZE), log, ++ LOG_GRP_SIZE, 0); + } + + static struct dentry *debugfs_root; +@@ -256,6 +256,8 @@ static void arena_debugfs_init(struct arena_info *a, struct dentry *parent, + debugfs_create_x64("logoff", S_IRUGO, d, &a->logoff); + debugfs_create_x64("info2off", S_IRUGO, d, &a->info2off); + debugfs_create_x32("flags", S_IRUGO, d, &a->flags); ++ debugfs_create_u32("log_index_0", S_IRUGO, d, &a->log_index[0]); ++ debugfs_create_u32("log_index_1", S_IRUGO, d, &a->log_index[1]); + } + + static void btt_debugfs_init(struct btt *btt) +@@ -274,6 +276,11 @@ static void btt_debugfs_init(struct btt *btt) + } + } + ++static u32 log_seq(struct log_group *log, int log_idx) ++{ ++ return le32_to_cpu(log->ent[log_idx].seq); ++} ++ + /* + * This function accepts two log entries, and uses the + * sequence number to find the 'older' entry. +@@ -283,8 +290,10 @@ static void btt_debugfs_init(struct btt *btt) + * + * TODO The logic feels a bit kludge-y. make it better.. + */ +-static int btt_log_get_old(struct log_entry *ent) ++static int btt_log_get_old(struct arena_info *a, struct log_group *log) + { ++ int idx0 = a->log_index[0]; ++ int idx1 = a->log_index[1]; + int old; + + /* +@@ -292,23 +301,23 @@ static int btt_log_get_old(struct log_entry *ent) + * the next time, the following logic works out to put this + * (next) entry into [1] + */ +- if (ent[0].seq == 0) { +- ent[0].seq = cpu_to_le32(1); ++ if (log_seq(log, idx0) == 0) { ++ log->ent[idx0].seq = cpu_to_le32(1); + return 0; + } + +- if (ent[0].seq == ent[1].seq) ++ if (log_seq(log, idx0) == log_seq(log, idx1)) + return -EINVAL; +- if (le32_to_cpu(ent[0].seq) + le32_to_cpu(ent[1].seq) > 5) ++ if (log_seq(log, idx0) + log_seq(log, idx1) > 5) + return -EINVAL; + +- if (le32_to_cpu(ent[0].seq) < le32_to_cpu(ent[1].seq)) { +- if (le32_to_cpu(ent[1].seq) - le32_to_cpu(ent[0].seq) == 1) ++ if (log_seq(log, idx0) < log_seq(log, idx1)) { ++ if ((log_seq(log, idx1) - log_seq(log, idx0)) == 1) + old = 0; + else + old = 1; + } else { +- if (le32_to_cpu(ent[0].seq) - le32_to_cpu(ent[1].seq) == 1) ++ if ((log_seq(log, idx0) - log_seq(log, idx1)) == 1) + old = 1; + else + old = 0; +@@ -328,17 +337,18 @@ static int btt_log_read(struct arena_info *arena, u32 lane, + { + int ret; + int old_ent, ret_ent; +- struct log_entry log[2]; ++ struct log_group log; + +- ret = btt_log_read_pair(arena, lane, log); ++ ret = btt_log_group_read(arena, lane, &log); + if (ret) + return -EIO; + +- old_ent = btt_log_get_old(log); ++ old_ent = btt_log_get_old(arena, &log); + if (old_ent < 0 || old_ent > 1) { + dev_err(to_dev(arena), + "log corruption (%d): lane %d seq [%d, %d]\n", +- old_ent, lane, log[0].seq, log[1].seq); ++ old_ent, lane, log.ent[arena->log_index[0]].seq, ++ log.ent[arena->log_index[1]].seq); + /* TODO set error state? */ + return -EIO; + } +@@ -346,7 +356,7 @@ static int btt_log_read(struct arena_info *arena, u32 lane, + ret_ent = (old_flag ? old_ent : (1 - old_ent)); + + if (ent != NULL) +- memcpy(ent, &log[ret_ent], LOG_ENT_SIZE); ++ memcpy(ent, &log.ent[arena->log_index[ret_ent]], LOG_ENT_SIZE); + + return ret_ent; + } +@@ -360,17 +370,13 @@ static int __btt_log_write(struct arena_info *arena, u32 lane, + u32 sub, struct log_entry *ent, unsigned long flags) + { + int ret; +- /* +- * Ignore the padding in log_entry for calculating log_half. +- * The entry is 'committed' when we write the sequence number, +- * and we want to ensure that that is the last thing written. +- * We don't bother writing the padding as that would be extra +- * media wear and write amplification +- */ +- unsigned int log_half = (LOG_ENT_SIZE - 2 * sizeof(u64)) / 2; +- u64 ns_off = arena->logoff + (((2 * lane) + sub) * LOG_ENT_SIZE); ++ u32 group_slot = arena->log_index[sub]; ++ unsigned int log_half = LOG_ENT_SIZE / 2; + void *src = ent; ++ u64 ns_off; + ++ ns_off = arena->logoff + (lane * LOG_GRP_SIZE) + ++ (group_slot * LOG_ENT_SIZE); + /* split the 16B write into atomic, durable halves */ + ret = arena_write_bytes(arena, ns_off, src, log_half, flags); + if (ret) +@@ -453,7 +459,7 @@ static int btt_log_init(struct arena_info *arena) + { + size_t logsize = arena->info2off - arena->logoff; + size_t chunk_size = SZ_4K, offset = 0; +- struct log_entry log; ++ struct log_entry ent; + void *zerobuf; + int ret; + u32 i; +@@ -485,11 +491,11 @@ static int btt_log_init(struct arena_info *arena) + } + + for (i = 0; i < arena->nfree; i++) { +- log.lba = cpu_to_le32(i); +- log.old_map = cpu_to_le32(arena->external_nlba + i); +- log.new_map = cpu_to_le32(arena->external_nlba + i); +- log.seq = cpu_to_le32(LOG_SEQ_INIT); +- ret = __btt_log_write(arena, i, 0, &log, 0); ++ ent.lba = cpu_to_le32(i); ++ ent.old_map = cpu_to_le32(arena->external_nlba + i); ++ ent.new_map = cpu_to_le32(arena->external_nlba + i); ++ ent.seq = cpu_to_le32(LOG_SEQ_INIT); ++ ret = __btt_log_write(arena, i, 0, &ent, 0); + if (ret) + goto free; + } +@@ -594,6 +600,123 @@ static int btt_freelist_init(struct arena_info *arena) + return 0; + } + ++static bool ent_is_padding(struct log_entry *ent) ++{ ++ return (ent->lba == 0) && (ent->old_map == 0) && (ent->new_map == 0) ++ && (ent->seq == 0); ++} ++ ++/* ++ * Detecting valid log indices: We read a log group (see the comments in btt.h ++ * for a description of a 'log_group' and its 'slots'), and iterate over its ++ * four slots. We expect that a padding slot will be all-zeroes, and use this ++ * to detect a padding slot vs. an actual entry. ++ * ++ * If a log_group is in the initial state, i.e. hasn't been used since the ++ * creation of this BTT layout, it will have three of the four slots with ++ * zeroes. We skip over these log_groups for the detection of log_index. If ++ * all log_groups are in the initial state (i.e. the BTT has never been ++ * written to), it is safe to assume the 'new format' of log entries in slots ++ * (0, 1). ++ */ ++static int log_set_indices(struct arena_info *arena) ++{ ++ bool idx_set = false, initial_state = true; ++ int ret, log_index[2] = {-1, -1}; ++ u32 i, j, next_idx = 0; ++ struct log_group log; ++ u32 pad_count = 0; ++ ++ for (i = 0; i < arena->nfree; i++) { ++ ret = btt_log_group_read(arena, i, &log); ++ if (ret < 0) ++ return ret; ++ ++ for (j = 0; j < 4; j++) { ++ if (!idx_set) { ++ if (ent_is_padding(&log.ent[j])) { ++ pad_count++; ++ continue; ++ } else { ++ /* Skip if index has been recorded */ ++ if ((next_idx == 1) && ++ (j == log_index[0])) ++ continue; ++ /* valid entry, record index */ ++ log_index[next_idx] = j; ++ next_idx++; ++ } ++ if (next_idx == 2) { ++ /* two valid entries found */ ++ idx_set = true; ++ } else if (next_idx > 2) { ++ /* too many valid indices */ ++ return -ENXIO; ++ } ++ } else { ++ /* ++ * once the indices have been set, just verify ++ * that all subsequent log groups are either in ++ * their initial state or follow the same ++ * indices. ++ */ ++ if (j == log_index[0]) { ++ /* entry must be 'valid' */ ++ if (ent_is_padding(&log.ent[j])) ++ return -ENXIO; ++ } else if (j == log_index[1]) { ++ ; ++ /* ++ * log_index[1] can be padding if the ++ * lane never got used and it is still ++ * in the initial state (three 'padding' ++ * entries) ++ */ ++ } else { ++ /* entry must be invalid (padding) */ ++ if (!ent_is_padding(&log.ent[j])) ++ return -ENXIO; ++ } ++ } ++ } ++ /* ++ * If any of the log_groups have more than one valid, ++ * non-padding entry, then the we are no longer in the ++ * initial_state ++ */ ++ if (pad_count < 3) ++ initial_state = false; ++ pad_count = 0; ++ } ++ ++ if (!initial_state && !idx_set) ++ return -ENXIO; ++ ++ /* ++ * If all the entries in the log were in the initial state, ++ * assume new padding scheme ++ */ ++ if (initial_state) ++ log_index[1] = 1; ++ ++ /* ++ * Only allow the known permutations of log/padding indices, ++ * i.e. (0, 1), and (0, 2) ++ */ ++ if ((log_index[0] == 0) && ((log_index[1] == 1) || (log_index[1] == 2))) ++ ; /* known index possibilities */ ++ else { ++ dev_err(to_dev(arena), "Found an unknown padding scheme\n"); ++ return -ENXIO; ++ } ++ ++ arena->log_index[0] = log_index[0]; ++ arena->log_index[1] = log_index[1]; ++ dev_dbg(to_dev(arena), "log_index_0 = %d\n", log_index[0]); ++ dev_dbg(to_dev(arena), "log_index_1 = %d\n", log_index[1]); ++ return 0; ++} ++ + static int btt_rtt_init(struct arena_info *arena) + { + arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL); +@@ -650,8 +773,7 @@ static struct arena_info *alloc_arena(struct btt *btt, size_t size, + available -= 2 * BTT_PG_SIZE; + + /* The log takes a fixed amount of space based on nfree */ +- logsize = roundup(2 * arena->nfree * sizeof(struct log_entry), +- BTT_PG_SIZE); ++ logsize = roundup(arena->nfree * LOG_GRP_SIZE, BTT_PG_SIZE); + available -= logsize; + + /* Calculate optimal split between map and data area */ +@@ -668,6 +790,10 @@ static struct arena_info *alloc_arena(struct btt *btt, size_t size, + arena->mapoff = arena->dataoff + datasize; + arena->logoff = arena->mapoff + mapsize; + arena->info2off = arena->logoff + logsize; ++ ++ /* Default log indices are (0,1) */ ++ arena->log_index[0] = 0; ++ arena->log_index[1] = 1; + return arena; + } + +@@ -758,6 +884,13 @@ static int discover_arenas(struct btt *btt) + arena->external_lba_start = cur_nlba; + parse_arena_meta(arena, super, cur_off); + ++ ret = log_set_indices(arena); ++ if (ret) { ++ dev_err(to_dev(arena), ++ "Unable to deduce log/padding indices\n"); ++ goto out; ++ } ++ + mutex_init(&arena->err_lock); + ret = btt_freelist_init(arena); + if (ret) +diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h +index 884fbbbdd18a..db3cb6d4d0d4 100644 +--- a/drivers/nvdimm/btt.h ++++ b/drivers/nvdimm/btt.h +@@ -27,6 +27,7 @@ + #define MAP_ERR_MASK (1 << MAP_ERR_SHIFT) + #define MAP_LBA_MASK (~((1 << MAP_TRIM_SHIFT) | (1 << MAP_ERR_SHIFT))) + #define MAP_ENT_NORMAL 0xC0000000 ++#define LOG_GRP_SIZE sizeof(struct log_group) + #define LOG_ENT_SIZE sizeof(struct log_entry) + #define ARENA_MIN_SIZE (1UL << 24) /* 16 MB */ + #define ARENA_MAX_SIZE (1ULL << 39) /* 512 GB */ +@@ -50,12 +51,52 @@ enum btt_init_state { + INIT_READY + }; + ++/* ++ * A log group represents one log 'lane', and consists of four log entries. ++ * Two of the four entries are valid entries, and the remaining two are ++ * padding. Due to an old bug in the padding location, we need to perform a ++ * test to determine the padding scheme being used, and use that scheme ++ * thereafter. ++ * ++ * In kernels prior to 4.15, 'log group' would have actual log entries at ++ * indices (0, 2) and padding at indices (1, 3), where as the correct/updated ++ * format has log entries at indices (0, 1) and padding at indices (2, 3). ++ * ++ * Old (pre 4.15) format: ++ * +-----------------+-----------------+ ++ * | ent[0] | ent[1] | ++ * | 16B | 16B | ++ * | lba/old/new/seq | pad | ++ * +-----------------------------------+ ++ * | ent[2] | ent[3] | ++ * | 16B | 16B | ++ * | lba/old/new/seq | pad | ++ * +-----------------+-----------------+ ++ * ++ * New format: ++ * +-----------------+-----------------+ ++ * | ent[0] | ent[1] | ++ * | 16B | 16B | ++ * | lba/old/new/seq | lba/old/new/seq | ++ * +-----------------------------------+ ++ * | ent[2] | ent[3] | ++ * | 16B | 16B | ++ * | pad | pad | ++ * +-----------------+-----------------+ ++ * ++ * We detect during start-up which format is in use, and set ++ * arena->log_index[(0, 1)] with the detected format. ++ */ ++ + struct log_entry { + __le32 lba; + __le32 old_map; + __le32 new_map; + __le32 seq; +- __le64 padding[2]; ++}; ++ ++struct log_group { ++ struct log_entry ent[4]; + }; + + struct btt_sb { +@@ -126,6 +167,7 @@ struct aligned_lock { + * @debugfs_dir: Debugfs dentry + * @flags: Arena flags - may signify error states. + * @err_lock: Mutex for synchronizing error clearing. ++ * @log_index: Indices of the valid log entries in a log_group + * + * arena_info is a per-arena handle. Once an arena is narrowed down for an + * IO, this struct is passed around for the duration of the IO. +@@ -158,6 +200,7 @@ struct arena_info { + /* Arena flags */ + u32 flags; + struct mutex err_lock; ++ int log_index[2]; + }; + + /** +-- +2.15.0 + diff --git a/queue/libnvdimm-dax-fix-1GB-aligned-namespaces-vs-physical.patch b/queue/libnvdimm-dax-fix-1GB-aligned-namespaces-vs-physical.patch new file mode 100644 index 0000000..ffffffc --- /dev/null +++ b/queue/libnvdimm-dax-fix-1GB-aligned-namespaces-vs-physical.patch @@ -0,0 +1,73 @@ +From 41fce90f26333c4fa82e8e43b9ace86c4e8a0120 Mon Sep 17 00:00:00 2001 +From: Dan Williams <dan.j.williams@intel.com> +Date: Mon, 4 Dec 2017 14:07:43 -0800 +Subject: [PATCH] libnvdimm, dax: fix 1GB-aligned namespaces vs physical + misalignment + +commit 41fce90f26333c4fa82e8e43b9ace86c4e8a0120 upstream. + +The following namespace configuration attempt: + + # ndctl create-namespace -e namespace0.0 -m devdax -a 1G -f + libndctl: ndctl_dax_enable: dax0.1: failed to enable + Error: namespace0.0: failed to enable + + failed to reconfigure namespace: No such device or address + +...fails when the backing memory range is not physically aligned to 1G: + + # cat /proc/iomem | grep Persistent + 210000000-30fffffff : Persistent Memory (legacy) + +In the above example the 4G persistent memory range starts and ends on a +256MB boundary. + +We handle this case correctly when needing to handle cases that violate +section alignment (128MB) collisions against "System RAM", and we simply +need to extend that padding/truncation for the 1GB alignment use case. + +Cc: <stable@vger.kernel.org> +Fixes: 315c562536c4 ("libnvdimm, pfn: add 'align' attribute...") +Reported-and-tested-by: Jane Chu <jane.chu@oracle.com> +Signed-off-by: Dan Williams <dan.j.williams@intel.com> + +diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c +index db2fc7c02e01..2adada1a5855 100644 +--- a/drivers/nvdimm/pfn_devs.c ++++ b/drivers/nvdimm/pfn_devs.c +@@ -583,6 +583,12 @@ static struct vmem_altmap *__nvdimm_setup_pfn(struct nd_pfn *nd_pfn, + return altmap; + } + ++static u64 phys_pmem_align_down(struct nd_pfn *nd_pfn, u64 phys) ++{ ++ return min_t(u64, PHYS_SECTION_ALIGN_DOWN(phys), ++ ALIGN_DOWN(phys, nd_pfn->align)); ++} ++ + static int nd_pfn_init(struct nd_pfn *nd_pfn) + { + u32 dax_label_reserve = is_nd_dax(&nd_pfn->dev) ? SZ_128K : 0; +@@ -638,13 +644,16 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) + start = nsio->res.start; + size = PHYS_SECTION_ALIGN_UP(start + size) - start; + if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM, +- IORES_DESC_NONE) == REGION_MIXED) { ++ IORES_DESC_NONE) == REGION_MIXED ++ || !IS_ALIGNED(start + resource_size(&nsio->res), ++ nd_pfn->align)) { + size = resource_size(&nsio->res); +- end_trunc = start + size - PHYS_SECTION_ALIGN_DOWN(start + size); ++ end_trunc = start + size - phys_pmem_align_down(nd_pfn, ++ start + size); + } + + if (start_pad + end_trunc) +- dev_info(&nd_pfn->dev, "%s section collision, truncate %d bytes\n", ++ dev_info(&nd_pfn->dev, "%s alignment collision, truncate %d bytes\n", + dev_name(&ndns->dev), start_pad + end_trunc); + + /* +-- +2.15.0 + diff --git a/queue/libnvdimm-pfn-fix-start_pad-handling-for-aligned-nam.patch b/queue/libnvdimm-pfn-fix-start_pad-handling-for-aligned-nam.patch new file mode 100644 index 0000000..e4af80d --- /dev/null +++ b/queue/libnvdimm-pfn-fix-start_pad-handling-for-aligned-nam.patch @@ -0,0 +1,55 @@ +From 19deaa217bc04e83b59b5e8c8229eb0e53ad9efc Mon Sep 17 00:00:00 2001 +From: Dan Williams <dan.j.williams@intel.com> +Date: Tue, 19 Dec 2017 15:07:10 -0800 +Subject: [PATCH] libnvdimm, pfn: fix start_pad handling for aligned namespaces + +commit 19deaa217bc04e83b59b5e8c8229eb0e53ad9efc upstream. + +The alignment checks at pfn driver startup fail to properly account for +the 'start_pad' in the case where the namespace is misaligned relative +to its internal alignment. This is typically triggered in 1G aligned +namespace, but could theoretically trigger with small namespace +alignments. When this triggers the kernel reports messages of the form: + + dax2.1: bad offset: 0x3c000000 dax disabled align: 0x40000000 + +Cc: <stable@vger.kernel.org> +Fixes: 1ee6667cd8d1 ("libnvdimm, pfn, dax: fix initialization vs autodetect...") +Reported-by: Jane Chu <jane.chu@oracle.com> +Signed-off-by: Dan Williams <dan.j.williams@intel.com> + +diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c +index 65cc171c721d..db2fc7c02e01 100644 +--- a/drivers/nvdimm/pfn_devs.c ++++ b/drivers/nvdimm/pfn_devs.c +@@ -364,9 +364,9 @@ struct device *nd_pfn_create(struct nd_region *nd_region) + int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig) + { + u64 checksum, offset; +- unsigned long align; + enum nd_pfn_mode mode; + struct nd_namespace_io *nsio; ++ unsigned long align, start_pad; + struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; + struct nd_namespace_common *ndns = nd_pfn->ndns; + const u8 *parent_uuid = nd_dev_to_uuid(&ndns->dev); +@@ -410,6 +410,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig) + + align = le32_to_cpu(pfn_sb->align); + offset = le64_to_cpu(pfn_sb->dataoff); ++ start_pad = le32_to_cpu(pfn_sb->start_pad); + if (align == 0) + align = 1UL << ilog2(offset); + mode = le32_to_cpu(pfn_sb->mode); +@@ -468,7 +469,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig) + return -EBUSY; + } + +- if ((align && !IS_ALIGNED(offset, align)) ++ if ((align && !IS_ALIGNED(nsio->res.start + offset + start_pad, align)) + || !IS_ALIGNED(offset, PAGE_SIZE)) { + dev_err(&nd_pfn->dev, + "bad offset: %#llx dax disabled align: %#lx\n", +-- +2.15.0 + diff --git a/queue/mfd-cros-ec-spi-Don-t-send-first-message-too-soon.patch b/queue/mfd-cros-ec-spi-Don-t-send-first-message-too-soon.patch new file mode 100644 index 0000000..fb46360 --- /dev/null +++ b/queue/mfd-cros-ec-spi-Don-t-send-first-message-too-soon.patch @@ -0,0 +1,45 @@ +From 15d8374874ded0bec37ef27f8301a6d54032c0e5 Mon Sep 17 00:00:00 2001 +From: Jon Hunter <jonathanh@nvidia.com> +Date: Tue, 14 Nov 2017 14:43:27 +0000 +Subject: [PATCH] mfd: cros ec: spi: Don't send first message too soon + +commit 15d8374874ded0bec37ef27f8301a6d54032c0e5 upstream. + +On the Tegra124 Nyan-Big chromebook the very first SPI message sent to +the EC is failing. + +The Tegra SPI driver configures the SPI chip-selects to be active-high +by default (and always has for many years). The EC SPI requires an +active-low chip-select and so the Tegra chip-select is reconfigured to +be active-low when the EC SPI driver calls spi_setup(). The problem is +that if the first SPI message to the EC is sent too soon after +reconfiguring the SPI chip-select, it fails. + +The EC SPI driver prevents back-to-back SPI messages being sent too +soon by keeping track of the time the last transfer was sent via the +variable 'last_transfer_ns'. To prevent the very first transfer being +sent too soon, initialise the 'last_transfer_ns' variable after calling +spi_setup() and before sending the first SPI message. + +Cc: <stable@vger.kernel.org> +Signed-off-by: Jon Hunter <jonathanh@nvidia.com> +Reviewed-by: Brian Norris <briannorris@chromium.org> +Reviewed-by: Douglas Anderson <dianders@chromium.org> +Acked-by: Benson Leung <bleung@chromium.org> +Signed-off-by: Lee Jones <lee.jones@linaro.org> + +diff --git a/drivers/mfd/cros_ec_spi.c b/drivers/mfd/cros_ec_spi.c +index c9714072e224..a14196e95e9b 100644 +--- a/drivers/mfd/cros_ec_spi.c ++++ b/drivers/mfd/cros_ec_spi.c +@@ -667,6 +667,7 @@ static int cros_ec_spi_probe(struct spi_device *spi) + sizeof(struct ec_response_get_protocol_info); + ec_dev->dout_size = sizeof(struct ec_host_request); + ++ ec_spi->last_transfer_ns = ktime_get_ns(); + + err = cros_ec_register(ec_dev); + if (err) { +-- +2.15.0 + diff --git a/queue/mfd-twl4030-audio-Fix-sibling-node-lookup.patch b/queue/mfd-twl4030-audio-Fix-sibling-node-lookup.patch new file mode 100644 index 0000000..6f66534 --- /dev/null +++ b/queue/mfd-twl4030-audio-Fix-sibling-node-lookup.patch @@ -0,0 +1,49 @@ +From 0a423772de2f3d7b00899987884f62f63ae00dcb Mon Sep 17 00:00:00 2001 +From: Johan Hovold <johan@kernel.org> +Date: Sat, 11 Nov 2017 16:38:43 +0100 +Subject: [PATCH] mfd: twl4030-audio: Fix sibling-node lookup + +commit 0a423772de2f3d7b00899987884f62f63ae00dcb upstream. + +A helper purported to look up a child node based on its name was using +the wrong of-helper and ended up prematurely freeing the parent of-node +while leaking any matching node. + +To make things worse, any matching node would not even necessarily be a +child node as the whole device tree was searched depth-first starting at +the parent. + +Fixes: 019a7e6b7b31 ("mfd: twl4030-audio: Add DT support") +Cc: stable <stable@vger.kernel.org> # 3.7 +Signed-off-by: Johan Hovold <johan@kernel.org> +Acked-by: Peter Ujfalusi <peter.ujfalusi@ti.com> +Signed-off-by: Lee Jones <lee.jones@linaro.org> + +diff --git a/drivers/mfd/twl4030-audio.c b/drivers/mfd/twl4030-audio.c +index da16bf45fab4..dc94ffc6321a 100644 +--- a/drivers/mfd/twl4030-audio.c ++++ b/drivers/mfd/twl4030-audio.c +@@ -159,13 +159,18 @@ unsigned int twl4030_audio_get_mclk(void) + EXPORT_SYMBOL_GPL(twl4030_audio_get_mclk); + + static bool twl4030_audio_has_codec(struct twl4030_audio_data *pdata, +- struct device_node *node) ++ struct device_node *parent) + { ++ struct device_node *node; ++ + if (pdata && pdata->codec) + return true; + +- if (of_find_node_by_name(node, "codec")) ++ node = of_get_child_by_name(parent, "codec"); ++ if (node) { ++ of_node_put(node); + return true; ++ } + + return false; + } +-- +2.15.0 + diff --git a/queue/mfd-twl6040-Fix-child-node-lookup.patch b/queue/mfd-twl6040-Fix-child-node-lookup.patch new file mode 100644 index 0000000..5ad6cd4 --- /dev/null +++ b/queue/mfd-twl6040-Fix-child-node-lookup.patch @@ -0,0 +1,53 @@ +From 85e9b13cbb130a3209f21bd7933933399c389ffe Mon Sep 17 00:00:00 2001 +From: Johan Hovold <johan@kernel.org> +Date: Sat, 11 Nov 2017 16:38:44 +0100 +Subject: [PATCH] mfd: twl6040: Fix child-node lookup + +commit 85e9b13cbb130a3209f21bd7933933399c389ffe upstream. + +Fix child-node lookup during probe, which ended up searching the whole +device tree depth-first starting at the parent rather than just matching +on its children. + +To make things worse, the parent node was prematurely freed, while the +child node was leaked. + +Note that the CONFIG_OF compile guard can be removed as +of_get_child_by_name() provides a !CONFIG_OF implementation which always +fails. + +Cc: stable <stable@vger.kernel.org> # 3.5 +Fixes: 37e13cecaa14 ("mfd: Add support for Device Tree to twl6040") +Fixes: ca2cad6ae38e ("mfd: Fix twl6040 build failure") +Signed-off-by: Johan Hovold <johan@kernel.org> +Acked-by: Peter Ujfalusi <peter.ujfalusi@ti.com> +Signed-off-by: Lee Jones <lee.jones@linaro.org> + +diff --git a/drivers/mfd/twl6040.c b/drivers/mfd/twl6040.c +index d66502d36ba0..dd19f17a1b63 100644 +--- a/drivers/mfd/twl6040.c ++++ b/drivers/mfd/twl6040.c +@@ -97,12 +97,16 @@ static struct reg_sequence twl6040_patch[] = { + }; + + +-static bool twl6040_has_vibra(struct device_node *node) ++static bool twl6040_has_vibra(struct device_node *parent) + { +-#ifdef CONFIG_OF +- if (of_find_node_by_name(node, "vibra")) ++ struct device_node *node; ++ ++ node = of_get_child_by_name(parent, "vibra"); ++ if (node) { ++ of_node_put(node); + return true; +-#endif ++ } ++ + return false; + } + +-- +2.15.0 + diff --git a/queue/net-mvneta-clear-interface-link-status-on-port-disab.patch b/queue/net-mvneta-clear-interface-link-status-on-port-disab.patch new file mode 100644 index 0000000..3281ee6 --- /dev/null +++ b/queue/net-mvneta-clear-interface-link-status-on-port-disab.patch @@ -0,0 +1,37 @@ +From 4423c18e466afdfb02a36ee8b9f901d144b3c607 Mon Sep 17 00:00:00 2001 +From: Yelena Krivosheev <yelena@marvell.com> +Date: Tue, 19 Dec 2017 17:59:45 +0100 +Subject: [PATCH] net: mvneta: clear interface link status on port disable + +commit 4423c18e466afdfb02a36ee8b9f901d144b3c607 upstream. + +When port connect to PHY in polling mode (with poll interval 1 sec), +port and phy link status must be synchronize in order don't loss link +change event. + +[gregory.clement@free-electrons.com: add fixes tag] +Cc: <stable@vger.kernel.org> +Fixes: c5aff18204da ("net: mvneta: driver for Marvell Armada 370/XP network unit") +Signed-off-by: Yelena Krivosheev <yelena@marvell.com> +Tested-by: Dmitri Epshtein <dima@marvell.com> +Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com> +Signed-off-by: David S. Miller <davem@davemloft.net> + +diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c +index bc93b69cfd1e..16b2bfb2cf51 100644 +--- a/drivers/net/ethernet/marvell/mvneta.c ++++ b/drivers/net/ethernet/marvell/mvneta.c +@@ -1214,6 +1214,10 @@ static void mvneta_port_disable(struct mvneta_port *pp) + val &= ~MVNETA_GMAC0_PORT_ENABLE; + mvreg_write(pp, MVNETA_GMAC_CTRL_0, val); + ++ pp->link = 0; ++ pp->duplex = -1; ++ pp->speed = 0; ++ + udelay(200); + } + +-- +2.15.0 + diff --git a/queue/net-mvneta-eliminate-wrong-call-to-handle-rx-descrip.patch b/queue/net-mvneta-eliminate-wrong-call-to-handle-rx-descrip.patch new file mode 100644 index 0000000..1255b5b --- /dev/null +++ b/queue/net-mvneta-eliminate-wrong-call-to-handle-rx-descrip.patch @@ -0,0 +1,38 @@ +From 2eecb2e04abb62ef8ea7b43e1a46bdb5b99d1bf8 Mon Sep 17 00:00:00 2001 +From: Yelena Krivosheev <yelena@marvell.com> +Date: Tue, 19 Dec 2017 17:59:47 +0100 +Subject: [PATCH] net: mvneta: eliminate wrong call to handle rx descriptor + error + +commit 2eecb2e04abb62ef8ea7b43e1a46bdb5b99d1bf8 upstream. + +There are few reasons in mvneta_rx_swbm() function when received packet +is dropped. mvneta_rx_error() should be called only if error bit [16] +is set in rx descriptor. + +[gregory.clement@free-electrons.com: add fixes tag] +Cc: stable@vger.kernel.org +Fixes: dc35a10f68d3 ("net: mvneta: bm: add support for hardware buffer management") +Signed-off-by: Yelena Krivosheev <yelena@marvell.com> +Tested-by: Dmitri Epshtein <dima@marvell.com> +Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com> +Signed-off-by: David S. Miller <davem@davemloft.net> + +diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c +index 1e0835655c93..a539263cd79c 100644 +--- a/drivers/net/ethernet/marvell/mvneta.c ++++ b/drivers/net/ethernet/marvell/mvneta.c +@@ -1962,9 +1962,9 @@ static int mvneta_rx_swbm(struct mvneta_port *pp, int rx_todo, + + if (!mvneta_rxq_desc_is_first_last(rx_status) || + (rx_status & MVNETA_RXD_ERR_SUMMARY)) { ++ mvneta_rx_error(pp, rx_desc); + err_drop_frame: + dev->stats.rx_errors++; +- mvneta_rx_error(pp, rx_desc); + /* leave the descriptor untouched */ + continue; + } +-- +2.15.0 + diff --git a/queue/net-mvneta-use-proper-rxq_number-in-loop-on-rx-queue.patch b/queue/net-mvneta-use-proper-rxq_number-in-loop-on-rx-queue.patch new file mode 100644 index 0000000..e56ec7a --- /dev/null +++ b/queue/net-mvneta-use-proper-rxq_number-in-loop-on-rx-queue.patch @@ -0,0 +1,34 @@ +From ca5902a6547f662419689ca28b3c29a772446caa Mon Sep 17 00:00:00 2001 +From: Yelena Krivosheev <yelena@marvell.com> +Date: Tue, 19 Dec 2017 17:59:46 +0100 +Subject: [PATCH] net: mvneta: use proper rxq_number in loop on rx queues + +commit ca5902a6547f662419689ca28b3c29a772446caa upstream. + +When adding the RX queue association with each CPU, a typo was made in +the mvneta_cleanup_rxqs() function. This patch fixes it. + +[gregory.clement@free-electrons.com: add commit log and fixes tag] +Cc: stable@vger.kernel.org +Fixes: 2dcf75e2793c ("net: mvneta: Associate RX queues with each CPU") +Signed-off-by: Yelena Krivosheev <yelena@marvell.com> +Tested-by: Dmitri Epshtein <dima@marvell.com> +Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com> +Signed-off-by: David S. Miller <davem@davemloft.net> + +diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c +index 16b2bfb2cf51..1e0835655c93 100644 +--- a/drivers/net/ethernet/marvell/mvneta.c ++++ b/drivers/net/ethernet/marvell/mvneta.c +@@ -3015,7 +3015,7 @@ static void mvneta_cleanup_rxqs(struct mvneta_port *pp) + { + int queue; + +- for (queue = 0; queue < txq_number; queue++) ++ for (queue = 0; queue < rxq_number; queue++) + mvneta_rxq_deinit(pp, &pp->rxqs[queue]); + } + +-- +2.15.0 + diff --git a/queue/parisc-Align-os_hpmc_size-on-word-boundary.patch b/queue/parisc-Align-os_hpmc_size-on-word-boundary.patch new file mode 100644 index 0000000..4048dcd --- /dev/null +++ b/queue/parisc-Align-os_hpmc_size-on-word-boundary.patch @@ -0,0 +1,29 @@ +From 0ed9d3de5f8f97e6efd5ca0e3377cab5f0451ead Mon Sep 17 00:00:00 2001 +From: Helge Deller <deller@gmx.de> +Date: Tue, 12 Dec 2017 21:25:41 +0100 +Subject: [PATCH] parisc: Align os_hpmc_size on word boundary + +commit 0ed9d3de5f8f97e6efd5ca0e3377cab5f0451ead upstream. + +The os_hpmc_size variable sometimes wasn't aligned at word boundary and thus +triggered the unaligned fault handler at startup. +Fix it by aligning it properly. + +Signed-off-by: Helge Deller <deller@gmx.de> +Cc: <stable@vger.kernel.org> # v4.14+ + +diff --git a/arch/parisc/kernel/hpmc.S b/arch/parisc/kernel/hpmc.S +index e3a8e5e4d5de..8d072c44f300 100644 +--- a/arch/parisc/kernel/hpmc.S ++++ b/arch/parisc/kernel/hpmc.S +@@ -305,6 +305,7 @@ ENDPROC_CFI(os_hpmc) + + + __INITRODATA ++ .align 4 + .export os_hpmc_size + os_hpmc_size: + .word .os_hpmc_end-.os_hpmc +-- +2.15.0 + diff --git a/queue/parisc-Fix-indenting-in-puts.patch b/queue/parisc-Fix-indenting-in-puts.patch new file mode 100644 index 0000000..1ac8ac1 --- /dev/null +++ b/queue/parisc-Fix-indenting-in-puts.patch @@ -0,0 +1,34 @@ +From 203c110b39a89b48156c7450504e454fedb7f7f6 Mon Sep 17 00:00:00 2001 +From: Helge Deller <deller@gmx.de> +Date: Tue, 12 Dec 2017 21:32:16 +0100 +Subject: [PATCH] parisc: Fix indenting in puts() + +commit 203c110b39a89b48156c7450504e454fedb7f7f6 upstream. + +Static analysis tools complain that we intended to have curly braces +around this indent block. In this case this assumption is wrong, so fix +the indenting. + +Fixes: 2f3c7b8137ef ("parisc: Add core code for self-extracting kernel") +Reported-by: Dan Carpenter <dan.carpenter@oracle.com> +Signed-off-by: Helge Deller <deller@gmx.de> +Cc: <stable@vger.kernel.org> # v4.14+ + +diff --git a/arch/parisc/boot/compressed/misc.c b/arch/parisc/boot/compressed/misc.c +index 9345b44b86f0..f57118e1f6b4 100644 +--- a/arch/parisc/boot/compressed/misc.c ++++ b/arch/parisc/boot/compressed/misc.c +@@ -123,8 +123,8 @@ int puts(const char *s) + while ((nuline = strchr(s, '\n')) != NULL) { + if (nuline != s) + pdc_iodc_print(s, nuline - s); +- pdc_iodc_print("\r\n", 2); +- s = nuline + 1; ++ pdc_iodc_print("\r\n", 2); ++ s = nuline + 1; + } + if (*s != '\0') + pdc_iodc_print(s, strlen(s)); +-- +2.15.0 + diff --git a/queue/parisc-Hide-Diva-built-in-serial-aux-and-graphics-ca.patch b/queue/parisc-Hide-Diva-built-in-serial-aux-and-graphics-ca.patch new file mode 100644 index 0000000..a529a62 --- /dev/null +++ b/queue/parisc-Hide-Diva-built-in-serial-aux-and-graphics-ca.patch @@ -0,0 +1,59 @@ +From bcf3f1752a622f1372d3252d0fea8855d89812e7 Mon Sep 17 00:00:00 2001 +From: Helge Deller <deller@gmx.de> +Date: Tue, 12 Dec 2017 21:52:26 +0100 +Subject: [PATCH] parisc: Hide Diva-built-in serial aux and graphics card + +commit bcf3f1752a622f1372d3252d0fea8855d89812e7 upstream. + +Diva GSP card has built-in serial AUX port and ATI graphic card which simply +don't work and which both don't have external connectors. User Guides even +mention that those devices shouldn't be used. +So, prevent that Linux drivers try to enable those devices. + +Signed-off-by: Helge Deller <deller@gmx.de> +Cc: <stable@vger.kernel.org> # v3.0+ + +diff --git a/drivers/parisc/lba_pci.c b/drivers/parisc/lba_pci.c +index a25fed52f7e9..41b740aed3a3 100644 +--- a/drivers/parisc/lba_pci.c ++++ b/drivers/parisc/lba_pci.c +@@ -1692,3 +1692,36 @@ void lba_set_iregs(struct parisc_device *lba, u32 ibase, u32 imask) + iounmap(base_addr); + } + ++ ++/* ++ * The design of the Diva management card in rp34x0 machines (rp3410, rp3440) ++ * seems rushed, so that many built-in components simply don't work. ++ * The following quirks disable the serial AUX port and the built-in ATI RV100 ++ * Radeon 7000 graphics card which both don't have any external connectors and ++ * thus are useless, and even worse, e.g. the AUX port occupies ttyS0 and as ++ * such makes those machines the only PARISC machines on which we can't use ++ * ttyS0 as boot console. ++ */ ++static void quirk_diva_ati_card(struct pci_dev *dev) ++{ ++ if (dev->subsystem_vendor != PCI_VENDOR_ID_HP || ++ dev->subsystem_device != 0x1292) ++ return; ++ ++ dev_info(&dev->dev, "Hiding Diva built-in ATI card"); ++ dev->device = 0; ++} ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_RADEON_QY, ++ quirk_diva_ati_card); ++ ++static void quirk_diva_aux_disable(struct pci_dev *dev) ++{ ++ if (dev->subsystem_vendor != PCI_VENDOR_ID_HP || ++ dev->subsystem_device != 0x1291) ++ return; ++ ++ dev_info(&dev->dev, "Hiding Diva built-in AUX serial device"); ++ dev->device = 0; ++} ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_DIVA_AUX, ++ quirk_diva_aux_disable); +-- +2.15.0 + diff --git a/queue/pinctrl-cherryview-Mask-all-interrupts-on-Intel_Stra.patch b/queue/pinctrl-cherryview-Mask-all-interrupts-on-Intel_Stra.patch new file mode 100644 index 0000000..807e65a --- /dev/null +++ b/queue/pinctrl-cherryview-Mask-all-interrupts-on-Intel_Stra.patch @@ -0,0 +1,52 @@ +From d2b3c353595a855794f8b9df5b5bdbe8deb0c413 Mon Sep 17 00:00:00 2001 +From: Mika Westerberg <mika.westerberg@linux.intel.com> +Date: Mon, 4 Dec 2017 12:11:02 +0300 +Subject: [PATCH] pinctrl: cherryview: Mask all interrupts on Intel_Strago + based systems + +commit d2b3c353595a855794f8b9df5b5bdbe8deb0c413 upstream. + +Guenter Roeck reported an interrupt storm on a prototype system which is +based on Cyan Chromebook. The root cause turned out to be a incorrectly +configured pin that triggers spurious interrupts. This will be fixed in +coreboot but currently we need to prevent the interrupt storm from +happening by masking all interrupts (but not GPEs) on those systems. + +Link: https://bugzilla.kernel.org/show_bug.cgi?id=197953 +Fixes: bcb48cca23ec ("pinctrl: cherryview: Do not mask all interrupts in probe") +Reported-and-tested-by: Guenter Roeck <linux@roeck-us.net> +Reported-by: Dmitry Torokhov <dmitry.torokhov@gmail.com> +Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com> +Cc: stable@vger.kernel.org +Signed-off-by: Linus Walleij <linus.walleij@linaro.org> + +diff --git a/drivers/pinctrl/intel/pinctrl-cherryview.c b/drivers/pinctrl/intel/pinctrl-cherryview.c +index bdedb6325c72..4471fd94e1fe 100644 +--- a/drivers/pinctrl/intel/pinctrl-cherryview.c ++++ b/drivers/pinctrl/intel/pinctrl-cherryview.c +@@ -1620,6 +1620,22 @@ static int chv_gpio_probe(struct chv_pinctrl *pctrl, int irq) + clear_bit(i, chip->irq.valid_mask); + } + ++ /* ++ * The same set of machines in chv_no_valid_mask[] have incorrectly ++ * configured GPIOs that generate spurious interrupts so we use ++ * this same list to apply another quirk for them. ++ * ++ * See also https://bugzilla.kernel.org/show_bug.cgi?id=197953. ++ */ ++ if (!need_valid_mask) { ++ /* ++ * Mask all interrupts the community is able to generate ++ * but leave the ones that can only generate GPEs unmasked. ++ */ ++ chv_writel(GENMASK(31, pctrl->community->nirqs), ++ pctrl->regs + CHV_INTMASK); ++ } ++ + /* Clear all interrupts */ + chv_writel(0xffff, pctrl->regs + CHV_INTSTAT); + +-- +2.15.0 + diff --git a/queue/powerpc-perf-Dereference-BHRB-entries-safely.patch b/queue/powerpc-perf-Dereference-BHRB-entries-safely.patch new file mode 100644 index 0000000..e71f8b7 --- /dev/null +++ b/queue/powerpc-perf-Dereference-BHRB-entries-safely.patch @@ -0,0 +1,54 @@ +From f41d84dddc66b164ac16acf3f584c276146f1c48 Mon Sep 17 00:00:00 2001 +From: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> +Date: Tue, 12 Dec 2017 17:59:15 +0530 +Subject: [PATCH] powerpc/perf: Dereference BHRB entries safely + +commit f41d84dddc66b164ac16acf3f584c276146f1c48 upstream. + +It's theoretically possible that branch instructions recorded in +BHRB (Branch History Rolling Buffer) entries have already been +unmapped before they are processed by the kernel. Hence, trying to +dereference such memory location will result in a crash. eg: + + Unable to handle kernel paging request for data at address 0xd000000019c41764 + Faulting instruction address: 0xc000000000084a14 + NIP [c000000000084a14] branch_target+0x4/0x70 + LR [c0000000000eb828] record_and_restart+0x568/0x5c0 + Call Trace: + [c0000000000eb3b4] record_and_restart+0xf4/0x5c0 (unreliable) + [c0000000000ec378] perf_event_interrupt+0x298/0x460 + [c000000000027964] performance_monitor_exception+0x54/0x70 + [c000000000009ba4] performance_monitor_common+0x114/0x120 + +Fix it by deferefencing the addresses safely. + +Fixes: 691231846ceb ("powerpc/perf: Fix setting of "to" addresses for BHRB") +Cc: stable@vger.kernel.org # v3.10+ +Suggested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> +Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com> +Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> +[mpe: Use probe_kernel_read() which is clearer, tweak change log] +Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> + +diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c +index 153812966365..fce545774d50 100644 +--- a/arch/powerpc/perf/core-book3s.c ++++ b/arch/powerpc/perf/core-book3s.c +@@ -410,8 +410,12 @@ static __u64 power_pmu_bhrb_to(u64 addr) + int ret; + __u64 target; + +- if (is_kernel_addr(addr)) +- return branch_target((unsigned int *)addr); ++ if (is_kernel_addr(addr)) { ++ if (probe_kernel_read(&instr, (void *)addr, sizeof(instr))) ++ return 0; ++ ++ return branch_target(&instr); ++ } + + /* Userspace: need copy instruction here then translate it */ + pagefault_disable(); +-- +2.15.0 + diff --git a/queue/series b/queue/series index f321485..4844541 100644 --- a/queue/series +++ b/queue/series @@ -80,3 +80,76 @@ bpf-don-t-prune-branches-when-a-scalar-is-replaced-w.patch bpf-fix-integer-overflows.patch selftests-bpf-add-tests-for-recent-bugfixes.patch linux-compiler.h-Split-into-compiler.h-and-compiler_.patch +tools-headers-Sync-objtool-UAPI-header.patch +x86-decoder-Fix-and-update-the-opcodes-map.patch +x86-insn-eval-Add-utility-functions-to-get-segment-s.patch +x86-Kconfig-Limit-NR_CPUS-on-32-bit-to-a-sane-amount.patch +x86-mm-dump_pagetables-Check-PAGE_PRESENT-for-real.patch +x86-mm-dump_pagetables-Make-the-address-hints-correc.patch +x86-vsyscall-64-Explicitly-set-_PAGE_USER-in-the-pag.patch +x86-vsyscall-64-Warn-and-fail-vsyscall-emulation-in-.patch +arch-mm-Allow-arch_dup_mmap-to-fail.patch +x86-ldt-Rework-locking.patch +x86-ldt-Prevent-LDT-inheritance-on-exec.patch +x86-mm-64-Improve-the-memory-map-documentation.patch +x86-doc-Remove-obvious-weirdnesses-from-the-x86-MM-l.patch +x86-entry-Rename-SYSENTER_stack-to-CPU_ENTRY_AREA_en.patch +x86-uv-Use-the-right-TLB-flush-API.patch +x86-microcode-Dont-abuse-the-TLB-flush-interface.patch +x86-mm-Use-__flush_tlb_one-for-kernel-memory.patch +x86-mm-Remove-superfluous-barriers.patch +x86-mm-Add-comments-to-clarify-which-TLB-flush-funct.patch +x86-mm-Move-the-CR3-construction-functions-to-tlbflu.patch +x86-mm-Remove-hard-coded-ASID-limit-checks.patch +x86-mm-Put-MMU-to-hardware-ASID-translation-in-one-p.patch +x86-mm-Create-asm-invpcid.h.patch +x86-cpu_entry_area-Move-it-to-a-separate-unit.patch +x86-cpu_entry_area-Move-it-out-of-the-fixmap.patch +init-Invoke-init_espfix_bsp-from-mm_init.patch +x86-cpu_entry_area-Prevent-wraparound-in-setup_cpu_e.patch +ACPI-APEI-ERST-Fix-missing-error-handling-in-erst_re.patch +acpi-nfit-fix-health-event-notification.patch +crypto-skcipher-set-walk.iv-for-zero-length-inputs.patch +crypto-mcryptd-protect-the-per-CPU-queue-with-a-lock.patch +crypto-af_alg-wait-for-data-at-beginning-of-recvmsg.patch +crypto-af_alg-fix-race-accessing-cipher-request.patch +mfd-cros-ec-spi-Don-t-send-first-message-too-soon.patch +mfd-twl4030-audio-Fix-sibling-node-lookup.patch +mfd-twl6040-Fix-child-node-lookup.patch +ALSA-rawmidi-Avoid-racy-info-ioctl-via-ctl-device.patch +ALSA-hda-realtek-Fix-Dell-AIO-LineOut-issue.patch +ALSA-hda-Add-vendor-id-for-Cannonlake-HDMI-codec.patch +ALSA-usb-audio-Add-native-DSD-support-for-Esoteric-D.patch +ALSA-usb-audio-Fix-the-missing-ctl-name-suffix-at-pa.patch +PCI-PM-Force-devices-to-D0-in-pci_pm_thaw_noirq.patch +block-unalign-call_single_data-in-struct-request.patch +block-throttle-avoid-double-charge.patch +parisc-Align-os_hpmc_size-on-word-boundary.patch +parisc-Fix-indenting-in-puts.patch +parisc-Hide-Diva-built-in-serial-aux-and-graphics-ca.patch +Revert-parisc-Re-enable-interrupts-early.patch +spi-xilinx-Detect-stall-with-Unknown-commands.patch +spi-a3700-Fix-clk-prescaling-for-coefficient-over-15.patch +pinctrl-cherryview-Mask-all-interrupts-on-Intel_Stra.patch +arm64-kvm-Prevent-restoring-stale-PMSCR_EL1-for-vcpu.patch +KVM-arm-arm64-Fix-HYP-unmapping-going-off-limits.patch +KVM-PPC-Book3S-fix-XIVE-migration-of-pending-interru.patch +KVM-PPC-Book3S-HV-Fix-pending_pri-value-in-kvmppc_xi.patch +KVM-MMU-Fix-infinite-loop-when-there-is-no-available.patch +KVM-X86-Fix-load-RFLAGS-w-o-the-fixed-bit.patch +kvm-x86-fix-RSM-when-PCID-is-non-zero.patch +clk-sunxi-sun9i-mmc-Implement-reset-callback-for-res.patch +powerpc-perf-Dereference-BHRB-entries-safely.patch +drm-i915-Flush-pending-GTT-writes-before-unbinding.patch +drm-sun4i-Fix-error-path-handling.patch +libnvdimm-dax-fix-1GB-aligned-namespaces-vs-physical.patch +libnvdimm-btt-Fix-an-incompatibility-in-the-log-layo.patch +libnvdimm-pfn-fix-start_pad-handling-for-aligned-nam.patch +tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch +tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch +tcp-detect-malicious-patterns-in-tcp_collapse_ofo_qu.patch +tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch +tcp-add-tcp_ooo_try_coalesce-helper.patch +net-mvneta-clear-interface-link-status-on-port-disab.patch +net-mvneta-use-proper-rxq_number-in-loop-on-rx-queue.patch +net-mvneta-eliminate-wrong-call-to-handle-rx-descrip.patch diff --git a/queue/spi-a3700-Fix-clk-prescaling-for-coefficient-over-15.patch b/queue/spi-a3700-Fix-clk-prescaling-for-coefficient-over-15.patch new file mode 100644 index 0000000..d72cb10 --- /dev/null +++ b/queue/spi-a3700-Fix-clk-prescaling-for-coefficient-over-15.patch @@ -0,0 +1,50 @@ +From 251c201bf4f8b5bf4f1ccb4f8920eed2e1f57580 Mon Sep 17 00:00:00 2001 +From: Maxime Chevallier <maxime.chevallier@smile.fr> +Date: Mon, 27 Nov 2017 15:16:32 +0100 +Subject: [PATCH] spi: a3700: Fix clk prescaling for coefficient over 15 + +commit 251c201bf4f8b5bf4f1ccb4f8920eed2e1f57580 upstream. + +The Armada 3700 SPI controller has 2 ranges of prescaler coefficients. +One ranging from 0 to 15 by steps of 1, and one ranging from 0 to 30 by +steps of 2. + +This commit fixes the prescaler coefficients that are over 15 so that it +uses the correct range of values. The prescaling coefficient is rounded +to the upper value if it is odd. + +This was tested on Espressobin with spidev and a locigal analyser. + +Signed-off-by: Maxime Chevallier <maxime.chevallier@smile.fr> +Signed-off-by: Mark Brown <broonie@kernel.org> +Cc: stable@vger.kernel.org + +diff --git a/drivers/spi/spi-armada-3700.c b/drivers/spi/spi-armada-3700.c +index 77fe55ce790c..d65345312527 100644 +--- a/drivers/spi/spi-armada-3700.c ++++ b/drivers/spi/spi-armada-3700.c +@@ -79,6 +79,7 @@ + #define A3700_SPI_BYTE_LEN BIT(5) + #define A3700_SPI_CLK_PRESCALE BIT(0) + #define A3700_SPI_CLK_PRESCALE_MASK (0x1f) ++#define A3700_SPI_CLK_EVEN_OFFS (0x10) + + #define A3700_SPI_WFIFO_THRS_BIT 28 + #define A3700_SPI_RFIFO_THRS_BIT 24 +@@ -220,6 +221,13 @@ static void a3700_spi_clock_set(struct a3700_spi *a3700_spi, + + prescale = DIV_ROUND_UP(clk_get_rate(a3700_spi->clk), speed_hz); + ++ /* For prescaler values over 15, we can only set it by steps of 2. ++ * Starting from A3700_SPI_CLK_EVEN_OFFS, we set values from 0 up to ++ * 30. We only use this range from 16 to 30. ++ */ ++ if (prescale > 15) ++ prescale = A3700_SPI_CLK_EVEN_OFFS + DIV_ROUND_UP(prescale, 2); ++ + val = spireg_read(a3700_spi, A3700_SPI_IF_CFG_REG); + val = val & ~A3700_SPI_CLK_PRESCALE_MASK; + +-- +2.15.0 + diff --git a/queue/spi-xilinx-Detect-stall-with-Unknown-commands.patch b/queue/spi-xilinx-Detect-stall-with-Unknown-commands.patch new file mode 100644 index 0000000..0f8b594 --- /dev/null +++ b/queue/spi-xilinx-Detect-stall-with-Unknown-commands.patch @@ -0,0 +1,65 @@ +From 5a1314fa697fc65cefaba64cd4699bfc3e6882a6 Mon Sep 17 00:00:00 2001 +From: Ricardo Ribalda <ricardo.ribalda@gmail.com> +Date: Tue, 21 Nov 2017 10:09:02 +0100 +Subject: [PATCH] spi: xilinx: Detect stall with Unknown commands + +commit 5a1314fa697fc65cefaba64cd4699bfc3e6882a6 upstream. + +When the core is configured in C_SPI_MODE > 0, it integrates a +lookup table that automatically configures the core in dual or quad mode +based on the command (first byte on the tx fifo). + +Unfortunately, that list mode_?_memoy_*.mif does not contain all the +supported commands by the flash. + +Since 4.14 spi-nor automatically tries to probe the flash using SFDP +(command 0x5a), and that command is not part of the list_mode table. + +Whit the right combination of C_SPI_MODE and C_SPI_MEMORY this leads +into a stall that can only be recovered with a soft rest. + +This patch detects this kind of stall and returns -EIO to the caller on +those commands. spi-nor can handle this error properly: + +m25p80 spi0.0: Detected stall. Check C_SPI_MODE and C_SPI_MEMORY. 0x21 0x2404 +m25p80 spi0.0: SPI transfer failed: -5 +spi_master spi0: failed to transfer one message from queue +m25p80 spi0.0: s25sl064p (8192 Kbytes) + +Signed-off-by: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com> +Signed-off-by: Mark Brown <broonie@kernel.org> +Cc: stable@vger.kernel.org + +diff --git a/drivers/spi/spi-xilinx.c b/drivers/spi/spi-xilinx.c +index bc7100b93dfc..e0b9fe1d0e37 100644 +--- a/drivers/spi/spi-xilinx.c ++++ b/drivers/spi/spi-xilinx.c +@@ -271,6 +271,7 @@ static int xilinx_spi_txrx_bufs(struct spi_device *spi, struct spi_transfer *t) + while (remaining_words) { + int n_words, tx_words, rx_words; + u32 sr; ++ int stalled; + + n_words = min(remaining_words, xspi->buffer_size); + +@@ -299,7 +300,17 @@ static int xilinx_spi_txrx_bufs(struct spi_device *spi, struct spi_transfer *t) + + /* Read out all the data from the Rx FIFO */ + rx_words = n_words; ++ stalled = 10; + while (rx_words) { ++ if (rx_words == n_words && !(stalled--) && ++ !(sr & XSPI_SR_TX_EMPTY_MASK) && ++ (sr & XSPI_SR_RX_EMPTY_MASK)) { ++ dev_err(&spi->dev, ++ "Detected stall. Check C_SPI_MODE and C_SPI_MEMORY\n"); ++ xspi_init_hw(xspi); ++ return -EIO; ++ } ++ + if ((sr & XSPI_SR_TX_EMPTY_MASK) && (rx_words > 1)) { + xilinx_spi_rx(xspi); + rx_words--; +-- +2.15.0 + diff --git a/queue/tcp-add-tcp_ooo_try_coalesce-helper.patch b/queue/tcp-add-tcp_ooo_try_coalesce-helper.patch new file mode 100644 index 0000000..acf8a19 --- /dev/null +++ b/queue/tcp-add-tcp_ooo_try_coalesce-helper.patch @@ -0,0 +1,73 @@ +From 58152ecbbcc6a0ce7fddd5bf5f6ee535834ece0c Mon Sep 17 00:00:00 2001 +From: Eric Dumazet <edumazet@google.com> +Date: Mon, 23 Jul 2018 09:28:21 -0700 +Subject: [PATCH] tcp: add tcp_ooo_try_coalesce() helper + +commit 58152ecbbcc6a0ce7fddd5bf5f6ee535834ece0c upstream. + +In case skb in out_or_order_queue is the result of +multiple skbs coalescing, we would like to get a proper gso_segs +counter tracking, so that future tcp_drop() can report an accurate +number. + +I chose to not implement this tracking for skbs in receive queue, +since they are not dropped, unless socket is disconnected. + +Signed-off-by: Eric Dumazet <edumazet@google.com> +Acked-by: Soheil Hassas Yeganeh <soheil@google.com> +Acked-by: Yuchung Cheng <ycheng@google.com> +Signed-off-by: David S. Miller <davem@davemloft.net> + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index b062a7692238..3bcd30a2ba06 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4358,6 +4358,23 @@ static bool tcp_try_coalesce(struct sock *sk, + return true; + } + ++static bool tcp_ooo_try_coalesce(struct sock *sk, ++ struct sk_buff *to, ++ struct sk_buff *from, ++ bool *fragstolen) ++{ ++ bool res = tcp_try_coalesce(sk, to, from, fragstolen); ++ ++ /* In case tcp_drop() is called later, update to->gso_segs */ ++ if (res) { ++ u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) + ++ max_t(u16, 1, skb_shinfo(from)->gso_segs); ++ ++ skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF); ++ } ++ return res; ++} ++ + static void tcp_drop(struct sock *sk, struct sk_buff *skb) + { + sk_drops_add(sk, skb); +@@ -4481,8 +4498,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + /* In the typical case, we are adding an skb to the end of the list. + * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. + */ +- if (tcp_try_coalesce(sk, tp->ooo_last_skb, +- skb, &fragstolen)) { ++ if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb, ++ skb, &fragstolen)) { + coalesce_done: + tcp_grow_window(sk, skb); + kfree_skb_partial(skb, fragstolen); +@@ -4532,8 +4549,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + tcp_drop(sk, skb1); + goto merge_right; + } +- } else if (tcp_try_coalesce(sk, skb1, +- skb, &fragstolen)) { ++ } else if (tcp_ooo_try_coalesce(sk, skb1, ++ skb, &fragstolen)) { + goto coalesce_done; + } + p = &parent->rb_right; +-- +2.15.0 + diff --git a/queue/tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch b/queue/tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch new file mode 100644 index 0000000..9ecea52 --- /dev/null +++ b/queue/tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch @@ -0,0 +1,45 @@ +From f4a3313d8e2ca9fd8d8f45e40a2903ba782607e7 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet <edumazet@google.com> +Date: Mon, 23 Jul 2018 09:28:18 -0700 +Subject: [PATCH] tcp: avoid collapses in tcp_prune_queue() if possible + +commit f4a3313d8e2ca9fd8d8f45e40a2903ba782607e7 upstream. + +Right after a TCP flow is created, receiving tiny out of order +packets allways hit the condition : + +if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + tcp_clamp_window(sk); + +tcp_clamp_window() increases sk_rcvbuf to match sk_rmem_alloc +(guarded by tcp_rmem[2]) + +Calling tcp_collapse_ofo_queue() in this case is not useful, +and offers a O(N^2) surface attack to malicious peers. + +Better not attempt anything before full queue capacity is reached, +forcing attacker to spend lots of resource and allow us to more +easily detect the abuse. + +Signed-off-by: Eric Dumazet <edumazet@google.com> +Acked-by: Soheil Hassas Yeganeh <soheil@google.com> +Acked-by: Yuchung Cheng <ycheng@google.com> +Signed-off-by: David S. Miller <davem@davemloft.net> + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 64e45b279431..53289911362a 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -5004,6 +5004,9 @@ static int tcp_prune_queue(struct sock *sk) + else if (tcp_under_memory_pressure(sk)) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + ++ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) ++ return 0; ++ + tcp_collapse_ofo_queue(sk); + if (!skb_queue_empty(&sk->sk_receive_queue)) + tcp_collapse(sk, &sk->sk_receive_queue, NULL, +-- +2.15.0 + diff --git a/queue/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch b/queue/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch new file mode 100644 index 0000000..570a59e --- /dev/null +++ b/queue/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch @@ -0,0 +1,41 @@ +From 8541b21e781a22dce52a74fef0b9bed00404a1cd Mon Sep 17 00:00:00 2001 +From: Eric Dumazet <edumazet@google.com> +Date: Mon, 23 Jul 2018 09:28:20 -0700 +Subject: [PATCH] tcp: call tcp_drop() from tcp_data_queue_ofo() + +commit 8541b21e781a22dce52a74fef0b9bed00404a1cd upstream. + +In order to be able to give better diagnostics and detect +malicious traffic, we need to have better sk->sk_drops tracking. + +Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue") +Signed-off-by: Eric Dumazet <edumazet@google.com> +Acked-by: Soheil Hassas Yeganeh <soheil@google.com> +Acked-by: Yuchung Cheng <ycheng@google.com> +Signed-off-by: David S. Miller <davem@davemloft.net> + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 78068b902e7b..b062a7692238 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4510,7 +4510,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + /* All the bits are present. Drop. */ + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); +- __kfree_skb(skb); ++ tcp_drop(sk, skb); + skb = NULL; + tcp_dsack_set(sk, seq, end_seq); + goto add_sack; +@@ -4529,7 +4529,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + TCP_SKB_CB(skb1)->end_seq); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); +- __kfree_skb(skb1); ++ tcp_drop(sk, skb1); + goto merge_right; + } + } else if (tcp_try_coalesce(sk, skb1, +-- +2.15.0 + diff --git a/queue/tcp-detect-malicious-patterns-in-tcp_collapse_ofo_qu.patch b/queue/tcp-detect-malicious-patterns-in-tcp_collapse_ofo_qu.patch new file mode 100644 index 0000000..e612f61 --- /dev/null +++ b/queue/tcp-detect-malicious-patterns-in-tcp_collapse_ofo_qu.patch @@ -0,0 +1,71 @@ +From 3d4bf93ac12003f9b8e1e2de37fe27983deebdcf Mon Sep 17 00:00:00 2001 +From: Eric Dumazet <edumazet@google.com> +Date: Mon, 23 Jul 2018 09:28:19 -0700 +Subject: [PATCH] tcp: detect malicious patterns in tcp_collapse_ofo_queue() + +commit 3d4bf93ac12003f9b8e1e2de37fe27983deebdcf upstream. + +In case an attacker feeds tiny packets completely out of order, +tcp_collapse_ofo_queue() might scan the whole rb-tree, performing +expensive copies, but not changing socket memory usage at all. + +1) Do not attempt to collapse tiny skbs. +2) Add logic to exit early when too many tiny skbs are detected. + +We prefer not doing aggressive collapsing (which copies packets) +for pathological flows, and revert to tcp_prune_ofo_queue() which +will be less expensive. + +In the future, we might add the possibility of terminating flows +that are proven to be malicious. + +Signed-off-by: Eric Dumazet <edumazet@google.com> +Acked-by: Soheil Hassas Yeganeh <soheil@google.com> +Signed-off-by: David S. Miller <davem@davemloft.net> + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 53289911362a..78068b902e7b 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4902,6 +4902,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, + static void tcp_collapse_ofo_queue(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); ++ u32 range_truesize, sum_tiny = 0; + struct sk_buff *skb, *head; + u32 start, end; + +@@ -4913,6 +4914,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk) + } + start = TCP_SKB_CB(skb)->seq; + end = TCP_SKB_CB(skb)->end_seq; ++ range_truesize = skb->truesize; + + for (head = skb;;) { + skb = skb_rb_next(skb); +@@ -4923,11 +4925,20 @@ static void tcp_collapse_ofo_queue(struct sock *sk) + if (!skb || + after(TCP_SKB_CB(skb)->seq, end) || + before(TCP_SKB_CB(skb)->end_seq, start)) { +- tcp_collapse(sk, NULL, &tp->out_of_order_queue, +- head, skb, start, end); ++ /* Do not attempt collapsing tiny skbs */ ++ if (range_truesize != head->truesize || ++ end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) { ++ tcp_collapse(sk, NULL, &tp->out_of_order_queue, ++ head, skb, start, end); ++ } else { ++ sum_tiny += range_truesize; ++ if (sum_tiny > sk->sk_rcvbuf >> 3) ++ return; ++ } + goto new_range; + } + ++ range_truesize += skb->truesize; + if (unlikely(before(TCP_SKB_CB(skb)->seq, start))) + start = TCP_SKB_CB(skb)->seq; + if (after(TCP_SKB_CB(skb)->end_seq, end)) +-- +2.15.0 + diff --git a/queue/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch b/queue/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch new file mode 100644 index 0000000..8ae006c --- /dev/null +++ b/queue/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch @@ -0,0 +1,75 @@ +From 72cd43ba64fc172a443410ce01645895850844c8 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet <edumazet@google.com> +Date: Mon, 23 Jul 2018 09:28:17 -0700 +Subject: [PATCH] tcp: free batches of packets in tcp_prune_ofo_queue() + +commit 72cd43ba64fc172a443410ce01645895850844c8 upstream. + +Juha-Matti Tilli reported that malicious peers could inject tiny +packets in out_of_order_queue, forcing very expensive calls +to tcp_collapse_ofo_queue() and tcp_prune_ofo_queue() for +every incoming packet. out_of_order_queue rb-tree can contain +thousands of nodes, iterating over all of them is not nice. + +Before linux-4.9, we would have pruned all packets in ofo_queue +in one go, every XXXX packets. XXXX depends on sk_rcvbuf and skbs +truesize, but is about 7000 packets with tcp_rmem[2] default of 6 MB. + +Since we plan to increase tcp_rmem[2] in the future to cope with +modern BDP, can not revert to the old behavior, without great pain. + +Strategy taken in this patch is to purge ~12.5 % of the queue capacity. + +Fixes: 36a6503fedda ("tcp: refine tcp_prune_ofo_queue() to not drop all packets") +Signed-off-by: Eric Dumazet <edumazet@google.com> +Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi> +Acked-by: Yuchung Cheng <ycheng@google.com> +Acked-by: Soheil Hassas Yeganeh <soheil@google.com> +Signed-off-by: David S. Miller <davem@davemloft.net> + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 6bade06aaf72..64e45b279431 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4942,6 +4942,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk) + * 2) not add too big latencies if thousands of packets sit there. + * (But if application shrinks SO_RCVBUF, we could still end up + * freeing whole queue here) ++ * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks. + * + * Return true if queue has shrunk. + */ +@@ -4949,20 +4950,26 @@ static bool tcp_prune_ofo_queue(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); + struct rb_node *node, *prev; ++ int goal; + + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) + return false; + + NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); ++ goal = sk->sk_rcvbuf >> 3; + node = &tp->ooo_last_skb->rbnode; + do { + prev = rb_prev(node); + rb_erase(node, &tp->out_of_order_queue); ++ goal -= rb_to_skb(node)->truesize; + tcp_drop(sk, rb_to_skb(node)); +- sk_mem_reclaim(sk); +- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && +- !tcp_under_memory_pressure(sk)) +- break; ++ if (!prev || goal <= 0) { ++ sk_mem_reclaim(sk); ++ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && ++ !tcp_under_memory_pressure(sk)) ++ break; ++ goal = sk->sk_rcvbuf >> 3; ++ } + node = prev; + } while (node); + tp->ooo_last_skb = rb_to_skb(prev); +-- +2.15.0 + diff --git a/queue/tools-headers-Sync-objtool-UAPI-header.patch b/queue/tools-headers-Sync-objtool-UAPI-header.patch new file mode 100644 index 0000000..6d21bef --- /dev/null +++ b/queue/tools-headers-Sync-objtool-UAPI-header.patch @@ -0,0 +1,46 @@ +From a356d2ae50790f49858ebed35da9e206336fafee Mon Sep 17 00:00:00 2001 +From: Ingo Molnar <mingo@kernel.org> +Date: Tue, 14 Nov 2017 07:24:22 +0100 +Subject: [PATCH] tools/headers: Sync objtool UAPI header + +commit a356d2ae50790f49858ebed35da9e206336fafee upstream. + +objtool grew this new warning: + + Warning: synced file at 'tools/objtool/arch/x86/include/asm/inat.h' differs from latest kernel version at 'arch/x86/include/asm/inat.h' + +which upstream header grew new INAT_SEG_* definitions. + +Sync up the tooling version of the header. + +Reported-by: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/tools/objtool/arch/x86/include/asm/inat.h b/tools/objtool/arch/x86/include/asm/inat.h +index 02aff0867211..1c78580e58be 100644 +--- a/tools/objtool/arch/x86/include/asm/inat.h ++++ b/tools/objtool/arch/x86/include/asm/inat.h +@@ -97,6 +97,16 @@ + #define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM) + #define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS) + ++/* Identifiers for segment registers */ ++#define INAT_SEG_REG_IGNORE 0 ++#define INAT_SEG_REG_DEFAULT 1 ++#define INAT_SEG_REG_CS 2 ++#define INAT_SEG_REG_SS 3 ++#define INAT_SEG_REG_DS 4 ++#define INAT_SEG_REG_ES 5 ++#define INAT_SEG_REG_FS 6 ++#define INAT_SEG_REG_GS 7 ++ + /* Attribute search APIs */ + extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode); + extern int inat_get_last_prefix_id(insn_byte_t last_pfx); +-- +2.15.0 + diff --git a/queue/x86-Kconfig-Limit-NR_CPUS-on-32-bit-to-a-sane-amount.patch b/queue/x86-Kconfig-Limit-NR_CPUS-on-32-bit-to-a-sane-amount.patch new file mode 100644 index 0000000..6315bcc --- /dev/null +++ b/queue/x86-Kconfig-Limit-NR_CPUS-on-32-bit-to-a-sane-amount.patch @@ -0,0 +1,45 @@ +From 7bbcbd3d1cdcbacd0f9f8dc4c98d550972f1ca30 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Wed, 20 Dec 2017 18:02:34 +0100 +Subject: [PATCH] x86/Kconfig: Limit NR_CPUS on 32-bit to a sane amount + +commit 7bbcbd3d1cdcbacd0f9f8dc4c98d550972f1ca30 upstream. + +The recent cpu_entry_area changes fail to compile on 32-bit when BIGSMP=y +and NR_CPUS=512, because the fixmap area becomes too big. + +Limit the number of CPUs with BIGSMP to 64, which is already way to big for +32-bit, but it's at least a working limitation. + +We performed a quick survey of 32-bit-only machines that might be affected +by this change negatively, but found none. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 665eba1b6103..cd5199de231e 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -925,7 +925,8 @@ config MAXSMP + config NR_CPUS + int "Maximum number of CPUs" if SMP && !MAXSMP + range 2 8 if SMP && X86_32 && !X86_BIGSMP +- range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK ++ range 2 64 if SMP && X86_32 && X86_BIGSMP ++ range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK && X86_64 + range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64 + default "1" if !SMP + default "8192" if MAXSMP +-- +2.15.0 + diff --git a/queue/x86-cpu_entry_area-Move-it-out-of-the-fixmap.patch b/queue/x86-cpu_entry_area-Move-it-out-of-the-fixmap.patch new file mode 100644 index 0000000..d1454fe --- /dev/null +++ b/queue/x86-cpu_entry_area-Move-it-out-of-the-fixmap.patch @@ -0,0 +1,562 @@ +From 92a0f81d89571e3e8759366e050ee05cc545ef99 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Wed, 20 Dec 2017 18:51:31 +0100 +Subject: [PATCH] x86/cpu_entry_area: Move it out of the fixmap + +commit 92a0f81d89571e3e8759366e050ee05cc545ef99 upstream. + +Put the cpu_entry_area into a separate P4D entry. The fixmap gets too big +and 0-day already hit a case where the fixmap PTEs were cleared by +cleanup_highmap(). + +Aside of that the fixmap API is a pain as it's all backwards. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt +index 63a41671d25b..51101708a03a 100644 +--- a/Documentation/x86/x86_64/mm.txt ++++ b/Documentation/x86/x86_64/mm.txt +@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) + ... unused hole ... + ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) + ... unused hole ... ++fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping + ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks + ... unused hole ... + ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space +@@ -35,6 +36,7 @@ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) + ... unused hole ... + ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) + ... unused hole ... ++fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping + ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks + ... unused hole ... + ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space +diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h +index 5471826803af..2fbc69a0916e 100644 +--- a/arch/x86/include/asm/cpu_entry_area.h ++++ b/arch/x86/include/asm/cpu_entry_area.h +@@ -43,10 +43,26 @@ struct cpu_entry_area { + }; + + #define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) +-#define CPU_ENTRY_AREA_PAGES (CPU_ENTRY_AREA_SIZE / PAGE_SIZE) ++#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) + + DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); + + extern void setup_cpu_entry_areas(void); ++extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); ++ ++#define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE ++#define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE) ++ ++#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT) ++ ++#define CPU_ENTRY_AREA_MAP_SIZE \ ++ (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE) ++ ++extern struct cpu_entry_area *get_cpu_entry_area(int cpu); ++ ++static inline struct entry_stack *cpu_entry_stack(int cpu) ++{ ++ return &get_cpu_entry_area(cpu)->entry_stack_page.stack; ++} + + #endif +diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h +index 2ace1f90d138..bc359dd2f7f6 100644 +--- a/arch/x86/include/asm/desc.h ++++ b/arch/x86/include/asm/desc.h +@@ -7,6 +7,7 @@ + #include <asm/mmu.h> + #include <asm/fixmap.h> + #include <asm/irq_vectors.h> ++#include <asm/cpu_entry_area.h> + + #include <linux/smp.h> + #include <linux/percpu.h> +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index fb801662a230..64c4a30e0d39 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -25,7 +25,6 @@ + #else + #include <uapi/asm/vsyscall.h> + #endif +-#include <asm/cpu_entry_area.h> + + /* + * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall +@@ -84,7 +83,6 @@ enum fixed_addresses { + FIX_IO_APIC_BASE_0, + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, + #endif +- FIX_RO_IDT, /* Virtual mapping for read-only IDT */ + #ifdef CONFIG_X86_32 + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, +@@ -100,9 +98,6 @@ enum fixed_addresses { + #ifdef CONFIG_X86_INTEL_MID + FIX_LNW_VRTC, + #endif +- /* Fixmap entries to remap the GDTs, one per processor. */ +- FIX_CPU_ENTRY_AREA_TOP, +- FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1, + + #ifdef CONFIG_ACPI_APEI_GHES + /* Used for GHES mapping from assorted contexts */ +@@ -143,7 +138,7 @@ enum fixed_addresses { + extern void reserve_top_address(unsigned long reserve); + + #define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) +-#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) ++#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) + + extern int fixmaps_set; + +@@ -191,30 +186,5 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, + void __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags); + +-static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page) +-{ +- BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); +- +- return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page; +-} +- +-#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \ +- BUILD_BUG_ON(offset % PAGE_SIZE != 0); \ +- __get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \ +- }) +- +-#define get_cpu_entry_area_index(cpu, field) \ +- __get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field)) +- +-static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) +-{ +- return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); +-} +- +-static inline struct entry_stack *cpu_entry_stack(int cpu) +-{ +- return &get_cpu_entry_area(cpu)->entry_stack_page.stack; +-} +- + #endif /* !__ASSEMBLY__ */ + #endif /* _ASM_X86_FIXMAP_H */ +diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h +index f2ca9b28fd68..ce245b0cdfca 100644 +--- a/arch/x86/include/asm/pgtable_32_types.h ++++ b/arch/x86/include/asm/pgtable_32_types.h +@@ -38,13 +38,22 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ + #define LAST_PKMAP 1024 + #endif + +-#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ +- & PMD_MASK) ++/* ++ * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c ++ * to avoid include recursion hell ++ */ ++#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40) ++ ++#define CPU_ENTRY_AREA_BASE \ ++ ((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK) ++ ++#define PKMAP_BASE \ ++ ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK) + + #ifdef CONFIG_HIGHMEM + # define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE) + #else +-# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) ++# define VMALLOC_END (CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE) + #endif + + #define MODULES_VADDR VMALLOC_START +diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h +index 6d5f45dcd4a1..3d27831bc58d 100644 +--- a/arch/x86/include/asm/pgtable_64_types.h ++++ b/arch/x86/include/asm/pgtable_64_types.h +@@ -76,32 +76,41 @@ typedef struct { pteval_t pte; } pte_t; + #define PGDIR_MASK (~(PGDIR_SIZE - 1)) + + /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ +-#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) ++#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) ++ + #ifdef CONFIG_X86_5LEVEL +-#define VMALLOC_SIZE_TB _AC(16384, UL) +-#define __VMALLOC_BASE _AC(0xff92000000000000, UL) +-#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) ++# define VMALLOC_SIZE_TB _AC(16384, UL) ++# define __VMALLOC_BASE _AC(0xff92000000000000, UL) ++# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) + #else +-#define VMALLOC_SIZE_TB _AC(32, UL) +-#define __VMALLOC_BASE _AC(0xffffc90000000000, UL) +-#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) ++# define VMALLOC_SIZE_TB _AC(32, UL) ++# define __VMALLOC_BASE _AC(0xffffc90000000000, UL) ++# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) + #endif ++ + #ifdef CONFIG_RANDOMIZE_MEMORY +-#define VMALLOC_START vmalloc_base +-#define VMEMMAP_START vmemmap_base ++# define VMALLOC_START vmalloc_base ++# define VMEMMAP_START vmemmap_base + #else +-#define VMALLOC_START __VMALLOC_BASE +-#define VMEMMAP_START __VMEMMAP_BASE ++# define VMALLOC_START __VMALLOC_BASE ++# define VMEMMAP_START __VMEMMAP_BASE + #endif /* CONFIG_RANDOMIZE_MEMORY */ +-#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) +-#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) ++ ++#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) ++ ++#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) + /* The module sections ends with the start of the fixmap */ +-#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) +-#define MODULES_LEN (MODULES_END - MODULES_VADDR) +-#define ESPFIX_PGD_ENTRY _AC(-2, UL) +-#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) +-#define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) +-#define EFI_VA_END (-68 * (_AC(1, UL) << 30)) ++#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) ++#define MODULES_LEN (MODULES_END - MODULES_VADDR) ++ ++#define ESPFIX_PGD_ENTRY _AC(-2, UL) ++#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) ++ ++#define CPU_ENTRY_AREA_PGD _AC(-3, UL) ++#define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT) ++ ++#define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) ++#define EFI_VA_END (-68 * (_AC(1, UL) << 30)) + + #define EARLY_DYNAMIC_PAGE_TABLES 64 + +diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c +index 1dd3f533d78c..36b17e0febe8 100644 +--- a/arch/x86/kernel/dumpstack.c ++++ b/arch/x86/kernel/dumpstack.c +@@ -18,6 +18,7 @@ + #include <linux/nmi.h> + #include <linux/sysfs.h> + ++#include <asm/cpu_entry_area.h> + #include <asm/stacktrace.h> + #include <asm/unwind.h> + +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index 464daed6894f..7c16fe0b60c2 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -951,8 +951,9 @@ void __init trap_init(void) + * "sidt" instruction will not leak the location of the kernel, and + * to defend the IDT against arbitrary memory write vulnerabilities. + * It will be reloaded in cpu_init() */ +- __set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO); +- idt_descr.address = fix_to_virt(FIX_RO_IDT); ++ cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), ++ PAGE_KERNEL_RO); ++ idt_descr.address = CPU_ENTRY_AREA_RO_IDT; + + /* + * Should be a barrier for any external CPU state: +diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c +index 235ff9cfaaf4..21e8b595cbb1 100644 +--- a/arch/x86/mm/cpu_entry_area.c ++++ b/arch/x86/mm/cpu_entry_area.c +@@ -15,11 +15,27 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); + #endif + ++struct cpu_entry_area *get_cpu_entry_area(int cpu) ++{ ++ unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; ++ BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); ++ ++ return (struct cpu_entry_area *) va; ++} ++EXPORT_SYMBOL(get_cpu_entry_area); ++ ++void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags) ++{ ++ unsigned long va = (unsigned long) cea_vaddr; ++ ++ set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags)); ++} ++ + static void __init +-set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) ++cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) + { +- for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) +- __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); ++ for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE) ++ cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); + } + + /* Setup the fixmap mappings only once per-processor */ +@@ -47,10 +63,12 @@ static void __init setup_cpu_entry_area(int cpu) + pgprot_t tss_prot = PAGE_KERNEL; + #endif + +- __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); +- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page), +- per_cpu_ptr(&entry_stack_storage, cpu), 1, +- PAGE_KERNEL); ++ cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu), ++ gdt_prot); ++ ++ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page, ++ per_cpu_ptr(&entry_stack_storage, cpu), 1, ++ PAGE_KERNEL); + + /* + * The Intel SDM says (Volume 3, 7.2.1): +@@ -72,10 +90,9 @@ static void __init setup_cpu_entry_area(int cpu) + BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ + offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); + BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); +- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), +- &per_cpu(cpu_tss_rw, cpu), +- sizeof(struct tss_struct) / PAGE_SIZE, +- tss_prot); ++ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss, ++ &per_cpu(cpu_tss_rw, cpu), ++ sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); + + #ifdef CONFIG_X86_32 + per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); +@@ -85,20 +102,37 @@ static void __init setup_cpu_entry_area(int cpu) + BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); + BUILD_BUG_ON(sizeof(exception_stacks) != + sizeof(((struct cpu_entry_area *)0)->exception_stacks)); +- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), +- &per_cpu(exception_stacks, cpu), +- sizeof(exception_stacks) / PAGE_SIZE, +- PAGE_KERNEL); ++ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, ++ &per_cpu(exception_stacks, cpu), ++ sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); + +- __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), ++ cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); + #endif + } + ++static __init void setup_cpu_entry_area_ptes(void) ++{ ++#ifdef CONFIG_X86_32 ++ unsigned long start, end; ++ ++ BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE); ++ BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK); ++ ++ start = CPU_ENTRY_AREA_BASE; ++ end = start + CPU_ENTRY_AREA_MAP_SIZE; ++ ++ for (; start < end; start += PMD_SIZE) ++ populate_extra_pte(start); ++#endif ++} ++ + void __init setup_cpu_entry_areas(void) + { + unsigned int cpu; + ++ setup_cpu_entry_area_ptes(); ++ + for_each_possible_cpu(cpu) + setup_cpu_entry_area(cpu); + } +diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c +index fdf09d8f98da..43dedbfb7257 100644 +--- a/arch/x86/mm/dump_pagetables.c ++++ b/arch/x86/mm/dump_pagetables.c +@@ -58,6 +58,7 @@ enum address_markers_idx { + KASAN_SHADOW_START_NR, + KASAN_SHADOW_END_NR, + #endif ++ CPU_ENTRY_AREA_NR, + #ifdef CONFIG_X86_ESPFIX64 + ESPFIX_START_NR, + #endif +@@ -81,6 +82,7 @@ static struct addr_marker address_markers[] = { + [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, + [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, + #endif ++ [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, + #ifdef CONFIG_X86_ESPFIX64 + [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, + #endif +@@ -104,6 +106,7 @@ enum address_markers_idx { + #ifdef CONFIG_HIGHMEM + PKMAP_BASE_NR, + #endif ++ CPU_ENTRY_AREA_NR, + FIXADDR_START_NR, + END_OF_SPACE_NR, + }; +@@ -116,6 +119,7 @@ static struct addr_marker address_markers[] = { + #ifdef CONFIG_HIGHMEM + [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, + #endif ++ [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" }, + [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, + [END_OF_SPACE_NR] = { -1, NULL } + }; +@@ -541,8 +545,8 @@ static int __init pt_dump_init(void) + address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; + # endif + address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; ++ address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE; + #endif +- + return 0; + } + __initcall(pt_dump_init); +diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c +index 8a64a6f2848d..135c9a7898c7 100644 +--- a/arch/x86/mm/init_32.c ++++ b/arch/x86/mm/init_32.c +@@ -50,6 +50,7 @@ + #include <asm/setup.h> + #include <asm/set_memory.h> + #include <asm/page_types.h> ++#include <asm/cpu_entry_area.h> + #include <asm/init.h> + + #include "mm_internal.h" +@@ -766,6 +767,7 @@ void __init mem_init(void) + mem_init_print_info(NULL); + printk(KERN_INFO "virtual kernel memory layout:\n" + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" ++ " cpu_entry : 0x%08lx - 0x%08lx (%4ld kB)\n" + #ifdef CONFIG_HIGHMEM + " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" + #endif +@@ -777,6 +779,10 @@ void __init mem_init(void) + FIXADDR_START, FIXADDR_TOP, + (FIXADDR_TOP - FIXADDR_START) >> 10, + ++ CPU_ENTRY_AREA_BASE, ++ CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE, ++ CPU_ENTRY_AREA_MAP_SIZE >> 10, ++ + #ifdef CONFIG_HIGHMEM + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, + (LAST_PKMAP*PAGE_SIZE) >> 10, +diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c +index 9ec70d780f1f..47388f0c0e59 100644 +--- a/arch/x86/mm/kasan_init_64.c ++++ b/arch/x86/mm/kasan_init_64.c +@@ -15,6 +15,7 @@ + #include <asm/tlbflush.h> + #include <asm/sections.h> + #include <asm/pgtable.h> ++#include <asm/cpu_entry_area.h> + + extern struct range pfn_mapped[E820_MAX_ENTRIES]; + +@@ -322,31 +323,33 @@ void __init kasan_init(void) + map_range(&pfn_mapped[i]); + } + +- kasan_populate_zero_shadow( +- kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), +- kasan_mem_to_shadow((void *)__START_KERNEL_map)); +- +- kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), +- (unsigned long)kasan_mem_to_shadow(_end), +- early_pfn_to_nid(__pa(_stext))); +- +- shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM); ++ shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE; + shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); + shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin, + PAGE_SIZE); + +- shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE); ++ shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE + ++ CPU_ENTRY_AREA_MAP_SIZE); + shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); + shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end, + PAGE_SIZE); + +- kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), +- shadow_cpu_entry_begin); ++ kasan_populate_zero_shadow( ++ kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), ++ shadow_cpu_entry_begin); + + kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, + (unsigned long)shadow_cpu_entry_end, 0); + +- kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END); ++ kasan_populate_zero_shadow(shadow_cpu_entry_end, ++ kasan_mem_to_shadow((void *)__START_KERNEL_map)); ++ ++ kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), ++ (unsigned long)kasan_mem_to_shadow(_end), ++ early_pfn_to_nid(__pa(_stext))); ++ ++ kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), ++ (void *)KASAN_SHADOW_END); + + load_cr3(init_top_pgt); + __flush_tlb_all(); +diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c +index 6b9bf023a700..c3c5274410a9 100644 +--- a/arch/x86/mm/pgtable_32.c ++++ b/arch/x86/mm/pgtable_32.c +@@ -10,6 +10,7 @@ + #include <linux/pagemap.h> + #include <linux/spinlock.h> + ++#include <asm/cpu_entry_area.h> + #include <asm/pgtable.h> + #include <asm/pgalloc.h> + #include <asm/fixmap.h> +diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c +index c2454237fa67..a0e2b8c6e5c7 100644 +--- a/arch/x86/xen/mmu_pv.c ++++ b/arch/x86/xen/mmu_pv.c +@@ -2261,7 +2261,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) + + switch (idx) { + case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: +- case FIX_RO_IDT: + #ifdef CONFIG_X86_32 + case FIX_WP_TEST: + # ifdef CONFIG_HIGHMEM +@@ -2272,7 +2271,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) + #endif + case FIX_TEXT_POKE0: + case FIX_TEXT_POKE1: +- case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM: + /* All local page mappings */ + pte = pfn_pte(phys, prot); + break; +-- +2.15.0 + diff --git a/queue/x86-cpu_entry_area-Move-it-to-a-separate-unit.patch b/queue/x86-cpu_entry_area-Move-it-to-a-separate-unit.patch new file mode 100644 index 0000000..b462e5d --- /dev/null +++ b/queue/x86-cpu_entry_area-Move-it-to-a-separate-unit.patch @@ -0,0 +1,382 @@ +From ed1bbc40a0d10e0c5c74fe7bdc6298295cf40255 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Wed, 20 Dec 2017 18:28:54 +0100 +Subject: [PATCH] x86/cpu_entry_area: Move it to a separate unit + +commit ed1bbc40a0d10e0c5c74fe7bdc6298295cf40255 upstream. + +Separate the cpu_entry_area code out of cpu/common.c and the fixmap. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h +new file mode 100644 +index 000000000000..5471826803af +--- /dev/null ++++ b/arch/x86/include/asm/cpu_entry_area.h +@@ -0,0 +1,52 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#ifndef _ASM_X86_CPU_ENTRY_AREA_H ++#define _ASM_X86_CPU_ENTRY_AREA_H ++ ++#include <linux/percpu-defs.h> ++#include <asm/processor.h> ++ ++/* ++ * cpu_entry_area is a percpu region that contains things needed by the CPU ++ * and early entry/exit code. Real types aren't used for all fields here ++ * to avoid circular header dependencies. ++ * ++ * Every field is a virtual alias of some other allocated backing store. ++ * There is no direct allocation of a struct cpu_entry_area. ++ */ ++struct cpu_entry_area { ++ char gdt[PAGE_SIZE]; ++ ++ /* ++ * The GDT is just below entry_stack and thus serves (on x86_64) as ++ * a a read-only guard page. ++ */ ++ struct entry_stack_page entry_stack_page; ++ ++ /* ++ * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because ++ * we need task switches to work, and task switches write to the TSS. ++ */ ++ struct tss_struct tss; ++ ++ char entry_trampoline[PAGE_SIZE]; ++ ++#ifdef CONFIG_X86_64 ++ /* ++ * Exception stacks used for IST entries. ++ * ++ * In the future, this should have a separate slot for each stack ++ * with guard pages between them. ++ */ ++ char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; ++#endif ++}; ++ ++#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) ++#define CPU_ENTRY_AREA_PAGES (CPU_ENTRY_AREA_SIZE / PAGE_SIZE) ++ ++DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); ++ ++extern void setup_cpu_entry_areas(void); ++ ++#endif +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index 8153b8d86a3c..fb801662a230 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -25,6 +25,7 @@ + #else + #include <uapi/asm/vsyscall.h> + #endif ++#include <asm/cpu_entry_area.h> + + /* + * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall +@@ -44,46 +45,6 @@ extern unsigned long __FIXADDR_TOP; + PAGE_SIZE) + #endif + +-/* +- * cpu_entry_area is a percpu region in the fixmap that contains things +- * needed by the CPU and early entry/exit code. Real types aren't used +- * for all fields here to avoid circular header dependencies. +- * +- * Every field is a virtual alias of some other allocated backing store. +- * There is no direct allocation of a struct cpu_entry_area. +- */ +-struct cpu_entry_area { +- char gdt[PAGE_SIZE]; +- +- /* +- * The GDT is just below entry_stack and thus serves (on x86_64) as +- * a a read-only guard page. +- */ +- struct entry_stack_page entry_stack_page; +- +- /* +- * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because +- * we need task switches to work, and task switches write to the TSS. +- */ +- struct tss_struct tss; +- +- char entry_trampoline[PAGE_SIZE]; +- +-#ifdef CONFIG_X86_64 +- /* +- * Exception stacks used for IST entries. +- * +- * In the future, this should have a separate slot for each stack +- * with guard pages between them. +- */ +- char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; +-#endif +-}; +- +-#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) +- +-extern void setup_cpu_entry_areas(void); +- + /* + * Here we define all the compile-time 'special' virtual + * addresses. The point is to have a constant address at +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index ed4acbce37a8..8ddcfa4d4165 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -482,102 +482,8 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, + [DEBUG_STACK - 1] = DEBUG_STKSZ + }; +- +-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks +- [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +-#endif +- +-static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, +- entry_stack_storage); +- +-static void __init +-set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) +-{ +- for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) +- __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); +-} +- +-/* Setup the fixmap mappings only once per-processor */ +-static void __init setup_cpu_entry_area(int cpu) +-{ +-#ifdef CONFIG_X86_64 +- extern char _entry_trampoline[]; +- +- /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ +- pgprot_t gdt_prot = PAGE_KERNEL_RO; +- pgprot_t tss_prot = PAGE_KERNEL_RO; +-#else +- /* +- * On native 32-bit systems, the GDT cannot be read-only because +- * our double fault handler uses a task gate, and entering through +- * a task gate needs to change an available TSS to busy. If the +- * GDT is read-only, that will triple fault. The TSS cannot be +- * read-only because the CPU writes to it on task switches. +- * +- * On Xen PV, the GDT must be read-only because the hypervisor +- * requires it. +- */ +- pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? +- PAGE_KERNEL_RO : PAGE_KERNEL; +- pgprot_t tss_prot = PAGE_KERNEL; +-#endif +- +- __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); +- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page), +- per_cpu_ptr(&entry_stack_storage, cpu), 1, +- PAGE_KERNEL); +- +- /* +- * The Intel SDM says (Volume 3, 7.2.1): +- * +- * Avoid placing a page boundary in the part of the TSS that the +- * processor reads during a task switch (the first 104 bytes). The +- * processor may not correctly perform address translations if a +- * boundary occurs in this area. During a task switch, the processor +- * reads and writes into the first 104 bytes of each TSS (using +- * contiguous physical addresses beginning with the physical address +- * of the first byte of the TSS). So, after TSS access begins, if +- * part of the 104 bytes is not physically contiguous, the processor +- * will access incorrect information without generating a page-fault +- * exception. +- * +- * There are also a lot of errata involving the TSS spanning a page +- * boundary. Assert that we're not doing that. +- */ +- BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ +- offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); +- BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); +- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), +- &per_cpu(cpu_tss_rw, cpu), +- sizeof(struct tss_struct) / PAGE_SIZE, +- tss_prot); +- +-#ifdef CONFIG_X86_32 +- per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); + #endif + +-#ifdef CONFIG_X86_64 +- BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); +- BUILD_BUG_ON(sizeof(exception_stacks) != +- sizeof(((struct cpu_entry_area *)0)->exception_stacks)); +- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), +- &per_cpu(exception_stacks, cpu), +- sizeof(exception_stacks) / PAGE_SIZE, +- PAGE_KERNEL); +- +- __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), +- __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); +-#endif +-} +- +-void __init setup_cpu_entry_areas(void) +-{ +- unsigned int cpu; +- +- for_each_possible_cpu(cpu) +- setup_cpu_entry_area(cpu); +-} +- + /* Load the original GDT from the per-cpu structure */ + void load_direct_gdt(int cpu) + { +diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c +index 74136fd16f49..464daed6894f 100644 +--- a/arch/x86/kernel/traps.c ++++ b/arch/x86/kernel/traps.c +@@ -52,6 +52,7 @@ + #include <asm/traps.h> + #include <asm/desc.h> + #include <asm/fpu/internal.h> ++#include <asm/cpu_entry_area.h> + #include <asm/mce.h> + #include <asm/fixmap.h> + #include <asm/mach_traps.h> +diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile +index 7ba7f3d7f477..2e0017af8f9b 100644 +--- a/arch/x86/mm/Makefile ++++ b/arch/x86/mm/Makefile +@@ -10,7 +10,7 @@ CFLAGS_REMOVE_mem_encrypt.o = -pg + endif + + obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ +- pat.o pgtable.o physaddr.o setup_nx.o tlb.o ++ pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o + + # Make sure __phys_addr has no stackprotector + nostackp := $(call cc-option, -fno-stack-protector) +diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c +new file mode 100644 +index 000000000000..235ff9cfaaf4 +--- /dev/null ++++ b/arch/x86/mm/cpu_entry_area.c +@@ -0,0 +1,104 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include <linux/spinlock.h> ++#include <linux/percpu.h> ++ ++#include <asm/cpu_entry_area.h> ++#include <asm/pgtable.h> ++#include <asm/fixmap.h> ++#include <asm/desc.h> ++ ++static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); ++ ++#ifdef CONFIG_X86_64 ++static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks ++ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); ++#endif ++ ++static void __init ++set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) ++{ ++ for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) ++ __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); ++} ++ ++/* Setup the fixmap mappings only once per-processor */ ++static void __init setup_cpu_entry_area(int cpu) ++{ ++#ifdef CONFIG_X86_64 ++ extern char _entry_trampoline[]; ++ ++ /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ ++ pgprot_t gdt_prot = PAGE_KERNEL_RO; ++ pgprot_t tss_prot = PAGE_KERNEL_RO; ++#else ++ /* ++ * On native 32-bit systems, the GDT cannot be read-only because ++ * our double fault handler uses a task gate, and entering through ++ * a task gate needs to change an available TSS to busy. If the ++ * GDT is read-only, that will triple fault. The TSS cannot be ++ * read-only because the CPU writes to it on task switches. ++ * ++ * On Xen PV, the GDT must be read-only because the hypervisor ++ * requires it. ++ */ ++ pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? ++ PAGE_KERNEL_RO : PAGE_KERNEL; ++ pgprot_t tss_prot = PAGE_KERNEL; ++#endif ++ ++ __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); ++ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page), ++ per_cpu_ptr(&entry_stack_storage, cpu), 1, ++ PAGE_KERNEL); ++ ++ /* ++ * The Intel SDM says (Volume 3, 7.2.1): ++ * ++ * Avoid placing a page boundary in the part of the TSS that the ++ * processor reads during a task switch (the first 104 bytes). The ++ * processor may not correctly perform address translations if a ++ * boundary occurs in this area. During a task switch, the processor ++ * reads and writes into the first 104 bytes of each TSS (using ++ * contiguous physical addresses beginning with the physical address ++ * of the first byte of the TSS). So, after TSS access begins, if ++ * part of the 104 bytes is not physically contiguous, the processor ++ * will access incorrect information without generating a page-fault ++ * exception. ++ * ++ * There are also a lot of errata involving the TSS spanning a page ++ * boundary. Assert that we're not doing that. ++ */ ++ BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ ++ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); ++ BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); ++ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), ++ &per_cpu(cpu_tss_rw, cpu), ++ sizeof(struct tss_struct) / PAGE_SIZE, ++ tss_prot); ++ ++#ifdef CONFIG_X86_32 ++ per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); ++#endif ++ ++#ifdef CONFIG_X86_64 ++ BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); ++ BUILD_BUG_ON(sizeof(exception_stacks) != ++ sizeof(((struct cpu_entry_area *)0)->exception_stacks)); ++ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), ++ &per_cpu(exception_stacks, cpu), ++ sizeof(exception_stacks) / PAGE_SIZE, ++ PAGE_KERNEL); ++ ++ __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), ++ __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); ++#endif ++} ++ ++void __init setup_cpu_entry_areas(void) ++{ ++ unsigned int cpu; ++ ++ for_each_possible_cpu(cpu) ++ setup_cpu_entry_area(cpu); ++} +-- +2.15.0 + diff --git a/queue/x86-cpu_entry_area-Prevent-wraparound-in-setup_cpu_e.patch b/queue/x86-cpu_entry_area-Prevent-wraparound-in-setup_cpu_e.patch new file mode 100644 index 0000000..dccbbf6 --- /dev/null +++ b/queue/x86-cpu_entry_area-Prevent-wraparound-in-setup_cpu_e.patch @@ -0,0 +1,38 @@ +From f6c4fd506cb626e4346aa81688f255e593a7c5a0 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Sat, 23 Dec 2017 19:45:11 +0100 +Subject: [PATCH] x86/cpu_entry_area: Prevent wraparound in + setup_cpu_entry_area_ptes() on 32bit + +commit f6c4fd506cb626e4346aa81688f255e593a7c5a0 upstream. + +The loop which populates the CPU entry area PMDs can wrap around on 32bit +machines when the number of CPUs is small. + +It worked wonderful for NR_CPUS=64 for whatever reason and the moron who +wrote that code did not bother to test it with !SMP. + +Check for the wraparound to fix it. + +Fixes: 92a0f81d8957 ("x86/cpu_entry_area: Move it out of the fixmap") +Reported-by: kernel test robot <fengguang.wu@intel.com> +Signed-off-by: Thomas "Feels stupid" Gleixner <tglx@linutronix.de> +Tested-by: Borislav Petkov <bp@alien8.de> + +diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c +index 21e8b595cbb1..fe814fd5e014 100644 +--- a/arch/x86/mm/cpu_entry_area.c ++++ b/arch/x86/mm/cpu_entry_area.c +@@ -122,7 +122,8 @@ static __init void setup_cpu_entry_area_ptes(void) + start = CPU_ENTRY_AREA_BASE; + end = start + CPU_ENTRY_AREA_MAP_SIZE; + +- for (; start < end; start += PMD_SIZE) ++ /* Careful here: start + PMD_SIZE might wrap around */ ++ for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE) + populate_extra_pte(start); + #endif + } +-- +2.15.0 + diff --git a/queue/x86-decoder-Fix-and-update-the-opcodes-map.patch b/queue/x86-decoder-Fix-and-update-the-opcodes-map.patch new file mode 100644 index 0000000..6373de0 --- /dev/null +++ b/queue/x86-decoder-Fix-and-update-the-opcodes-map.patch @@ -0,0 +1,158 @@ +From f5b5fab1780c98b74526dbac527574bd02dc16f8 Mon Sep 17 00:00:00 2001 +From: Randy Dunlap <rdunlap@infradead.org> +Date: Mon, 11 Dec 2017 10:38:36 -0800 +Subject: [PATCH] x86/decoder: Fix and update the opcodes map + +commit f5b5fab1780c98b74526dbac527574bd02dc16f8 upstream. + +Update x86-opcode-map.txt based on the October 2017 Intel SDM publication. +Fix INVPID to INVVPID. +Add UD0 and UD1 instruction opcodes. + +Also sync the objtool and perf tooling copies of this file. + +Signed-off-by: Randy Dunlap <rdunlap@infradead.org> +Acked-by: Masami Hiramatsu <mhiramat@kernel.org> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Masami Hiramatsu <masami.hiramatsu@gmail.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Link: http://lkml.kernel.org/r/aac062d7-c0f6-96e3-5c92-ed299e2bd3da@infradead.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt +index c4d55919fac1..e0b85930dd77 100644 +--- a/arch/x86/lib/x86-opcode-map.txt ++++ b/arch/x86/lib/x86-opcode-map.txt +@@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) + fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) + fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) + fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) +-ff: ++ff: UD0 + EndTable + + Table: 3-byte opcode 1 (0x0f 0x38) +@@ -717,7 +717,7 @@ AVXcode: 2 + 7e: vpermt2d/q Vx,Hx,Wx (66),(ev) + 7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) + 80: INVEPT Gy,Mdq (66) +-81: INVPID Gy,Mdq (66) ++81: INVVPID Gy,Mdq (66) + 82: INVPCID Gy,Mdq (66) + 83: vpmultishiftqb Vx,Hx,Wx (66),(ev) + 88: vexpandps/d Vpd,Wpd (66),(ev) +@@ -970,6 +970,15 @@ GrpTable: Grp9 + EndTable + + GrpTable: Grp10 ++# all are UD1 ++0: UD1 ++1: UD1 ++2: UD1 ++3: UD1 ++4: UD1 ++5: UD1 ++6: UD1 ++7: UD1 + EndTable + + # Grp11A and Grp11B are expressed as Grp11 in Intel SDM +diff --git a/tools/objtool/arch/x86/insn/x86-opcode-map.txt b/tools/objtool/arch/x86/insn/x86-opcode-map.txt +index 12e377184ee4..e0b85930dd77 100644 +--- a/tools/objtool/arch/x86/insn/x86-opcode-map.txt ++++ b/tools/objtool/arch/x86/insn/x86-opcode-map.txt +@@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) + fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) + fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) + fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) +-ff: ++ff: UD0 + EndTable + + Table: 3-byte opcode 1 (0x0f 0x38) +@@ -717,7 +717,7 @@ AVXcode: 2 + 7e: vpermt2d/q Vx,Hx,Wx (66),(ev) + 7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) + 80: INVEPT Gy,Mdq (66) +-81: INVPID Gy,Mdq (66) ++81: INVVPID Gy,Mdq (66) + 82: INVPCID Gy,Mdq (66) + 83: vpmultishiftqb Vx,Hx,Wx (66),(ev) + 88: vexpandps/d Vpd,Wpd (66),(ev) +@@ -896,7 +896,7 @@ EndTable + + GrpTable: Grp3_1 + 0: TEST Eb,Ib +-1: ++1: TEST Eb,Ib + 2: NOT Eb + 3: NEG Eb + 4: MUL AL,Eb +@@ -970,6 +970,15 @@ GrpTable: Grp9 + EndTable + + GrpTable: Grp10 ++# all are UD1 ++0: UD1 ++1: UD1 ++2: UD1 ++3: UD1 ++4: UD1 ++5: UD1 ++6: UD1 ++7: UD1 + EndTable + + # Grp11A and Grp11B are expressed as Grp11 in Intel SDM +diff --git a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt +index 12e377184ee4..e0b85930dd77 100644 +--- a/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt ++++ b/tools/perf/util/intel-pt-decoder/x86-opcode-map.txt +@@ -607,7 +607,7 @@ fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) + fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) + fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) + fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) +-ff: ++ff: UD0 + EndTable + + Table: 3-byte opcode 1 (0x0f 0x38) +@@ -717,7 +717,7 @@ AVXcode: 2 + 7e: vpermt2d/q Vx,Hx,Wx (66),(ev) + 7f: vpermt2ps/d Vx,Hx,Wx (66),(ev) + 80: INVEPT Gy,Mdq (66) +-81: INVPID Gy,Mdq (66) ++81: INVVPID Gy,Mdq (66) + 82: INVPCID Gy,Mdq (66) + 83: vpmultishiftqb Vx,Hx,Wx (66),(ev) + 88: vexpandps/d Vpd,Wpd (66),(ev) +@@ -896,7 +896,7 @@ EndTable + + GrpTable: Grp3_1 + 0: TEST Eb,Ib +-1: ++1: TEST Eb,Ib + 2: NOT Eb + 3: NEG Eb + 4: MUL AL,Eb +@@ -970,6 +970,15 @@ GrpTable: Grp9 + EndTable + + GrpTable: Grp10 ++# all are UD1 ++0: UD1 ++1: UD1 ++2: UD1 ++3: UD1 ++4: UD1 ++5: UD1 ++6: UD1 ++7: UD1 + EndTable + + # Grp11A and Grp11B are expressed as Grp11 in Intel SDM +-- +2.15.0 + diff --git a/queue/x86-doc-Remove-obvious-weirdnesses-from-the-x86-MM-l.patch b/queue/x86-doc-Remove-obvious-weirdnesses-from-the-x86-MM-l.patch new file mode 100644 index 0000000..d4f8aca --- /dev/null +++ b/queue/x86-doc-Remove-obvious-weirdnesses-from-the-x86-MM-l.patch @@ -0,0 +1,74 @@ +From e8ffe96e5933d417195268478479933d56213a3f Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Tue, 5 Dec 2017 13:34:54 +0100 +Subject: [PATCH] x86/doc: Remove obvious weirdnesses from the x86 MM layout + documentation + +commit e8ffe96e5933d417195268478479933d56213a3f upstream. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt +index 83ca5a3b90ac..63a41671d25b 100644 +--- a/Documentation/x86/x86_64/mm.txt ++++ b/Documentation/x86/x86_64/mm.txt +@@ -1,6 +1,4 @@ + +-<previous description obsolete, deleted> +- + Virtual memory map with 4 level page tables: + + 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm +@@ -49,8 +47,9 @@ ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole + + Architecture defines a 64-bit virtual address. Implementations can support + less. Currently supported are 48- and 57-bit virtual addresses. Bits 63 +-through to the most-significant implemented bit are set to either all ones +-or all zero. This causes hole between user space and kernel addresses. ++through to the most-significant implemented bit are sign extended. ++This causes hole between user space and kernel addresses if you interpret them ++as unsigned. + + The direct mapping covers all memory in the system up to the highest + memory address (this means in some cases it can also include PCI memory +@@ -60,9 +59,6 @@ vmalloc space is lazily synchronized into the different PML4/PML5 pages of + the processes using the page fault handler, with init_top_pgt as + reference. + +-Current X86-64 implementations support up to 46 bits of address space (64 TB), +-which is our current limit. This expands into MBZ space in the page tables. +- + We map EFI runtime services in the 'efi_pgd' PGD in a 64Gb large virtual + memory window (this size is arbitrary, it can be raised later if needed). + The mappings are not part of any other kernel PGD and are only available +@@ -74,5 +70,3 @@ following fixmap section. + Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all + physical memory, vmalloc/ioremap space and virtual memory map are randomized. + Their order is preserved but their base will be offset early at boot time. +- +--Andi Kleen, Jul 2004 +-- +2.15.0 + diff --git a/queue/x86-entry-Rename-SYSENTER_stack-to-CPU_ENTRY_AREA_en.patch b/queue/x86-entry-Rename-SYSENTER_stack-to-CPU_ENTRY_AREA_en.patch new file mode 100644 index 0000000..73e5ba7 --- /dev/null +++ b/queue/x86-entry-Rename-SYSENTER_stack-to-CPU_ENTRY_AREA_en.patch @@ -0,0 +1,325 @@ +From 4fe2d8b11a370af286287a2661de9d4e6c9a145a Mon Sep 17 00:00:00 2001 +From: Dave Hansen <dave.hansen@linux.intel.com> +Date: Mon, 4 Dec 2017 17:25:07 -0800 +Subject: [PATCH] x86/entry: Rename SYSENTER_stack to + CPU_ENTRY_AREA_entry_stack + +commit 4fe2d8b11a370af286287a2661de9d4e6c9a145a upstream. + +If the kernel oopses while on the trampoline stack, it will print +"<SYSENTER>" even if SYSENTER is not involved. That is rather confusing. + +The "SYSENTER" stack is used for a lot more than SYSENTER now. Give it a +better string to display in stack dumps, and rename the kernel code to +match. + +Also move the 32-bit code over to the new naming even though it still uses +the entry stack only for SYSENTER. + +Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bp@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S +index bd8b57a5c874..ace8f321a5a1 100644 +--- a/arch/x86/entry/entry_32.S ++++ b/arch/x86/entry/entry_32.S +@@ -942,9 +942,9 @@ ENTRY(debug) + + /* Are we currently on the SYSENTER stack? */ + movl PER_CPU_VAR(cpu_entry_area), %ecx +- addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx +- subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ +- cmpl $SIZEOF_SYSENTER_stack, %ecx ++ addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx ++ subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ ++ cmpl $SIZEOF_entry_stack, %ecx + jb .Ldebug_from_sysenter_stack + + TRACE_IRQS_OFF +@@ -986,9 +986,9 @@ ENTRY(nmi) + + /* Are we currently on the SYSENTER stack? */ + movl PER_CPU_VAR(cpu_entry_area), %ecx +- addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx +- subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ +- cmpl $SIZEOF_SYSENTER_stack, %ecx ++ addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx ++ subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ ++ cmpl $SIZEOF_entry_stack, %ecx + jb .Lnmi_from_sysenter_stack + + /* Not on SYSENTER stack. */ +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 2812ce043a7a..87cebe78bbef 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -154,8 +154,8 @@ END(native_usergs_sysret64) + _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) + + /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ +-#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \ +- SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA ++#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \ ++ SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA + + ENTRY(entry_SYSCALL_64_trampoline) + UNWIND_HINT_EMPTY +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index 94fc4fa14127..8153b8d86a3c 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -56,10 +56,10 @@ struct cpu_entry_area { + char gdt[PAGE_SIZE]; + + /* +- * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as ++ * The GDT is just below entry_stack and thus serves (on x86_64) as + * a a read-only guard page. + */ +- struct SYSENTER_stack_page SYSENTER_stack_page; ++ struct entry_stack_page entry_stack_page; + + /* + * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because +@@ -250,9 +250,9 @@ static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) + return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); + } + +-static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu) ++static inline struct entry_stack *cpu_entry_stack(int cpu) + { +- return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack; ++ return &get_cpu_entry_area(cpu)->entry_stack_page.stack; + } + + #endif /* !__ASSEMBLY__ */ +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index da943411d3d8..9e482d8b0b97 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -336,12 +336,12 @@ struct x86_hw_tss { + #define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) + #define INVALID_IO_BITMAP_OFFSET 0x8000 + +-struct SYSENTER_stack { ++struct entry_stack { + unsigned long words[64]; + }; + +-struct SYSENTER_stack_page { +- struct SYSENTER_stack stack; ++struct entry_stack_page { ++ struct entry_stack stack; + } __aligned(PAGE_SIZE); + + struct tss_struct { +diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h +index f8062bfd43a0..f73706878772 100644 +--- a/arch/x86/include/asm/stacktrace.h ++++ b/arch/x86/include/asm/stacktrace.h +@@ -16,7 +16,7 @@ enum stack_type { + STACK_TYPE_TASK, + STACK_TYPE_IRQ, + STACK_TYPE_SOFTIRQ, +- STACK_TYPE_SYSENTER, ++ STACK_TYPE_ENTRY, + STACK_TYPE_EXCEPTION, + STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, + }; +@@ -29,7 +29,7 @@ struct stack_info { + bool in_task_stack(unsigned long *stack, struct task_struct *task, + struct stack_info *info); + +-bool in_sysenter_stack(unsigned long *stack, struct stack_info *info); ++bool in_entry_stack(unsigned long *stack, struct stack_info *info); + + int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask); +diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c +index cd360a5e0dca..676b7cf4b62b 100644 +--- a/arch/x86/kernel/asm-offsets.c ++++ b/arch/x86/kernel/asm-offsets.c +@@ -97,6 +97,6 @@ void common(void) { + /* Layout info for cpu_entry_area */ + OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); + OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); +- OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page); +- DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); ++ OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); ++ DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); + } +diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c +index 7d20d9c0b3d6..fa1261eefa16 100644 +--- a/arch/x86/kernel/asm-offsets_32.c ++++ b/arch/x86/kernel/asm-offsets_32.c +@@ -48,7 +48,7 @@ void foo(void) + + /* Offset from the sysenter stack to tss.sp0 */ + DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - +- offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack)); ++ offsetofend(struct cpu_entry_area, entry_stack_page.stack)); + + #ifdef CONFIG_CC_STACKPROTECTOR + BLANK(); +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 034900623adf..ed4acbce37a8 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -487,8 +487,8 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); + #endif + +-static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page, +- SYSENTER_stack_storage); ++static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, ++ entry_stack_storage); + + static void __init + set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) +@@ -523,8 +523,8 @@ static void __init setup_cpu_entry_area(int cpu) + #endif + + __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); +- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page), +- per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1, ++ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page), ++ per_cpu_ptr(&entry_stack_storage, cpu), 1, + PAGE_KERNEL); + + /* +@@ -1323,7 +1323,7 @@ void enable_sep_cpu(void) + + tss->x86_tss.ss1 = __KERNEL_CS; + wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); +- wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0); ++ wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); + + put_cpu(); +@@ -1440,7 +1440,7 @@ void syscall_init(void) + * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). + */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); +- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); ++ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1)); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); + #else + wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); +@@ -1655,7 +1655,7 @@ void cpu_init(void) + */ + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + load_TR_desc(); +- load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); ++ load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); + + load_mm_ldt(&init_mm); + +diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c +index bbd6d986e2d0..1dd3f533d78c 100644 +--- a/arch/x86/kernel/dumpstack.c ++++ b/arch/x86/kernel/dumpstack.c +@@ -43,9 +43,9 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, + return true; + } + +-bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) ++bool in_entry_stack(unsigned long *stack, struct stack_info *info) + { +- struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id()); ++ struct entry_stack *ss = cpu_entry_stack(smp_processor_id()); + + void *begin = ss; + void *end = ss + 1; +@@ -53,7 +53,7 @@ bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) + if ((void *)stack < begin || (void *)stack >= end) + return false; + +- info->type = STACK_TYPE_SYSENTER; ++ info->type = STACK_TYPE_ENTRY; + info->begin = begin; + info->end = end; + info->next_sp = NULL; +@@ -111,13 +111,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + * - task stack + * - interrupt stack + * - HW exception stacks (double fault, nmi, debug, mce) +- * - SYSENTER stack ++ * - entry stack + * + * x86-32 can have up to four stacks: + * - task stack + * - softirq stack + * - hardirq stack +- * - SYSENTER stack ++ * - entry stack + */ + for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + const char *stack_name; +diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c +index 5ff13a6b3680..04170f63e3a1 100644 +--- a/arch/x86/kernel/dumpstack_32.c ++++ b/arch/x86/kernel/dumpstack_32.c +@@ -26,8 +26,8 @@ const char *stack_type_name(enum stack_type type) + if (type == STACK_TYPE_SOFTIRQ) + return "SOFTIRQ"; + +- if (type == STACK_TYPE_SYSENTER) +- return "SYSENTER"; ++ if (type == STACK_TYPE_ENTRY) ++ return "ENTRY_TRAMPOLINE"; + + return NULL; + } +@@ -96,7 +96,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, + if (task != current) + goto unknown; + +- if (in_sysenter_stack(stack, info)) ++ if (in_entry_stack(stack, info)) + goto recursion_check; + + if (in_hardirq_stack(stack, info)) +diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c +index abc828f8c297..563e28d14f2c 100644 +--- a/arch/x86/kernel/dumpstack_64.c ++++ b/arch/x86/kernel/dumpstack_64.c +@@ -37,8 +37,14 @@ const char *stack_type_name(enum stack_type type) + if (type == STACK_TYPE_IRQ) + return "IRQ"; + +- if (type == STACK_TYPE_SYSENTER) +- return "SYSENTER"; ++ if (type == STACK_TYPE_ENTRY) { ++ /* ++ * On 64-bit, we have a generic entry stack that we ++ * use for all the kernel entry points, including ++ * SYSENTER. ++ */ ++ return "ENTRY_TRAMPOLINE"; ++ } + + if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) + return exception_stack_names[type - STACK_TYPE_EXCEPTION]; +@@ -118,7 +124,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, + if (in_irq_stack(stack, info)) + goto recursion_check; + +- if (in_sysenter_stack(stack, info)) ++ if (in_entry_stack(stack, info)) + goto recursion_check; + + goto unknown; +-- +2.15.0 + diff --git a/queue/x86-insn-eval-Add-utility-functions-to-get-segment-s.patch b/queue/x86-insn-eval-Add-utility-functions-to-get-segment-s.patch new file mode 100644 index 0000000..2cdca00 --- /dev/null +++ b/queue/x86-insn-eval-Add-utility-functions-to-get-segment-s.patch @@ -0,0 +1,462 @@ +From 32d0b95300db03c2b23b2ea2c94769a4a138e79d Mon Sep 17 00:00:00 2001 +From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com> +Date: Fri, 27 Oct 2017 13:25:40 -0700 +Subject: [PATCH] x86/insn-eval: Add utility functions to get segment selector + +commit 32d0b95300db03c2b23b2ea2c94769a4a138e79d upstream. + +When computing a linear address and segmentation is used, we need to know +the base address of the segment involved in the computation. In most of +the cases, the segment base address will be zero as in USER_DS/USER32_DS. +However, it may be possible that a user space program defines its own +segments via a local descriptor table. In such a case, the segment base +address may not be zero. Thus, the segment base address is needed to +calculate correctly the linear address. + +If running in protected mode, the segment selector to be used when +computing a linear address is determined by either any of segment override +prefixes in the instruction or inferred from the registers involved in the +computation of the effective address; in that order. Also, there are cases +when the segment override prefixes shall be ignored (i.e., code segments +are always selected by the CS segment register; string instructions always +use the ES segment register when using rDI register as operand). In long +mode, segment registers are ignored, except for FS and GS. In these two +cases, base addresses are obtained from the respective MSRs. + +For clarity, this process can be split into four steps (and an equal +number of functions): determine if segment prefixes overrides can be used; +parse the segment override prefixes, and use them if found; if not found +or cannot be used, use the default segment registers associated with the +operand registers. Once the segment register to use has been identified, +read its value to obtain the segment selector. + +The method to obtain the segment selector depends on several factors. In +32-bit builds, segment selectors are saved into a pt_regs structure +when switching to kernel mode. The same is also true for virtual-8086 +mode. In 64-bit builds, segmentation is mostly ignored, except when +running a program in 32-bit legacy mode. In this case, CS and SS can be +obtained from pt_regs. DS, ES, FS and GS can be read directly from +the respective segment registers. + +In order to identify the segment registers, a new set of #defines is +introduced. It also includes two special identifiers. One of them +indicates when the default segment register associated with instruction +operands shall be used. Another one indicates that the contents of the +segment register shall be ignored; this identifier is used when in long +mode. + +Improvements-by: Borislav Petkov <bp@suse.de> +Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Borislav Petkov <bp@suse.de> +Cc: "Michael S. Tsirkin" <mst@redhat.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: ricardo.neri@intel.com +Cc: Adrian Hunter <adrian.hunter@intel.com> +Cc: Paul Gortmaker <paul.gortmaker@windriver.com> +Cc: Huang Rui <ray.huang@amd.com> +Cc: Qiaowei Ren <qiaowei.ren@intel.com> +Cc: Shuah Khan <shuah@kernel.org> +Cc: Kees Cook <keescook@chromium.org> +Cc: Jonathan Corbet <corbet@lwn.net> +Cc: Jiri Slaby <jslaby@suse.cz> +Cc: Dmitry Vyukov <dvyukov@google.com> +Cc: "Ravi V. Shankar" <ravi.v.shankar@intel.com> +Cc: Chris Metcalf <cmetcalf@mellanox.com> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Arnaldo Carvalho de Melo <acme@redhat.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Colin Ian King <colin.king@canonical.com> +Cc: Chen Yucong <slaoub@gmail.com> +Cc: Adam Buchbinder <adam.buchbinder@gmail.com> +Cc: Vlastimil Babka <vbabka@suse.cz> +Cc: Lorenzo Stoakes <lstoakes@gmail.com> +Cc: Masami Hiramatsu <mhiramat@kernel.org> +Cc: Paolo Bonzini <pbonzini@redhat.com> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Thomas Garnier <thgarnie@google.com> +Link: https://lkml.kernel.org/r/1509135945-13762-14-git-send-email-ricardo.neri-calderon@linux.intel.com + +diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h +index 02aff0867211..1c78580e58be 100644 +--- a/arch/x86/include/asm/inat.h ++++ b/arch/x86/include/asm/inat.h +@@ -97,6 +97,16 @@ + #define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM) + #define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS) + ++/* Identifiers for segment registers */ ++#define INAT_SEG_REG_IGNORE 0 ++#define INAT_SEG_REG_DEFAULT 1 ++#define INAT_SEG_REG_CS 2 ++#define INAT_SEG_REG_SS 3 ++#define INAT_SEG_REG_DS 4 ++#define INAT_SEG_REG_ES 5 ++#define INAT_SEG_REG_FS 6 ++#define INAT_SEG_REG_GS 7 ++ + /* Attribute search APIs */ + extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode); + extern int inat_get_last_prefix_id(insn_byte_t last_pfx); +diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c +index ac7b87c228b9..6a902b155f5d 100644 +--- a/arch/x86/lib/insn-eval.c ++++ b/arch/x86/lib/insn-eval.c +@@ -9,6 +9,7 @@ + #include <asm/inat.h> + #include <asm/insn.h> + #include <asm/insn-eval.h> ++#include <asm/vm86.h> + + #undef pr_fmt + #define pr_fmt(fmt) "insn: " fmt +@@ -47,6 +48,345 @@ static bool is_string_insn(struct insn *insn) + } + } + ++/** ++ * get_seg_reg_override_idx() - obtain segment register override index ++ * @insn: Valid instruction with segment override prefixes ++ * ++ * Inspect the instruction prefixes in @insn and find segment overrides, if any. ++ * ++ * Returns: ++ * ++ * A constant identifying the segment register to use, among CS, SS, DS, ++ * ES, FS, or GS. INAT_SEG_REG_DEFAULT is returned if no segment override ++ * prefixes were found. ++ * ++ * -EINVAL in case of error. ++ */ ++static int get_seg_reg_override_idx(struct insn *insn) ++{ ++ int idx = INAT_SEG_REG_DEFAULT; ++ int num_overrides = 0, i; ++ ++ insn_get_prefixes(insn); ++ ++ /* Look for any segment override prefixes. */ ++ for (i = 0; i < insn->prefixes.nbytes; i++) { ++ insn_attr_t attr; ++ ++ attr = inat_get_opcode_attribute(insn->prefixes.bytes[i]); ++ switch (attr) { ++ case INAT_MAKE_PREFIX(INAT_PFX_CS): ++ idx = INAT_SEG_REG_CS; ++ num_overrides++; ++ break; ++ case INAT_MAKE_PREFIX(INAT_PFX_SS): ++ idx = INAT_SEG_REG_SS; ++ num_overrides++; ++ break; ++ case INAT_MAKE_PREFIX(INAT_PFX_DS): ++ idx = INAT_SEG_REG_DS; ++ num_overrides++; ++ break; ++ case INAT_MAKE_PREFIX(INAT_PFX_ES): ++ idx = INAT_SEG_REG_ES; ++ num_overrides++; ++ break; ++ case INAT_MAKE_PREFIX(INAT_PFX_FS): ++ idx = INAT_SEG_REG_FS; ++ num_overrides++; ++ break; ++ case INAT_MAKE_PREFIX(INAT_PFX_GS): ++ idx = INAT_SEG_REG_GS; ++ num_overrides++; ++ break; ++ /* No default action needed. */ ++ } ++ } ++ ++ /* More than one segment override prefix leads to undefined behavior. */ ++ if (num_overrides > 1) ++ return -EINVAL; ++ ++ return idx; ++} ++ ++/** ++ * check_seg_overrides() - check if segment override prefixes are allowed ++ * @insn: Valid instruction with segment override prefixes ++ * @regoff: Operand offset, in pt_regs, for which the check is performed ++ * ++ * For a particular register used in register-indirect addressing, determine if ++ * segment override prefixes can be used. Specifically, no overrides are allowed ++ * for rDI if used with a string instruction. ++ * ++ * Returns: ++ * ++ * True if segment override prefixes can be used with the register indicated ++ * in @regoff. False if otherwise. ++ */ ++static bool check_seg_overrides(struct insn *insn, int regoff) ++{ ++ if (regoff == offsetof(struct pt_regs, di) && is_string_insn(insn)) ++ return false; ++ ++ return true; ++} ++ ++/** ++ * resolve_default_seg() - resolve default segment register index for an operand ++ * @insn: Instruction with opcode and address size. Must be valid. ++ * @regs: Register values as seen when entering kernel mode ++ * @off: Operand offset, in pt_regs, for which resolution is needed ++ * ++ * Resolve the default segment register index associated with the instruction ++ * operand register indicated by @off. Such index is resolved based on defaults ++ * described in the Intel Software Development Manual. ++ * ++ * Returns: ++ * ++ * If in protected mode, a constant identifying the segment register to use, ++ * among CS, SS, ES or DS. If in long mode, INAT_SEG_REG_IGNORE. ++ * ++ * -EINVAL in case of error. ++ */ ++static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off) ++{ ++ if (user_64bit_mode(regs)) ++ return INAT_SEG_REG_IGNORE; ++ /* ++ * Resolve the default segment register as described in Section 3.7.4 ++ * of the Intel Software Development Manual Vol. 1: ++ * ++ * + DS for all references involving r[ABCD]X, and rSI. ++ * + If used in a string instruction, ES for rDI. Otherwise, DS. ++ * + AX, CX and DX are not valid register operands in 16-bit address ++ * encodings but are valid for 32-bit and 64-bit encodings. ++ * + -EDOM is reserved to identify for cases in which no register ++ * is used (i.e., displacement-only addressing). Use DS. ++ * + SS for rSP or rBP. ++ * + CS for rIP. ++ */ ++ ++ switch (off) { ++ case offsetof(struct pt_regs, ax): ++ case offsetof(struct pt_regs, cx): ++ case offsetof(struct pt_regs, dx): ++ /* Need insn to verify address size. */ ++ if (insn->addr_bytes == 2) ++ return -EINVAL; ++ ++ case -EDOM: ++ case offsetof(struct pt_regs, bx): ++ case offsetof(struct pt_regs, si): ++ return INAT_SEG_REG_DS; ++ ++ case offsetof(struct pt_regs, di): ++ if (is_string_insn(insn)) ++ return INAT_SEG_REG_ES; ++ return INAT_SEG_REG_DS; ++ ++ case offsetof(struct pt_regs, bp): ++ case offsetof(struct pt_regs, sp): ++ return INAT_SEG_REG_SS; ++ ++ case offsetof(struct pt_regs, ip): ++ return INAT_SEG_REG_CS; ++ ++ default: ++ return -EINVAL; ++ } ++} ++ ++/** ++ * resolve_seg_reg() - obtain segment register index ++ * @insn: Instruction with operands ++ * @regs: Register values as seen when entering kernel mode ++ * @regoff: Operand offset, in pt_regs, used to deterimine segment register ++ * ++ * Determine the segment register associated with the operands and, if ++ * applicable, prefixes and the instruction pointed by @insn. ++ * ++ * The segment register associated to an operand used in register-indirect ++ * addressing depends on: ++ * ++ * a) Whether running in long mode (in such a case segments are ignored, except ++ * if FS or GS are used). ++ * ++ * b) Whether segment override prefixes can be used. Certain instructions and ++ * registers do not allow override prefixes. ++ * ++ * c) Whether segment overrides prefixes are found in the instruction prefixes. ++ * ++ * d) If there are not segment override prefixes or they cannot be used, the ++ * default segment register associated with the operand register is used. ++ * ++ * The function checks first if segment override prefixes can be used with the ++ * operand indicated by @regoff. If allowed, obtain such overridden segment ++ * register index. Lastly, if not prefixes were found or cannot be used, resolve ++ * the segment register index to use based on the defaults described in the ++ * Intel documentation. In long mode, all segment register indexes will be ++ * ignored, except if overrides were found for FS or GS. All these operations ++ * are done using helper functions. ++ * ++ * The operand register, @regoff, is represented as the offset from the base of ++ * pt_regs. ++ * ++ * As stated, the main use of this function is to determine the segment register ++ * index based on the instruction, its operands and prefixes. Hence, @insn ++ * must be valid. However, if @regoff indicates rIP, we don't need to inspect ++ * @insn at all as in this case CS is used in all cases. This case is checked ++ * before proceeding further. ++ * ++ * Please note that this function does not return the value in the segment ++ * register (i.e., the segment selector) but our defined index. The segment ++ * selector needs to be obtained using get_segment_selector() and passing the ++ * segment register index resolved by this function. ++ * ++ * Returns: ++ * ++ * An index identifying the segment register to use, among CS, SS, DS, ++ * ES, FS, or GS. INAT_SEG_REG_IGNORE is returned if running in long mode. ++ * ++ * -EINVAL in case of error. ++ */ ++static int resolve_seg_reg(struct insn *insn, struct pt_regs *regs, int regoff) ++{ ++ int idx; ++ ++ /* ++ * In the unlikely event of having to resolve the segment register ++ * index for rIP, do it first. Segment override prefixes should not ++ * be used. Hence, it is not necessary to inspect the instruction, ++ * which may be invalid at this point. ++ */ ++ if (regoff == offsetof(struct pt_regs, ip)) { ++ if (user_64bit_mode(regs)) ++ return INAT_SEG_REG_IGNORE; ++ else ++ return INAT_SEG_REG_CS; ++ } ++ ++ if (!insn) ++ return -EINVAL; ++ ++ if (!check_seg_overrides(insn, regoff)) ++ return resolve_default_seg(insn, regs, regoff); ++ ++ idx = get_seg_reg_override_idx(insn); ++ if (idx < 0) ++ return idx; ++ ++ if (idx == INAT_SEG_REG_DEFAULT) ++ return resolve_default_seg(insn, regs, regoff); ++ ++ /* ++ * In long mode, segment override prefixes are ignored, except for ++ * overrides for FS and GS. ++ */ ++ if (user_64bit_mode(regs)) { ++ if (idx != INAT_SEG_REG_FS && ++ idx != INAT_SEG_REG_GS) ++ idx = INAT_SEG_REG_IGNORE; ++ } ++ ++ return idx; ++} ++ ++/** ++ * get_segment_selector() - obtain segment selector ++ * @regs: Register values as seen when entering kernel mode ++ * @seg_reg_idx: Segment register index to use ++ * ++ * Obtain the segment selector from any of the CS, SS, DS, ES, FS, GS segment ++ * registers. In CONFIG_X86_32, the segment is obtained from either pt_regs or ++ * kernel_vm86_regs as applicable. In CONFIG_X86_64, CS and SS are obtained ++ * from pt_regs. DS, ES, FS and GS are obtained by reading the actual CPU ++ * registers. This done for only for completeness as in CONFIG_X86_64 segment ++ * registers are ignored. ++ * ++ * Returns: ++ * ++ * Value of the segment selector, including null when running in ++ * long mode. ++ * ++ * -EINVAL on error. ++ */ ++static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx) ++{ ++#ifdef CONFIG_X86_64 ++ unsigned short sel; ++ ++ switch (seg_reg_idx) { ++ case INAT_SEG_REG_IGNORE: ++ return 0; ++ case INAT_SEG_REG_CS: ++ return (unsigned short)(regs->cs & 0xffff); ++ case INAT_SEG_REG_SS: ++ return (unsigned short)(regs->ss & 0xffff); ++ case INAT_SEG_REG_DS: ++ savesegment(ds, sel); ++ return sel; ++ case INAT_SEG_REG_ES: ++ savesegment(es, sel); ++ return sel; ++ case INAT_SEG_REG_FS: ++ savesegment(fs, sel); ++ return sel; ++ case INAT_SEG_REG_GS: ++ savesegment(gs, sel); ++ return sel; ++ default: ++ return -EINVAL; ++ } ++#else /* CONFIG_X86_32 */ ++ struct kernel_vm86_regs *vm86regs = (struct kernel_vm86_regs *)regs; ++ ++ if (v8086_mode(regs)) { ++ switch (seg_reg_idx) { ++ case INAT_SEG_REG_CS: ++ return (unsigned short)(regs->cs & 0xffff); ++ case INAT_SEG_REG_SS: ++ return (unsigned short)(regs->ss & 0xffff); ++ case INAT_SEG_REG_DS: ++ return vm86regs->ds; ++ case INAT_SEG_REG_ES: ++ return vm86regs->es; ++ case INAT_SEG_REG_FS: ++ return vm86regs->fs; ++ case INAT_SEG_REG_GS: ++ return vm86regs->gs; ++ case INAT_SEG_REG_IGNORE: ++ /* fall through */ ++ default: ++ return -EINVAL; ++ } ++ } ++ ++ switch (seg_reg_idx) { ++ case INAT_SEG_REG_CS: ++ return (unsigned short)(regs->cs & 0xffff); ++ case INAT_SEG_REG_SS: ++ return (unsigned short)(regs->ss & 0xffff); ++ case INAT_SEG_REG_DS: ++ return (unsigned short)(regs->ds & 0xffff); ++ case INAT_SEG_REG_ES: ++ return (unsigned short)(regs->es & 0xffff); ++ case INAT_SEG_REG_FS: ++ return (unsigned short)(regs->fs & 0xffff); ++ case INAT_SEG_REG_GS: ++ /* ++ * GS may or may not be in regs as per CONFIG_X86_32_LAZY_GS. ++ * The macro below takes care of both cases. ++ */ ++ return get_user_gs(regs); ++ case INAT_SEG_REG_IGNORE: ++ /* fall through */ ++ default: ++ return -EINVAL; ++ } ++#endif /* CONFIG_X86_64 */ ++} ++ + static int get_reg_offset(struct insn *insn, struct pt_regs *regs, + enum reg_type type) + { +-- +2.15.0 + diff --git a/queue/x86-ldt-Prevent-LDT-inheritance-on-exec.patch b/queue/x86-ldt-Prevent-LDT-inheritance-on-exec.patch new file mode 100644 index 0000000..3785ca8 --- /dev/null +++ b/queue/x86-ldt-Prevent-LDT-inheritance-on-exec.patch @@ -0,0 +1,164 @@ +From a4828f81037f491b2cc986595e3a969a6eeb2fb5 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Thu, 14 Dec 2017 12:27:31 +0100 +Subject: [PATCH] x86/ldt: Prevent LDT inheritance on exec + +commit a4828f81037f491b2cc986595e3a969a6eeb2fb5 upstream. + +The LDT is inherited across fork() or exec(), but that makes no sense +at all because exec() is supposed to start the process clean. + +The reason why this happens is that init_new_context_ldt() is called from +init_new_context() which obviously needs to be called for both fork() and +exec(). + +It would be surprising if anything relies on that behaviour, so it seems to +be safe to remove that misfeature. + +Split the context initialization into two parts. Clear the LDT pointer and +initialize the mutex from the general context init and move the LDT +duplication to arch_dup_mmap() which is only called on fork(). + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Peter Zijlstra <peterz@infradead.org> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Andy Lutomirsky <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: dan.j.williams@intel.com +Cc: hughd@google.com +Cc: keescook@google.com +Cc: kirill.shutemov@linux.intel.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h +index 4fdbe5efe535..5e25423bf9bb 100644 +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -57,11 +57,17 @@ struct ldt_struct { + /* + * Used for LDT copy/destruction. + */ +-int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm); ++static inline void init_new_context_ldt(struct mm_struct *mm) ++{ ++ mm->context.ldt = NULL; ++ init_rwsem(&mm->context.ldt_usr_sem); ++} ++int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm); + void destroy_context_ldt(struct mm_struct *mm); + #else /* CONFIG_MODIFY_LDT_SYSCALL */ +-static inline int init_new_context_ldt(struct task_struct *tsk, +- struct mm_struct *mm) ++static inline void init_new_context_ldt(struct mm_struct *mm) { } ++static inline int ldt_dup_context(struct mm_struct *oldmm, ++ struct mm_struct *mm) + { + return 0; + } +@@ -137,15 +143,16 @@ static inline int init_new_context(struct task_struct *tsk, + mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); + atomic64_set(&mm->context.tlb_gen, 0); + +- #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS ++#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS + if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { + /* pkey 0 is the default and always allocated */ + mm->context.pkey_allocation_map = 0x1; + /* -1 means unallocated or invalid */ + mm->context.execute_only_pkey = -1; + } +- #endif +- return init_new_context_ldt(tsk, mm); ++#endif ++ init_new_context_ldt(mm); ++ return 0; + } + static inline void destroy_context(struct mm_struct *mm) + { +@@ -181,7 +188,7 @@ do { \ + static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) + { + paravirt_arch_dup_mmap(oldmm, mm); +- return 0; ++ return ldt_dup_context(oldmm, mm); + } + + static inline void arch_exit_mmap(struct mm_struct *mm) +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c +index 1600aebc1ec7..a6b5d62f45a7 100644 +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -131,28 +131,20 @@ static void free_ldt_struct(struct ldt_struct *ldt) + } + + /* +- * we do not have to muck with descriptors here, that is +- * done in switch_mm() as needed. ++ * Called on fork from arch_dup_mmap(). Just copy the current LDT state, ++ * the new task is not running, so nothing can be installed. + */ +-int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) ++int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) + { + struct ldt_struct *new_ldt; +- struct mm_struct *old_mm; + int retval = 0; + +- init_rwsem(&mm->context.ldt_usr_sem); +- +- old_mm = current->mm; +- if (!old_mm) { +- mm->context.ldt = NULL; ++ if (!old_mm) + return 0; +- } + + mutex_lock(&old_mm->context.lock); +- if (!old_mm->context.ldt) { +- mm->context.ldt = NULL; ++ if (!old_mm->context.ldt) + goto out_unlock; +- } + + new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries); + if (!new_ldt) { +diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c +index 66e5ce5b91f0..0304ffb714f2 100644 +--- a/tools/testing/selftests/x86/ldt_gdt.c ++++ b/tools/testing/selftests/x86/ldt_gdt.c +@@ -627,13 +627,10 @@ static void do_multicpu_tests(void) + static int finish_exec_test(void) + { + /* +- * In a sensible world, this would be check_invalid_segment(0, 1); +- * For better or for worse, though, the LDT is inherited across exec. +- * We can probably change this safely, but for now we test it. ++ * Older kernel versions did inherit the LDT on exec() which is ++ * wrong because exec() starts from a clean state. + */ +- check_valid_segment(0, 1, +- AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB, +- 42, true); ++ check_invalid_segment(0, 1); + + return nerrs ? 1 : 0; + } +-- +2.15.0 + diff --git a/queue/x86-ldt-Rework-locking.patch b/queue/x86-ldt-Rework-locking.patch new file mode 100644 index 0000000..fc9f3e9 --- /dev/null +++ b/queue/x86-ldt-Rework-locking.patch @@ -0,0 +1,186 @@ +From c2b3496bb30bd159e9de42e5c952e1f1f33c9a77 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Thu, 14 Dec 2017 12:27:30 +0100 +Subject: [PATCH] x86/ldt: Rework locking + +commit c2b3496bb30bd159e9de42e5c952e1f1f33c9a77 upstream. + +The LDT is duplicated on fork() and on exec(), which is wrong as exec() +should start from a clean state, i.e. without LDT. To fix this the LDT +duplication code will be moved into arch_dup_mmap() which is only called +for fork(). + +This introduces a locking problem. arch_dup_mmap() holds mmap_sem of the +parent process, but the LDT duplication code needs to acquire +mm->context.lock to access the LDT data safely, which is the reverse lock +order of write_ldt() where mmap_sem nests into context.lock. + +Solve this by introducing a new rw semaphore which serializes the +read/write_ldt() syscall operations and use context.lock to protect the +actual installment of the LDT descriptor. + +So context.lock stabilizes mm->context.ldt and can nest inside of the new +semaphore or mmap_sem. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Andy Lutomirsky <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Borislav Petkov <bpetkov@suse.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: dan.j.williams@intel.com +Cc: hughd@google.com +Cc: keescook@google.com +Cc: kirill.shutemov@linux.intel.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h +index 9ea26f167497..5ff3e8af2c20 100644 +--- a/arch/x86/include/asm/mmu.h ++++ b/arch/x86/include/asm/mmu.h +@@ -3,6 +3,7 @@ + #define _ASM_X86_MMU_H + + #include <linux/spinlock.h> ++#include <linux/rwsem.h> + #include <linux/mutex.h> + #include <linux/atomic.h> + +@@ -27,7 +28,8 @@ typedef struct { + atomic64_t tlb_gen; + + #ifdef CONFIG_MODIFY_LDT_SYSCALL +- struct ldt_struct *ldt; ++ struct rw_semaphore ldt_usr_sem; ++ struct ldt_struct *ldt; + #endif + + #ifdef CONFIG_X86_64 +diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h +index c76162439c8a..4fdbe5efe535 100644 +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -132,6 +132,8 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); + static inline int init_new_context(struct task_struct *tsk, + struct mm_struct *mm) + { ++ mutex_init(&mm->context.lock); ++ + mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); + atomic64_set(&mm->context.tlb_gen, 0); + +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c +index 1c1eae961340..1600aebc1ec7 100644 +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -5,6 +5,11 @@ + * Copyright (C) 2002 Andi Kleen + * + * This handles calls from both 32bit and 64bit mode. ++ * ++ * Lock order: ++ * contex.ldt_usr_sem ++ * mmap_sem ++ * context.lock + */ + + #include <linux/errno.h> +@@ -42,7 +47,7 @@ static void refresh_ldt_segments(void) + #endif + } + +-/* context.lock is held for us, so we don't need any locking. */ ++/* context.lock is held by the task which issued the smp function call */ + static void flush_ldt(void *__mm) + { + struct mm_struct *mm = __mm; +@@ -99,15 +104,17 @@ static void finalize_ldt_struct(struct ldt_struct *ldt) + paravirt_alloc_ldt(ldt->entries, ldt->nr_entries); + } + +-/* context.lock is held */ +-static void install_ldt(struct mm_struct *current_mm, +- struct ldt_struct *ldt) ++static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt) + { ++ mutex_lock(&mm->context.lock); ++ + /* Synchronizes with READ_ONCE in load_mm_ldt. */ +- smp_store_release(¤t_mm->context.ldt, ldt); ++ smp_store_release(&mm->context.ldt, ldt); + +- /* Activate the LDT for all CPUs using current_mm. */ +- on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true); ++ /* Activate the LDT for all CPUs using currents mm. */ ++ on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true); ++ ++ mutex_unlock(&mm->context.lock); + } + + static void free_ldt_struct(struct ldt_struct *ldt) +@@ -133,7 +140,8 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) + struct mm_struct *old_mm; + int retval = 0; + +- mutex_init(&mm->context.lock); ++ init_rwsem(&mm->context.ldt_usr_sem); ++ + old_mm = current->mm; + if (!old_mm) { + mm->context.ldt = NULL; +@@ -180,7 +188,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount) + unsigned long entries_size; + int retval; + +- mutex_lock(&mm->context.lock); ++ down_read(&mm->context.ldt_usr_sem); + + if (!mm->context.ldt) { + retval = 0; +@@ -209,7 +217,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount) + retval = bytecount; + + out_unlock: +- mutex_unlock(&mm->context.lock); ++ up_read(&mm->context.ldt_usr_sem); + return retval; + } + +@@ -269,7 +277,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) + ldt.avl = 0; + } + +- mutex_lock(&mm->context.lock); ++ if (down_write_killable(&mm->context.ldt_usr_sem)) ++ return -EINTR; + + old_ldt = mm->context.ldt; + old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; +@@ -291,7 +300,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) + error = 0; + + out_unlock: +- mutex_unlock(&mm->context.lock); ++ up_write(&mm->context.ldt_usr_sem); + out: + return error; + } +-- +2.15.0 + diff --git a/queue/x86-microcode-Dont-abuse-the-TLB-flush-interface.patch b/queue/x86-microcode-Dont-abuse-the-TLB-flush-interface.patch new file mode 100644 index 0000000..bf93f36 --- /dev/null +++ b/queue/x86-microcode-Dont-abuse-the-TLB-flush-interface.patch @@ -0,0 +1,114 @@ +From 23cb7d46f371844c004784ad9552a57446f73e5a Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Tue, 5 Dec 2017 13:34:51 +0100 +Subject: [PATCH] x86/microcode: Dont abuse the TLB-flush interface + +commit 23cb7d46f371844c004784ad9552a57446f73e5a upstream. + +Commit: + + ec400ddeff20 ("x86/microcode_intel_early.c: Early update ucode on Intel's CPU") + +... grubbed into tlbflush internals without coherent explanation. + +Since it says its a precaution and the SDM doesn't mention anything like +this, take it out back. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: fenghua.yu@intel.com +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 509046cfa5ce..c2e45da4e540 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -246,20 +246,9 @@ static inline void __native_flush_tlb(void) + preempt_enable(); + } + +-static inline void __native_flush_tlb_global_irq_disabled(void) +-{ +- unsigned long cr4; +- +- cr4 = this_cpu_read(cpu_tlbstate.cr4); +- /* clear PGE */ +- native_write_cr4(cr4 & ~X86_CR4_PGE); +- /* write old PGE again and flush TLBs */ +- native_write_cr4(cr4); +-} +- + static inline void __native_flush_tlb_global(void) + { +- unsigned long flags; ++ unsigned long cr4, flags; + + if (static_cpu_has(X86_FEATURE_INVPCID)) { + /* +@@ -277,7 +266,11 @@ static inline void __native_flush_tlb_global(void) + */ + raw_local_irq_save(flags); + +- __native_flush_tlb_global_irq_disabled(); ++ cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ /* toggle PGE */ ++ native_write_cr4(cr4 ^ X86_CR4_PGE); ++ /* write old PGE again and flush TLBs */ ++ native_write_cr4(cr4); + + raw_local_irq_restore(flags); + } +diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c +index 7dbcb7adf797..8ccdca6d3f9e 100644 +--- a/arch/x86/kernel/cpu/microcode/intel.c ++++ b/arch/x86/kernel/cpu/microcode/intel.c +@@ -565,15 +565,6 @@ static void print_ucode(struct ucode_cpu_info *uci) + } + #else + +-/* +- * Flush global tlb. We only do this in x86_64 where paging has been enabled +- * already and PGE should be enabled as well. +- */ +-static inline void flush_tlb_early(void) +-{ +- __native_flush_tlb_global_irq_disabled(); +-} +- + static inline void print_ucode(struct ucode_cpu_info *uci) + { + struct microcode_intel *mc; +@@ -602,10 +593,6 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) + if (rev != mc->hdr.rev) + return -1; + +-#ifdef CONFIG_X86_64 +- /* Flush global tlb. This is precaution. */ +- flush_tlb_early(); +-#endif + uci->cpu_sig.rev = rev; + + if (early) +-- +2.15.0 + diff --git a/queue/x86-mm-64-Improve-the-memory-map-documentation.patch b/queue/x86-mm-64-Improve-the-memory-map-documentation.patch new file mode 100644 index 0000000..0b5fb9e --- /dev/null +++ b/queue/x86-mm-64-Improve-the-memory-map-documentation.patch @@ -0,0 +1,59 @@ +From 5a7ccf4754fb3660569a6de52ba7f7fc3dfaf280 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Tue, 12 Dec 2017 07:56:43 -0800 +Subject: [PATCH] x86/mm/64: Improve the memory map documentation + +commit 5a7ccf4754fb3660569a6de52ba7f7fc3dfaf280 upstream. + +The old docs had the vsyscall range wrong and were missing the fixmap. +Fix both. + +There used to be 8 MB reserved for future vsyscalls, but that's long gone. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Kees Cook <keescook@chromium.org> +Cc: Kirill A. Shutemov <kirill@shutemov.name> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt +index 3448e675b462..83ca5a3b90ac 100644 +--- a/Documentation/x86/x86_64/mm.txt ++++ b/Documentation/x86/x86_64/mm.txt +@@ -19,8 +19,9 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks + ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space + ... unused hole ... + ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 +-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable) +-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls ++ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space (variable) ++[fixmap start] - ffffffffff5fffff kernel-internal fixmap range ++ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI + ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole + + Virtual memory map with 5 level page tables: +@@ -41,8 +42,9 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks + ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space + ... unused hole ... + ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 +-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space +-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls ++ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space ++[fixmap start] - ffffffffff5fffff kernel-internal fixmap range ++ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI + ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole + + Architecture defines a 64-bit virtual address. Implementations can support +-- +2.15.0 + diff --git a/queue/x86-mm-Add-comments-to-clarify-which-TLB-flush-funct.patch b/queue/x86-mm-Add-comments-to-clarify-which-TLB-flush-funct.patch new file mode 100644 index 0000000..fb559a2 --- /dev/null +++ b/queue/x86-mm-Add-comments-to-clarify-which-TLB-flush-funct.patch @@ -0,0 +1,101 @@ +From 3f67af51e56f291d7417d77c4f67cd774633c5e1 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Tue, 5 Dec 2017 13:34:52 +0100 +Subject: [PATCH] x86/mm: Add comments to clarify which TLB-flush functions are + supposed to flush what + +commit 3f67af51e56f291d7417d77c4f67cd774633c5e1 upstream. + +Per popular request.. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 3e2227386abe..552d581c8f9f 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -228,6 +228,9 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) + + extern void initialize_tlbstate_and_flush(void); + ++/* ++ * flush the entire current user mapping ++ */ + static inline void __native_flush_tlb(void) + { + /* +@@ -240,6 +243,9 @@ static inline void __native_flush_tlb(void) + preempt_enable(); + } + ++/* ++ * flush everything ++ */ + static inline void __native_flush_tlb_global(void) + { + unsigned long cr4, flags; +@@ -269,17 +275,27 @@ static inline void __native_flush_tlb_global(void) + raw_local_irq_restore(flags); + } + ++/* ++ * flush one page in the user mapping ++ */ + static inline void __native_flush_tlb_single(unsigned long addr) + { + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + } + ++/* ++ * flush everything ++ */ + static inline void __flush_tlb_all(void) + { +- if (boot_cpu_has(X86_FEATURE_PGE)) ++ if (boot_cpu_has(X86_FEATURE_PGE)) { + __flush_tlb_global(); +- else ++ } else { ++ /* ++ * !PGE -> !PCID (setup_pcid()), thus every flush is total. ++ */ + __flush_tlb(); ++ } + + /* + * Note: if we somehow had PCID but not PGE, then this wouldn't work -- +@@ -290,6 +306,9 @@ static inline void __flush_tlb_all(void) + */ + } + ++/* ++ * flush one page in the kernel mapping ++ */ + static inline void __flush_tlb_one(unsigned long addr) + { + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); +-- +2.15.0 + diff --git a/queue/x86-mm-Create-asm-invpcid.h.patch b/queue/x86-mm-Create-asm-invpcid.h.patch new file mode 100644 index 0000000..0e88349 --- /dev/null +++ b/queue/x86-mm-Create-asm-invpcid.h.patch @@ -0,0 +1,155 @@ +From 1a3b0caeb77edeac5ce5fa05e6a61c474c9a9745 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Tue, 5 Dec 2017 13:34:47 +0100 +Subject: [PATCH] x86/mm: Create asm/invpcid.h + +commit 1a3b0caeb77edeac5ce5fa05e6a61c474c9a9745 upstream. + +Unclutter tlbflush.h a little. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/invpcid.h b/arch/x86/include/asm/invpcid.h +new file mode 100644 +index 000000000000..989cfa86de85 +--- /dev/null ++++ b/arch/x86/include/asm/invpcid.h +@@ -0,0 +1,53 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _ASM_X86_INVPCID ++#define _ASM_X86_INVPCID ++ ++static inline void __invpcid(unsigned long pcid, unsigned long addr, ++ unsigned long type) ++{ ++ struct { u64 d[2]; } desc = { { pcid, addr } }; ++ ++ /* ++ * The memory clobber is because the whole point is to invalidate ++ * stale TLB entries and, especially if we're flushing global ++ * mappings, we don't want the compiler to reorder any subsequent ++ * memory accesses before the TLB flush. ++ * ++ * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and ++ * invpcid (%rcx), %rax in long mode. ++ */ ++ asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" ++ : : "m" (desc), "a" (type), "c" (&desc) : "memory"); ++} ++ ++#define INVPCID_TYPE_INDIV_ADDR 0 ++#define INVPCID_TYPE_SINGLE_CTXT 1 ++#define INVPCID_TYPE_ALL_INCL_GLOBAL 2 ++#define INVPCID_TYPE_ALL_NON_GLOBAL 3 ++ ++/* Flush all mappings for a given pcid and addr, not including globals. */ ++static inline void invpcid_flush_one(unsigned long pcid, ++ unsigned long addr) ++{ ++ __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); ++} ++ ++/* Flush all mappings for a given PCID, not including globals. */ ++static inline void invpcid_flush_single_context(unsigned long pcid) ++{ ++ __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); ++} ++ ++/* Flush all mappings, including globals, for all PCIDs. */ ++static inline void invpcid_flush_all(void) ++{ ++ __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); ++} ++ ++/* Flush all mappings for all PCIDs except globals. */ ++static inline void invpcid_flush_all_nonglobals(void) ++{ ++ __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); ++} ++ ++#endif /* _ASM_X86_INVPCID */ +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 8b27daff7a7f..171b429f43a2 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -9,54 +9,7 @@ + #include <asm/cpufeature.h> + #include <asm/special_insns.h> + #include <asm/smp.h> +- +-static inline void __invpcid(unsigned long pcid, unsigned long addr, +- unsigned long type) +-{ +- struct { u64 d[2]; } desc = { { pcid, addr } }; +- +- /* +- * The memory clobber is because the whole point is to invalidate +- * stale TLB entries and, especially if we're flushing global +- * mappings, we don't want the compiler to reorder any subsequent +- * memory accesses before the TLB flush. +- * +- * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and +- * invpcid (%rcx), %rax in long mode. +- */ +- asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" +- : : "m" (desc), "a" (type), "c" (&desc) : "memory"); +-} +- +-#define INVPCID_TYPE_INDIV_ADDR 0 +-#define INVPCID_TYPE_SINGLE_CTXT 1 +-#define INVPCID_TYPE_ALL_INCL_GLOBAL 2 +-#define INVPCID_TYPE_ALL_NON_GLOBAL 3 +- +-/* Flush all mappings for a given pcid and addr, not including globals. */ +-static inline void invpcid_flush_one(unsigned long pcid, +- unsigned long addr) +-{ +- __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); +-} +- +-/* Flush all mappings for a given PCID, not including globals. */ +-static inline void invpcid_flush_single_context(unsigned long pcid) +-{ +- __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); +-} +- +-/* Flush all mappings, including globals, for all PCIDs. */ +-static inline void invpcid_flush_all(void) +-{ +- __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); +-} +- +-/* Flush all mappings for all PCIDs except globals. */ +-static inline void invpcid_flush_all_nonglobals(void) +-{ +- __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); +-} ++#include <asm/invpcid.h> + + static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) + { +-- +2.15.0 + diff --git a/queue/x86-mm-Move-the-CR3-construction-functions-to-tlbflu.patch b/queue/x86-mm-Move-the-CR3-construction-functions-to-tlbflu.patch new file mode 100644 index 0000000..d14febd --- /dev/null +++ b/queue/x86-mm-Move-the-CR3-construction-functions-to-tlbflu.patch @@ -0,0 +1,166 @@ +From 50fb83a62cf472dc53ba23bd3f7bd6c1b2b3b53e Mon Sep 17 00:00:00 2001 +From: Dave Hansen <dave.hansen@linux.intel.com> +Date: Mon, 4 Dec 2017 15:07:54 +0100 +Subject: [PATCH] x86/mm: Move the CR3 construction functions to tlbflush.h + +commit 50fb83a62cf472dc53ba23bd3f7bd6c1b2b3b53e upstream. + +For flushing the TLB, the ASID which has been programmed into the hardware +must be known. That differs from what is in 'cpu_tlbstate'. + +Add functions to transform the 'cpu_tlbstate' values into to the one +programmed into the hardware (CR3). + +It's not easy to include mmu_context.h into tlbflush.h, so just move the +CR3 building over to tlbflush.h. + +Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h +index 5e25423bf9bb..5ede7cae1d67 100644 +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -290,33 +290,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, + return __pkru_allows_pkey(vma_pkey(vma), write); + } + +-/* +- * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID +- * bits. This serves two purposes. It prevents a nasty situation in +- * which PCID-unaware code saves CR3, loads some other value (with PCID +- * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if +- * the saved ASID was nonzero. It also means that any bugs involving +- * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger +- * deterministically. +- */ +- +-static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid) +-{ +- if (static_cpu_has(X86_FEATURE_PCID)) { +- VM_WARN_ON_ONCE(asid > 4094); +- return __sme_pa(mm->pgd) | (asid + 1); +- } else { +- VM_WARN_ON_ONCE(asid != 0); +- return __sme_pa(mm->pgd); +- } +-} +- +-static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) +-{ +- VM_WARN_ON_ONCE(asid > 4094); +- return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH; +-} +- + /* + * This can be used from process context to figure out what the value of + * CR3 is without needing to do a (slow) __read_cr3(). +@@ -326,7 +299,7 @@ static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) + */ + static inline unsigned long __get_current_cr3_fast(void) + { +- unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm), ++ unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, + this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + + /* For now, be very restrictive about when this can be called. */ +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 552d581c8f9f..ee7925adfb57 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -69,6 +69,32 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) + return atomic64_inc_return(&mm->context.tlb_gen); + } + ++/* ++ * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID bits. ++ * This serves two purposes. It prevents a nasty situation in which ++ * PCID-unaware code saves CR3, loads some other value (with PCID == 0), ++ * and then restores CR3, thus corrupting the TLB for ASID 0 if the saved ++ * ASID was nonzero. It also means that any bugs involving loading a ++ * PCID-enabled CR3 with CR4.PCIDE off will trigger deterministically. ++ */ ++struct pgd_t; ++static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) ++{ ++ if (static_cpu_has(X86_FEATURE_PCID)) { ++ VM_WARN_ON_ONCE(asid > 4094); ++ return __sme_pa(pgd) | (asid + 1); ++ } else { ++ VM_WARN_ON_ONCE(asid != 0); ++ return __sme_pa(pgd); ++ } ++} ++ ++static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) ++{ ++ VM_WARN_ON_ONCE(asid > 4094); ++ return __sme_pa(pgd) | (asid + 1) | CR3_NOFLUSH; ++} ++ + #ifdef CONFIG_PARAVIRT + #include <asm/paravirt.h> + #else +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 0569987f6da6..0a1be3adc97e 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -128,7 +128,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + * isn't free. + */ + #ifdef CONFIG_DEBUG_VM +- if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { ++ if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) { + /* + * If we were to BUG here, we'd be very likely to kill + * the system so hard that we don't see the call trace. +@@ -195,7 +195,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); +- write_cr3(build_cr3(next, new_asid)); ++ write_cr3(build_cr3(next->pgd, new_asid)); + + /* + * NB: This gets called via leave_mm() in the idle path +@@ -208,7 +208,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ +- write_cr3(build_cr3_noflush(next, new_asid)); ++ write_cr3(build_cr3_noflush(next->pgd, new_asid)); + + /* See above wrt _rcuidle. */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); +@@ -288,7 +288,7 @@ void initialize_tlbstate_and_flush(void) + !(cr4_read_shadow() & X86_CR4_PCIDE)); + + /* Force ASID 0 and force a TLB flush. */ +- write_cr3(build_cr3(mm, 0)); ++ write_cr3(build_cr3(mm->pgd, 0)); + + /* Reinitialize tlbstate. */ + this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); +-- +2.15.0 + diff --git a/queue/x86-mm-Put-MMU-to-hardware-ASID-translation-in-one-p.patch b/queue/x86-mm-Put-MMU-to-hardware-ASID-translation-in-one-p.patch new file mode 100644 index 0000000..4512d2b --- /dev/null +++ b/queue/x86-mm-Put-MMU-to-hardware-ASID-translation-in-one-p.patch @@ -0,0 +1,95 @@ +From dd95f1a4b5ca904c78e6a097091eb21436478abb Mon Sep 17 00:00:00 2001 +From: Dave Hansen <dave.hansen@linux.intel.com> +Date: Mon, 4 Dec 2017 15:07:56 +0100 +Subject: [PATCH] x86/mm: Put MMU to hardware ASID translation in one place + +commit dd95f1a4b5ca904c78e6a097091eb21436478abb upstream. + +There are effectively two ASID types: + + 1. The one stored in the mmu_context that goes from 0..5 + 2. The one programmed into the hardware that goes from 1..6 + +This consolidates the locations where converting between the two (by doing +a +1) to a single place which gives us a nice place to comment. +PAGE_TABLE_ISOLATION will also need to, given an ASID, know which hardware +ASID to flush for the userspace mapping. + +Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index f88ccd3ae466..8b27daff7a7f 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -85,20 +85,26 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) + */ + #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2) + +-/* +- * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID bits. +- * This serves two purposes. It prevents a nasty situation in which +- * PCID-unaware code saves CR3, loads some other value (with PCID == 0), +- * and then restores CR3, thus corrupting the TLB for ASID 0 if the saved +- * ASID was nonzero. It also means that any bugs involving loading a +- * PCID-enabled CR3 with CR4.PCIDE off will trigger deterministically. +- */ ++static inline u16 kern_pcid(u16 asid) ++{ ++ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); ++ /* ++ * If PCID is on, ASID-aware code paths put the ASID+1 into the ++ * PCID bits. This serves two purposes. It prevents a nasty ++ * situation in which PCID-unaware code saves CR3, loads some other ++ * value (with PCID == 0), and then restores CR3, thus corrupting ++ * the TLB for ASID 0 if the saved ASID was nonzero. It also means ++ * that any bugs involving loading a PCID-enabled CR3 with ++ * CR4.PCIDE off will trigger deterministically. ++ */ ++ return asid + 1; ++} ++ + struct pgd_t; + static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) + { + if (static_cpu_has(X86_FEATURE_PCID)) { +- VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); +- return __sme_pa(pgd) | (asid + 1); ++ return __sme_pa(pgd) | kern_pcid(asid); + } else { + VM_WARN_ON_ONCE(asid != 0); + return __sme_pa(pgd); +@@ -108,7 +114,8 @@ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) + static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) + { + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); +- return __sme_pa(pgd) | (asid + 1) | CR3_NOFLUSH; ++ VM_WARN_ON_ONCE(!this_cpu_has(X86_FEATURE_PCID)); ++ return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; + } + + #ifdef CONFIG_PARAVIRT +-- +2.15.0 + diff --git a/queue/x86-mm-Remove-hard-coded-ASID-limit-checks.patch b/queue/x86-mm-Remove-hard-coded-ASID-limit-checks.patch new file mode 100644 index 0000000..08ebf3d --- /dev/null +++ b/queue/x86-mm-Remove-hard-coded-ASID-limit-checks.patch @@ -0,0 +1,85 @@ +From cb0a9144a744e55207e24dcef812f05cd15a499a Mon Sep 17 00:00:00 2001 +From: Dave Hansen <dave.hansen@linux.intel.com> +Date: Mon, 4 Dec 2017 15:07:55 +0100 +Subject: [PATCH] x86/mm: Remove hard-coded ASID limit checks + +commit cb0a9144a744e55207e24dcef812f05cd15a499a upstream. + +First, it's nice to remove the magic numbers. + +Second, PAGE_TABLE_ISOLATION is going to consume half of the available ASID +space. The space is currently unused, but add a comment to spell out this +new restriction. + +Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index ee7925adfb57..f88ccd3ae466 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -69,6 +69,22 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) + return atomic64_inc_return(&mm->context.tlb_gen); + } + ++/* There are 12 bits of space for ASIDS in CR3 */ ++#define CR3_HW_ASID_BITS 12 ++/* ++ * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for ++ * user/kernel switches ++ */ ++#define PTI_CONSUMED_ASID_BITS 0 ++ ++#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS) ++/* ++ * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account ++ * for them being zero-based. Another -1 is because ASID 0 is reserved for ++ * use by non-PCID-aware users. ++ */ ++#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2) ++ + /* + * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID bits. + * This serves two purposes. It prevents a nasty situation in which +@@ -81,7 +97,7 @@ struct pgd_t; + static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) + { + if (static_cpu_has(X86_FEATURE_PCID)) { +- VM_WARN_ON_ONCE(asid > 4094); ++ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + return __sme_pa(pgd) | (asid + 1); + } else { + VM_WARN_ON_ONCE(asid != 0); +@@ -91,7 +107,7 @@ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) + + static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) + { +- VM_WARN_ON_ONCE(asid > 4094); ++ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + return __sme_pa(pgd) | (asid + 1) | CR3_NOFLUSH; + } + +-- +2.15.0 + diff --git a/queue/x86-mm-Remove-superfluous-barriers.patch b/queue/x86-mm-Remove-superfluous-barriers.patch new file mode 100644 index 0000000..6582b7e --- /dev/null +++ b/queue/x86-mm-Remove-superfluous-barriers.patch @@ -0,0 +1,61 @@ +From b5fc6d943808b570bdfbec80f40c6b3855f1c48b Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Tue, 5 Dec 2017 13:34:46 +0100 +Subject: [PATCH] x86/mm: Remove superfluous barriers + +commit b5fc6d943808b570bdfbec80f40c6b3855f1c48b upstream. + +atomic64_inc_return() already implies smp_mb() before and after. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index c2e45da4e540..3e2227386abe 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -60,19 +60,13 @@ static inline void invpcid_flush_all_nonglobals(void) + + static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) + { +- u64 new_tlb_gen; +- + /* + * Bump the generation count. This also serves as a full barrier + * that synchronizes with switch_mm(): callers are required to order + * their read of mm_cpumask after their writes to the paging + * structures. + */ +- smp_mb__before_atomic(); +- new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen); +- smp_mb__after_atomic(); +- +- return new_tlb_gen; ++ return atomic64_inc_return(&mm->context.tlb_gen); + } + + #ifdef CONFIG_PARAVIRT +-- +2.15.0 + diff --git a/queue/x86-mm-Use-__flush_tlb_one-for-kernel-memory.patch b/queue/x86-mm-Use-__flush_tlb_one-for-kernel-memory.patch new file mode 100644 index 0000000..cf55d8a --- /dev/null +++ b/queue/x86-mm-Use-__flush_tlb_one-for-kernel-memory.patch @@ -0,0 +1,50 @@ +From a501686b2923ce6f2ff2b1d0d50682c6411baf72 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Tue, 5 Dec 2017 13:34:49 +0100 +Subject: [PATCH] x86/mm: Use __flush_tlb_one() for kernel memory + +commit a501686b2923ce6f2ff2b1d0d50682c6411baf72 upstream. + +__flush_tlb_single() is for user mappings, __flush_tlb_one() for +kernel mappings. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 3118392cdf75..0569987f6da6 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -551,7 +551,7 @@ static void do_kernel_range_flush(void *info) + + /* flush range by one by one 'invlpg' */ + for (addr = f->start; addr < f->end; addr += PAGE_SIZE) +- __flush_tlb_single(addr); ++ __flush_tlb_one(addr); + } + + void flush_tlb_kernel_range(unsigned long start, unsigned long end) +-- +2.15.0 + diff --git a/queue/x86-mm-dump_pagetables-Check-PAGE_PRESENT-for-real.patch b/queue/x86-mm-dump_pagetables-Check-PAGE_PRESENT-for-real.patch new file mode 100644 index 0000000..97f1d89 --- /dev/null +++ b/queue/x86-mm-dump_pagetables-Check-PAGE_PRESENT-for-real.patch @@ -0,0 +1,45 @@ +From c05344947b37f7cda726e802457370bc6eac4d26 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Sat, 16 Dec 2017 01:14:39 +0100 +Subject: [PATCH] x86/mm/dump_pagetables: Check PAGE_PRESENT for real + +commit c05344947b37f7cda726e802457370bc6eac4d26 upstream. + +The check for a present page in printk_prot(): + + if (!pgprot_val(prot)) { + /* Not present */ + +is bogus. If a PTE is set to PAGE_NONE then the pgprot_val is not zero and +the entry is decoded in bogus ways, e.g. as RX GLB. That is confusing when +analyzing mapping correctness. Check for the present bit to make an +informed decision. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c +index 5e3ac6fe6c9e..1014cfb21c2c 100644 +--- a/arch/x86/mm/dump_pagetables.c ++++ b/arch/x86/mm/dump_pagetables.c +@@ -140,7 +140,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) + static const char * const level_name[] = + { "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; + +- if (!pgprot_val(prot)) { ++ if (!(pr & _PAGE_PRESENT)) { + /* Not present */ + pt_dump_cont_printf(m, dmsg, " "); + } else { +-- +2.15.0 + diff --git a/queue/x86-mm-dump_pagetables-Make-the-address-hints-correc.patch b/queue/x86-mm-dump_pagetables-Make-the-address-hints-correc.patch new file mode 100644 index 0000000..affb6a3 --- /dev/null +++ b/queue/x86-mm-dump_pagetables-Make-the-address-hints-correc.patch @@ -0,0 +1,158 @@ +From 146122e24bdf208015d629babba673e28d090709 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Wed, 20 Dec 2017 18:07:42 +0100 +Subject: [PATCH] x86/mm/dump_pagetables: Make the address hints correct and + readable + +commit 146122e24bdf208015d629babba673e28d090709 upstream. + +The address hints are a trainwreck. The array entry numbers have to kept +magically in sync with the actual hints, which is doomed as some of the +array members are initialized at runtime via the entry numbers. + +Designated initializers have been around before this code was +implemented.... + +Use the entry numbers to populate the address hints array and add the +missing bits and pieces. Split 32 and 64 bit for readability sake. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: linux-kernel@vger.kernel.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c +index 1014cfb21c2c..fdf09d8f98da 100644 +--- a/arch/x86/mm/dump_pagetables.c ++++ b/arch/x86/mm/dump_pagetables.c +@@ -44,10 +44,12 @@ struct addr_marker { + unsigned long max_lines; + }; + +-/* indices for address_markers; keep sync'd w/ address_markers below */ ++/* Address space markers hints */ ++ ++#ifdef CONFIG_X86_64 ++ + enum address_markers_idx { + USER_SPACE_NR = 0, +-#ifdef CONFIG_X86_64 + KERNEL_SPACE_NR, + LOW_KERNEL_NR, + VMALLOC_START_NR, +@@ -56,56 +58,70 @@ enum address_markers_idx { + KASAN_SHADOW_START_NR, + KASAN_SHADOW_END_NR, + #endif +-# ifdef CONFIG_X86_ESPFIX64 ++#ifdef CONFIG_X86_ESPFIX64 + ESPFIX_START_NR, +-# endif ++#endif ++#ifdef CONFIG_EFI ++ EFI_END_NR, ++#endif + HIGH_KERNEL_NR, + MODULES_VADDR_NR, + MODULES_END_NR, +-#else ++ FIXADDR_START_NR, ++ END_OF_SPACE_NR, ++}; ++ ++static struct addr_marker address_markers[] = { ++ [USER_SPACE_NR] = { 0, "User Space" }, ++ [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" }, ++ [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" }, ++ [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, ++ [VMEMMAP_START_NR] = { 0UL, "Vmemmap" }, ++#ifdef CONFIG_KASAN ++ [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" }, ++ [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" }, ++#endif ++#ifdef CONFIG_X86_ESPFIX64 ++ [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, ++#endif ++#ifdef CONFIG_EFI ++ [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" }, ++#endif ++ [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" }, ++ [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" }, ++ [MODULES_END_NR] = { MODULES_END, "End Modules" }, ++ [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" }, ++ [END_OF_SPACE_NR] = { -1, NULL } ++}; ++ ++#else /* CONFIG_X86_64 */ ++ ++enum address_markers_idx { ++ USER_SPACE_NR = 0, + KERNEL_SPACE_NR, + VMALLOC_START_NR, + VMALLOC_END_NR, +-# ifdef CONFIG_HIGHMEM ++#ifdef CONFIG_HIGHMEM + PKMAP_BASE_NR, +-# endif +- FIXADDR_START_NR, + #endif ++ FIXADDR_START_NR, ++ END_OF_SPACE_NR, + }; + +-/* Address space markers hints */ + static struct addr_marker address_markers[] = { +- { 0, "User Space" }, +-#ifdef CONFIG_X86_64 +- { 0x8000000000000000UL, "Kernel Space" }, +- { 0/* PAGE_OFFSET */, "Low Kernel Mapping" }, +- { 0/* VMALLOC_START */, "vmalloc() Area" }, +- { 0/* VMEMMAP_START */, "Vmemmap" }, +-#ifdef CONFIG_KASAN +- { KASAN_SHADOW_START, "KASAN shadow" }, +- { KASAN_SHADOW_END, "KASAN shadow end" }, ++ [USER_SPACE_NR] = { 0, "User Space" }, ++ [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" }, ++ [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, ++ [VMALLOC_END_NR] = { 0UL, "vmalloc() End" }, ++#ifdef CONFIG_HIGHMEM ++ [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, + #endif +-# ifdef CONFIG_X86_ESPFIX64 +- { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, +-# endif +-# ifdef CONFIG_EFI +- { EFI_VA_END, "EFI Runtime Services" }, +-# endif +- { __START_KERNEL_map, "High Kernel Mapping" }, +- { MODULES_VADDR, "Modules" }, +- { MODULES_END, "End Modules" }, +-#else +- { PAGE_OFFSET, "Kernel Mapping" }, +- { 0/* VMALLOC_START */, "vmalloc() Area" }, +- { 0/*VMALLOC_END*/, "vmalloc() End" }, +-# ifdef CONFIG_HIGHMEM +- { 0/*PKMAP_BASE*/, "Persistent kmap() Area" }, +-# endif +- { 0/*FIXADDR_START*/, "Fixmap Area" }, +-#endif +- { -1, NULL } /* End of list */ ++ [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, ++ [END_OF_SPACE_NR] = { -1, NULL } + }; + ++#endif /* !CONFIG_X86_64 */ ++ + /* Multipliers for offsets within the PTEs */ + #define PTE_LEVEL_MULT (PAGE_SIZE) + #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) +-- +2.15.0 + diff --git a/queue/x86-uv-Use-the-right-TLB-flush-API.patch b/queue/x86-uv-Use-the-right-TLB-flush-API.patch new file mode 100644 index 0000000..79a4250 --- /dev/null +++ b/queue/x86-uv-Use-the-right-TLB-flush-API.patch @@ -0,0 +1,53 @@ +From 3e46e0f5ee3643a1239be9046c7ba6c66ca2b329 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Tue, 5 Dec 2017 13:34:50 +0100 +Subject: [PATCH] x86/uv: Use the right TLB-flush API + +commit 3e46e0f5ee3643a1239be9046c7ba6c66ca2b329 upstream. + +Since uv_flush_tlb_others() implements flush_tlb_others() which is +about flushing user mappings, we should use __flush_tlb_single(), +which too is about flushing user mappings. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: Andrew Banman <abanman@hpe.com> +Cc: Andy Lutomirski <luto@kernel.org> +Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: Denys Vlasenko <dvlasenk@redhat.com> +Cc: Eduardo Valentin <eduval@amazon.com> +Cc: Greg KH <gregkh@linuxfoundation.org> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Mike Travis <mike.travis@hpe.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Will Deacon <will.deacon@arm.com> +Cc: aliguori@amazon.com +Cc: daniel.gruss@iaik.tugraz.at +Cc: hughd@google.com +Cc: keescook@google.com +Cc: linux-mm@kvack.org +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c +index f44c0bc95aa2..8538a6723171 100644 +--- a/arch/x86/platform/uv/tlb_uv.c ++++ b/arch/x86/platform/uv/tlb_uv.c +@@ -299,7 +299,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, + local_flush_tlb(); + stat->d_alltlb++; + } else { +- __flush_tlb_one(msg->address); ++ __flush_tlb_single(msg->address); + stat->d_onetlb++; + } + stat->d_requestee++; +-- +2.15.0 + diff --git a/queue/x86-vsyscall-64-Explicitly-set-_PAGE_USER-in-the-pag.patch b/queue/x86-vsyscall-64-Explicitly-set-_PAGE_USER-in-the-pag.patch new file mode 100644 index 0000000..338ad96 --- /dev/null +++ b/queue/x86-vsyscall-64-Explicitly-set-_PAGE_USER-in-the-pag.patch @@ -0,0 +1,97 @@ +From 49275fef986abfb8b476e4708aaecc07e7d3e087 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Sun, 10 Dec 2017 22:47:19 -0800 +Subject: [PATCH] x86/vsyscall/64: Explicitly set _PAGE_USER in the pagetable + hierarchy + +commit 49275fef986abfb8b476e4708aaecc07e7d3e087 upstream. + +The kernel is very erratic as to which pagetables have _PAGE_USER set. The +vsyscall page gets lucky: it seems that all of the relevant pagetables are +among the apparently arbitrary ones that set _PAGE_USER. Rather than +relying on chance, just explicitly set _PAGE_USER. + +This will let us clean up pagetable setup to stop setting _PAGE_USER. The +added code can also be reused by pagetable isolation to manage the +_PAGE_USER bit in the usermode tables. + +[ tglx: Folded paravirt fix from Juergen Gross ] + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Kees Cook <keescook@chromium.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c +index f279ba2643dc..daad57c76e42 100644 +--- a/arch/x86/entry/vsyscall/vsyscall_64.c ++++ b/arch/x86/entry/vsyscall/vsyscall_64.c +@@ -37,6 +37,7 @@ + #include <asm/unistd.h> + #include <asm/fixmap.h> + #include <asm/traps.h> ++#include <asm/paravirt.h> + + #define CREATE_TRACE_POINTS + #include "vsyscall_trace.h" +@@ -329,16 +330,47 @@ int in_gate_area_no_mm(unsigned long addr) + return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; + } + ++/* ++ * The VSYSCALL page is the only user-accessible page in the kernel address ++ * range. Normally, the kernel page tables can have _PAGE_USER clear, but ++ * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls ++ * are enabled. ++ * ++ * Some day we may create a "minimal" vsyscall mode in which we emulate ++ * vsyscalls but leave the page not present. If so, we skip calling ++ * this. ++ */ ++static void __init set_vsyscall_pgtable_user_bits(void) ++{ ++ pgd_t *pgd; ++ p4d_t *p4d; ++ pud_t *pud; ++ pmd_t *pmd; ++ ++ pgd = pgd_offset_k(VSYSCALL_ADDR); ++ set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); ++ p4d = p4d_offset(pgd, VSYSCALL_ADDR); ++#if CONFIG_PGTABLE_LEVELS >= 5 ++ p4d->p4d |= _PAGE_USER; ++#endif ++ pud = pud_offset(p4d, VSYSCALL_ADDR); ++ set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); ++ pmd = pmd_offset(pud, VSYSCALL_ADDR); ++ set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER)); ++} ++ + void __init map_vsyscall(void) + { + extern char __vsyscall_page; + unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); + +- if (vsyscall_mode != NONE) ++ if (vsyscall_mode != NONE) { + __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, + vsyscall_mode == NATIVE + ? PAGE_KERNEL_VSYSCALL + : PAGE_KERNEL_VVAR); ++ set_vsyscall_pgtable_user_bits(); ++ } + + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != + (unsigned long)VSYSCALL_ADDR); +-- +2.15.0 + diff --git a/queue/x86-vsyscall-64-Warn-and-fail-vsyscall-emulation-in-.patch b/queue/x86-vsyscall-64-Warn-and-fail-vsyscall-emulation-in-.patch new file mode 100644 index 0000000..2922051 --- /dev/null +++ b/queue/x86-vsyscall-64-Warn-and-fail-vsyscall-emulation-in-.patch @@ -0,0 +1,44 @@ +From 4831b779403a836158917d59a7ca880483c67378 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@kernel.org> +Date: Sun, 10 Dec 2017 22:47:20 -0800 +Subject: [PATCH] x86/vsyscall/64: Warn and fail vsyscall emulation in NATIVE + mode + +commit 4831b779403a836158917d59a7ca880483c67378 upstream. + +If something goes wrong with pagetable setup, vsyscall=native will +accidentally fall back to emulation. Make it warn and fail so that we +notice. + +Signed-off-by: Andy Lutomirski <luto@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Borislav Petkov <bp@alien8.de> +Cc: Brian Gerst <brgerst@gmail.com> +Cc: Dave Hansen <dave.hansen@linux.intel.com> +Cc: David Laight <David.Laight@aculab.com> +Cc: H. Peter Anvin <hpa@zytor.com> +Cc: Josh Poimboeuf <jpoimboe@redhat.com> +Cc: Juergen Gross <jgross@suse.com> +Cc: Kees Cook <keescook@chromium.org> +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Peter Zijlstra <peterz@infradead.org> +Signed-off-by: Ingo Molnar <mingo@kernel.org> + +diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c +index daad57c76e42..1faf40f2dda9 100644 +--- a/arch/x86/entry/vsyscall/vsyscall_64.c ++++ b/arch/x86/entry/vsyscall/vsyscall_64.c +@@ -139,6 +139,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) + + WARN_ON_ONCE(address != regs->ip); + ++ /* This should be unreachable in NATIVE mode. */ ++ if (WARN_ON(vsyscall_mode == NATIVE)) ++ return false; ++ + if (vsyscall_mode == NONE) { + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall attempted with vsyscall=none"); +-- +2.15.0 + |