diff options
author | Zefan Li <lizefan@huawei.com> | 2014-12-23 14:01:08 +0800 |
---|---|---|
committer | Zefan Li <lizefan@huawei.com> | 2014-12-23 22:17:07 +0800 |
commit | 6963c380cd946f8b8fb600e8a29cbeb700f30c73 (patch) | |
tree | ce0b338b1b39e4eec398064861032c955699a18e | |
parent | 701208ec2ba0225f6baa56f25b619009c81719e8 (diff) | |
download | linux-3.4.y-queue-6963c380cd946f8b8fb600e8a29cbeb700f30c73.tar.gz |
Add commits to 3.4.y, up to 3.18-rc2
88 files changed, 6592 insertions, 0 deletions
diff --git a/patches/alsa-emu10k1-fix-deadlock-in-synth-voice-lookup.patch b/patches/alsa-emu10k1-fix-deadlock-in-synth-voice-lookup.patch new file mode 100644 index 0000000..65b6b9a --- /dev/null +++ b/patches/alsa-emu10k1-fix-deadlock-in-synth-voice-lookup.patch @@ -0,0 +1,60 @@ +From 95926035b187cc9fee6fb61385b7da9c28123f74 Mon Sep 17 00:00:00 2001 +From: Takashi Iwai <tiwai@suse.de> +Date: Mon, 13 Oct 2014 23:18:02 +0200 +Subject: ALSA: emu10k1: Fix deadlock in synth voice lookup + +commit 95926035b187cc9fee6fb61385b7da9c28123f74 upstream. + +The emu10k1 voice allocator takes voice_lock spinlock. When there is +no empty stream available, it tries to release a voice used by synth, +and calls get_synth_voice. The callback function, +snd_emu10k1_synth_get_voice(), however, also takes the voice_lock, +thus it deadlocks. + +The fix is simply removing the voice_lock holds in +snd_emu10k1_synth_get_voice(), as this is always called in the +spinlock context. + +Reported-and-tested-by: Arthur Marsh <arthur.marsh@internode.on.net> +Signed-off-by: Takashi Iwai <tiwai@suse.de> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + sound/pci/emu10k1/emu10k1_callback.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +--- a/sound/pci/emu10k1/emu10k1_callback.c ++++ b/sound/pci/emu10k1/emu10k1_callback.c +@@ -85,6 +85,8 @@ snd_emu10k1_ops_setup(struct snd_emux *e + * get more voice for pcm + * + * terminate most inactive voice and give it as a pcm voice. ++ * ++ * voice_lock is already held. + */ + int + snd_emu10k1_synth_get_voice(struct snd_emu10k1 *hw) +@@ -92,12 +94,10 @@ snd_emu10k1_synth_get_voice(struct snd_e + struct snd_emux *emu; + struct snd_emux_voice *vp; + struct best_voice best[V_END]; +- unsigned long flags; + int i; + + emu = hw->synth; + +- spin_lock_irqsave(&emu->voice_lock, flags); + lookup_voices(emu, hw, best, 1); /* no OFF voices */ + for (i = 0; i < V_END; i++) { + if (best[i].voice >= 0) { +@@ -113,11 +113,9 @@ snd_emu10k1_synth_get_voice(struct snd_e + vp->emu->num_voices--; + vp->ch = -1; + vp->state = SNDRV_EMUX_ST_OFF; +- spin_unlock_irqrestore(&emu->voice_lock, flags); + return ch; + } + } +- spin_unlock_irqrestore(&emu->voice_lock, flags); + + /* not found */ + return -ENOMEM; diff --git a/patches/alsa-pcm-use-the-same-dma-mmap-codepath-both-for-arm-and-arm64.patch b/patches/alsa-pcm-use-the-same-dma-mmap-codepath-both-for-arm-and-arm64.patch new file mode 100644 index 0000000..3b4634d --- /dev/null +++ b/patches/alsa-pcm-use-the-same-dma-mmap-codepath-both-for-arm-and-arm64.patch @@ -0,0 +1,35 @@ +From a011e213f3700233ed2a676f1ef0a74a052d7162 Mon Sep 17 00:00:00 2001 +From: Anatol Pomozov <anatol.pomozov@gmail.com> +Date: Fri, 17 Oct 2014 12:43:34 -0700 +Subject: ALSA: pcm: use the same dma mmap codepath both for arm and arm64 + +commit a011e213f3700233ed2a676f1ef0a74a052d7162 upstream. + +This avoids following kernel crash when try to playback on arm64 + +[ 107.497203] [<ffffffc00046b310>] snd_pcm_mmap_data_fault+0x90/0xd4 +[ 107.503405] [<ffffffc0001541ac>] __do_fault+0xb0/0x498 +[ 107.508565] [<ffffffc0001576a0>] handle_mm_fault+0x224/0x7b0 +[ 107.514246] [<ffffffc000092640>] do_page_fault+0x11c/0x310 +[ 107.519738] [<ffffffc000081100>] do_mem_abort+0x38/0x98 + +Tested: backported to 3.14 and tried to playback on arm64 machine + +Signed-off-by: Anatol Pomozov <anatol.pomozov@gmail.com> +Signed-off-by: Takashi Iwai <tiwai@suse.de> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + sound/core/pcm_native.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/sound/core/pcm_native.c ++++ b/sound/core/pcm_native.c +@@ -3171,7 +3171,7 @@ static const struct vm_operations_struct + + #ifndef ARCH_HAS_DMA_MMAP_COHERENT + /* This should be defined / handled globally! */ +-#ifdef CONFIG_ARM ++#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) + #define ARCH_HAS_DMA_MMAP_COHERENT + #endif + #endif diff --git a/patches/alsa-usb-audio-add-support-for-steinberg-ur22-usb-interface.patch b/patches/alsa-usb-audio-add-support-for-steinberg-ur22-usb-interface.patch new file mode 100644 index 0000000..b0ca337 --- /dev/null +++ b/patches/alsa-usb-audio-add-support-for-steinberg-ur22-usb-interface.patch @@ -0,0 +1,68 @@ +From f0b127fbfdc8756eba7437ab668f3169280bd358 Mon Sep 17 00:00:00 2001 +From: Vlad Catoi <vladcatoi@gmail.com> +Date: Sat, 18 Oct 2014 17:45:41 -0500 +Subject: ALSA: usb-audio: Add support for Steinberg UR22 USB interface + +commit f0b127fbfdc8756eba7437ab668f3169280bd358 upstream. + +Adding support for Steinberg UR22 USB interface via quirks table patch + +See Ubuntu bug report: +https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1317244 +Also see threads: +http://linux-audio.4202.n7.nabble.com/Support-for-Steinberg-UR22-Yamaha-USB-chipset-0499-1509-tc82888.html#a82917 +http://www.steinberg.net/forums/viewtopic.php?t=62290 + +Tested by at least 4 people judging by the threads. +Did not test MIDI interface, but audio output and capture both are +functional. Built 3.17 kernel with this driver on Ubuntu 14.04 & tested with mpg123 +Patch applied to 3.13 Ubuntu kernel works well enough for daily use. + +Signed-off-by: Vlad Catoi <vladcatoi@gmail.com> +Acked-by: Clemens Ladisch <clemens@ladisch.de> +Signed-off-by: Takashi Iwai <tiwai@suse.de> +[lizf: Backported to 3.4: adjust context] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + sound/usb/quirks-table.h | 30 ++++++++++++++++++++++++++++++ + 1 file changed, 30 insertions(+) + +--- a/sound/usb/quirks-table.h ++++ b/sound/usb/quirks-table.h +@@ -301,6 +301,36 @@ YAMAHA_DEVICE(0x105d, NULL), + } + } + }, ++{ ++ USB_DEVICE(0x0499, 0x1509), ++ .driver_info = (unsigned long) & (const struct snd_usb_audio_quirk) { ++ /* .vendor_name = "Yamaha", */ ++ /* .product_name = "Steinberg UR22", */ ++ .ifnum = QUIRK_ANY_INTERFACE, ++ .type = QUIRK_COMPOSITE, ++ .data = (const struct snd_usb_audio_quirk[]) { ++ { ++ .ifnum = 1, ++ .type = QUIRK_AUDIO_STANDARD_INTERFACE ++ }, ++ { ++ .ifnum = 2, ++ .type = QUIRK_AUDIO_STANDARD_INTERFACE ++ }, ++ { ++ .ifnum = 3, ++ .type = QUIRK_MIDI_YAMAHA ++ }, ++ { ++ .ifnum = 4, ++ .type = QUIRK_IGNORE_INTERFACE ++ }, ++ { ++ .ifnum = -1 ++ } ++ } ++ } ++}, + YAMAHA_DEVICE(0x2000, "DGP-7"), + YAMAHA_DEVICE(0x2001, "DGP-5"), + YAMAHA_DEVICE(0x2002, NULL), diff --git a/patches/block-fix-alignment_offset-math-that-assumes-io_min-is-a-power-of-2.patch b/patches/block-fix-alignment_offset-math-that-assumes-io_min-is-a-power-of-2.patch new file mode 100644 index 0000000..412ff1b --- /dev/null +++ b/patches/block-fix-alignment_offset-math-that-assumes-io_min-is-a-power-of-2.patch @@ -0,0 +1,61 @@ +From b8839b8c55f3fdd60dc36abcda7e0266aff7985c Mon Sep 17 00:00:00 2001 +From: Mike Snitzer <snitzer@redhat.com> +Date: Wed, 8 Oct 2014 18:26:13 -0400 +Subject: block: fix alignment_offset math that assumes io_min is a power-of-2 + +commit b8839b8c55f3fdd60dc36abcda7e0266aff7985c upstream. + +The math in both blk_stack_limits() and queue_limit_alignment_offset() +assume that a block device's io_min (aka minimum_io_size) is always a +power-of-2. Fix the math such that it works for non-power-of-2 io_min. + +This issue (of alignment_offset != 0) became apparent when testing +dm-thinp with a thinp blocksize that matches a RAID6 stripesize of +1280K. Commit fdfb4c8c1 ("dm thin: set minimum_io_size to pool's data +block size") unlocked the potential for alignment_offset != 0 due to +the dm-thin-pool's io_min possibly being a non-power-of-2. + +Signed-off-by: Mike Snitzer <snitzer@redhat.com> +Acked-by: Martin K. Petersen <martin.petersen@oracle.com> +Signed-off-by: Jens Axboe <axboe@fb.com> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + block/blk-settings.c | 4 ++-- + include/linux/blkdev.h | 5 ++--- + 2 files changed, 4 insertions(+), 5 deletions(-) + +--- a/block/blk-settings.c ++++ b/block/blk-settings.c +@@ -538,7 +538,7 @@ int blk_stack_limits(struct queue_limits + bottom = max(b->physical_block_size, b->io_min) + alignment; + + /* Verify that top and bottom intervals line up */ +- if (max(top, bottom) & (min(top, bottom) - 1)) { ++ if (max(top, bottom) % min(top, bottom)) { + t->misaligned = 1; + ret = -1; + } +@@ -579,7 +579,7 @@ int blk_stack_limits(struct queue_limits + + /* Find lowest common alignment_offset */ + t->alignment_offset = lcm(t->alignment_offset, alignment) +- & (max(t->physical_block_size, t->io_min) - 1); ++ % max(t->physical_block_size, t->io_min); + + /* Verify that new alignment_offset is on a logical block boundary */ + if (t->alignment_offset & (t->logical_block_size - 1)) { +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -1069,10 +1069,9 @@ static inline int queue_alignment_offset + static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t sector) + { + unsigned int granularity = max(lim->physical_block_size, lim->io_min); +- unsigned int alignment = (sector << 9) & (granularity - 1); ++ unsigned int alignment = sector_div(sector, granularity >> 9) << 9; + +- return (granularity + lim->alignment_offset - alignment) +- & (granularity - 1); ++ return (granularity + lim->alignment_offset - alignment) % granularity; + } + + static inline int bdev_alignment_offset(struct block_device *bdev) diff --git a/patches/bluetooth-fix-issue-with-usb-suspend-in-btusb-driver.patch b/patches/bluetooth-fix-issue-with-usb-suspend-in-btusb-driver.patch new file mode 100644 index 0000000..8048747 --- /dev/null +++ b/patches/bluetooth-fix-issue-with-usb-suspend-in-btusb-driver.patch @@ -0,0 +1,53 @@ +From 85560c4a828ec9c8573840c9b66487b6ae584768 Mon Sep 17 00:00:00 2001 +From: Champion Chen <champion_chen@realsil.com.cn> +Date: Sat, 6 Sep 2014 14:06:08 -0500 +Subject: Bluetooth: Fix issue with USB suspend in btusb driver + +commit 85560c4a828ec9c8573840c9b66487b6ae584768 upstream. + +Suspend could fail for some platforms because +btusb_suspend==> btusb_stop_traffic ==> usb_kill_anchored_urbs. + +When btusb_bulk_complete returns before system suspend and resubmits +an URB, the system cannot enter suspend state. + +Signed-off-by: Champion Chen <champion_chen@realsil.com.cn> +Signed-off-by: Larry Finger <Larry.Finger@lwfinger.net> +Signed-off-by: Marcel Holtmann <marcel@holtmann.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/bluetooth/btusb.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/drivers/bluetooth/btusb.c ++++ b/drivers/bluetooth/btusb.c +@@ -304,6 +304,9 @@ static void btusb_intr_complete(struct u + BT_ERR("%s corrupted event packet", hdev->name); + hdev->stat.err_rx++; + } ++ } else if (urb->status == -ENOENT) { ++ /* Avoid suspend failed when usb_kill_urb */ ++ return; + } + + if (!test_bit(BTUSB_INTR_RUNNING, &data->flags)) +@@ -392,6 +395,9 @@ static void btusb_bulk_complete(struct u + BT_ERR("%s corrupted ACL packet", hdev->name); + hdev->stat.err_rx++; + } ++ } else if (urb->status == -ENOENT) { ++ /* Avoid suspend failed when usb_kill_urb */ ++ return; + } + + if (!test_bit(BTUSB_BULK_RUNNING, &data->flags)) +@@ -486,6 +492,9 @@ static void btusb_isoc_complete(struct u + hdev->stat.err_rx++; + } + } ++ } else if (urb->status == -ENOENT) { ++ /* Avoid suspend failed when usb_kill_urb */ ++ return; + } + + if (!test_bit(BTUSB_ISOC_RUNNING, &data->flags)) diff --git a/patches/bluetooth-fix-setting-correct-security-level-when-initiating-smp.patch b/patches/bluetooth-fix-setting-correct-security-level-when-initiating-smp.patch new file mode 100644 index 0000000..6ed9998 --- /dev/null +++ b/patches/bluetooth-fix-setting-correct-security-level-when-initiating-smp.patch @@ -0,0 +1,42 @@ +From 5eb596f55cacc2389554a8d7572d90d5e9d4269d Mon Sep 17 00:00:00 2001 +From: Johan Hedberg <johan.hedberg@intel.com> +Date: Thu, 18 Sep 2014 11:26:32 +0300 +Subject: Bluetooth: Fix setting correct security level when initiating SMP + +commit 5eb596f55cacc2389554a8d7572d90d5e9d4269d upstream. + +We can only determine the final security level when both pairing request +and response have been exchanged. When initiating pairing the starting +target security level is set to MEDIUM unless explicitly specified to be +HIGH, so that we can still perform pairing even if the remote doesn't +have MITM capabilities. However, once we've received the pairing +response we should re-consult the remote and local IO capabilities and +upgrade the target security level if necessary. + +Without this patch the resulting Long Term Key will occasionally be +reported to be unauthenticated when it in reality is an authenticated +one. + +Signed-off-by: Johan Hedberg <johan.hedberg@intel.com> +Signed-off-by: Marcel Holtmann <marcel@holtmann.org> +[lizf: Backported to 3.4: adjust context] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + net/bluetooth/smp.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/net/bluetooth/smp.c ++++ b/net/bluetooth/smp.c +@@ -325,8 +325,11 @@ static int tk_request(struct l2cap_conn + } + + /* Not Just Works/Confirm results in MITM Authentication */ +- if (method != JUST_CFM) ++ if (method != JUST_CFM) { + set_bit(SMP_FLAG_MITM_AUTH, &smp->smp_flags); ++ if (hcon->pending_sec_level < BT_SECURITY_HIGH) ++ hcon->pending_sec_level = BT_SECURITY_HIGH; ++ } + + /* If both devices have Keyoard-Display I/O, the master + * Confirms and the slave Enters the passkey. diff --git a/patches/cpufreq-expose-scaling_cur_freq-sysfs-file-for-set_policy-drivers.patch b/patches/cpufreq-expose-scaling_cur_freq-sysfs-file-for-set_policy-drivers.patch new file mode 100644 index 0000000..6e2a0c5 --- /dev/null +++ b/patches/cpufreq-expose-scaling_cur_freq-sysfs-file-for-set_policy-drivers.patch @@ -0,0 +1,61 @@ +From c034b02e213d271b98c45c4a7b54af8f69aaac1e Mon Sep 17 00:00:00 2001 +From: Dirk Brandewie <dirk.j.brandewie@intel.com> +Date: Mon, 13 Oct 2014 08:37:40 -0700 +Subject: cpufreq: expose scaling_cur_freq sysfs file for set_policy() drivers + +commit c034b02e213d271b98c45c4a7b54af8f69aaac1e upstream. + +Currently the core does not expose scaling_cur_freq for set_policy() +drivers this breaks some userspace monitoring tools. +Change the core to expose this file for all drivers and if the +set_policy() driver supports the get() callback use it to retrieve the +current frequency. + +Link: https://bugzilla.kernel.org/show_bug.cgi?id=73741 +Signed-off-by: Dirk Brandewie <dirk.j.brandewie@intel.com> +Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> +[lizf: Backported to 3.4: adjust context] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/cpufreq/cpufreq.c | 23 +++++++++++++++++------ + 1 file changed, 17 insertions(+), 6 deletions(-) + +--- a/drivers/cpufreq/cpufreq.c ++++ b/drivers/cpufreq/cpufreq.c +@@ -371,7 +371,18 @@ show_one(cpuinfo_max_freq, cpuinfo.max_f + show_one(cpuinfo_transition_latency, cpuinfo.transition_latency); + show_one(scaling_min_freq, min); + show_one(scaling_max_freq, max); +-show_one(scaling_cur_freq, cur); ++ ++static ssize_t show_scaling_cur_freq( ++ struct cpufreq_policy *policy, char *buf) ++{ ++ ssize_t ret; ++ ++ if (cpufreq_driver && cpufreq_driver->setpolicy && cpufreq_driver->get) ++ ret = sprintf(buf, "%u\n", cpufreq_driver->get(policy->cpu)); ++ else ++ ret = sprintf(buf, "%u\n", policy->cur); ++ return ret; ++} + + static int __cpufreq_set_policy(struct cpufreq_policy *data, + struct cpufreq_policy *policy); +@@ -818,11 +829,11 @@ static int cpufreq_add_dev_interface(uns + if (ret) + goto err_out_kobj_put; + } +- if (cpufreq_driver->target) { +- ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr); +- if (ret) +- goto err_out_kobj_put; +- } ++ ++ ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr); ++ if (ret) ++ goto err_out_kobj_put; ++ + if (cpufreq_driver->bios_limit) { + ret = sysfs_create_file(&policy->kobj, &bios_limit.attr); + if (ret) diff --git a/patches/crypto-more-robust-crypto_memneq.patch b/patches/crypto-more-robust-crypto_memneq.patch new file mode 100644 index 0000000..b750fd7 --- /dev/null +++ b/patches/crypto-more-robust-crypto_memneq.patch @@ -0,0 +1,77 @@ +From fe8c8a126806fea4465c43d62a1f9d273a572bf5 Mon Sep 17 00:00:00 2001 +From: Cesar Eduardo Barros <cesarb@cesarb.eti.br> +Date: Mon, 25 Nov 2013 22:00:41 -0200 +Subject: crypto: more robust crypto_memneq + +commit fe8c8a126806fea4465c43d62a1f9d273a572bf5 upstream. + +[Only use the compiler.h portion of this patch, to get the +OPTIMIZER_HIDE_VAR() macro, which we need for other -stable patches +- gregkh] + +Disabling compiler optimizations can be fragile, since a new +optimization could be added to -O0 or -Os that breaks the assumptions +the code is making. + +Instead of disabling compiler optimizations, use a dummy inline assembly +(based on RELOC_HIDE) to block the problematic kinds of optimization, +while still allowing other optimizations to be applied to the code. + +The dummy inline assembly is added after every OR, and has the +accumulator variable as its input and output. The compiler is forced to +assume that the dummy inline assembly could both depend on the +accumulator variable and change the accumulator variable, so it is +forced to compute the value correctly before the inline assembly, and +cannot assume anything about its value after the inline assembly. + +This change should be enough to make crypto_memneq work correctly (with +data-independent timing) even if it is inlined at its call sites. That +can be done later in a followup patch. + +Compile-tested on x86_64. + +Signed-off-by: Cesar Eduardo Barros <cesarb@cesarb.eti.br> +Acked-by: Daniel Borkmann <dborkman@redhat.com> +Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + include/linux/compiler-gcc.h | 3 +++ + include/linux/compiler-intel.h | 7 +++++++ + 2 files changed, 10 insertions(+) + +--- a/include/linux/compiler-gcc.h ++++ b/include/linux/compiler-gcc.h +@@ -37,6 +37,9 @@ + __asm__ ("" : "=r"(__ptr) : "0"(ptr)); \ + (typeof(ptr)) (__ptr + (off)); }) + ++/* Make the optimizer believe the variable can be manipulated arbitrarily. */ ++#define OPTIMIZER_HIDE_VAR(var) __asm__ ("" : "=r" (var) : "0" (var)) ++ + #ifdef __CHECKER__ + #define __must_be_array(arr) 0 + #else +--- a/include/linux/compiler-intel.h ++++ b/include/linux/compiler-intel.h +@@ -15,6 +15,7 @@ + */ + #undef barrier + #undef RELOC_HIDE ++#undef OPTIMIZER_HIDE_VAR + + #define barrier() __memory_barrier() + +@@ -23,6 +24,12 @@ + __ptr = (unsigned long) (ptr); \ + (typeof(ptr)) (__ptr + (off)); }) + ++/* This should act as an optimization barrier on var. ++ * Given that this compiler does not have inline assembly, a compiler barrier ++ * is the best we can do. ++ */ ++#define OPTIMIZER_HIDE_VAR(var) barrier() ++ + /* Intel ECC compiler doesn't support __builtin_types_compatible_p() */ + #define __must_be_array(a) 0 + diff --git a/patches/dm-bufio-update-last_accessed-when-relinking-a-buffer.patch b/patches/dm-bufio-update-last_accessed-when-relinking-a-buffer.patch new file mode 100644 index 0000000..bf53cf1 --- /dev/null +++ b/patches/dm-bufio-update-last_accessed-when-relinking-a-buffer.patch @@ -0,0 +1,37 @@ +From eb76faf53b1ff7a77ce3f78cc98ad392ac70c2a0 Mon Sep 17 00:00:00 2001 +From: Joe Thornber <ejt@redhat.com> +Date: Tue, 30 Sep 2014 09:32:46 +0100 +Subject: dm bufio: update last_accessed when relinking a buffer + +commit eb76faf53b1ff7a77ce3f78cc98ad392ac70c2a0 upstream. + +The 'last_accessed' member of the dm_buffer structure was only set when +the the buffer was created. This led to each buffer being discarded +after dm_bufio_max_age time even if it was used recently. In practice +this resulted in all thinp metadata being evicted soon after being read +-- this is particularly problematic for metadata intensive workloads +like multithreaded small random IO. + +'last_accessed' is now updated each time the buffer is moved to the head +of the LRU list, so the buffer is now properly discarded if it was not +used in dm_bufio_max_age time. + +Signed-off-by: Joe Thornber <ejt@redhat.com> +Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> +Signed-off-by: Mike Snitzer <snitzer@redhat.com> +[lizf: Backported to 3.4: adjust context] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/md/dm-bufio.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/md/dm-bufio.c ++++ b/drivers/md/dm-bufio.c +@@ -467,6 +467,7 @@ static void __relink_lru(struct dm_buffe + b->list_mode = dirty; + list_del(&b->lru_list); + list_add(&b->lru_list, &c->lru[dirty]); ++ b->last_accessed = jiffies; + } + + /*---------------------------------------------------------------- diff --git a/patches/dm-log-userspace-fix-memory-leak-in-dm_ulog_tfr_init-failure-path.patch b/patches/dm-log-userspace-fix-memory-leak-in-dm_ulog_tfr_init-failure-path.patch new file mode 100644 index 0000000..3d3c901 --- /dev/null +++ b/patches/dm-log-userspace-fix-memory-leak-in-dm_ulog_tfr_init-failure-path.patch @@ -0,0 +1,31 @@ +From 56ec16cb1e1ce46354de8511eef962a417c32c92 Mon Sep 17 00:00:00 2001 +From: Alexey Khoroshilov <khoroshilov@ispras.ru> +Date: Wed, 1 Oct 2014 22:58:35 +0200 +Subject: dm log userspace: fix memory leak in dm_ulog_tfr_init failure path + +commit 56ec16cb1e1ce46354de8511eef962a417c32c92 upstream. + +If cn_add_callback() fails in dm_ulog_tfr_init(), it does not +deallocate prealloced memory but calls cn_del_callback(). + +Found by Linux Driver Verification project (linuxtesting.org). + +Signed-off-by: Alexey Khoroshilov <khoroshilov@ispras.ru> +Reviewed-by: Jonathan Brassow <jbrassow@redhat.com> +Signed-off-by: Mike Snitzer <snitzer@redhat.com> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/md/dm-log-userspace-transfer.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/md/dm-log-userspace-transfer.c ++++ b/drivers/md/dm-log-userspace-transfer.c +@@ -272,7 +272,7 @@ int dm_ulog_tfr_init(void) + + r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback); + if (r) { +- cn_del_callback(&ulog_cn_id); ++ kfree(prealloced_cn_msg); + return r; + } + diff --git a/patches/documentation-lzo-document-part-of-the-encoding.patch b/patches/documentation-lzo-document-part-of-the-encoding.patch new file mode 100644 index 0000000..af9d075 --- /dev/null +++ b/patches/documentation-lzo-document-part-of-the-encoding.patch @@ -0,0 +1,188 @@ +From d98a0526434d27e261f622cf9d2e0028b5ff1a00 Mon Sep 17 00:00:00 2001 +From: Willy Tarreau <w@1wt.eu> +Date: Sat, 27 Sep 2014 12:31:35 +0200 +Subject: Documentation: lzo: document part of the encoding + +commit d98a0526434d27e261f622cf9d2e0028b5ff1a00 upstream. + +Add a complete description of the LZO format as processed by the +decompressor. I have not found a public specification of this format +hence this analysis, which will be used to better understand the code. + +Cc: Willem Pinckaers <willem@lekkertech.net> +Cc: "Don A. Bailey" <donb@securitymouse.com> +Signed-off-by: Willy Tarreau <w@1wt.eu> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + Documentation/lzo.txt | 164 ++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 164 insertions(+) + create mode 100644 Documentation/lzo.txt + +--- /dev/null ++++ b/Documentation/lzo.txt +@@ -0,0 +1,164 @@ ++ ++LZO stream format as understood by Linux's LZO decompressor ++=========================================================== ++ ++Introduction ++ ++ This is not a specification. No specification seems to be publicly available ++ for the LZO stream format. This document describes what input format the LZO ++ decompressor as implemented in the Linux kernel understands. The file subject ++ of this analysis is lib/lzo/lzo1x_decompress_safe.c. No analysis was made on ++ the compressor nor on any other implementations though it seems likely that ++ the format matches the standard one. The purpose of this document is to ++ better understand what the code does in order to propose more efficient fixes ++ for future bug reports. ++ ++Description ++ ++ The stream is composed of a series of instructions, operands, and data. The ++ instructions consist in a few bits representing an opcode, and bits forming ++ the operands for the instruction, whose size and position depend on the ++ opcode and on the number of literals copied by previous instruction. The ++ operands are used to indicate : ++ ++ - a distance when copying data from the dictionary (past output buffer) ++ - a length (number of bytes to copy from dictionary) ++ - the number of literals to copy, which is retained in variable "state" ++ as a piece of information for next instructions. ++ ++ Optionally depending on the opcode and operands, extra data may follow. These ++ extra data can be a complement for the operand (eg: a length or a distance ++ encoded on larger values), or a literal to be copied to the output buffer. ++ ++ The first byte of the block follows a different encoding from other bytes, it ++ seems to be optimized for literal use only, since there is no dictionary yet ++ prior to that byte. ++ ++ Lengths are always encoded on a variable size starting with a small number ++ of bits in the operand. If the number of bits isn't enough to represent the ++ length, up to 255 may be added in increments by consuming more bytes with a ++ rate of at most 255 per extra byte (thus the compression ratio cannot exceed ++ around 255:1). The variable length encoding using #bits is always the same : ++ ++ length = byte & ((1 << #bits) - 1) ++ if (!length) { ++ length = ((1 << #bits) - 1) ++ length += 255*(number of zero bytes) ++ length += first-non-zero-byte ++ } ++ length += constant (generally 2 or 3) ++ ++ For references to the dictionary, distances are relative to the output ++ pointer. Distances are encoded using very few bits belonging to certain ++ ranges, resulting in multiple copy instructions using different encodings. ++ Certain encodings involve one extra byte, others involve two extra bytes ++ forming a little-endian 16-bit quantity (marked LE16 below). ++ ++ After any instruction except the large literal copy, 0, 1, 2 or 3 literals ++ are copied before starting the next instruction. The number of literals that ++ were copied may change the meaning and behaviour of the next instruction. In ++ practice, only one instruction needs to know whether 0, less than 4, or more ++ literals were copied. This is the information stored in the <state> variable ++ in this implementation. This number of immediate literals to be copied is ++ generally encoded in the last two bits of the instruction but may also be ++ taken from the last two bits of an extra operand (eg: distance). ++ ++ End of stream is declared when a block copy of distance 0 is seen. Only one ++ instruction may encode this distance (0001HLLL), it takes one LE16 operand ++ for the distance, thus requiring 3 bytes. ++ ++ IMPORTANT NOTE : in the code some length checks are missing because certain ++ instructions are called under the assumption that a certain number of bytes ++ follow because it has already been garanteed before parsing the instructions. ++ They just have to "refill" this credit if they consume extra bytes. This is ++ an implementation design choice independant on the algorithm or encoding. ++ ++Byte sequences ++ ++ First byte encoding : ++ ++ 0..17 : follow regular instruction encoding, see below. It is worth ++ noting that codes 16 and 17 will represent a block copy from ++ the dictionary which is empty, and that they will always be ++ invalid at this place. ++ ++ 18..21 : copy 0..3 literals ++ state = (byte - 17) = 0..3 [ copy <state> literals ] ++ skip byte ++ ++ 22..255 : copy literal string ++ length = (byte - 17) = 4..238 ++ state = 4 [ don't copy extra literals ] ++ skip byte ++ ++ Instruction encoding : ++ ++ 0 0 0 0 X X X X (0..15) ++ Depends on the number of literals copied by the last instruction. ++ If last instruction did not copy any literal (state == 0), this ++ encoding will be a copy of 4 or more literal, and must be interpreted ++ like this : ++ ++ 0 0 0 0 L L L L (0..15) : copy long literal string ++ length = 3 + (L ?: 15 + (zero_bytes * 255) + non_zero_byte) ++ state = 4 (no extra literals are copied) ++ ++ If last instruction used to copy between 1 to 3 literals (encoded in ++ the instruction's opcode or distance), the instruction is a copy of a ++ 2-byte block from the dictionary within a 1kB distance. It is worth ++ noting that this instruction provides little savings since it uses 2 ++ bytes to encode a copy of 2 other bytes but it encodes the number of ++ following literals for free. It must be interpreted like this : ++ ++ 0 0 0 0 D D S S (0..15) : copy 2 bytes from <= 1kB distance ++ length = 2 ++ state = S (copy S literals after this block) ++ Always followed by exactly one byte : H H H H H H H H ++ distance = (H << 2) + D + 1 ++ ++ If last instruction used to copy 4 or more literals (as detected by ++ state == 4), the instruction becomes a copy of a 3-byte block from the ++ dictionary from a 2..3kB distance, and must be interpreted like this : ++ ++ 0 0 0 0 D D S S (0..15) : copy 3 bytes from 2..3 kB distance ++ length = 3 ++ state = S (copy S literals after this block) ++ Always followed by exactly one byte : H H H H H H H H ++ distance = (H << 2) + D + 2049 ++ ++ 0 0 0 1 H L L L (16..31) ++ Copy of a block within 16..48kB distance (preferably less than 10B) ++ length = 2 + (L ?: 7 + (zero_bytes * 255) + non_zero_byte) ++ Always followed by exactly one LE16 : D D D D D D D D : D D D D D D S S ++ distance = 16384 + (H << 14) + D ++ state = S (copy S literals after this block) ++ End of stream is reached if distance == 16384 ++ ++ 0 0 1 L L L L L (32..63) ++ Copy of small block within 16kB distance (preferably less than 34B) ++ length = 2 + (L ?: 31 + (zero_bytes * 255) + non_zero_byte) ++ Always followed by exactly one LE16 : D D D D D D D D : D D D D D D S S ++ distance = D + 1 ++ state = S (copy S literals after this block) ++ ++ 0 1 L D D D S S (64..127) ++ Copy 3-4 bytes from block within 2kB distance ++ state = S (copy S literals after this block) ++ length = 3 + L ++ Always followed by exactly one byte : H H H H H H H H ++ distance = (H << 3) + D + 1 ++ ++ 1 L L D D D S S (128..255) ++ Copy 5-8 bytes from block within 2kB distance ++ state = S (copy S literals after this block) ++ length = 5 + L ++ Always followed by exactly one byte : H H H H H H H H ++ distance = (H << 3) + D + 1 ++ ++Authors ++ ++ This document was written by Willy Tarreau <w@1wt.eu> on 2014/07/19 during an ++ analysis of the decompression code available in Linux 3.16-rc5. The code is ++ tricky, it is possible that this document contains mistakes or that a few ++ corner cases were overlooked. In any case, please report any doubt, fix, or ++ proposed updates to the author(s) so that the document can be updated. diff --git a/patches/drivers-hv-vmbus-cleanup-vmbus_close_internal.patch b/patches/drivers-hv-vmbus-cleanup-vmbus_close_internal.patch new file mode 100644 index 0000000..025624f --- /dev/null +++ b/patches/drivers-hv-vmbus-cleanup-vmbus_close_internal.patch @@ -0,0 +1,67 @@ +From 98d731bb064a9d1817a6ca9bf8b97051334a7cfe Mon Sep 17 00:00:00 2001 +From: "K. Y. Srinivasan" <kys@microsoft.com> +Date: Wed, 27 Aug 2014 16:25:33 -0700 +Subject: Drivers: hv: vmbus: Cleanup vmbus_close_internal() + +commit 98d731bb064a9d1817a6ca9bf8b97051334a7cfe upstream. + +Eliminate calls to BUG_ON() in vmbus_close_internal(). +We have chosen to potentially leak memory, than crash the guest +in case of failures. + +In this version of the patch I have addressed comments from +Dan Carpenter (dan.carpenter@oracle.com). + +Signed-off-by: K. Y. Srinivasan <kys@microsoft.com> +Tested-by: Sitsofe Wheeler <sitsofe@yahoo.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +[lizf: Backported to 3.4: s/return ret/return/g] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/hv/channel.c | 27 +++++++++++++++++++++------ + 1 file changed, 21 insertions(+), 6 deletions(-) + +--- a/drivers/hv/channel.c ++++ b/drivers/hv/channel.c +@@ -531,11 +531,28 @@ void vmbus_close(struct vmbus_channel *c + + ret = vmbus_post_msg(msg, sizeof(struct vmbus_channel_close_channel)); + +- BUG_ON(ret != 0); ++ if (ret) { ++ pr_err("Close failed: close post msg return is %d\n", ret); ++ /* ++ * If we failed to post the close msg, ++ * it is perhaps better to leak memory. ++ */ ++ return; ++ } ++ + /* Tear down the gpadl for the channel's ring buffer */ +- if (channel->ringbuffer_gpadlhandle) +- vmbus_teardown_gpadl(channel, +- channel->ringbuffer_gpadlhandle); ++ if (channel->ringbuffer_gpadlhandle) { ++ ret = vmbus_teardown_gpadl(channel, ++ channel->ringbuffer_gpadlhandle); ++ if (ret) { ++ pr_err("Close failed: teardown gpadl return %d\n", ret); ++ /* ++ * If we failed to teardown gpadl, ++ * it is perhaps better to leak memory. ++ */ ++ return; ++ } ++ } + + /* Cleanup the ring buffers for this channel */ + hv_ringbuffer_cleanup(&channel->outbound); +@@ -543,8 +560,6 @@ void vmbus_close(struct vmbus_channel *c + + free_pages((unsigned long)channel->ringbuffer_pages, + get_order(channel->ringbuffer_pagecount * PAGE_SIZE)); +- +- + } + EXPORT_SYMBOL_GPL(vmbus_close); + diff --git a/patches/drivers-hv-vmbus-cleanup-vmbus_establish_gpadl.patch b/patches/drivers-hv-vmbus-cleanup-vmbus_establish_gpadl.patch new file mode 100644 index 0000000..93c192b --- /dev/null +++ b/patches/drivers-hv-vmbus-cleanup-vmbus_establish_gpadl.patch @@ -0,0 +1,40 @@ +From 72c6b71c245dac8f371167d97ef471b367d0b66b Mon Sep 17 00:00:00 2001 +From: "K. Y. Srinivasan" <kys@microsoft.com> +Date: Wed, 27 Aug 2014 16:25:34 -0700 +Subject: Drivers: hv: vmbus: Cleanup vmbus_establish_gpadl() + +commit 72c6b71c245dac8f371167d97ef471b367d0b66b upstream. + +Eliminate the call to BUG_ON() by waiting for the host to respond. We are +trying to reclaim the ownership of memory that was given to the host and so +we will have to wait until the host responds. + +Signed-off-by: K. Y. Srinivasan <kys@microsoft.com> +Tested-by: Sitsofe Wheeler <sitsofe@yahoo.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/hv/channel.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +--- a/drivers/hv/channel.c ++++ b/drivers/hv/channel.c +@@ -400,7 +400,6 @@ int vmbus_establish_gpadl(struct vmbus_c + u32 next_gpadl_handle; + unsigned long flags; + int ret = 0; +- int t; + + next_gpadl_handle = atomic_read(&vmbus_connection.next_gpadl_handle); + atomic_inc(&vmbus_connection.next_gpadl_handle); +@@ -447,9 +446,7 @@ int vmbus_establish_gpadl(struct vmbus_c + + } + } +- t = wait_for_completion_timeout(&msginfo->waitevent, 5*HZ); +- BUG_ON(t == 0); +- ++ wait_for_completion(&msginfo->waitevent); + + /* At this point, we received the gpadl created msg */ + *gpadl_handle = gpadlmsg->gpadl; diff --git a/patches/drivers-hv-vmbus-cleanup-vmbus_post_msg.patch b/patches/drivers-hv-vmbus-cleanup-vmbus_post_msg.patch new file mode 100644 index 0000000..eed7f59 --- /dev/null +++ b/patches/drivers-hv-vmbus-cleanup-vmbus_post_msg.patch @@ -0,0 +1,49 @@ +From fdeebcc62279119dbeafbc1a2e39e773839025fd Mon Sep 17 00:00:00 2001 +From: "K. Y. Srinivasan" <kys@microsoft.com> +Date: Wed, 27 Aug 2014 16:25:31 -0700 +Subject: Drivers: hv: vmbus: Cleanup vmbus_post_msg() + +commit fdeebcc62279119dbeafbc1a2e39e773839025fd upstream. + +Posting messages to the host can fail because of transient resource +related failures. Correctly deal with these failures and increase the +number of attempts to post the message before giving up. + +In this version of the patch, I have normalized the error code to +Linux error code. + +Signed-off-by: K. Y. Srinivasan <kys@microsoft.com> +Tested-by: Sitsofe Wheeler <sitsofe@yahoo.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/hv/connection.c | 17 ++++++++++++++--- + 1 file changed, 14 insertions(+), 3 deletions(-) + +--- a/drivers/hv/connection.c ++++ b/drivers/hv/connection.c +@@ -294,10 +294,21 @@ int vmbus_post_msg(void *buffer, size_t + * insufficient resources. Retry the operation a couple of + * times before giving up. + */ +- while (retries < 3) { +- ret = hv_post_message(conn_id, 1, buffer, buflen); +- if (ret != HV_STATUS_INSUFFICIENT_BUFFERS) ++ while (retries < 10) { ++ ret = hv_post_message(conn_id, 1, buffer, buflen); ++ ++ switch (ret) { ++ case HV_STATUS_INSUFFICIENT_BUFFERS: ++ ret = -ENOMEM; ++ case -ENOMEM: ++ break; ++ case HV_STATUS_SUCCESS: + return ret; ++ default: ++ pr_err("hv_post_msg() failed; error code:%d\n", ret); ++ return -EINVAL; ++ } ++ + retries++; + msleep(100); + } diff --git a/patches/drivers-hv-vmbus-cleanup-vmbus_teardown_gpadl.patch b/patches/drivers-hv-vmbus-cleanup-vmbus_teardown_gpadl.patch new file mode 100644 index 0000000..cd5c74b --- /dev/null +++ b/patches/drivers-hv-vmbus-cleanup-vmbus_teardown_gpadl.patch @@ -0,0 +1,49 @@ +From 66be653083057358724d56d817e870e53fb81ca7 Mon Sep 17 00:00:00 2001 +From: "K. Y. Srinivasan" <kys@microsoft.com> +Date: Wed, 27 Aug 2014 16:25:32 -0700 +Subject: Drivers: hv: vmbus: Cleanup vmbus_teardown_gpadl() + +commit 66be653083057358724d56d817e870e53fb81ca7 upstream. + +Eliminate calls to BUG_ON() by properly handling errors. In cases where +rollback is possible, we will return the appropriate error to have the +calling code decide how to rollback state. In the case where we are +transferring ownership of the guest physical pages to the host, +we will wait for the host to respond. + +Signed-off-by: K. Y. Srinivasan <kys@microsoft.com> +Tested-by: Sitsofe Wheeler <sitsofe@yahoo.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/hv/channel.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +--- a/drivers/hv/channel.c ++++ b/drivers/hv/channel.c +@@ -472,7 +472,7 @@ int vmbus_teardown_gpadl(struct vmbus_ch + struct vmbus_channel_gpadl_teardown *msg; + struct vmbus_channel_msginfo *info; + unsigned long flags; +- int ret, t; ++ int ret; + + info = kmalloc(sizeof(*info) + + sizeof(struct vmbus_channel_gpadl_teardown), GFP_KERNEL); +@@ -494,11 +494,12 @@ int vmbus_teardown_gpadl(struct vmbus_ch + ret = vmbus_post_msg(msg, + sizeof(struct vmbus_channel_gpadl_teardown)); + +- BUG_ON(ret != 0); +- t = wait_for_completion_timeout(&info->waitevent, 5*HZ); +- BUG_ON(t == 0); ++ if (ret) ++ goto post_msg_err; + +- /* Received a torndown response */ ++ wait_for_completion(&info->waitevent); ++ ++post_msg_err: + spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); + list_del(&info->msglistentry); + spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); diff --git a/patches/drivers-hv-vmbus-fix-a-bug-in-vmbus_open.patch b/patches/drivers-hv-vmbus-fix-a-bug-in-vmbus_open.patch new file mode 100644 index 0000000..4660a93 --- /dev/null +++ b/patches/drivers-hv-vmbus-fix-a-bug-in-vmbus_open.patch @@ -0,0 +1,33 @@ +From 45d727cee9e200f5b351528b9fb063b69cf702c8 Mon Sep 17 00:00:00 2001 +From: "K. Y. Srinivasan" <kys@microsoft.com> +Date: Wed, 27 Aug 2014 16:25:35 -0700 +Subject: Drivers: hv: vmbus: Fix a bug in vmbus_open() + +commit 45d727cee9e200f5b351528b9fb063b69cf702c8 upstream. + +Fix a bug in vmbus_open() and properly propagate the error. I would +like to thank Dexuan Cui <decui@microsoft.com> for identifying the +issue. + +Signed-off-by: K. Y. Srinivasan <kys@microsoft.com> +Tested-by: Sitsofe Wheeler <sitsofe@yahoo.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/hv/channel.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/drivers/hv/channel.c ++++ b/drivers/hv/channel.c +@@ -207,8 +207,10 @@ int vmbus_open(struct vmbus_channel *new + ret = vmbus_post_msg(open_msg, + sizeof(struct vmbus_channel_open_channel)); + +- if (ret != 0) ++ if (ret != 0) { ++ err = ret; + goto error1; ++ } + + t = wait_for_completion_timeout(&open_info->waitevent, 5*HZ); + if (t == 0) { diff --git a/patches/ecryptfs-avoid-to-access-null-pointer-when-write-metadata-in-xattr.patch b/patches/ecryptfs-avoid-to-access-null-pointer-when-write-metadata-in-xattr.patch new file mode 100644 index 0000000..99257ca --- /dev/null +++ b/patches/ecryptfs-avoid-to-access-null-pointer-when-write-metadata-in-xattr.patch @@ -0,0 +1,84 @@ +From 35425ea2492175fd39f6116481fe98b2b3ddd4ca Mon Sep 17 00:00:00 2001 +From: Chao Yu <chao2.yu@samsung.com> +Date: Thu, 24 Jul 2014 17:25:42 +0800 +Subject: ecryptfs: avoid to access NULL pointer when write metadata in xattr + +commit 35425ea2492175fd39f6116481fe98b2b3ddd4ca upstream. + +Christopher Head 2014-06-28 05:26:20 UTC described: +"I tried to reproduce this on 3.12.21. Instead, when I do "echo hello > foo" +in an ecryptfs mount with ecryptfs_xattr specified, I get a kernel crash: + +BUG: unable to handle kernel NULL pointer dereference at (null) +IP: [<ffffffff8110eb39>] fsstack_copy_attr_all+0x2/0x61 +PGD d7840067 PUD b2c3c067 PMD 0 +Oops: 0002 [#1] SMP +Modules linked in: nvidia(PO) +CPU: 3 PID: 3566 Comm: bash Tainted: P O 3.12.21-gentoo-r1 #2 +Hardware name: ASUSTek Computer Inc. G60JX/G60JX, BIOS 206 03/15/2010 +task: ffff8801948944c0 ti: ffff8800bad70000 task.ti: ffff8800bad70000 +RIP: 0010:[<ffffffff8110eb39>] [<ffffffff8110eb39>] fsstack_copy_attr_all+0x2/0x61 +RSP: 0018:ffff8800bad71c10 EFLAGS: 00010246 +RAX: 00000000000181a4 RBX: ffff880198648480 RCX: 0000000000000000 +RDX: 0000000000000004 RSI: ffff880172010450 RDI: 0000000000000000 +RBP: ffff880198490e40 R08: 0000000000000000 R09: 0000000000000000 +R10: ffff880172010450 R11: ffffea0002c51e80 R12: 0000000000002000 +R13: 000000000000001a R14: 0000000000000000 R15: ffff880198490e40 +FS: 00007ff224caa700(0000) GS:ffff88019fcc0000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 0000000000000000 CR3: 00000000bb07f000 CR4: 00000000000007e0 +Stack: +ffffffff811826e8 ffff8800a39d8000 0000000000000000 000000000000001a +ffff8800a01d0000 ffff8800a39d8000 ffffffff81185fd5 ffffffff81082c2c +00000001a39d8000 53d0abbc98490e40 0000000000000037 ffff8800a39d8220 +Call Trace: +[<ffffffff811826e8>] ? ecryptfs_setxattr+0x40/0x52 +[<ffffffff81185fd5>] ? ecryptfs_write_metadata+0x1b3/0x223 +[<ffffffff81082c2c>] ? should_resched+0x5/0x23 +[<ffffffff8118322b>] ? ecryptfs_initialize_file+0xaf/0xd4 +[<ffffffff81183344>] ? ecryptfs_create+0xf4/0x142 +[<ffffffff810f8c0d>] ? vfs_create+0x48/0x71 +[<ffffffff810f9c86>] ? do_last.isra.68+0x559/0x952 +[<ffffffff810f7ce7>] ? link_path_walk+0xbd/0x458 +[<ffffffff810fa2a3>] ? path_openat+0x224/0x472 +[<ffffffff810fa7bd>] ? do_filp_open+0x2b/0x6f +[<ffffffff81103606>] ? __alloc_fd+0xd6/0xe7 +[<ffffffff810ee6ab>] ? do_sys_open+0x65/0xe9 +[<ffffffff8157d022>] ? system_call_fastpath+0x16/0x1b +RIP [<ffffffff8110eb39>] fsstack_copy_attr_all+0x2/0x61 +RSP <ffff8800bad71c10> +CR2: 0000000000000000 +---[ end trace df9dba5f1ddb8565 ]---" + +If we create a file when we mount with ecryptfs_xattr_metadata option, we will +encounter a crash in this path: +->ecryptfs_create + ->ecryptfs_initialize_file + ->ecryptfs_write_metadata + ->ecryptfs_write_metadata_to_xattr + ->ecryptfs_setxattr + ->fsstack_copy_attr_all +It's because our dentry->d_inode used in fsstack_copy_attr_all is NULL, and it +will be initialized when ecryptfs_initialize_file finish. + +So we should skip copying attr from lower inode when the value of ->d_inode is +invalid. + +Signed-off-by: Chao Yu <chao2.yu@samsung.com> +Signed-off-by: Tyler Hicks <tyhicks@canonical.com> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + fs/ecryptfs/inode.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ecryptfs/inode.c ++++ b/fs/ecryptfs/inode.c +@@ -1093,7 +1093,7 @@ ecryptfs_setxattr(struct dentry *dentry, + } + + rc = vfs_setxattr(lower_dentry, name, value, size, flags); +- if (!rc) ++ if (!rc && dentry->d_inode) + fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode); + out: + return rc; diff --git a/patches/ext4-add-ext4_iget_normal-which-is-to-be-used-for-dir-tree-lookups.patch b/patches/ext4-add-ext4_iget_normal-which-is-to-be-used-for-dir-tree-lookups.patch new file mode 100644 index 0000000..c6a7308 --- /dev/null +++ b/patches/ext4-add-ext4_iget_normal-which-is-to-be-used-for-dir-tree-lookups.patch @@ -0,0 +1,84 @@ +From f4bb2981024fc91b23b4d09a8817c415396dbabb Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o <tytso@mit.edu> +Date: Sun, 5 Oct 2014 22:56:00 -0400 +Subject: ext4: add ext4_iget_normal() which is to be used for dir tree lookups + +commit f4bb2981024fc91b23b4d09a8817c415396dbabb upstream. + +If there is a corrupted file system which has directory entries that +point at reserved, metadata inodes, prohibit them from being used by +treating them the same way we treat Boot Loader inodes --- that is, +mark them to be bad inodes. This prohibits them from being opened, +deleted, or modified via chmod, chown, utimes, etc. + +In particular, this prevents a corrupted file system which has a +directory entry which points at the journal inode from being deleted +and its blocks released, after which point Much Hilarity Ensues. + +Reported-by: Sami Liedes <sami.liedes@iki.fi> +Signed-off-by: Theodore Ts'o <tytso@mit.edu> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + fs/ext4/ext4.h | 1 + + fs/ext4/inode.c | 7 +++++++ + fs/ext4/namei.c | 4 ++-- + fs/ext4/super.c | 2 +- + 4 files changed, 11 insertions(+), 3 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1891,6 +1891,7 @@ int ext4_get_block(struct inode *inode, + struct buffer_head *bh_result, int create); + + extern struct inode *ext4_iget(struct super_block *, unsigned long); ++extern struct inode *ext4_iget_normal(struct super_block *, unsigned long); + extern int ext4_write_inode(struct inode *, struct writeback_control *); + extern int ext4_setattr(struct dentry *, struct iattr *); + extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -3838,6 +3838,13 @@ bad_inode: + return ERR_PTR(ret); + } + ++struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino) ++{ ++ if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ++ return ERR_PTR(-EIO); ++ return ext4_iget(sb, ino); ++} ++ + static int ext4_inode_blocks_set(handle_t *handle, + struct ext4_inode *raw_inode, + struct ext4_inode_info *ei) +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -1051,7 +1051,7 @@ static struct dentry *ext4_lookup(struct + dentry->d_name.name); + return ERR_PTR(-EIO); + } +- inode = ext4_iget(dir->i_sb, ino); ++ inode = ext4_iget_normal(dir->i_sb, ino); + if (inode == ERR_PTR(-ESTALE)) { + EXT4_ERROR_INODE(dir, + "deleted inode referenced: %u", +@@ -1087,7 +1087,7 @@ struct dentry *ext4_get_parent(struct de + return ERR_PTR(-EIO); + } + +- return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); ++ return d_obtain_alias(ext4_iget_normal(child->d_inode->i_sb, ino)); + } + + #define S_SHIFT 12 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1041,7 +1041,7 @@ static struct inode *ext4_nfs_get_inode( + * Currently we don't know the generation for parent directory, so + * a generation of 0 means "accept any" + */ +- inode = ext4_iget(sb, ino); ++ inode = ext4_iget_normal(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { diff --git a/patches/ext4-check-ea-value-offset-when-loading.patch b/patches/ext4-check-ea-value-offset-when-loading.patch new file mode 100644 index 0000000..eaecbdb --- /dev/null +++ b/patches/ext4-check-ea-value-offset-when-loading.patch @@ -0,0 +1,98 @@ +From a0626e75954078cfacddb00a4545dde821170bc5 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" <darrick.wong@oracle.com> +Date: Tue, 16 Sep 2014 14:34:59 -0400 +Subject: ext4: check EA value offset when loading + +commit a0626e75954078cfacddb00a4545dde821170bc5 upstream. + +When loading extended attributes, check each entry's value offset to +make sure it doesn't collide with the entries. + +Without this check it is easy to crash the kernel by mounting a +malicious FS containing a file with an EA wherein e_value_offs = 0 and +e_value_size > 0 and then deleting the EA, which corrupts the name +list. + +(See the f_ea_value_crash test's FS image in e2fsprogs for an example.) + +Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> +Signed-off-by: Theodore Ts'o <tytso@mit.edu> +[lizf: Backported to 3.4: adjust context] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + fs/ext4/xattr.c | 32 ++++++++++++++++++++++++-------- + 1 file changed, 24 insertions(+), 8 deletions(-) + +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -144,14 +144,28 @@ ext4_listxattr(struct dentry *dentry, ch + } + + static int +-ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end) ++ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, ++ void *value_start) + { +- while (!IS_LAST_ENTRY(entry)) { +- struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry); ++ struct ext4_xattr_entry *e = entry; ++ ++ while (!IS_LAST_ENTRY(e)) { ++ struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); + if ((void *)next >= end) + return -EIO; +- entry = next; ++ e = next; + } ++ ++ while (!IS_LAST_ENTRY(entry)) { ++ if (entry->e_value_size != 0 && ++ (value_start + le16_to_cpu(entry->e_value_offs) < ++ (void *)e + sizeof(__u32) || ++ value_start + le16_to_cpu(entry->e_value_offs) + ++ le32_to_cpu(entry->e_value_size) > end)) ++ return -EIO; ++ entry = EXT4_XATTR_NEXT(entry); ++ } ++ + return 0; + } + +@@ -161,7 +175,8 @@ ext4_xattr_check_block(struct buffer_hea + if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) + return -EIO; +- return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); ++ return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size, ++ bh->b_data); + } + + static inline int +@@ -274,7 +289,7 @@ ext4_xattr_ibody_get(struct inode *inode + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; +- error = ext4_xattr_check_names(entry, end); ++ error = ext4_xattr_check_names(entry, end, entry); + if (error) + goto cleanup; + error = ext4_xattr_find_entry(&entry, name_index, name, +@@ -402,7 +417,7 @@ ext4_xattr_ibody_list(struct dentry *den + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; +- error = ext4_xattr_check_names(IFIRST(header), end); ++ error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header)); + if (error) + goto cleanup; + error = ext4_xattr_list_entries(dentry, IFIRST(header), +@@ -914,7 +929,8 @@ ext4_xattr_ibody_find(struct inode *inod + is->s.here = is->s.first; + is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { +- error = ext4_xattr_check_names(IFIRST(header), is->s.end); ++ error = ext4_xattr_check_names(IFIRST(header), is->s.end, ++ IFIRST(header)); + if (error) + return error; + /* Find the named attribute. */ diff --git a/patches/ext4-don-t-check-quota-format-when-there-are-no-quota-files.patch b/patches/ext4-don-t-check-quota-format-when-there-are-no-quota-files.patch new file mode 100644 index 0000000..966ec9f --- /dev/null +++ b/patches/ext4-don-t-check-quota-format-when-there-are-no-quota-files.patch @@ -0,0 +1,35 @@ +From 279bf6d390933d5353ab298fcc306c391a961469 Mon Sep 17 00:00:00 2001 +From: Jan Kara <jack@suse.cz> +Date: Thu, 18 Sep 2014 01:12:15 -0400 +Subject: ext4: don't check quota format when there are no quota files + +commit 279bf6d390933d5353ab298fcc306c391a961469 upstream. + +The check whether quota format is set even though there are no +quota files with journalled quota is pointless and it actually +makes it impossible to turn off journalled quotas (as there's +no way to unset journalled quota format). Just remove the check. + +Signed-off-by: Jan Kara <jack@suse.cz> +Signed-off-by: Theodore Ts'o <tytso@mit.edu> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + fs/ext4/super.c | 7 ------- + 1 file changed, 7 deletions(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1642,13 +1642,6 @@ static int parse_options(char *options, + "not specified"); + return 0; + } +- } else { +- if (sbi->s_jquota_fmt) { +- ext4_msg(sb, KERN_ERR, "journaled quota format " +- "specified with no journaling " +- "enabled"); +- return 0; +- } + } + #endif + if (test_opt(sb, DIOREAD_NOLOCK)) { diff --git a/patches/ext4-don-t-orphan-or-truncate-the-boot-loader-inode.patch b/patches/ext4-don-t-orphan-or-truncate-the-boot-loader-inode.patch new file mode 100644 index 0000000..11a525d --- /dev/null +++ b/patches/ext4-don-t-orphan-or-truncate-the-boot-loader-inode.patch @@ -0,0 +1,66 @@ +From e2bfb088fac03c0f621886a04cffc7faa2b49b1d Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o <tytso@mit.edu> +Date: Sun, 5 Oct 2014 22:47:07 -0400 +Subject: ext4: don't orphan or truncate the boot loader inode + +commit e2bfb088fac03c0f621886a04cffc7faa2b49b1d upstream. + +The boot loader inode (inode #5) should never be visible in the +directory hierarchy, but it's possible if the file system is corrupted +that there will be a directory entry that points at inode #5. In +order to avoid accidentally trashing it, when such a directory inode +is opened, the inode will be marked as a bad inode, so that it's not +possible to modify (or read) the inode from userspace. + +Unfortunately, when we unlink this (invalid/illegal) directory entry, +we will put the bad inode on the ophan list, and then when try to +unlink the directory, we don't actually remove the bad inode from the +orphan list before freeing in-memory inode structure. This means the +in-memory orphan list is corrupted, leading to a kernel oops. + +In addition, avoid truncating a bad inode in ext4_destroy_inode(), +since truncating the boot loader inode is not a smart thing to do. + +Reported-by: Sami Liedes <sami.liedes@iki.fi> +Reviewed-by: Jan Kara <jack@suse.cz> +Signed-off-by: Theodore Ts'o <tytso@mit.edu> +[lizf: Backported to 3.4: adjust context] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + fs/ext4/inode.c | 8 +++----- + fs/ext4/namei.c | 2 +- + 2 files changed, 4 insertions(+), 6 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -157,16 +157,14 @@ void ext4_evict_inode(struct inode *inod + goto no_delete; + } + +- if (!is_bad_inode(inode)) +- dquot_initialize(inode); ++ if (is_bad_inode(inode)) ++ goto no_delete; ++ dquot_initialize(inode); + + if (ext4_should_order_data(inode)) + ext4_begin_ordered_truncate(inode, 0); + truncate_inode_pages(&inode->i_data, 0); + +- if (is_bad_inode(inode)) +- goto no_delete; +- + handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); + if (IS_ERR(handle)) { + ext4_std_error(inode->i_sb, PTR_ERR(handle)); +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -1992,7 +1992,7 @@ int ext4_orphan_add(handle_t *handle, st + struct ext4_iloc iloc; + int err = 0, rc; + +- if (!ext4_handle_valid(handle)) ++ if (!ext4_handle_valid(handle) || is_bad_inode(inode)) + return 0; + + mutex_lock(&EXT4_SB(sb)->s_orphan_lock); diff --git a/patches/ext4-fix-reservation-overflow-in-ext4_da_write_begin.patch b/patches/ext4-fix-reservation-overflow-in-ext4_da_write_begin.patch new file mode 100644 index 0000000..b4bffdf --- /dev/null +++ b/patches/ext4-fix-reservation-overflow-in-ext4_da_write_begin.patch @@ -0,0 +1,79 @@ +From 0ff8947fc5f700172b37cbca811a38eb9cb81e08 Mon Sep 17 00:00:00 2001 +From: Eric Sandeen <sandeen@redhat.com> +Date: Sat, 11 Oct 2014 19:51:17 -0400 +Subject: ext4: fix reservation overflow in ext4_da_write_begin + +commit 0ff8947fc5f700172b37cbca811a38eb9cb81e08 upstream. + +Delalloc write journal reservations only reserve 1 credit, +to update the inode if necessary. However, it may happen +once in a filesystem's lifetime that a file will cross +the 2G threshold, and require the LARGE_FILE feature to +be set in the superblock as well, if it was not set already. + +This overruns the transaction reservation, and can be +demonstrated simply on any ext4 filesystem without the LARGE_FILE +feature already set: + +dd if=/dev/zero of=testfile bs=1 seek=2147483646 count=1 \ + conv=notrunc of=testfile +sync +dd if=/dev/zero of=testfile bs=1 seek=2147483647 count=1 \ + conv=notrunc of=testfile + +leads to: + +EXT4-fs: ext4_do_update_inode:4296: aborting transaction: error 28 in __ext4_handle_dirty_super +EXT4-fs error (device loop0) in ext4_do_update_inode:4301: error 28 +EXT4-fs error (device loop0) in ext4_reserve_inode_write:4757: Readonly filesystem +EXT4-fs error (device loop0) in ext4_dirty_inode:4876: error 28 +EXT4-fs error (device loop0) in ext4_da_write_end:2685: error 28 + +Adjust the number of credits based on whether the flag is +already set, and whether the current write may extend past the +LARGE_FILE limit. + +Signed-off-by: Eric Sandeen <sandeen@redhat.com> +Signed-off-by: Theodore Ts'o <tytso@mit.edu> +Reviewed-by: Andreas Dilger <adilger@dilger.ca> +[lizf: Backported to 3.4: + - adjust context + - ext4_journal_start() has no parameter type] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + fs/ext4/inode.c | 17 ++++++++++++++++- + 1 file changed, 16 insertions(+), 1 deletion(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -2408,6 +2408,20 @@ static int ext4_nonda_switch(struct supe + return 0; + } + ++/* We always reserve for an inode update; the superblock could be there too */ ++static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len) ++{ ++ if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, ++ EXT4_FEATURE_RO_COMPAT_LARGE_FILE))) ++ return 1; ++ ++ if (pos + len <= 0x7fffffffULL) ++ return 1; ++ ++ /* We might need to update the superblock to set LARGE_FILE */ ++ return 2; ++} ++ + static int ext4_da_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +@@ -2434,7 +2448,8 @@ retry: + * to journalling the i_disksize update if writes to the end + * of file which has an already mapped buffer. + */ +- handle = ext4_journal_start(inode, 1); ++ handle = ext4_journal_start(inode, ++ ext4_da_write_credits(inode, pos, len)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; diff --git a/patches/fanotify-enable-close-on-exec-on-events-fd-when-requested-in-fanotify_init.patch b/patches/fanotify-enable-close-on-exec-on-events-fd-when-requested-in-fanotify_init.patch new file mode 100644 index 0000000..725de79 --- /dev/null +++ b/patches/fanotify-enable-close-on-exec-on-events-fd-when-requested-in-fanotify_init.patch @@ -0,0 +1,110 @@ +From 0b37e097a648aa71d4db1ad108001e95b69a2da4 Mon Sep 17 00:00:00 2001 +From: Yann Droneaud <ydroneaud@opteya.com> +Date: Thu, 9 Oct 2014 15:24:40 -0700 +Subject: fanotify: enable close-on-exec on events' fd when requested in + fanotify_init() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +commit 0b37e097a648aa71d4db1ad108001e95b69a2da4 upstream. + +According to commit 80af258867648 ("fanotify: groups can specify their +f_flags for new fd"), file descriptors created as part of file access +notification events inherit flags from the event_f_flags argument passed +to syscall fanotify_init(2)[1]. + +Unfortunately O_CLOEXEC is currently silently ignored. + +Indeed, event_f_flags are only given to dentry_open(), which only seems to +care about O_ACCMODE and O_PATH in do_dentry_open(), O_DIRECT in +open_check_o_direct() and O_LARGEFILE in generic_file_open(). + +It's a pity, since, according to some lookup on various search engines and +http://codesearch.debian.net/, there's already some userspace code which +use O_CLOEXEC: + +- in systemd's readahead[2]: + + fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME); + +- in clsync[3]: + + #define FANOTIFY_EVFLAGS (O_LARGEFILE|O_RDONLY|O_CLOEXEC) + + int fanotify_d = fanotify_init(FANOTIFY_FLAGS, FANOTIFY_EVFLAGS); + +- in examples [4] from "Filesystem monitoring in the Linux + kernel" article[5] by Aleksander Morgado: + + if ((fanotify_fd = fanotify_init (FAN_CLOEXEC, + O_RDONLY | O_CLOEXEC | O_LARGEFILE)) < 0) + +Additionally, since commit 48149e9d3a7e ("fanotify: check file flags +passed in fanotify_init"). having O_CLOEXEC as part of fanotify_init() +second argument is expressly allowed. + +So it seems expected to set close-on-exec flag on the file descriptors if +userspace is allowed to request it with O_CLOEXEC. + +But Andrew Morton raised[6] the concern that enabling now close-on-exec +might break existing applications which ask for O_CLOEXEC but expect the +file descriptor to be inherited across exec(). + +In the other hand, as reported by Mihai Dontu[7] close-on-exec on the file +descriptor returned as part of file access notify can break applications +due to deadlock. So close-on-exec is needed for most applications. + +More, applications asking for close-on-exec are likely expecting it to be +enabled, relying on O_CLOEXEC being effective. If not, it might weaken +their security, as noted by Jan Kara[8]. + +So this patch replaces call to macro get_unused_fd() by a call to function +get_unused_fd_flags() with event_f_flags value as argument. This way +O_CLOEXEC flag in the second argument of fanotify_init(2) syscall is +interpreted and close-on-exec get enabled when requested. + +[1] http://man7.org/linux/man-pages/man2/fanotify_init.2.html +[2] http://cgit.freedesktop.org/systemd/systemd/tree/src/readahead/readahead-collect.c?id=v208#n294 +[3] https://github.com/xaionaro/clsync/blob/v0.2.1/sync.c#L1631 + https://github.com/xaionaro/clsync/blob/v0.2.1/configuration.h#L38 +[4] http://www.lanedo.com/~aleksander/fanotify/fanotify-example.c +[5] http://www.lanedo.com/2013/filesystem-monitoring-linux-kernel/ +[6] http://lkml.kernel.org/r/20141001153621.65e9258e65a6167bf2e4cb50@linux-foundation.org +[7] http://lkml.kernel.org/r/20141002095046.3715eb69@mdontu-l +[8] http://lkml.kernel.org/r/20141002104410.GB19748@quack.suse.cz + +Link: http://lkml.kernel.org/r/cover.1411562410.git.ydroneaud@opteya.com +Signed-off-by: Yann Droneaud <ydroneaud@opteya.com> +Reviewed-by: Jan Kara <jack@suse.cz> +Reviewed by: Heinrich Schuchardt <xypron.glpk@gmx.de> +Tested-by: Heinrich Schuchardt <xypron.glpk@gmx.de> +Cc: Mihai Don\u021bu <mihai.dontu@gmail.com> +Cc: Pádraig Brady <P@draigBrady.com> +Cc: Heinrich Schuchardt <xypron.glpk@gmx.de> +Cc: Jan Kara <jack@suse.cz> +Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> +Cc: Michael Kerrisk-manpages <mtk.manpages@gmail.com> +Cc: Lino Sanfilippo <LinoSanfilippo@gmx.de> +Cc: Richard Guy Briggs <rgb@redhat.com> +Cc: Eric Paris <eparis@redhat.com> +Cc: Al Viro <viro@zeniv.linux.org.uk> +Cc: Michael Kerrisk <mtk.manpages@gmail.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + fs/notify/fanotify/fanotify_user.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/notify/fanotify/fanotify_user.c ++++ b/fs/notify/fanotify/fanotify_user.c +@@ -67,7 +67,7 @@ static int create_fd(struct fsnotify_gro + + pr_debug("%s: group=%p event=%p\n", __func__, group, event); + +- client_fd = get_unused_fd(); ++ client_fd = get_unused_fd_flags(group->fanotify_data.f_flags); + if (client_fd < 0) + return client_fd; + diff --git a/patches/firmware_class-make-sure-fw-requests-contain-a-name.patch b/patches/firmware_class-make-sure-fw-requests-contain-a-name.patch new file mode 100644 index 0000000..873a937 --- /dev/null +++ b/patches/firmware_class-make-sure-fw-requests-contain-a-name.patch @@ -0,0 +1,36 @@ +From 471b095dfe0d693a8d624cbc716d1ee4d74eb437 Mon Sep 17 00:00:00 2001 +From: Kees Cook <keescook@chromium.org> +Date: Thu, 18 Sep 2014 11:25:37 -0700 +Subject: firmware_class: make sure fw requests contain a name + +commit 471b095dfe0d693a8d624cbc716d1ee4d74eb437 upstream. + +An empty firmware request name will trigger warnings when building +device names. Make sure this is caught earlier and rejected. + +The warning was visible via the test_firmware.ko module interface: + +echo -ne "\x00" > /sys/devices/virtual/misc/test_firmware/trigger_request + +Reported-by: Sasha Levin <sasha.levin@oracle.com> +Signed-off-by: Kees Cook <keescook@chromium.org> +Tested-by: Sasha Levin <sasha.levin@oracle.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +[lizf: Backported to 3.4: adjust context] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/base/firmware_class.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/drivers/base/firmware_class.c ++++ b/drivers/base/firmware_class.c +@@ -588,6 +588,9 @@ request_firmware(const struct firmware * + struct firmware_priv *fw_priv; + int ret; + ++ if (!name || name[0] == '\0') ++ return -EINVAL; ++ + fw_priv = _request_firmware_prepare(firmware_p, name, device, true, + false); + if (IS_ERR_OR_NULL(fw_priv)) diff --git a/patches/fix-misuses-of-f_count-in-ppp-and-netlink.patch b/patches/fix-misuses-of-f_count-in-ppp-and-netlink.patch new file mode 100644 index 0000000..9c3f97e --- /dev/null +++ b/patches/fix-misuses-of-f_count-in-ppp-and-netlink.patch @@ -0,0 +1,43 @@ +From 24dff96a37a2ca319e75a74d3929b2de22447ca6 Mon Sep 17 00:00:00 2001 +From: Al Viro <viro@zeniv.linux.org.uk> +Date: Wed, 8 Oct 2014 23:44:00 -0400 +Subject: fix misuses of f_count() in ppp and netlink + +commit 24dff96a37a2ca319e75a74d3929b2de22447ca6 upstream. + +we used to check for "nobody else could start doing anything with +that opened file" by checking that refcount was 2 or less - one +for descriptor table and one we'd acquired in fget() on the way to +wherever we are. That was race-prone (somebody else might have +had a reference to descriptor table and do fget() just as we'd +been checking) and it had become flat-out incorrect back when +we switched to fget_light() on those codepaths - unlike fget(), +it doesn't grab an extra reference unless the descriptor table +is shared. The same change allowed a race-free check, though - +we are safe exactly when refcount is less than 2. + +It was a long time ago; pre-2.6.12 for ioctl() (the codepath leading +to ppp one) and 2.6.17 for sendmsg() (netlink one). OTOH, +netlink hadn't grown that check until 3.9 and ppp used to live +in drivers/net, not drivers/net/ppp until 3.1. The bug existed +well before that, though, and the same fix used to apply in old +location of file. + +Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> +[lizf: Backported to 3.4: drop the change to netlink_mmap_sendmsg()] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/net/ppp/ppp_generic.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ppp/ppp_generic.c ++++ b/drivers/net/ppp/ppp_generic.c +@@ -588,7 +588,7 @@ static long ppp_ioctl(struct file *file, + if (file == ppp->owner) + ppp_shutdown_interface(ppp); + } +- if (atomic_long_read(&file->f_count) <= 2) { ++ if (atomic_long_read(&file->f_count) < 2) { + ppp_release(NULL, file); + err = 0; + } else diff --git a/patches/framebuffer-fix-border-color.patch b/patches/framebuffer-fix-border-color.patch new file mode 100644 index 0000000..3b839b3 --- /dev/null +++ b/patches/framebuffer-fix-border-color.patch @@ -0,0 +1,97 @@ +From f74a289b9480648a654e5afd8458c2263c03a1e1 Mon Sep 17 00:00:00 2001 +From: Mikulas Patocka <mpatocka@redhat.com> +Date: Tue, 16 Sep 2014 12:40:26 -0400 +Subject: framebuffer: fix border color + +commit f74a289b9480648a654e5afd8458c2263c03a1e1 upstream. + +The framebuffer code uses the current background color to fill the border +when switching consoles, however, this results in inconsistent behavior. +For example: +- start Midnigh Commander +- the border is black +- switch to another console and switch back +- the border is cyan +- type something into the command line in mc +- the border is cyan +- switch to another console and switch back +- the border is black +- press F9 to go to menu +- the border is black +- switch to another console and switch back +- the border is dark blue + +When switching to a console with Midnight Commander, the border is random +color that was left selected by the slang subsystem. + +This patch fixes this inconsistency by always using black as the +background color when switching consoles. + +Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> +Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/video/console/bitblit.c | 3 +-- + drivers/video/console/fbcon_ccw.c | 3 +-- + drivers/video/console/fbcon_cw.c | 3 +-- + drivers/video/console/fbcon_ud.c | 3 +-- + 4 files changed, 4 insertions(+), 8 deletions(-) + +--- a/drivers/video/console/bitblit.c ++++ b/drivers/video/console/bitblit.c +@@ -205,7 +205,6 @@ static void bit_putcs(struct vc_data *vc + static void bit_clear_margins(struct vc_data *vc, struct fb_info *info, + int bottom_only) + { +- int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; + unsigned int cw = vc->vc_font.width; + unsigned int ch = vc->vc_font.height; + unsigned int rw = info->var.xres - (vc->vc_cols*cw); +@@ -214,7 +213,7 @@ static void bit_clear_margins(struct vc_ + unsigned int bs = info->var.yres - bh; + struct fb_fillrect region; + +- region.color = attr_bgcol_ec(bgshift, vc, info); ++ region.color = 0; + region.rop = ROP_COPY; + + if (rw && !bottom_only) { +--- a/drivers/video/console/fbcon_ccw.c ++++ b/drivers/video/console/fbcon_ccw.c +@@ -197,9 +197,8 @@ static void ccw_clear_margins(struct vc_ + unsigned int bh = info->var.xres - (vc->vc_rows*ch); + unsigned int bs = vc->vc_rows*ch; + struct fb_fillrect region; +- int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; + +- region.color = attr_bgcol_ec(bgshift,vc,info); ++ region.color = 0; + region.rop = ROP_COPY; + + if (rw && !bottom_only) { +--- a/drivers/video/console/fbcon_cw.c ++++ b/drivers/video/console/fbcon_cw.c +@@ -181,9 +181,8 @@ static void cw_clear_margins(struct vc_d + unsigned int bh = info->var.xres - (vc->vc_rows*ch); + unsigned int rs = info->var.yres - rw; + struct fb_fillrect region; +- int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; + +- region.color = attr_bgcol_ec(bgshift,vc,info); ++ region.color = 0; + region.rop = ROP_COPY; + + if (rw && !bottom_only) { +--- a/drivers/video/console/fbcon_ud.c ++++ b/drivers/video/console/fbcon_ud.c +@@ -227,9 +227,8 @@ static void ud_clear_margins(struct vc_d + unsigned int rw = info->var.xres - (vc->vc_cols*cw); + unsigned int bh = info->var.yres - (vc->vc_rows*ch); + struct fb_fillrect region; +- int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; + +- region.color = attr_bgcol_ec(bgshift,vc,info); ++ region.color = 0; + region.rop = ROP_COPY; + + if (rw && !bottom_only) { diff --git a/patches/freezer-do-not-freeze-tasks-killed-by-oom-killer.patch b/patches/freezer-do-not-freeze-tasks-killed-by-oom-killer.patch new file mode 100644 index 0000000..004b683 --- /dev/null +++ b/patches/freezer-do-not-freeze-tasks-killed-by-oom-killer.patch @@ -0,0 +1,51 @@ +From 51fae6da640edf9d266c94f36bc806c63c301991 Mon Sep 17 00:00:00 2001 +From: Cong Wang <xiyou.wangcong@gmail.com> +Date: Tue, 21 Oct 2014 09:27:12 +0200 +Subject: freezer: Do not freeze tasks killed by OOM killer + +commit 51fae6da640edf9d266c94f36bc806c63c301991 upstream. + +Since f660daac474c6f (oom: thaw threads if oom killed thread is frozen +before deferring) OOM killer relies on being able to thaw a frozen task +to handle OOM situation but a3201227f803 (freezer: make freezing() test +freeze conditions in effect instead of TIF_FREEZE) has reorganized the +code and stopped clearing freeze flag in __thaw_task. This means that +the target task only wakes up and goes into the fridge again because the +freezing condition hasn't changed for it. This reintroduces the bug +fixed by f660daac474c6f. + +Fix the issue by checking for TIF_MEMDIE thread flag in +freezing_slow_path and exclude the task from freezing completely. If a +task was already frozen it would get woken by __thaw_task from OOM killer +and get out of freezer after rechecking freezing(). + +Changes since v1 +- put TIF_MEMDIE check into freezing_slowpath rather than in __refrigerator + as per Oleg +- return __thaw_task into oom_scan_process_thread because + oom_kill_process will not wake task in the fridge because it is + sleeping uninterruptible + +[mhocko@suse.cz: rewrote the changelog] +Fixes: a3201227f803 (freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE) +Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com> +Signed-off-by: Michal Hocko <mhocko@suse.cz> +Acked-by: Oleg Nesterov <oleg@redhat.com> +Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + kernel/freezer.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/kernel/freezer.c ++++ b/kernel/freezer.c +@@ -36,6 +36,9 @@ bool freezing_slow_path(struct task_stru + if (p->flags & PF_NOFREEZE) + return false; + ++ if (test_thread_flag(TIF_MEMDIE)) ++ return false; ++ + if (pm_nosig_freezing || cgroup_freezing(p)) + return true; + diff --git a/patches/fs-fix-theoretical-division-by-0-in-super_cache_scan.patch b/patches/fs-fix-theoretical-division-by-0-in-super_cache_scan.patch new file mode 100644 index 0000000..375a2b1 --- /dev/null +++ b/patches/fs-fix-theoretical-division-by-0-in-super_cache_scan.patch @@ -0,0 +1,35 @@ +From 475d0db742e3755c6b267f48577ff7cbb7dfda0d Mon Sep 17 00:00:00 2001 +From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> +Date: Sat, 17 May 2014 20:56:38 +0900 +Subject: fs: Fix theoretical division by 0 in super_cache_scan(). + +commit 475d0db742e3755c6b267f48577ff7cbb7dfda0d upstream. + +total_objects could be 0 and is used as a denom. + +While total_objects is a "long", total_objects == 0 unlikely happens for +3.12 and later kernels because 32-bit architectures would not be able to +hold (1 << 32) objects. However, total_objects == 0 may happen for kernels +between 3.1 and 3.11 because total_objects in prune_super() was an "int" +and (e.g.) x86_64 architecture might be able to hold (1 << 32) objects. + +Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> +Reviewed-by: Christoph Hellwig <hch@lst.de> +Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> +[lizf: Backported to 3.4: adjust context] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + fs/super.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/super.c ++++ b/fs/super.c +@@ -69,6 +69,8 @@ static int prune_super(struct shrinker * + + total_objects = sb->s_nr_dentry_unused + + sb->s_nr_inodes_unused + fs_objects + 1; ++ if (!total_objects) ++ total_objects = 1; + + if (sc->nr_to_scan) { + int dentries; diff --git a/patches/fs-make-cont_expand_zero-interruptible.patch b/patches/fs-make-cont_expand_zero-interruptible.patch new file mode 100644 index 0000000..2306f6e --- /dev/null +++ b/patches/fs-make-cont_expand_zero-interruptible.patch @@ -0,0 +1,39 @@ +From c2ca0fcd202863b14bd041a7fece2e789926c225 Mon Sep 17 00:00:00 2001 +From: Mikulas Patocka <mpatocka@redhat.com> +Date: Sun, 27 Jul 2014 13:00:41 -0400 +Subject: fs: make cont_expand_zero interruptible + +commit c2ca0fcd202863b14bd041a7fece2e789926c225 upstream. + +This patch makes it possible to kill a process looping in +cont_expand_zero. A process may spend a lot of time in this function, so +it is desirable to be able to kill it. + +It happened to me that I wanted to copy a piece data from the disk to a +file. By mistake, I used the "seek" parameter to dd instead of "skip". Due +to the "seek" parameter, dd attempted to extend the file and became stuck +doing so - the only possibility was to reset the machine or wait many +hours until the filesystem runs out of space and cont_expand_zero fails. +We need this patch to be able to terminate the process. + +Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> +Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + fs/buffer.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/fs/buffer.c ++++ b/fs/buffer.c +@@ -2221,6 +2221,11 @@ static int cont_expand_zero(struct file + err = 0; + + balance_dirty_pages_ratelimited(mapping); ++ ++ if (unlikely(fatal_signal_pending(current))) { ++ err = -EINTR; ++ goto out; ++ } + } + + /* page covers the boundary, find the boundary offset */ diff --git a/patches/futex-ensure-get_futex_key_refs-always-implies-a-barrier.patch b/patches/futex-ensure-get_futex_key_refs-always-implies-a-barrier.patch new file mode 100644 index 0000000..9ebd262 --- /dev/null +++ b/patches/futex-ensure-get_futex_key_refs-always-implies-a-barrier.patch @@ -0,0 +1,52 @@ +From 76835b0ebf8a7fe85beb03c75121419a7dec52f0 Mon Sep 17 00:00:00 2001 +From: Catalin Marinas <catalin.marinas@arm.com> +Date: Fri, 17 Oct 2014 17:38:49 +0100 +Subject: futex: Ensure get_futex_key_refs() always implies a barrier + +commit 76835b0ebf8a7fe85beb03c75121419a7dec52f0 upstream. + +Commit b0c29f79ecea (futexes: Avoid taking the hb->lock if there's +nothing to wake up) changes the futex code to avoid taking a lock when +there are no waiters. This code has been subsequently fixed in commit +11d4616bd07f (futex: revert back to the explicit waiter counting code). +Both the original commit and the fix-up rely on get_futex_key_refs() to +always imply a barrier. + +However, for private futexes, none of the cases in the switch statement +of get_futex_key_refs() would be hit and the function completes without +a memory barrier as required before checking the "waiters" in +futex_wake() -> hb_waiters_pending(). The consequence is a race with a +thread waiting on a futex on another CPU, allowing the waker thread to +read "waiters == 0" while the waiter thread to have read "futex_val == +locked" (in kernel). + +Without this fix, the problem (user space deadlocks) can be seen with +Android bionic's mutex implementation on an arm64 multi-cluster system. + +Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> +Reported-by: Matteo Franchin <Matteo.Franchin@arm.com> +Fixes: b0c29f79ecea (futexes: Avoid taking the hb->lock if there's nothing to wake up) +Acked-by: Davidlohr Bueso <dave@stgolabs.net> +Tested-by: Mike Galbraith <umgwanakikbuti@gmail.com> +Cc: Darren Hart <dvhart@linux.intel.com> +Cc: Thomas Gleixner <tglx@linutronix.de> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Ingo Molnar <mingo@kernel.org> +Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + kernel/futex.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -212,6 +212,8 @@ static void drop_futex_key_refs(union fu + case FUT_OFF_MMSHARED: + mmdrop(key->private.mm); + break; ++ default: ++ smp_mb(); /* explicit MB (B) */ + } + } + diff --git a/patches/input-i8042-add-noloop-quirk-for-asus-x750ln.patch b/patches/input-i8042-add-noloop-quirk-for-asus-x750ln.patch new file mode 100644 index 0000000..e3993a7 --- /dev/null +++ b/patches/input-i8042-add-noloop-quirk-for-asus-x750ln.patch @@ -0,0 +1,34 @@ +From 9ff84a17302aeb8913ff244ecc0d8f9d219fecb5 Mon Sep 17 00:00:00 2001 +From: Hans de Goede <hdegoede@redhat.com> +Date: Sat, 11 Oct 2014 11:27:37 -0700 +Subject: Input: i8042 - add noloop quirk for Asus X750LN + +commit 9ff84a17302aeb8913ff244ecc0d8f9d219fecb5 upstream. + +Without this the aux port does not get detected, and consequently the +touchpad will not work. + +https://bugzilla.redhat.com/show_bug.cgi?id=1110011 + +Signed-off-by: Hans de Goede <hdegoede@redhat.com> +Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/input/serio/i8042-x86ia64io.h | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/drivers/input/serio/i8042-x86ia64io.h ++++ b/drivers/input/serio/i8042-x86ia64io.h +@@ -101,6 +101,12 @@ static const struct dmi_system_id __init + }, + { + .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), ++ DMI_MATCH(DMI_PRODUCT_NAME, "X750LN"), ++ }, ++ }, ++ { ++ .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), + DMI_MATCH(DMI_PRODUCT_NAME , "ProLiant"), + DMI_MATCH(DMI_PRODUCT_VERSION, "8500"), diff --git a/patches/input-synaptics-gate-forcepad-support-by-dmi-check.patch b/patches/input-synaptics-gate-forcepad-support-by-dmi-check.patch new file mode 100644 index 0000000..1db08a8 --- /dev/null +++ b/patches/input-synaptics-gate-forcepad-support-by-dmi-check.patch @@ -0,0 +1,96 @@ +From aa972409951e0675e07918620427517cad5090e0 Mon Sep 17 00:00:00 2001 +From: Dmitry Torokhov <dmitry.torokhov@gmail.com> +Date: Tue, 2 Sep 2014 09:49:18 -0700 +Subject: Input: synaptics - gate forcepad support by DMI check + +commit aa972409951e0675e07918620427517cad5090e0 upstream. + +Unfortunately, ForcePad capability is not actually exported over PS/2, so +we have to resort to DMI checks. + +Reported-by: Nicole Faerber <nicole.faerber@kernelconcepts.de> +Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com> +[lizf: Backported to 3.4: adjust context] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/input/mouse/synaptics.c | 22 +++++++++++++++++++++- + drivers/input/mouse/synaptics.h | 8 ++------ + 2 files changed, 23 insertions(+), 7 deletions(-) + +--- a/drivers/input/mouse/synaptics.c ++++ b/drivers/input/mouse/synaptics.c +@@ -506,6 +506,8 @@ static void synaptics_parse_agm(const un + priv->agm_pending = true; + } + ++static bool is_forcepad; ++ + static int synaptics_parse_hw_state(const unsigned char buf[], + struct synaptics_data *priv, + struct synaptics_hw_state *hw) +@@ -535,7 +537,7 @@ static int synaptics_parse_hw_state(cons + hw->left = (buf[0] & 0x01) ? 1 : 0; + hw->right = (buf[0] & 0x02) ? 1 : 0; + +- if (SYN_CAP_FORCEPAD(priv->ext_cap_0c)) { ++ if (is_forcepad) { + /* + * ForcePads, like Clickpads, use middle button + * bits to report primary button clicks. +@@ -1512,6 +1514,18 @@ static const struct dmi_system_id min_ma + { } + }; + ++static const struct dmi_system_id forcepad_dmi_table[] __initconst = { ++#if defined(CONFIG_DMI) && defined(CONFIG_X86) ++ { ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), ++ DMI_MATCH(DMI_PRODUCT_NAME, "HP EliteBook Folio 1040 G1"), ++ }, ++ }, ++#endif ++ { } ++}; ++ + void __init synaptics_module_init(void) + { + const struct dmi_system_id *min_max_dmi; +@@ -1522,6 +1536,12 @@ void __init synaptics_module_init(void) + min_max_dmi = dmi_first_match(min_max_dmi_table); + if (min_max_dmi) + quirk_min_max = min_max_dmi->driver_data; ++ ++ /* ++ * Unfortunately ForcePad capability is not exported over PS/2, ++ * so we have to resort to checking DMI. ++ */ ++ is_forcepad = dmi_check_system(forcepad_dmi_table); + } + + static int __synaptics_init(struct psmouse *psmouse, bool absolute_mode) +--- a/drivers/input/mouse/synaptics.h ++++ b/drivers/input/mouse/synaptics.h +@@ -76,12 +76,9 @@ + * for noise. + * 2 0x08 image sensor image sensor tracks 5 fingers, but only + * reports 2. ++ * 2 0x01 uniform clickpad whole clickpad moves instead of being ++ * hinged at the top. + * 2 0x20 report min query 0x0f gives min coord reported +- * 2 0x80 forcepad forcepad is a variant of clickpad that +- * does not have physical buttons but rather +- * uses pressure above certain threshold to +- * report primary clicks. Forcepads also have +- * clickpad bit set. + */ + #define SYN_CAP_CLICKPAD(ex0c) ((ex0c) & 0x100000) /* 1-button ClickPad */ + #define SYN_CAP_CLICKPAD2BTN(ex0c) ((ex0c) & 0x000100) /* 2-button ClickPad */ +@@ -90,7 +87,6 @@ + #define SYN_CAP_ADV_GESTURE(ex0c) ((ex0c) & 0x080000) + #define SYN_CAP_REDUCED_FILTERING(ex0c) ((ex0c) & 0x000400) + #define SYN_CAP_IMAGE_SENSOR(ex0c) ((ex0c) & 0x000800) +-#define SYN_CAP_FORCEPAD(ex0c) ((ex0c) & 0x008000) + + /* synaptics modes query bits */ + #define SYN_MODE_ABSOLUTE(m) ((m) & (1 << 7)) diff --git a/patches/introduce-for_each_thread-to-replace-the-buggy-while_each_thread.patch b/patches/introduce-for_each_thread-to-replace-the-buggy-while_each_thread.patch new file mode 100644 index 0000000..4499e91 --- /dev/null +++ b/patches/introduce-for_each_thread-to-replace-the-buggy-while_each_thread.patch @@ -0,0 +1,152 @@ +From 0c740d0afc3bff0a097ad03a1c8df92757516f5c Mon Sep 17 00:00:00 2001 +From: Oleg Nesterov <oleg@redhat.com> +Date: Tue, 21 Jan 2014 15:49:56 -0800 +Subject: introduce for_each_thread() to replace the buggy while_each_thread() + +commit 0c740d0afc3bff0a097ad03a1c8df92757516f5c upstream. + +while_each_thread() and next_thread() should die, almost every lockless +usage is wrong. + +1. Unless g == current, the lockless while_each_thread() is not safe. + + while_each_thread(g, t) can loop forever if g exits, next_thread() + can't reach the unhashed thread in this case. Note that this can + happen even if g is the group leader, it can exec. + +2. Even if while_each_thread() itself was correct, people often use + it wrongly. + + It was never safe to just take rcu_read_lock() and loop unless + you verify that pid_alive(g) == T, even the first next_thread() + can point to the already freed/reused memory. + +This patch adds signal_struct->thread_head and task->thread_node to +create the normal rcu-safe list with the stable head. The new +for_each_thread(g, t) helper is always safe under rcu_read_lock() as +long as this task_struct can't go away. + +Note: of course it is ugly to have both task_struct->thread_node and the +old task_struct->thread_group, we will kill it later, after we change +the users of while_each_thread() to use for_each_thread(). + +Perhaps we can kill it even before we convert all users, we can +reimplement next_thread(t) using the new thread_head/thread_node. But +we can't do this right now because this will lead to subtle behavioural +changes. For example, do/while_each_thread() always sees at least one +task, while for_each_thread() can do nothing if the whole thread group +has died. Or thread_group_empty(), currently its semantics is not clear +unless thread_group_leader(p) and we need to audit the callers before we +can change it. + +So this patch adds the new interface which has to coexist with the old +one for some time, hopefully the next changes will be more or less +straightforward and the old one will go away soon. + +Signed-off-by: Oleg Nesterov <oleg@redhat.com> +Reviewed-by: Sergey Dyasly <dserrg@gmail.com> +Tested-by: Sergey Dyasly <dserrg@gmail.com> +Reviewed-by: Sameer Nanda <snanda@chromium.org> +Acked-by: David Rientjes <rientjes@google.com> +Cc: "Eric W. Biederman" <ebiederm@xmission.com> +Cc: Frederic Weisbecker <fweisbec@gmail.com> +Cc: Mandeep Singh Baines <msb@chromium.org> +Cc: "Ma, Xindong" <xindong.ma@intel.com> +Cc: Michal Hocko <mhocko@suse.cz> +Cc: "Tu, Xiaobing" <xiaobing.tu@intel.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + include/linux/init_task.h | 2 ++ + include/linux/sched.h | 12 ++++++++++++ + kernel/exit.c | 1 + + kernel/fork.c | 7 +++++++ + 4 files changed, 22 insertions(+) + +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -38,6 +38,7 @@ extern struct fs_struct init_fs; + + #define INIT_SIGNALS(sig) { \ + .nr_threads = 1, \ ++ .thread_head = LIST_HEAD_INIT(init_task.thread_node), \ + .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ + .shared_pending = { \ + .list = LIST_HEAD_INIT(sig.shared_pending.list), \ +@@ -202,6 +203,7 @@ extern struct task_group root_task_group + [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \ + }, \ + .thread_group = LIST_HEAD_INIT(tsk.thread_group), \ ++ .thread_node = LIST_HEAD_INIT(init_signals.thread_head), \ + INIT_IDS \ + INIT_PERF_EVENTS(tsk) \ + INIT_TRACE_IRQFLAGS \ +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -534,6 +534,7 @@ struct signal_struct { + atomic_t sigcnt; + atomic_t live; + int nr_threads; ++ struct list_head thread_head; + + wait_queue_head_t wait_chldexit; /* for wait4() */ + +@@ -1394,6 +1395,7 @@ struct task_struct { + /* PID/PID hash table linkage. */ + struct pid_link pids[PIDTYPE_MAX]; + struct list_head thread_group; ++ struct list_head thread_node; + + struct completion *vfork_done; /* for vfork() */ + int __user *set_child_tid; /* CLONE_CHILD_SETTID */ +@@ -2397,6 +2399,16 @@ extern bool current_is_single_threaded(v + #define while_each_thread(g, t) \ + while ((t = next_thread(t)) != g) + ++#define __for_each_thread(signal, t) \ ++ list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node) ++ ++#define for_each_thread(p, t) \ ++ __for_each_thread((p)->signal, t) ++ ++/* Careful: this is a double loop, 'break' won't work as expected. */ ++#define for_each_process_thread(p, t) \ ++ for_each_process(p) for_each_thread(p, t) ++ + static inline int get_nr_threads(struct task_struct *tsk) + { + return tsk->signal->nr_threads; +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -74,6 +74,7 @@ static void __unhash_process(struct task + __this_cpu_dec(process_counts); + } + list_del_rcu(&p->thread_group); ++ list_del_rcu(&p->thread_node); + } + + /* +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1026,6 +1026,11 @@ static int copy_signal(unsigned long clo + sig->nr_threads = 1; + atomic_set(&sig->live, 1); + atomic_set(&sig->sigcnt, 1); ++ ++ /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ ++ sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); ++ tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head); ++ + init_waitqueue_head(&sig->wait_chldexit); + if (clone_flags & CLONE_NEWPID) + sig->flags |= SIGNAL_UNKILLABLE; +@@ -1433,6 +1438,8 @@ static struct task_struct *copy_process( + p->group_leader = current->group_leader; + list_add_tail_rcu(&p->thread_group, + &p->group_leader->thread_group); ++ list_add_tail_rcu(&p->thread_node, ++ &p->signal->thread_head); + } + attach_pid(p, PIDTYPE_PID, pid); + nr_threads++; diff --git a/patches/kernel-add-support-for-gcc-5.patch b/patches/kernel-add-support-for-gcc-5.patch new file mode 100644 index 0000000..10516d6 --- /dev/null +++ b/patches/kernel-add-support-for-gcc-5.patch @@ -0,0 +1,93 @@ +From 71458cfc782eafe4b27656e078d379a34e472adf Mon Sep 17 00:00:00 2001 +From: Sasha Levin <sasha.levin@oracle.com> +Date: Mon, 13 Oct 2014 15:51:05 -0700 +Subject: kernel: add support for gcc 5 + +commit 71458cfc782eafe4b27656e078d379a34e472adf upstream. + +We're missing include/linux/compiler-gcc5.h which is required now +because gcc branched off to v5 in trunk. + +Just copy the relevant bits out of include/linux/compiler-gcc4.h, +no new code is added as of now. + +This fixes a build error when using gcc 5. + +Signed-off-by: Sasha Levin <sasha.levin@oracle.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + include/linux/compiler-gcc5.h | 66 ++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 66 insertions(+) + create mode 100644 include/linux/compiler-gcc5.h + +--- /dev/null ++++ b/include/linux/compiler-gcc5.h +@@ -0,0 +1,66 @@ ++#ifndef __LINUX_COMPILER_H ++#error "Please don't include <linux/compiler-gcc5.h> directly, include <linux/compiler.h> instead." ++#endif ++ ++#define __used __attribute__((__used__)) ++#define __must_check __attribute__((warn_unused_result)) ++#define __compiler_offsetof(a, b) __builtin_offsetof(a, b) ++ ++/* Mark functions as cold. gcc will assume any path leading to a call ++ to them will be unlikely. This means a lot of manual unlikely()s ++ are unnecessary now for any paths leading to the usual suspects ++ like BUG(), printk(), panic() etc. [but let's keep them for now for ++ older compilers] ++ ++ Early snapshots of gcc 4.3 don't support this and we can't detect this ++ in the preprocessor, but we can live with this because they're unreleased. ++ Maketime probing would be overkill here. ++ ++ gcc also has a __attribute__((__hot__)) to move hot functions into ++ a special section, but I don't see any sense in this right now in ++ the kernel context */ ++#define __cold __attribute__((__cold__)) ++ ++#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) ++ ++#ifndef __CHECKER__ ++# define __compiletime_warning(message) __attribute__((warning(message))) ++# define __compiletime_error(message) __attribute__((error(message))) ++#endif /* __CHECKER__ */ ++ ++/* ++ * Mark a position in code as unreachable. This can be used to ++ * suppress control flow warnings after asm blocks that transfer ++ * control elsewhere. ++ * ++ * Early snapshots of gcc 4.5 don't support this and we can't detect ++ * this in the preprocessor, but we can live with this because they're ++ * unreleased. Really, we need to have autoconf for the kernel. ++ */ ++#define unreachable() __builtin_unreachable() ++ ++/* Mark a function definition as prohibited from being cloned. */ ++#define __noclone __attribute__((__noclone__)) ++ ++/* ++ * Tell the optimizer that something else uses this function or variable. ++ */ ++#define __visible __attribute__((externally_visible)) ++ ++/* ++ * GCC 'asm goto' miscompiles certain code sequences: ++ * ++ * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 ++ * ++ * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. ++ * Fixed in GCC 4.8.2 and later versions. ++ * ++ * (asm goto is automatically volatile - the naming reflects this.) ++ */ ++#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) ++ ++#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP ++#define __HAVE_BUILTIN_BSWAP32__ ++#define __HAVE_BUILTIN_BSWAP64__ ++#define __HAVE_BUILTIN_BSWAP16__ ++#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */ diff --git a/patches/kernel-fork.c-copy_process-unify-clone_thread-or-thread_group_leader-code.patch b/patches/kernel-fork.c-copy_process-unify-clone_thread-or-thread_group_leader-code.patch new file mode 100644 index 0000000..b2403b6 --- /dev/null +++ b/patches/kernel-fork.c-copy_process-unify-clone_thread-or-thread_group_leader-code.patch @@ -0,0 +1,57 @@ +From 80628ca06c5d42929de6bc22c0a41589a834d151 Mon Sep 17 00:00:00 2001 +From: Oleg Nesterov <oleg@redhat.com> +Date: Wed, 3 Jul 2013 15:08:30 -0700 +Subject: kernel/fork.c:copy_process(): unify + CLONE_THREAD-or-thread_group_leader code + +commit 80628ca06c5d42929de6bc22c0a41589a834d151 upstream. + +Cleanup and preparation for the next changes. + +Move the "if (clone_flags & CLONE_THREAD)" code down under "if +(likely(p->pid))" and turn it into into the "else" branch. This makes the +process/thread initialization more symmetrical and removes one check. + +Signed-off-by: Oleg Nesterov <oleg@redhat.com> +Cc: "Eric W. Biederman" <ebiederm@xmission.com> +Cc: Michal Hocko <mhocko@suse.cz> +Cc: Pavel Emelyanov <xemul@parallels.com> +Cc: Sergey Dyasly <dserrg@gmail.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + kernel/fork.c | 15 +++++++-------- + 1 file changed, 7 insertions(+), 8 deletions(-) + +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1412,14 +1412,6 @@ static struct task_struct *copy_process( + goto bad_fork_free_pid; + } + +- if (clone_flags & CLONE_THREAD) { +- current->signal->nr_threads++; +- atomic_inc(¤t->signal->live); +- atomic_inc(¤t->signal->sigcnt); +- p->group_leader = current->group_leader; +- list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); +- } +- + if (likely(p->pid)) { + ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); + +@@ -1434,6 +1426,13 @@ static struct task_struct *copy_process( + list_add_tail(&p->sibling, &p->real_parent->children); + list_add_tail_rcu(&p->tasks, &init_task.tasks); + __this_cpu_inc(process_counts); ++ } else { ++ current->signal->nr_threads++; ++ atomic_inc(¤t->signal->live); ++ atomic_inc(¤t->signal->sigcnt); ++ p->group_leader = current->group_leader; ++ list_add_tail_rcu(&p->thread_group, ++ &p->group_leader->thread_group); + } + attach_pid(p, PIDTYPE_PID, pid); + nr_threads++; diff --git a/patches/kvm-don-t-take-vcpu-mutex-for-obviously-invalid-vcpu-ioctls.patch b/patches/kvm-don-t-take-vcpu-mutex-for-obviously-invalid-vcpu-ioctls.patch new file mode 100644 index 0000000..d76eaf2 --- /dev/null +++ b/patches/kvm-don-t-take-vcpu-mutex-for-obviously-invalid-vcpu-ioctls.patch @@ -0,0 +1,44 @@ +From 2ea75be3219571d0ec009ce20d9971e54af96e09 Mon Sep 17 00:00:00 2001 +From: David Matlack <dmatlack@google.com> +Date: Fri, 19 Sep 2014 16:03:25 -0700 +Subject: kvm: don't take vcpu mutex for obviously invalid vcpu ioctls + +commit 2ea75be3219571d0ec009ce20d9971e54af96e09 upstream. + +vcpu ioctls can hang the calling thread if issued while a vcpu is running. +However, invalid ioctls can happen when userspace tries to probe the kind +of file descriptors (e.g. isatty() calls ioctl(TCGETS)); in that case, +we know the ioctl is going to be rejected as invalid anyway and we can +fail before trying to take the vcpu mutex. + +This patch does not change functionality, it just makes invalid ioctls +fail faster. + +Signed-off-by: David Matlack <dmatlack@google.com> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +[lizf: Backported to 3.4: adjust context] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + virt/kvm/kvm_main.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -52,6 +52,7 @@ + + #include <asm/processor.h> + #include <asm/io.h> ++#include <asm/ioctl.h> + #include <asm/uaccess.h> + #include <asm/pgtable.h> + +@@ -1744,6 +1745,9 @@ static long kvm_vcpu_ioctl(struct file * + if (vcpu->kvm->mm != current->mm) + return -EIO; + ++ if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) ++ return -EINVAL; ++ + #if defined(CONFIG_S390) || defined(CONFIG_PPC) + /* + * Special cases: vcpu ioctls that are asynchronous to vcpu execution, diff --git a/patches/kvm-fix-excessive-pages-un-pinning-in-kvm_iommu_map-error-path.patch b/patches/kvm-fix-excessive-pages-un-pinning-in-kvm_iommu_map-error-path.patch new file mode 100644 index 0000000..86021c2 --- /dev/null +++ b/patches/kvm-fix-excessive-pages-un-pinning-in-kvm_iommu_map-error-path.patch @@ -0,0 +1,76 @@ +From 3d32e4dbe71374a6780eaf51d719d76f9a9bf22f Mon Sep 17 00:00:00 2001 +From: Quentin Casasnovas <quentin.casasnovas@oracle.com> +Date: Fri, 17 Oct 2014 22:55:59 +0200 +Subject: kvm: fix excessive pages un-pinning in kvm_iommu_map error path. + +commit 3d32e4dbe71374a6780eaf51d719d76f9a9bf22f upstream. + +The third parameter of kvm_unpin_pages() when called from +kvm_iommu_map_pages() is wrong, it should be the number of pages to un-pin +and not the page size. + +This error was facilitated with an inconsistent API: kvm_pin_pages() takes +a size, but kvn_unpin_pages() takes a number of pages, so fix the problem +by matching the two. + +This was introduced by commit 350b8bd ("kvm: iommu: fix the third parameter +of kvm_iommu_put_pages (CVE-2014-3601)"), which fixes the lack of +un-pinning for pages intended to be un-pinned (i.e. memory leak) but +unfortunately potentially aggravated the number of pages we un-pin that +should have stayed pinned. As far as I understand though, the same +practical mitigations apply. + +This issue was found during review of Red Hat 6.6 patches to prepare +Ksplice rebootless updates. + +Thanks to Vegard for his time on a late Friday evening to help me in +understanding this code. + +Fixes: 350b8bd ("kvm: iommu: fix the third parameter of... (CVE-2014-3601)") +Signed-off-by: Quentin Casasnovas <quentin.casasnovas@oracle.com> +Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com> +Signed-off-by: Jamie Iles <jamie.iles@oracle.com> +Reviewed-by: Sasha Levin <sasha.levin@oracle.com> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +[lizf: Backported to 3.4: adjust context] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + virt/kvm/iommu.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/virt/kvm/iommu.c ++++ b/virt/kvm/iommu.c +@@ -43,13 +43,13 @@ static void kvm_iommu_put_pages(struct k + gfn_t base_gfn, unsigned long npages); + + static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot, +- gfn_t gfn, unsigned long size) ++ gfn_t gfn, unsigned long npages) + { + gfn_t end_gfn; + pfn_t pfn; + + pfn = gfn_to_pfn_memslot(kvm, slot, gfn); +- end_gfn = gfn + (size >> PAGE_SHIFT); ++ end_gfn = gfn + npages; + gfn += 1; + + if (is_error_pfn(pfn)) +@@ -117,7 +117,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, + * Pin all pages we are about to map in memory. This is + * important because we unmap and unpin in 4kb steps later. + */ +- pfn = kvm_pin_pages(kvm, slot, gfn, page_size); ++ pfn = kvm_pin_pages(kvm, slot, gfn, page_size >> PAGE_SHIFT); + if (is_error_pfn(pfn)) { + gfn += 1; + continue; +@@ -129,7 +129,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, + if (r) { + printk(KERN_ERR "kvm_iommu_map_address:" + "iommu failed to map pfn=%llx\n", pfn); +- kvm_unpin_pages(kvm, pfn, page_size); ++ kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT); + goto unmap_pages; + } + diff --git a/patches/kvm-s390-unintended-fallthrough-for-external-call.patch b/patches/kvm-s390-unintended-fallthrough-for-external-call.patch new file mode 100644 index 0000000..7603736 --- /dev/null +++ b/patches/kvm-s390-unintended-fallthrough-for-external-call.patch @@ -0,0 +1,26 @@ +From f346026e55f1efd3949a67ddd1dcea7c1b9a615e Mon Sep 17 00:00:00 2001 +From: Christian Borntraeger <borntraeger@de.ibm.com> +Date: Wed, 3 Sep 2014 16:21:32 +0200 +Subject: KVM: s390: unintended fallthrough for external call + +commit f346026e55f1efd3949a67ddd1dcea7c1b9a615e upstream. + +We must not fallthrough if the conditions for external call are not met. + +Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com> +Reviewed-by: Thomas Huth <thuth@linux.vnet.ibm.com> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + arch/s390/kvm/interrupt.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/arch/s390/kvm/interrupt.c ++++ b/arch/s390/kvm/interrupt.c +@@ -43,6 +43,7 @@ static int __interrupt_is_deliverable(st + return 0; + if (vcpu->arch.sie_block->gcr[0] & 0x2000ul) + return 1; ++ return 0; + case KVM_S390_INT_EMERGENCY: + if (psw_extint_disabled(vcpu)) + return 0; diff --git a/patches/kvm-vmx-handle-invvpid-vm-exit-gracefully.patch b/patches/kvm-vmx-handle-invvpid-vm-exit-gracefully.patch new file mode 100644 index 0000000..34d3047 --- /dev/null +++ b/patches/kvm-vmx-handle-invvpid-vm-exit-gracefully.patch @@ -0,0 +1,69 @@ +From a642fc305053cc1c6e47e4f4df327895747ab485 Mon Sep 17 00:00:00 2001 +From: Petr Matousek <pmatouse@redhat.com> +Date: Tue, 23 Sep 2014 20:22:30 +0200 +Subject: kvm: vmx: handle invvpid vm exit gracefully + +commit a642fc305053cc1c6e47e4f4df327895747ab485 upstream. + +On systems with invvpid instruction support (corresponding bit in +IA32_VMX_EPT_VPID_CAP MSR is set) guest invocation of invvpid +causes vm exit, which is currently not handled and results in +propagation of unknown exit to userspace. + +Fix this by installing an invvpid vm exit handler. + +This is CVE-2014-3646. + +Signed-off-by: Petr Matousek <pmatouse@redhat.com> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +[lizf: Backported to 3.4: + - adjust filename + - drop the change to VMX_EXIT_REASON strings] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + arch/x86/include/asm/vmx.h | 1 + + arch/x86/kvm/vmx.c | 9 ++++++++- + 2 files changed, 9 insertions(+), 1 deletion(-) + +--- a/arch/x86/include/asm/vmx.h ++++ b/arch/x86/include/asm/vmx.h +@@ -280,6 +280,7 @@ enum vmcs_field { + #define EXIT_REASON_EPT_VIOLATION 48 + #define EXIT_REASON_EPT_MISCONFIG 49 + #define EXIT_REASON_INVEPT 50 ++#define EXIT_REASON_INVVPID 53 + #define EXIT_REASON_WBINVD 54 + #define EXIT_REASON_XSETBV 55 + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -5572,6 +5572,12 @@ static int handle_invept(struct kvm_vcpu + return 1; + } + ++static int handle_invvpid(struct kvm_vcpu *vcpu) ++{ ++ kvm_queue_exception(vcpu, UD_VECTOR); ++ return 1; ++} ++ + /* + * The exit handlers return 1 if the exit was handled fully and guest execution + * may resume. Otherwise they set the kvm_run parameter to indicate what needs +@@ -5615,6 +5621,7 @@ static int (*kvm_vmx_exit_handlers[])(st + [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_INVEPT] = handle_invept, ++ [EXIT_REASON_INVVPID] = handle_invvpid, + }; + + static const int kvm_vmx_max_exit_handlers = +@@ -5799,7 +5806,7 @@ static bool nested_vmx_exit_handled(stru + case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: + case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: + case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: +- case EXIT_REASON_INVEPT: ++ case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: + /* + * VMX instructions trap unconditionally. This allows L1 to + * emulate them for its L2 guest, i.e., allows 3-level nesting! diff --git a/patches/kvm-x86-check-non-canonical-addresses-upon-wrmsr.patch b/patches/kvm-x86-check-non-canonical-addresses-upon-wrmsr.patch new file mode 100644 index 0000000..29b5ed6 --- /dev/null +++ b/patches/kvm-x86-check-non-canonical-addresses-upon-wrmsr.patch @@ -0,0 +1,140 @@ +From 854e8bb1aa06c578c2c9145fa6bfe3680ef63b23 Mon Sep 17 00:00:00 2001 +From: Nadav Amit <namit@cs.technion.ac.il> +Date: Tue, 16 Sep 2014 03:24:05 +0300 +Subject: KVM: x86: Check non-canonical addresses upon WRMSR + +commit 854e8bb1aa06c578c2c9145fa6bfe3680ef63b23 upstream. + +Upon WRMSR, the CPU should inject #GP if a non-canonical value (address) is +written to certain MSRs. The behavior is "almost" identical for AMD and Intel +(ignoring MSRs that are not implemented in either architecture since they would +anyhow #GP). However, IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if +non-canonical address is written on Intel but not on AMD (which ignores the top +32-bits). + +Accordingly, this patch injects a #GP on the MSRs which behave identically on +Intel and AMD. To eliminate the differences between the architecutres, the +value which is written to IA32_SYSENTER_ESP and IA32_SYSENTER_EIP is turned to +canonical value before writing instead of injecting a #GP. + +Some references from Intel and AMD manuals: + +According to Intel SDM description of WRMSR instruction #GP is expected on +WRMSR "If the source register contains a non-canonical address and ECX +specifies one of the following MSRs: IA32_DS_AREA, IA32_FS_BASE, IA32_GS_BASE, +IA32_KERNEL_GS_BASE, IA32_LSTAR, IA32_SYSENTER_EIP, IA32_SYSENTER_ESP." + +According to AMD manual instruction manual: +LSTAR/CSTAR (SYSCALL): "The WRMSR instruction loads the target RIP into the +LSTAR and CSTAR registers. If an RIP written by WRMSR is not in canonical +form, a general-protection exception (#GP) occurs." +IA32_GS_BASE and IA32_FS_BASE (WRFSBASE/WRGSBASE): "The address written to the +base field must be in canonical form or a #GP fault will occur." +IA32_KERNEL_GS_BASE (SWAPGS): "The address stored in the KernelGSbase MSR must +be in canonical form." + +This patch fixes CVE-2014-3610. + +Signed-off-by: Nadav Amit <namit@cs.technion.ac.il> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +[lizf: Backported to 3.4: + - adjust context + - s/msr->index/msr_index and s/msr->data/data] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + arch/x86/include/asm/kvm_host.h | 14 ++++++++++++++ + arch/x86/kvm/svm.c | 2 +- + arch/x86/kvm/vmx.c | 2 +- + arch/x86/kvm/x86.c | 27 ++++++++++++++++++++++++++- + 4 files changed, 42 insertions(+), 3 deletions(-) + +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -882,6 +882,20 @@ static inline void kvm_inject_gp(struct + kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); + } + ++static inline u64 get_canonical(u64 la) ++{ ++ return ((int64_t)la << 16) >> 16; ++} ++ ++static inline bool is_noncanonical_address(u64 la) ++{ ++#ifdef CONFIG_X86_64 ++ return get_canonical(la) != la; ++#else ++ return false; ++#endif ++} ++ + #define TSS_IOPB_BASE_OFFSET 0x66 + #define TSS_BASE_SIZE 0x68 + #define TSS_IOPB_SIZE (65536 / 8) +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -3212,7 +3212,7 @@ static int wrmsr_interception(struct vcp + + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; +- if (svm_set_msr(&svm->vcpu, ecx, data)) { ++ if (kvm_set_msr(&svm->vcpu, ecx, data)) { + trace_kvm_msr_write_ex(ecx, data); + kvm_inject_gp(&svm->vcpu, 0); + } else { +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -4545,7 +4545,7 @@ static int handle_wrmsr(struct kvm_vcpu + u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) + | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); + +- if (vmx_set_msr(vcpu, ecx, data) != 0) { ++ if (kvm_set_msr(vcpu, ecx, data) != 0) { + trace_kvm_msr_write_ex(ecx, data); + kvm_inject_gp(vcpu, 0); + return 1; +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -858,7 +858,6 @@ void kvm_enable_efer_bits(u64 mask) + } + EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); + +- + /* + * Writes msr value into into the appropriate "register". + * Returns 0 on success, non-0 otherwise. +@@ -866,8 +865,34 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); + */ + int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) + { ++ switch (msr_index) { ++ case MSR_FS_BASE: ++ case MSR_GS_BASE: ++ case MSR_KERNEL_GS_BASE: ++ case MSR_CSTAR: ++ case MSR_LSTAR: ++ if (is_noncanonical_address(data)) ++ return 1; ++ break; ++ case MSR_IA32_SYSENTER_EIP: ++ case MSR_IA32_SYSENTER_ESP: ++ /* ++ * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if ++ * non-canonical address is written on Intel but not on ++ * AMD (which ignores the top 32-bits, because it does ++ * not implement 64-bit SYSENTER). ++ * ++ * 64-bit code should hence be able to write a non-canonical ++ * value on AMD. Making the address canonical ensures that ++ * vmentry does not fail on Intel after writing a non-canonical ++ * value, and that something deterministic happens if the guest ++ * invokes 64-bit SYSENTER. ++ */ ++ data = get_canonical(data); ++ } + return kvm_x86_ops->set_msr(vcpu, msr_index, data); + } ++EXPORT_SYMBOL_GPL(kvm_set_msr); + + /* + * Adapt set_msr() to msr_io()'s calling convention diff --git a/patches/kvm-x86-don-t-kill-guest-on-unknown-exit-reason.patch b/patches/kvm-x86-don-t-kill-guest-on-unknown-exit-reason.patch new file mode 100644 index 0000000..41c539d --- /dev/null +++ b/patches/kvm-x86-don-t-kill-guest-on-unknown-exit-reason.patch @@ -0,0 +1,50 @@ +From 2bc19dc3754fc066c43799659f0d848631c44cfe Mon Sep 17 00:00:00 2001 +From: "Michael S. Tsirkin" <mst@redhat.com> +Date: Thu, 18 Sep 2014 16:21:16 +0300 +Subject: kvm: x86: don't kill guest on unknown exit reason + +commit 2bc19dc3754fc066c43799659f0d848631c44cfe upstream. + +KVM_EXIT_UNKNOWN is a kvm bug, we don't really know whether it was +triggered by a priveledged application. Let's not kill the guest: WARN +and inject #UD instead. + +Signed-off-by: Michael S. Tsirkin <mst@redhat.com> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + arch/x86/kvm/svm.c | 6 +++--- + arch/x86/kvm/vmx.c | 6 +++--- + 2 files changed, 6 insertions(+), 6 deletions(-) + +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -3494,9 +3494,9 @@ static int handle_exit(struct kvm_vcpu * + + if (exit_code >= ARRAY_SIZE(svm_exit_handlers) + || !svm_exit_handlers[exit_code]) { +- kvm_run->exit_reason = KVM_EXIT_UNKNOWN; +- kvm_run->hw.hardware_exit_reason = exit_code; +- return 0; ++ WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_code); ++ kvm_queue_exception(vcpu, UD_VECTOR); ++ return 1; + } + + return svm_exit_handlers[exit_code](svm); +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -5936,10 +5936,10 @@ static int vmx_handle_exit(struct kvm_vc + && kvm_vmx_exit_handlers[exit_reason]) + return kvm_vmx_exit_handlers[exit_reason](vcpu); + else { +- vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; +- vcpu->run->hw.hardware_exit_reason = exit_reason; ++ WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason); ++ kvm_queue_exception(vcpu, UD_VECTOR); ++ return 1; + } +- return 0; + } + + static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) diff --git a/patches/kvm-x86-emulator-fixes-for-eip-canonical-checks-on-near-branches.patch b/patches/kvm-x86-emulator-fixes-for-eip-canonical-checks-on-near-branches.patch new file mode 100644 index 0000000..5590a55 --- /dev/null +++ b/patches/kvm-x86-emulator-fixes-for-eip-canonical-checks-on-near-branches.patch @@ -0,0 +1,234 @@ +From 234f3ce485d54017f15cf5e0699cff4100121601 Mon Sep 17 00:00:00 2001 +From: Nadav Amit <namit@cs.technion.ac.il> +Date: Thu, 18 Sep 2014 22:39:38 +0300 +Subject: KVM: x86: Emulator fixes for eip canonical checks on near branches + +commit 234f3ce485d54017f15cf5e0699cff4100121601 upstream. + +Before changing rip (during jmp, call, ret, etc.) the target should be asserted +to be canonical one, as real CPUs do. During sysret, both target rsp and rip +should be canonical. If any of these values is noncanonical, a #GP exception +should occur. The exception to this rule are syscall and sysenter instructions +in which the assigned rip is checked during the assignment to the relevant +MSRs. + +This patch fixes the emulator to behave as real CPUs do for near branches. +Far branches are handled by the next patch. + +This fixes CVE-2014-3647. + +Signed-off-by: Nadav Amit <namit@cs.technion.ac.il> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +[lizf: Backported to 3.4: + - adjust context + - use ctxt->regs rather than reg_read() and reg_write()] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + arch/x86/kvm/emulate.c | 78 +++++++++++++++++++++++++++++++++---------------- + 1 file changed, 54 insertions(+), 24 deletions(-) + +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -532,7 +532,8 @@ static int emulate_nm(struct x86_emulate + return emulate_exception(ctxt, NM_VECTOR, 0, false); + } + +-static inline void assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) ++static inline int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst, ++ int cs_l) + { + switch (ctxt->op_bytes) { + case 2: +@@ -542,16 +543,25 @@ static inline void assign_eip_near(struc + ctxt->_eip = (u32)dst; + break; + case 8: ++ if ((cs_l && is_noncanonical_address(dst)) || ++ (!cs_l && (dst & ~(u32)-1))) ++ return emulate_gp(ctxt, 0); + ctxt->_eip = dst; + break; + default: + WARN(1, "unsupported eip assignment size\n"); + } ++ return X86EMUL_CONTINUE; ++} ++ ++static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) ++{ ++ return assign_eip_far(ctxt, dst, ctxt->mode == X86EMUL_MODE_PROT64); + } + +-static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) ++static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) + { +- assign_eip_near(ctxt, ctxt->_eip + rel); ++ return assign_eip_near(ctxt, ctxt->_eip + rel); + } + + static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg) +@@ -1802,13 +1812,15 @@ static int em_grp45(struct x86_emulate_c + case 2: /* call near abs */ { + long int old_eip; + old_eip = ctxt->_eip; +- ctxt->_eip = ctxt->src.val; ++ rc = assign_eip_near(ctxt, ctxt->src.val); ++ if (rc != X86EMUL_CONTINUE) ++ break; + ctxt->src.val = old_eip; + rc = em_push(ctxt); + break; + } + case 4: /* jmp abs */ +- ctxt->_eip = ctxt->src.val; ++ rc = assign_eip_near(ctxt, ctxt->src.val); + break; + case 5: /* jmp far */ + rc = em_jmp_far(ctxt); +@@ -1840,10 +1852,14 @@ static int em_cmpxchg8b(struct x86_emula + + static int em_ret(struct x86_emulate_ctxt *ctxt) + { +- ctxt->dst.type = OP_REG; +- ctxt->dst.addr.reg = &ctxt->_eip; +- ctxt->dst.bytes = ctxt->op_bytes; +- return em_pop(ctxt); ++ int rc; ++ unsigned long eip; ++ ++ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); ++ if (rc != X86EMUL_CONTINUE) ++ return rc; ++ ++ return assign_eip_near(ctxt, eip); + } + + static int em_ret_far(struct x86_emulate_ctxt *ctxt) +@@ -2108,7 +2124,7 @@ static int em_sysexit(struct x86_emulate + { + struct x86_emulate_ops *ops = ctxt->ops; + struct desc_struct cs, ss; +- u64 msr_data; ++ u64 msr_data, rcx, rdx; + int usermode; + u16 cs_sel = 0, ss_sel = 0; + +@@ -2124,6 +2140,9 @@ static int em_sysexit(struct x86_emulate + else + usermode = X86EMUL_MODE_PROT32; + ++ rcx = ctxt->regs[VCPU_REGS_RCX]; ++ rdx = ctxt->regs[VCPU_REGS_RDX]; ++ + cs.dpl = 3; + ss.dpl = 3; + ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); +@@ -2141,6 +2160,9 @@ static int em_sysexit(struct x86_emulate + ss_sel = cs_sel + 8; + cs.d = 0; + cs.l = 1; ++ if (is_noncanonical_address(rcx) || ++ is_noncanonical_address(rdx)) ++ return emulate_gp(ctxt, 0); + break; + } + cs_sel |= SELECTOR_RPL_MASK; +@@ -2149,8 +2171,8 @@ static int em_sysexit(struct x86_emulate + ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); + ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); + +- ctxt->_eip = ctxt->regs[VCPU_REGS_RDX]; +- ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX]; ++ ctxt->_eip = rdx; ++ ctxt->regs[VCPU_REGS_RSP] = rcx; + + return X86EMUL_CONTINUE; + } +@@ -2646,10 +2668,13 @@ static int em_das(struct x86_emulate_ctx + + static int em_call(struct x86_emulate_ctxt *ctxt) + { ++ int rc; + long rel = ctxt->src.val; + + ctxt->src.val = (unsigned long)ctxt->_eip; +- jmp_rel(ctxt, rel); ++ rc = jmp_rel(ctxt, rel); ++ if (rc != X86EMUL_CONTINUE) ++ return rc; + return em_push(ctxt); + } + +@@ -2681,11 +2706,12 @@ static int em_call_far(struct x86_emulat + static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) + { + int rc; ++ unsigned long eip; + +- ctxt->dst.type = OP_REG; +- ctxt->dst.addr.reg = &ctxt->_eip; +- ctxt->dst.bytes = ctxt->op_bytes; +- rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); ++ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); ++ if (rc != X86EMUL_CONTINUE) ++ return rc; ++ rc = assign_eip_near(ctxt, eip); + if (rc != X86EMUL_CONTINUE) + return rc; + register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val); +@@ -2994,20 +3020,24 @@ static int em_lmsw(struct x86_emulate_ct + + static int em_loop(struct x86_emulate_ctxt *ctxt) + { ++ int rc = X86EMUL_CONTINUE; ++ + register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); + if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) && + (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags))) +- jmp_rel(ctxt, ctxt->src.val); ++ rc = jmp_rel(ctxt, ctxt->src.val); + +- return X86EMUL_CONTINUE; ++ return rc; + } + + static int em_jcxz(struct x86_emulate_ctxt *ctxt) + { ++ int rc = X86EMUL_CONTINUE; ++ + if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) +- jmp_rel(ctxt, ctxt->src.val); ++ rc = jmp_rel(ctxt, ctxt->src.val); + +- return X86EMUL_CONTINUE; ++ return rc; + } + + static int em_in(struct x86_emulate_ctxt *ctxt) +@@ -4185,7 +4215,7 @@ special_insn: + break; + case 0x70 ... 0x7f: /* jcc (short) */ + if (test_cc(ctxt->b, ctxt->eflags)) +- jmp_rel(ctxt, ctxt->src.val); ++ rc = jmp_rel(ctxt, ctxt->src.val); + break; + case 0x8d: /* lea r16/r32, m */ + ctxt->dst.val = ctxt->src.addr.mem.ea; +@@ -4224,7 +4254,7 @@ special_insn: + break; + case 0xe9: /* jmp rel */ + case 0xeb: /* jmp rel short */ +- jmp_rel(ctxt, ctxt->src.val); ++ rc = jmp_rel(ctxt, ctxt->src.val); + ctxt->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xf4: /* hlt */ +@@ -4327,7 +4357,7 @@ twobyte_insn: + break; + case 0x80 ... 0x8f: /* jnz rel, etc*/ + if (test_cc(ctxt->b, ctxt->eflags)) +- jmp_rel(ctxt, ctxt->src.val); ++ rc = jmp_rel(ctxt, ctxt->src.val); + break; + case 0x90 ... 0x9f: /* setcc r/m8 */ + ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); diff --git a/patches/kvm-x86-fix-stale-mmio-cache-bug.patch b/patches/kvm-x86-fix-stale-mmio-cache-bug.patch new file mode 100644 index 0000000..097e290 --- /dev/null +++ b/patches/kvm-x86-fix-stale-mmio-cache-bug.patch @@ -0,0 +1,112 @@ +From 56f17dd3fbc44adcdbc3340fe3988ddb833a47a7 Mon Sep 17 00:00:00 2001 +From: David Matlack <dmatlack@google.com> +Date: Mon, 18 Aug 2014 15:46:07 -0700 +Subject: kvm: x86: fix stale mmio cache bug + +commit 56f17dd3fbc44adcdbc3340fe3988ddb833a47a7 upstream. + +The following events can lead to an incorrect KVM_EXIT_MMIO bubbling +up to userspace: + +(1) Guest accesses gpa X without a memory slot. The gfn is cached in +struct kvm_vcpu_arch (mmio_gfn). On Intel EPT-enabled hosts, KVM sets +the SPTE write-execute-noread so that future accesses cause +EPT_MISCONFIGs. + +(2) Host userspace creates a memory slot via KVM_SET_USER_MEMORY_REGION +covering the page just accessed. + +(3) Guest attempts to read or write to gpa X again. On Intel, this +generates an EPT_MISCONFIG. The memory slot generation number that +was incremented in (2) would normally take care of this but we fast +path mmio faults through quickly_check_mmio_pf(), which only checks +the per-vcpu mmio cache. Since we hit the cache, KVM passes a +KVM_EXIT_MMIO up to userspace. + +This patch fixes the issue by using the memslot generation number +to validate the mmio cache. + +Signed-off-by: David Matlack <dmatlack@google.com> +[xiaoguangrong: adjust the code to make it simpler for stable-tree fix.] +Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> +Reviewed-by: David Matlack <dmatlack@google.com> +Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> +Tested-by: David Matlack <dmatlack@google.com> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + arch/x86/include/asm/kvm_host.h | 1 + + arch/x86/kvm/mmu.c | 2 +- + arch/x86/kvm/x86.h | 20 +++++++++++++++----- + 3 files changed, 17 insertions(+), 6 deletions(-) + +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -453,6 +453,7 @@ struct kvm_vcpu_arch { + u64 mmio_gva; + unsigned access; + gfn_t mmio_gfn; ++ u64 mmio_gen; + + struct kvm_pmu pmu; + +--- a/arch/x86/kvm/mmu.c ++++ b/arch/x86/kvm/mmu.c +@@ -2842,7 +2842,7 @@ static void mmu_sync_roots(struct kvm_vc + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + +- vcpu_clear_mmio_info(vcpu, ~0ul); ++ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); + kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); + if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; +--- a/arch/x86/kvm/x86.h ++++ b/arch/x86/kvm/x86.h +@@ -78,15 +78,23 @@ static inline void vcpu_cache_mmio_info( + vcpu->arch.mmio_gva = gva & PAGE_MASK; + vcpu->arch.access = access; + vcpu->arch.mmio_gfn = gfn; ++ vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation; ++} ++ ++static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu) ++{ ++ return vcpu->arch.mmio_gen == kvm_memslots(vcpu->kvm)->generation; + } + + /* +- * Clear the mmio cache info for the given gva, +- * specially, if gva is ~0ul, we clear all mmio cache info. ++ * Clear the mmio cache info for the given gva. If gva is MMIO_GVA_ANY, we ++ * clear all mmio cache info. + */ ++#define MMIO_GVA_ANY (~(gva_t)0) ++ + static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva) + { +- if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK)) ++ if (gva != MMIO_GVA_ANY && vcpu->arch.mmio_gva != (gva & PAGE_MASK)) + return; + + vcpu->arch.mmio_gva = 0; +@@ -94,7 +102,8 @@ static inline void vcpu_clear_mmio_info( + + static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva) + { +- if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK)) ++ if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gva && ++ vcpu->arch.mmio_gva == (gva & PAGE_MASK)) + return true; + + return false; +@@ -102,7 +111,8 @@ static inline bool vcpu_match_mmio_gva(s + + static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) + { +- if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT) ++ if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gfn && ++ vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT) + return true; + + return false; diff --git a/patches/kvm-x86-fix-wrong-masking-on-relative-jump-call.patch b/patches/kvm-x86-fix-wrong-masking-on-relative-jump-call.patch new file mode 100644 index 0000000..4435910 --- /dev/null +++ b/patches/kvm-x86-fix-wrong-masking-on-relative-jump-call.patch @@ -0,0 +1,62 @@ +From 05c83ec9b73c8124555b706f6af777b10adf0862 Mon Sep 17 00:00:00 2001 +From: Nadav Amit <namit@cs.technion.ac.il> +Date: Thu, 18 Sep 2014 22:39:37 +0300 +Subject: KVM: x86: Fix wrong masking on relative jump/call + +commit 05c83ec9b73c8124555b706f6af777b10adf0862 upstream. + +Relative jumps and calls do the masking according to the operand size, and not +according to the address size as the KVM emulator does today. + +This patch fixes KVM behavior. + +Signed-off-by: Nadav Amit <namit@cs.technion.ac.il> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + arch/x86/kvm/emulate.c | 27 ++++++++++++++++++++++----- + 1 file changed, 22 insertions(+), 5 deletions(-) + +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -459,11 +459,6 @@ register_address_increment(struct x86_em + *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt)); + } + +-static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) +-{ +- register_address_increment(ctxt, &ctxt->_eip, rel); +-} +- + static u32 desc_limit_scaled(struct desc_struct *desc) + { + u32 limit = get_desc_limit(desc); +@@ -537,6 +532,28 @@ static int emulate_nm(struct x86_emulate + return emulate_exception(ctxt, NM_VECTOR, 0, false); + } + ++static inline void assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) ++{ ++ switch (ctxt->op_bytes) { ++ case 2: ++ ctxt->_eip = (u16)dst; ++ break; ++ case 4: ++ ctxt->_eip = (u32)dst; ++ break; ++ case 8: ++ ctxt->_eip = dst; ++ break; ++ default: ++ WARN(1, "unsupported eip assignment size\n"); ++ } ++} ++ ++static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) ++{ ++ assign_eip_near(ctxt, ctxt->_eip + rel); ++} ++ + static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg) + { + u16 selector; diff --git a/patches/kvm-x86-handle-errors-when-rip-is-set-during-far-jumps.patch b/patches/kvm-x86-handle-errors-when-rip-is-set-during-far-jumps.patch new file mode 100644 index 0000000..29cf49e --- /dev/null +++ b/patches/kvm-x86-handle-errors-when-rip-is-set-during-far-jumps.patch @@ -0,0 +1,246 @@ +From d1442d85cc30ea75f7d399474ca738e0bc96f715 Mon Sep 17 00:00:00 2001 +From: Nadav Amit <namit@cs.technion.ac.il> +Date: Thu, 18 Sep 2014 22:39:39 +0300 +Subject: KVM: x86: Handle errors when RIP is set during far jumps + +commit d1442d85cc30ea75f7d399474ca738e0bc96f715 upstream. + +Far jmp/call/ret may fault while loading a new RIP. Currently KVM does not +handle this case, and may result in failed vm-entry once the assignment is +done. The tricky part of doing so is that loading the new CS affects the +VMCS/VMCB state, so if we fail during loading the new RIP, we are left in +unconsistent state. Therefore, this patch saves on 64-bit the old CS +descriptor and restores it if loading RIP failed. + +This fixes CVE-2014-3647. + +Signed-off-by: Nadav Amit <namit@cs.technion.ac.il> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +[lizf: Backported to 3.4: + - adjust context + - __load_segment_descriptor() doesn't take in_task_switch parameter] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + arch/x86/kvm/emulate.c | 118 ++++++++++++++++++++++++++++++++++++------------- + 1 file changed, 88 insertions(+), 30 deletions(-) + +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -1252,7 +1252,9 @@ static int write_segment_descriptor(stru + + /* Does not support long mode */ + static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, +- u16 selector, int seg, u8 cpl) ++ u16 selector, int seg, u8 cpl, ++ bool in_task_switch, ++ struct desc_struct *desc) + { + struct desc_struct seg_desc; + u8 dpl, rpl; +@@ -1362,6 +1364,8 @@ static int __load_segment_descriptor(str + } + load: + ctxt->ops->set_segment(ctxt, selector, &seg_desc, 0, seg); ++ if (desc) ++ *desc = seg_desc; + return X86EMUL_CONTINUE; + exception: + emulate_exception(ctxt, err_vec, err_code, true); +@@ -1372,7 +1376,7 @@ static int load_segment_descriptor(struc + u16 selector, int seg) + { + u8 cpl = ctxt->ops->cpl(ctxt); +- return __load_segment_descriptor(ctxt, selector, seg, cpl); ++ return __load_segment_descriptor(ctxt, selector, seg, cpl, false, NULL); + } + + static void write_register_operand(struct operand *op) +@@ -1714,17 +1718,31 @@ static int em_iret(struct x86_emulate_ct + static int em_jmp_far(struct x86_emulate_ctxt *ctxt) + { + int rc; +- unsigned short sel; ++ unsigned short sel, old_sel; ++ struct desc_struct old_desc, new_desc; ++ const struct x86_emulate_ops *ops = ctxt->ops; ++ u8 cpl = ctxt->ops->cpl(ctxt); ++ ++ /* Assignment of RIP may only fail in 64-bit mode */ ++ if (ctxt->mode == X86EMUL_MODE_PROT64) ++ ops->get_segment(ctxt, &old_sel, &old_desc, NULL, ++ VCPU_SREG_CS); + + memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); + +- rc = load_segment_descriptor(ctxt, sel, VCPU_SREG_CS); ++ rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, ++ &new_desc); + if (rc != X86EMUL_CONTINUE) + return rc; + +- ctxt->_eip = 0; +- memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes); +- return X86EMUL_CONTINUE; ++ rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l); ++ if (rc != X86EMUL_CONTINUE) { ++ WARN_ON(!ctxt->mode != X86EMUL_MODE_PROT64); ++ /* assigning eip failed; restore the old cs */ ++ ops->set_segment(ctxt, old_sel, &old_desc, 0, VCPU_SREG_CS); ++ return rc; ++ } ++ return rc; + } + + static int em_grp2(struct x86_emulate_ctxt *ctxt) +@@ -1871,17 +1889,30 @@ static int em_ret(struct x86_emulate_ctx + static int em_ret_far(struct x86_emulate_ctxt *ctxt) + { + int rc; +- unsigned long cs; ++ unsigned long eip, cs; ++ u16 old_cs; ++ struct desc_struct old_desc, new_desc; ++ const struct x86_emulate_ops *ops = ctxt->ops; ++ ++ if (ctxt->mode == X86EMUL_MODE_PROT64) ++ ops->get_segment(ctxt, &old_cs, &old_desc, NULL, ++ VCPU_SREG_CS); + +- rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes); ++ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; +- if (ctxt->op_bytes == 4) +- ctxt->_eip = (u32)ctxt->_eip; + rc = emulate_pop(ctxt, &cs, ctxt->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; +- rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS); ++ rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, 0, false, ++ &new_desc); ++ if (rc != X86EMUL_CONTINUE) ++ return rc; ++ rc = assign_eip_far(ctxt, eip, new_desc.l); ++ if (rc != X86EMUL_CONTINUE) { ++ WARN_ON(!ctxt->mode != X86EMUL_MODE_PROT64); ++ ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); ++ } + return rc; + } + +@@ -2296,19 +2327,24 @@ static int load_state_from_tss16(struct + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ +- ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl); ++ ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, ++ true, NULL); + if (ret != X86EMUL_CONTINUE) + return ret; +- ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl); ++ ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, ++ true, NULL); + if (ret != X86EMUL_CONTINUE) + return ret; +- ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl); ++ ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, ++ true, NULL); + if (ret != X86EMUL_CONTINUE) + return ret; +- ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl); ++ ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, ++ true, NULL); + if (ret != X86EMUL_CONTINUE) + return ret; +- ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl); ++ ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, ++ true, NULL); + if (ret != X86EMUL_CONTINUE) + return ret; + +@@ -2434,25 +2470,32 @@ static int load_state_from_tss32(struct + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ +- ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl); ++ ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, ++ cpl, true, NULL); + if (ret != X86EMUL_CONTINUE) + return ret; +- ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl); ++ ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, ++ true, NULL); + if (ret != X86EMUL_CONTINUE) + return ret; +- ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl); ++ ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, ++ true, NULL); + if (ret != X86EMUL_CONTINUE) + return ret; +- ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl); ++ ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, ++ true, NULL); + if (ret != X86EMUL_CONTINUE) + return ret; +- ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl); ++ ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, ++ true, NULL); + if (ret != X86EMUL_CONTINUE) + return ret; +- ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl); ++ ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, ++ true, NULL); + if (ret != X86EMUL_CONTINUE) + return ret; +- ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl); ++ ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, ++ true, NULL); + if (ret != X86EMUL_CONTINUE) + return ret; + +@@ -2689,24 +2732,39 @@ static int em_call_far(struct x86_emulat + u16 sel, old_cs; + ulong old_eip; + int rc; ++ struct desc_struct old_desc, new_desc; ++ const struct x86_emulate_ops *ops = ctxt->ops; ++ int cpl = ctxt->ops->cpl(ctxt); + +- old_cs = get_segment_selector(ctxt, VCPU_SREG_CS); + old_eip = ctxt->_eip; ++ ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS); + + memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); +- if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS)) ++ rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, ++ &new_desc); ++ if (rc != X86EMUL_CONTINUE) + return X86EMUL_CONTINUE; + +- ctxt->_eip = 0; +- memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes); ++ rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l); ++ if (rc != X86EMUL_CONTINUE) ++ goto fail; + + ctxt->src.val = old_cs; + rc = em_push(ctxt); + if (rc != X86EMUL_CONTINUE) +- return rc; ++ goto fail; + + ctxt->src.val = old_eip; +- return em_push(ctxt); ++ rc = em_push(ctxt); ++ /* If we failed, we tainted the memory, but the very least we should ++ restore cs */ ++ if (rc != X86EMUL_CONTINUE) ++ goto fail; ++ return rc; ++fail: ++ ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); ++ return rc; ++ + } + + static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) diff --git a/patches/kvm-x86-improve-thread-safety-in-pit.patch b/patches/kvm-x86-improve-thread-safety-in-pit.patch new file mode 100644 index 0000000..878ae91 --- /dev/null +++ b/patches/kvm-x86-improve-thread-safety-in-pit.patch @@ -0,0 +1,35 @@ +From 2febc839133280d5a5e8e1179c94ea674489dae2 Mon Sep 17 00:00:00 2001 +From: Andy Honig <ahonig@google.com> +Date: Wed, 27 Aug 2014 14:42:54 -0700 +Subject: KVM: x86: Improve thread safety in pit + +commit 2febc839133280d5a5e8e1179c94ea674489dae2 upstream. + +There's a race condition in the PIT emulation code in KVM. In +__kvm_migrate_pit_timer the pit_timer object is accessed without +synchronization. If the race condition occurs at the wrong time this +can crash the host kernel. + +This fixes CVE-2014-3611. + +Signed-off-by: Andrew Honig <ahonig@google.com> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +[lizf: Backported to 3.4: adjust context] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + arch/x86/kvm/i8254.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/arch/x86/kvm/i8254.c ++++ b/arch/x86/kvm/i8254.c +@@ -263,8 +263,10 @@ void __kvm_migrate_pit_timer(struct kvm_ + return; + + timer = &pit->pit_state.pit_timer.timer; ++ mutex_lock(&pit->pit_state.lock); + if (hrtimer_cancel(timer)) + hrtimer_start_expires(timer, HRTIMER_MODE_ABS); ++ mutex_unlock(&pit->pit_state.lock); + } + + static void destroy_pit_timer(struct kvm_pit *pit) diff --git a/patches/kvm-x86-prevent-host-from-panicking-on-shared-msr-writes.patch b/patches/kvm-x86-prevent-host-from-panicking-on-shared-msr-writes.patch new file mode 100644 index 0000000..efbf941 --- /dev/null +++ b/patches/kvm-x86-prevent-host-from-panicking-on-shared-msr-writes.patch @@ -0,0 +1,85 @@ +From 8b3c3104c3f4f706e99365c3e0d2aa61b95f969f Mon Sep 17 00:00:00 2001 +From: Andy Honig <ahonig@google.com> +Date: Wed, 27 Aug 2014 11:16:44 -0700 +Subject: KVM: x86: Prevent host from panicking on shared MSR writes. + +commit 8b3c3104c3f4f706e99365c3e0d2aa61b95f969f upstream. + +The previous patch blocked invalid writes directly when the MSR +is written. As a precaution, prevent future similar mistakes by +gracefulling handle GPs caused by writes to shared MSRs. + +Signed-off-by: Andrew Honig <ahonig@google.com> +[Remove parts obsoleted by Nadav's patch. - Paolo] +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +[lizf: Backported to 3.4: + - adjust context + - s/wrmsrl_safe/checking_wrmsrl/] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + arch/x86/include/asm/kvm_host.h | 2 +- + arch/x86/kvm/vmx.c | 7 +++++-- + arch/x86/kvm/x86.c | 11 ++++++++--- + 3 files changed, 14 insertions(+), 6 deletions(-) + +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -954,7 +954,7 @@ int kvm_arch_interrupt_allowed(struct kv + int kvm_cpu_get_interrupt(struct kvm_vcpu *v); + + void kvm_define_shared_msr(unsigned index, u32 msr); +-void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); ++int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); + + bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -2210,12 +2210,15 @@ static int vmx_set_msr(struct kvm_vcpu * + break; + msr = find_msr_entry(vmx, msr_index); + if (msr) { ++ u64 old_msr_data = msr->data; + msr->data = data; + if (msr - vmx->guest_msrs < vmx->save_nmsrs) { + preempt_disable(); +- kvm_set_shared_msr(msr->index, msr->data, +- msr->mask); ++ ret = kvm_set_shared_msr(msr->index, msr->data, ++ msr->mask); + preempt_enable(); ++ if (ret) ++ msr->data = old_msr_data; + } + break; + } +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -220,19 +220,24 @@ static void kvm_shared_msr_cpu_online(vo + shared_msr_update(i, shared_msrs_global.msrs[i]); + } + +-void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) ++int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) + { + struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); ++ int err; + + if (((value ^ smsr->values[slot].curr) & mask) == 0) +- return; ++ return 0; + smsr->values[slot].curr = value; +- wrmsrl(shared_msrs_global.msrs[slot], value); ++ err = checking_wrmsrl(shared_msrs_global.msrs[slot], value); ++ if (err) ++ return 1; ++ + if (!smsr->registered) { + smsr->urn.on_user_return = kvm_on_user_return; + user_return_notifier_register(&smsr->urn); + smsr->registered = true; + } ++ return 0; + } + EXPORT_SYMBOL_GPL(kvm_set_shared_msr); + diff --git a/patches/kvm-x86-use-new-cs.rpl-as-cpl-during-task-switch.patch b/patches/kvm-x86-use-new-cs.rpl-as-cpl-during-task-switch.patch new file mode 100644 index 0000000..01a9b27 --- /dev/null +++ b/patches/kvm-x86-use-new-cs.rpl-as-cpl-during-task-switch.patch @@ -0,0 +1,180 @@ +From 2356aaeb2f58f491679dc0c38bc3f6dbe54e7ded Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini <pbonzini@redhat.com> +Date: Thu, 15 May 2014 17:56:57 +0200 +Subject: KVM: x86: use new CS.RPL as CPL during task switch + +commit 2356aaeb2f58f491679dc0c38bc3f6dbe54e7ded upstream. + +During task switch, all of CS.DPL, CS.RPL, SS.DPL must match (in addition +to all the other requirements) and will be the new CPL. So far this +worked by carefully setting the CS selector and flag before doing the +task switch; setting CS.selector will already change the CPL. + +However, this will not work once we get the CPL from SS.DPL, because +then you will have to set the full segment descriptor cache to change +the CPL. ctxt->ops->cpl(ctxt) will then return the old CPL during the +task switch, and the check that SS.DPL == CPL will fail. + +Temporarily assume that the CPL comes from CS.RPL during task switch +to a protected-mode task. This is the same approach used in QEMU's +emulation code, which (until version 2.0) manually tracks the CPL. + +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +[lizf: Backported to 3.4: adjust context] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + arch/x86/kvm/emulate.c | 60 ++++++++++++++++++++++++++----------------------- + 1 file changed, 33 insertions(+), 27 deletions(-) + +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -1251,11 +1251,11 @@ static int write_segment_descriptor(stru + } + + /* Does not support long mode */ +-static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, +- u16 selector, int seg) ++static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, ++ u16 selector, int seg, u8 cpl) + { + struct desc_struct seg_desc; +- u8 dpl, rpl, cpl; ++ u8 dpl, rpl; + unsigned err_vec = GP_VECTOR; + u32 err_code = 0; + bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ +@@ -1306,7 +1306,6 @@ static int load_segment_descriptor(struc + + rpl = selector & 3; + dpl = seg_desc.dpl; +- cpl = ctxt->ops->cpl(ctxt); + + switch (seg) { + case VCPU_SREG_SS: +@@ -1369,6 +1368,13 @@ exception: + return X86EMUL_PROPAGATE_FAULT; + } + ++static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, ++ u16 selector, int seg) ++{ ++ u8 cpl = ctxt->ops->cpl(ctxt); ++ return __load_segment_descriptor(ctxt, selector, seg, cpl); ++} ++ + static void write_register_operand(struct operand *op) + { + /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ +@@ -2261,6 +2267,7 @@ static int load_state_from_tss16(struct + struct tss_segment_16 *tss) + { + int ret; ++ u8 cpl; + + ctxt->_eip = tss->ip; + ctxt->eflags = tss->flag | 2; +@@ -2283,23 +2290,25 @@ static int load_state_from_tss16(struct + set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); + set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); + ++ cpl = tss->cs & 3; ++ + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ +- ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR); ++ ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl); + if (ret != X86EMUL_CONTINUE) + return ret; +- ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); ++ ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl); + if (ret != X86EMUL_CONTINUE) + return ret; +- ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); ++ ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl); + if (ret != X86EMUL_CONTINUE) + return ret; +- ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); ++ ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl); + if (ret != X86EMUL_CONTINUE) + return ret; +- ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); ++ ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl); + if (ret != X86EMUL_CONTINUE) + return ret; + +@@ -2378,6 +2387,7 @@ static int load_state_from_tss32(struct + struct tss_segment_32 *tss) + { + int ret; ++ u8 cpl; + + if (ctxt->ops->set_cr(ctxt, 3, tss->cr3)) + return emulate_gp(ctxt, 0); +@@ -2396,7 +2406,8 @@ static int load_state_from_tss32(struct + + /* + * SDM says that segment selectors are loaded before segment +- * descriptors ++ * descriptors. This is important because CPL checks will ++ * use CS.RPL. + */ + set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); + set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); +@@ -2410,43 +2421,38 @@ static int load_state_from_tss32(struct + * If we're switching between Protected Mode and VM86, we need to make + * sure to update the mode before loading the segment descriptors so + * that the selectors are interpreted correctly. +- * +- * Need to get rflags to the vcpu struct immediately because it +- * influences the CPL which is checked at least when loading the segment +- * descriptors and when pushing an error code to the new kernel stack. +- * +- * TODO Introduce a separate ctxt->ops->set_cpl callback + */ +- if (ctxt->eflags & X86_EFLAGS_VM) ++ if (ctxt->eflags & X86_EFLAGS_VM) { + ctxt->mode = X86EMUL_MODE_VM86; +- else ++ cpl = 3; ++ } else { + ctxt->mode = X86EMUL_MODE_PROT32; +- +- ctxt->ops->set_rflags(ctxt, ctxt->eflags); ++ cpl = tss->cs & 3; ++ } + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ +- ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); ++ ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl); + if (ret != X86EMUL_CONTINUE) + return ret; +- ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); ++ ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl); + if (ret != X86EMUL_CONTINUE) + return ret; +- ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); ++ ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl); + if (ret != X86EMUL_CONTINUE) + return ret; +- ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); ++ ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl); + if (ret != X86EMUL_CONTINUE) + return ret; +- ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); ++ ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl); + if (ret != X86EMUL_CONTINUE) + return ret; +- ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS); ++ ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl); + if (ret != X86EMUL_CONTINUE) + return ret; +- ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS); ++ ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl); + if (ret != X86EMUL_CONTINUE) + return ret; + diff --git a/patches/libata-sff-fix-controllers-with-no-ctl-port.patch b/patches/libata-sff-fix-controllers-with-no-ctl-port.patch new file mode 100644 index 0000000..98cebee --- /dev/null +++ b/patches/libata-sff-fix-controllers-with-no-ctl-port.patch @@ -0,0 +1,107 @@ +From 6d8ca28fa688a9354bc9fbc935bdaeb3651b6677 Mon Sep 17 00:00:00 2001 +From: Ondrej Zary <linux@rainbow-software.org> +Date: Sat, 27 Sep 2014 00:04:46 +0200 +Subject: libata-sff: Fix controllers with no ctl port + +commit 6d8ca28fa688a9354bc9fbc935bdaeb3651b6677 upstream. + +Currently, ata_sff_softreset is skipped for controllers with no ctl port. +But that also skips ata_sff_dev_classify required for device detection. +This means that libata is currently broken on controllers with no ctl port. + +No device connected: +[ 1.872480] pata_isapnp 01:01.02: activated +[ 1.889823] scsi2 : pata_isapnp +[ 1.890109] ata3: PATA max PIO0 cmd 0x1e8 ctl 0x0 irq 11 +[ 6.888110] ata3.01: qc timeout (cmd 0xec) +[ 6.888179] ata3.01: failed to IDENTIFY (I/O error, err_mask=0x5) +[ 16.888085] ata3.01: qc timeout (cmd 0xec) +[ 16.888147] ata3.01: failed to IDENTIFY (I/O error, err_mask=0x5) +[ 46.888086] ata3.01: qc timeout (cmd 0xec) +[ 46.888148] ata3.01: failed to IDENTIFY (I/O error, err_mask=0x5) +[ 51.888100] ata3.00: qc timeout (cmd 0xec) +[ 51.888160] ata3.00: failed to IDENTIFY (I/O error, err_mask=0x5) +[ 61.888079] ata3.00: qc timeout (cmd 0xec) +[ 61.888141] ata3.00: failed to IDENTIFY (I/O error, err_mask=0x5) +[ 91.888089] ata3.00: qc timeout (cmd 0xec) +[ 91.888152] ata3.00: failed to IDENTIFY (I/O error, err_mask=0x5) + +ATAPI device connected: +[ 1.882061] pata_isapnp 01:01.02: activated +[ 1.893430] scsi2 : pata_isapnp +[ 1.893719] ata3: PATA max PIO0 cmd 0x1e8 ctl 0x0 irq 11 +[ 6.892107] ata3.01: qc timeout (cmd 0xec) +[ 6.892171] ata3.01: failed to IDENTIFY (I/O error, err_mask=0x5) +[ 16.892079] ata3.01: qc timeout (cmd 0xec) +[ 16.892138] ata3.01: failed to IDENTIFY (I/O error, err_mask=0x5) +[ 46.892079] ata3.01: qc timeout (cmd 0xec) +[ 46.892138] ata3.01: failed to IDENTIFY (I/O error, err_mask=0x5) +[ 46.908586] ata3.00: ATAPI: ACER CD-767E/O, V1.5X, max PIO2, CDB intr +[ 46.924570] ata3.00: configured for PIO0 (device error ignored) +[ 46.926295] scsi 2:0:0:0: CD-ROM ACER CD-767E/O 1.5X PQ: 0 ANSI: 5 +[ 46.984519] sr0: scsi3-mmc drive: 6x/6x xa/form2 tray +[ 46.984592] cdrom: Uniform CD-ROM driver Revision: 3.20 + +So don't skip ata_sff_softreset, just skip the reset part of ata_bus_softreset +if the ctl port is not available. + +This makes IDE port on ES968 behave correctly: + +No device connected: +[ 4.670888] pata_isapnp 01:01.02: activated +[ 4.673207] scsi host2: pata_isapnp +[ 4.673675] ata3: PATA max PIO0 cmd 0x1e8 ctl 0x0 irq 11 +[ 7.081840] Adding 2541652k swap on /dev/sda2. Priority:-1 extents:1 across:2541652k + +ATAPI device connected: +[ 4.704362] pata_isapnp 01:01.02: activated +[ 4.706620] scsi host2: pata_isapnp +[ 4.706877] ata3: PATA max PIO0 cmd 0x1e8 ctl 0x0 irq 11 +[ 4.872782] ata3.00: ATAPI: ACER CD-767E/O, V1.5X, max PIO2, CDB intr +[ 4.888673] ata3.00: configured for PIO0 (device error ignored) +[ 4.893984] scsi 2:0:0:0: CD-ROM ACER CD-767E/O 1.5X PQ: 0 ANSI: 5 +[ 7.015578] Adding 2541652k swap on /dev/sda2. Priority:-1 extents:1 across:2541652k + +Signed-off-by: Ondrej Zary <linux@rainbow-software.org> +Signed-off-by: Tejun Heo <tj@kernel.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/ata/libata-sff.c | 20 +++++++++----------- + 1 file changed, 9 insertions(+), 11 deletions(-) + +--- a/drivers/ata/libata-sff.c ++++ b/drivers/ata/libata-sff.c +@@ -2008,13 +2008,15 @@ static int ata_bus_softreset(struct ata_ + + DPRINTK("ata%u: bus reset via SRST\n", ap->print_id); + +- /* software reset. causes dev0 to be selected */ +- iowrite8(ap->ctl, ioaddr->ctl_addr); +- udelay(20); /* FIXME: flush */ +- iowrite8(ap->ctl | ATA_SRST, ioaddr->ctl_addr); +- udelay(20); /* FIXME: flush */ +- iowrite8(ap->ctl, ioaddr->ctl_addr); +- ap->last_ctl = ap->ctl; ++ if (ap->ioaddr.ctl_addr) { ++ /* software reset. causes dev0 to be selected */ ++ iowrite8(ap->ctl, ioaddr->ctl_addr); ++ udelay(20); /* FIXME: flush */ ++ iowrite8(ap->ctl | ATA_SRST, ioaddr->ctl_addr); ++ udelay(20); /* FIXME: flush */ ++ iowrite8(ap->ctl, ioaddr->ctl_addr); ++ ap->last_ctl = ap->ctl; ++ } + + /* wait the port to become ready */ + return ata_sff_wait_after_reset(&ap->link, devmask, deadline); +@@ -2215,10 +2217,6 @@ void ata_sff_error_handler(struct ata_po + + spin_unlock_irqrestore(ap->lock, flags); + +- /* ignore ata_sff_softreset if ctl isn't accessible */ +- if (softreset == ata_sff_softreset && !ap->ioaddr.ctl_addr) +- softreset = NULL; +- + /* ignore built-in hardresets if SCR access is not available */ + if ((hardreset == sata_std_hardreset || + hardreset == sata_sff_hardreset) && !sata_scr_valid(&ap->link)) diff --git a/patches/lockd-try-to-reconnect-if-statd-has-moved.patch b/patches/lockd-try-to-reconnect-if-statd-has-moved.patch new file mode 100644 index 0000000..3f1a261 --- /dev/null +++ b/patches/lockd-try-to-reconnect-if-statd-has-moved.patch @@ -0,0 +1,33 @@ +From 173b3afceebe76fa2205b2c8808682d5b541fe3c Mon Sep 17 00:00:00 2001 +From: Benjamin Coddington <bcodding@redhat.com> +Date: Tue, 23 Sep 2014 12:26:20 -0400 +Subject: lockd: Try to reconnect if statd has moved + +commit 173b3afceebe76fa2205b2c8808682d5b541fe3c upstream. + +If rpc.statd is restarted, upcalls to monitor hosts can fail with +ECONNREFUSED. In that case force a lookup of statd's new port and retry the +upcall. + +Signed-off-by: Benjamin Coddington <bcodding@redhat.com> +Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + fs/lockd/mon.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/fs/lockd/mon.c ++++ b/fs/lockd/mon.c +@@ -114,6 +114,12 @@ static int nsm_mon_unmon(struct nsm_hand + + msg.rpc_proc = &clnt->cl_procinfo[proc]; + status = rpc_call_sync(clnt, &msg, 0); ++ if (status == -ECONNREFUSED) { ++ dprintk("lockd: NSM upcall RPC failed, status=%d, forcing rebind\n", ++ status); ++ rpc_force_rebind(clnt); ++ status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN); ++ } + if (status < 0) + dprintk("lockd: NSM upcall RPC failed, status=%d\n", + status); diff --git a/patches/lzo-check-for-length-overrun-in-variable-length-encoding.patch b/patches/lzo-check-for-length-overrun-in-variable-length-encoding.patch new file mode 100644 index 0000000..70d781f --- /dev/null +++ b/patches/lzo-check-for-length-overrun-in-variable-length-encoding.patch @@ -0,0 +1,122 @@ +From 72cf90124e87d975d0b2114d930808c58b4c05e4 Mon Sep 17 00:00:00 2001 +From: Willy Tarreau <w@1wt.eu> +Date: Sat, 27 Sep 2014 12:31:37 +0200 +Subject: lzo: check for length overrun in variable length encoding. + +commit 72cf90124e87d975d0b2114d930808c58b4c05e4 upstream. + +This fix ensures that we never meet an integer overflow while adding +255 while parsing a variable length encoding. It works differently from +commit 206a81c ("lzo: properly check for overruns") because instead of +ensuring that we don't overrun the input, which is tricky to guarantee +due to many assumptions in the code, it simply checks that the cumulated +number of 255 read cannot overflow by bounding this number. + +The MAX_255_COUNT is the maximum number of times we can add 255 to a base +count without overflowing an integer. The multiply will overflow when +multiplying 255 by more than MAXINT/255. The sum will overflow earlier +depending on the base count. Since the base count is taken from a u8 +and a few bits, it is safe to assume that it will always be lower than +or equal to 2*255, thus we can always prevent any overflow by accepting +two less 255 steps. + +This patch also reduces the CPU overhead and actually increases performance +by 1.1% compared to the initial code, while the previous fix costs 3.1% +(measured on x86_64). + +The fix needs to be backported to all currently supported stable kernels. + +Reported-by: Willem Pinckaers <willem@lekkertech.net> +Cc: "Don A. Bailey" <donb@securitymouse.com> +Signed-off-by: Willy Tarreau <w@1wt.eu> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + lib/lzo/lzo1x_decompress_safe.c | 43 ++++++++++++++++++++++++++++++++++------ + 1 file changed, 37 insertions(+), 6 deletions(-) + +--- a/lib/lzo/lzo1x_decompress_safe.c ++++ b/lib/lzo/lzo1x_decompress_safe.c +@@ -25,6 +25,16 @@ + #define NEED_OP(x) if (!HAVE_OP(x)) goto output_overrun + #define TEST_LB(m_pos) if ((m_pos) < out) goto lookbehind_overrun + ++/* This MAX_255_COUNT is the maximum number of times we can add 255 to a base ++ * count without overflowing an integer. The multiply will overflow when ++ * multiplying 255 by more than MAXINT/255. The sum will overflow earlier ++ * depending on the base count. Since the base count is taken from a u8 ++ * and a few bits, it is safe to assume that it will always be lower than ++ * or equal to 2*255, thus we can always prevent any overflow by accepting ++ * two less 255 steps. See Documentation/lzo.txt for more information. ++ */ ++#define MAX_255_COUNT ((((size_t)~0) / 255) - 2) ++ + int lzo1x_decompress_safe(const unsigned char *in, size_t in_len, + unsigned char *out, size_t *out_len) + { +@@ -55,12 +65,19 @@ int lzo1x_decompress_safe(const unsigned + if (t < 16) { + if (likely(state == 0)) { + if (unlikely(t == 0)) { ++ size_t offset; ++ const unsigned char *ip_last = ip; ++ + while (unlikely(*ip == 0)) { +- t += 255; + ip++; + NEED_IP(1); + } +- t += 15 + *ip++; ++ offset = ip - ip_last; ++ if (unlikely(offset > MAX_255_COUNT)) ++ return LZO_E_ERROR; ++ ++ offset = (offset << 8) - offset; ++ t += offset + 15 + *ip++; + } + t += 3; + copy_literal_run: +@@ -116,12 +133,19 @@ copy_literal_run: + } else if (t >= 32) { + t = (t & 31) + (3 - 1); + if (unlikely(t == 2)) { ++ size_t offset; ++ const unsigned char *ip_last = ip; ++ + while (unlikely(*ip == 0)) { +- t += 255; + ip++; + NEED_IP(1); + } +- t += 31 + *ip++; ++ offset = ip - ip_last; ++ if (unlikely(offset > MAX_255_COUNT)) ++ return LZO_E_ERROR; ++ ++ offset = (offset << 8) - offset; ++ t += offset + 31 + *ip++; + NEED_IP(2); + } + m_pos = op - 1; +@@ -134,12 +158,19 @@ copy_literal_run: + m_pos -= (t & 8) << 11; + t = (t & 7) + (3 - 1); + if (unlikely(t == 2)) { ++ size_t offset; ++ const unsigned char *ip_last = ip; ++ + while (unlikely(*ip == 0)) { +- t += 255; + ip++; + NEED_IP(1); + } +- t += 7 + *ip++; ++ offset = ip - ip_last; ++ if (unlikely(offset > MAX_255_COUNT)) ++ return LZO_E_ERROR; ++ ++ offset = (offset << 8) - offset; ++ t += offset + 7 + *ip++; + NEED_IP(2); + } + next = get_unaligned_le16(ip); diff --git a/patches/m68k-disable-restore-interrupts-in-hwreg_present-hwreg_write.patch b/patches/m68k-disable-restore-interrupts-in-hwreg_present-hwreg_write.patch new file mode 100644 index 0000000..f701dc1 --- /dev/null +++ b/patches/m68k-disable-restore-interrupts-in-hwreg_present-hwreg_write.patch @@ -0,0 +1,74 @@ +From e4dc601bf99ccd1c95b7e6eef1d3cf3c4b0d4961 Mon Sep 17 00:00:00 2001 +From: Geert Uytterhoeven <geert@linux-m68k.org> +Date: Sun, 28 Sep 2014 10:50:06 +0200 +Subject: m68k: Disable/restore interrupts in hwreg_present()/hwreg_write() + +commit e4dc601bf99ccd1c95b7e6eef1d3cf3c4b0d4961 upstream. + +hwreg_present() and hwreg_write() temporarily change the VBR register to +another vector table. This table contains a valid bus error handler +only, all other entries point to arbitrary addresses. + +If an interrupt comes in while the temporary table is active, the +processor will start executing at such an arbitrary address, and the +kernel will crash. + +While most callers run early, before interrupts are enabled, or +explicitly disable interrupts, Finn Thain pointed out that macsonic has +one callsite that doesn't, causing intermittent boot crashes. +There's another unsafe callsite in hilkbd. + +Fix this for good by disabling and restoring interrupts inside +hwreg_present() and hwreg_write(). + +Explicitly disabling interrupts can be removed from the callsites later. + +Reported-by: Finn Thain <fthain@telegraphics.com.au> +Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + arch/m68k/mm/hwtest.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/arch/m68k/mm/hwtest.c ++++ b/arch/m68k/mm/hwtest.c +@@ -28,9 +28,11 @@ + int hwreg_present( volatile void *regp ) + { + int ret = 0; ++ unsigned long flags; + long save_sp, save_vbr; + long tmp_vectors[3]; + ++ local_irq_save(flags); + __asm__ __volatile__ + ( "movec %/vbr,%2\n\t" + "movel #Lberr1,%4@(8)\n\t" +@@ -46,6 +48,7 @@ int hwreg_present( volatile void *regp ) + : "=&d" (ret), "=&r" (save_sp), "=&r" (save_vbr) + : "a" (regp), "a" (tmp_vectors) + ); ++ local_irq_restore(flags); + + return( ret ); + } +@@ -58,9 +61,11 @@ EXPORT_SYMBOL(hwreg_present); + int hwreg_write( volatile void *regp, unsigned short val ) + { + int ret; ++ unsigned long flags; + long save_sp, save_vbr; + long tmp_vectors[3]; + ++ local_irq_save(flags); + __asm__ __volatile__ + ( "movec %/vbr,%2\n\t" + "movel #Lberr2,%4@(8)\n\t" +@@ -78,6 +83,7 @@ int hwreg_write( volatile void *regp, un + : "=&d" (ret), "=&r" (save_sp), "=&r" (save_vbr) + : "a" (regp), "a" (tmp_vectors), "g" (val) + ); ++ local_irq_restore(flags); + + return( ret ); + } diff --git a/patches/mips-tlbex-fix-a-missing-statement-for-hugetlb.patch b/patches/mips-tlbex-fix-a-missing-statement-for-hugetlb.patch new file mode 100644 index 0000000..5fcf137 --- /dev/null +++ b/patches/mips-tlbex-fix-a-missing-statement-for-hugetlb.patch @@ -0,0 +1,37 @@ +From 8393c524a25609a30129e4a8975cf3b91f6c16a5 Mon Sep 17 00:00:00 2001 +From: Huacai Chen <chenhc@lemote.com> +Date: Tue, 29 Jul 2014 14:54:40 +0800 +Subject: MIPS: tlbex: Fix a missing statement for HUGETLB + +commit 8393c524a25609a30129e4a8975cf3b91f6c16a5 upstream. + +In commit 2c8c53e28f1 (MIPS: Optimize TLB handlers for Octeon CPUs) +build_r4000_tlb_refill_handler() is modified. But it doesn't compatible +with the original code in HUGETLB case. Because there is a copy & paste +error and one line of code is missing. It is very easy to produce a bug +with LTP's hugemmap05 test. + +Signed-off-by: Huacai Chen <chenhc@lemote.com> +Signed-off-by: Binbin Zhou <zhoubb@lemote.com> +Cc: John Crispin <john@phrozen.org> +Cc: Steven J. Hill <Steven.Hill@imgtec.com> +Cc: linux-mips@linux-mips.org +Cc: Fuxin Zhang <zhangfx@lemote.com> +Cc: Zhangjin Wu <wuzhangjin@gmail.com> +Patchwork: https://patchwork.linux-mips.org/patch/7496/ +Signed-off-by: Ralf Baechle <ralf@linux-mips.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + arch/mips/mm/tlbex.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/arch/mips/mm/tlbex.c ++++ b/arch/mips/mm/tlbex.c +@@ -1283,6 +1283,7 @@ static void __cpuinit build_r4000_tlb_re + } + #ifdef CONFIG_HUGETLB_PAGE + uasm_l_tlb_huge_update(&l, p); ++ UASM_i_LW(&p, K0, 0, K1); + build_huge_update_entries(&p, htlb_info.huge_pte, K1); + build_huge_tlb_write_entry(&p, &l, &r, K0, tlb_random, + htlb_info.restore_scratch); diff --git a/patches/mips-tlbex-properly-fix-huge-tlb-refill-exception-handler.patch b/patches/mips-tlbex-properly-fix-huge-tlb-refill-exception-handler.patch new file mode 100644 index 0000000..b5fbb71 --- /dev/null +++ b/patches/mips-tlbex-properly-fix-huge-tlb-refill-exception-handler.patch @@ -0,0 +1,87 @@ +From 9e0f162a36914937a937358fcb45e0609ef2bfc4 Mon Sep 17 00:00:00 2001 +From: David Daney <david.daney@cavium.com> +Date: Mon, 20 Oct 2014 15:34:23 -0700 +Subject: MIPS: tlbex: Properly fix HUGE TLB Refill exception handler + +commit 9e0f162a36914937a937358fcb45e0609ef2bfc4 upstream. + +In commit 8393c524a25609 (MIPS: tlbex: Fix a missing statement for +HUGETLB), the TLB Refill handler was fixed so that non-OCTEON targets +would work properly with huge pages. The change was incorrect in that +it broke the OCTEON case. + +The problem is shown here: + + xxx0: df7a0000 ld k0,0(k1) + . + . + . + xxxc0: df610000 ld at,0(k1) + xxxc4: 335a0ff0 andi k0,k0,0xff0 + xxxc8: e825ffcd bbit1 at,0x5,0x0 + xxxcc: 003ad82d daddu k1,at,k0 + . + . + . + +In the non-octeon case there is a destructive test for the huge PTE +bit, and then at 0, $k0 is reloaded (that is what the 8393c524a25609 +patch added). + +In the octeon case, we modify k1 in the branch delay slot, but we +never need k0 again, so the new load is not needed, but since k1 is +modified, if we do the load, we load from a garbage location and then +get a nested TLB Refill, which is seen in userspace as either SIGBUS +or SIGSEGV (depending on the garbage). + +The real fix is to only do this reloading if it is needed, and never +where it is harmful. + +Signed-off-by: David Daney <david.daney@cavium.com> +Cc: Huacai Chen <chenhc@lemote.com> +Cc: Fuxin Zhang <zhangfx@lemote.com> +Cc: Zhangjin Wu <wuzhangjin@gmail.com> +Cc: linux-mips@linux-mips.org +Patchwork: https://patchwork.linux-mips.org/patch/8151/ +Signed-off-by: Ralf Baechle <ralf@linux-mips.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + arch/mips/mm/tlbex.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/arch/mips/mm/tlbex.c ++++ b/arch/mips/mm/tlbex.c +@@ -1041,6 +1041,7 @@ static void __cpuinit build_update_entri + struct mips_huge_tlb_info { + int huge_pte; + int restore_scratch; ++ bool need_reload_pte; + }; + + static struct mips_huge_tlb_info __cpuinit +@@ -1055,6 +1056,7 @@ build_fast_tlb_refill_handler (u32 **p, + + rv.huge_pte = scratch; + rv.restore_scratch = 0; ++ rv.need_reload_pte = false; + + if (check_for_high_segbits) { + UASM_i_MFC0(p, tmp, C0_BADVADDR); +@@ -1247,6 +1249,7 @@ static void __cpuinit build_r4000_tlb_re + } else { + htlb_info.huge_pte = K0; + htlb_info.restore_scratch = 0; ++ htlb_info.need_reload_pte = true; + vmalloc_mode = refill_noscratch; + /* + * create the plain linear handler +@@ -1283,7 +1286,8 @@ static void __cpuinit build_r4000_tlb_re + } + #ifdef CONFIG_HUGETLB_PAGE + uasm_l_tlb_huge_update(&l, p); +- UASM_i_LW(&p, K0, 0, K1); ++ if (htlb_info.need_reload_pte) ++ UASM_i_LW(&p, htlb_info.huge_pte, 0, K1); + build_huge_update_entries(&p, htlb_info.huge_pte, K1); + build_huge_tlb_write_entry(&p, &l, &r, K0, tlb_random, + htlb_info.restore_scratch); diff --git a/patches/mnt-prevent-pivot_root-from-creating-a-loop-in-the-mount-tree.patch b/patches/mnt-prevent-pivot_root-from-creating-a-loop-in-the-mount-tree.patch new file mode 100644 index 0000000..3fceb45 --- /dev/null +++ b/patches/mnt-prevent-pivot_root-from-creating-a-loop-in-the-mount-tree.patch @@ -0,0 +1,45 @@ +From 0d0826019e529f21c84687521d03f60cd241ca7d Mon Sep 17 00:00:00 2001 +From: "Eric W. Biederman" <ebiederm@xmission.com> +Date: Wed, 8 Oct 2014 10:42:27 -0700 +Subject: mnt: Prevent pivot_root from creating a loop in the mount tree + +commit 0d0826019e529f21c84687521d03f60cd241ca7d upstream. + +Andy Lutomirski recently demonstrated that when chroot is used to set +the root path below the path for the new ``root'' passed to pivot_root +the pivot_root system call succeeds and leaks mounts. + +In examining the code I see that starting with a new root that is +below the current root in the mount tree will result in a loop in the +mount tree after the mounts are detached and then reattached to one +another. Resulting in all kinds of ugliness including a leak of that +mounts involved in the leak of the mount loop. + +Prevent this problem by ensuring that the new mount is reachable from +the current root of the mount tree. + +[Added stable cc. Fixes CVE-2014-7970. --Andy] + +Reported-by: Andy Lutomirski <luto@amacapital.net> +Reviewed-by: Andy Lutomirski <luto@amacapital.net> +Link: http://lkml.kernel.org/r/87bnpmihks.fsf@x220.int.ebiederm.org +Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> +Signed-off-by: Andy Lutomirski <luto@amacapital.net> +[lizf: Backported to 3.4: adjust context] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + fs/namespace.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -2508,6 +2508,9 @@ SYSCALL_DEFINE2(pivot_root, const char _ + /* make sure we can reach put_old from new_root */ + if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new)) + goto out4; ++ /* make certain new is below the root */ ++ if (!is_path_reachable(new_mnt, new.dentry, &root)) ++ goto out4; + br_write_lock(vfsmount_lock); + detach_mnt(new_mnt, &parent_path); + detach_mnt(root_mnt, &root_parent); diff --git a/patches/mpc85xx_edac-make-l2-interrupt-shared-too.patch b/patches/mpc85xx_edac-make-l2-interrupt-shared-too.patch new file mode 100644 index 0000000..dff5c38 --- /dev/null +++ b/patches/mpc85xx_edac-make-l2-interrupt-shared-too.patch @@ -0,0 +1,73 @@ +From a18c3f16a907b8977ef65fc8dd71ed3f7b751748 Mon Sep 17 00:00:00 2001 +From: Borislav Petkov <bp@suse.de> +Date: Tue, 30 Sep 2014 12:55:41 +0200 +Subject: mpc85xx_edac: Make L2 interrupt shared too + +commit a18c3f16a907b8977ef65fc8dd71ed3f7b751748 upstream. + +The other two interrupt handlers in this driver are shared, except this +one. When loading the driver, it fails like this. + +So make the IRQ line shared. + +Freescale(R) MPC85xx EDAC driver, (C) 2006 Montavista Software +mpc85xx_mc_err_probe: No ECC DIMMs discovered +EDAC DEVICE0: Giving out device to module MPC85xx_edac controller mpc85xx_l2_err: DEV mpc85xx_l2_err (INTERRUPT) +genirq: Flags mismatch irq 16. 00000000 ([EDAC] L2 err) vs. 00000080 ([EDAC] PCI err) +mpc85xx_l2_err_probe: Unable to request irq 16 for MPC85xx L2 err +remove_proc_entry: removing non-empty directory 'irq/16', leaking at least 'aerdrv' +------------[ cut here ]------------ +WARNING: at fs/proc/generic.c:521 +Modules linked in: +CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.17.0-rc5-dirty #1 +task: ee058000 ti: ee046000 task.ti: ee046000 +NIP: c016c0c4 LR: c016c0c4 CTR: c037b51c +REGS: ee047c10 TRAP: 0700 Not tainted (3.17.0-rc5-dirty) +MSR: 00029000 <CE,EE,ME> CR: 22008022 XER: 20000000 + +GPR00: c016c0c4 ee047cc0 ee058000 00000053 00029000 00000000 c037c744 00000003 +GPR08: c09aab28 c09aab24 c09aab28 00000156 20008028 00000000 c0002ac8 00000000 +GPR16: 00000000 00000000 00000000 00000000 00000000 00000000 00000139 c0950394 +GPR24: c09f0000 ee5585b0 ee047d08 c0a10000 ee047d08 ee15f808 00000002 ee03f660 +NIP [c016c0c4] remove_proc_entry +LR [c016c0c4] remove_proc_entry +Call Trace: +remove_proc_entry (unreliable) +unregister_irq_proc +free_desc +irq_free_descs +mpc85xx_l2_err_probe +platform_drv_probe +really_probe +__driver_attach +bus_for_each_dev +bus_add_driver +driver_register +mpc85xx_mc_init +do_one_initcall +kernel_init_freeable +kernel_init +ret_from_kernel_thread +Instruction dump: ... + +Reported-and-tested-by: <lpb_098@163.com> +Acked-by: Johannes Thumshirn <johannes.thumshirn@men.de> +Signed-off-by: Borislav Petkov <bp@suse.de> +[lizf: Backported to 3.4: IRQF_DISABLED hasn't been removed in 3.4] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/edac/mpc85xx_edac.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/drivers/edac/mpc85xx_edac.c ++++ b/drivers/edac/mpc85xx_edac.c +@@ -577,7 +577,8 @@ static int __devinit mpc85xx_l2_err_prob + if (edac_op_state == EDAC_OPSTATE_INT) { + pdata->irq = irq_of_parse_and_map(op->dev.of_node, 0); + res = devm_request_irq(&op->dev, pdata->irq, +- mpc85xx_l2_isr, IRQF_DISABLED, ++ mpc85xx_l2_isr, ++ IRQF_DISABLED | IRQF_SHARED, + "[EDAC] L2 err", edac_dev); + if (res < 0) { + printk(KERN_ERR diff --git a/patches/nept-nested-invept.patch b/patches/nept-nested-invept.patch new file mode 100644 index 0000000..f535c37 --- /dev/null +++ b/patches/nept-nested-invept.patch @@ -0,0 +1,76 @@ +From 02a988e6e4511b1f6d83525710a12db9c5a45149 Mon Sep 17 00:00:00 2001 +From: Nadav Har'El <nyh@il.ibm.com> +Date: Mon, 5 Aug 2013 11:07:17 +0300 +Subject: nEPT: Nested INVEPT + +commit bfd0a56b90005f8c8a004baf407ad90045c2b11e upstream. + +If we let L1 use EPT, we should probably also support the INVEPT instruction. + +In our current nested EPT implementation, when L1 changes its EPT table +for L2 (i.e., EPT12), L0 modifies the shadow EPT table (EPT02), and in +the course of this modification already calls INVEPT. But if last level +of shadow page is unsync not all L1's changes to EPT12 are intercepted, +which means roots need to be synced when L1 calls INVEPT. Global INVEPT +should not be different since roots are synced by kvm_mmu_load() each +time EPTP02 changes. + +Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> +Signed-off-by: Nadav Har'El <nyh@il.ibm.com> +Signed-off-by: Jun Nakajima <jun.nakajima@intel.com> +Signed-off-by: Xinhao Xu <xinhao.xu@intel.com> +Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com> +Signed-off-by: Gleb Natapov <gleb@redhat.com> +Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> +[bwh: Backported to 3.2: + - Adjust context, filename + - Simplify handle_invept() as recommended by Paolo - nEPT is not + supported so we always raise #UD] +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + arch/x86/include/asm/vmx.h | 1 + + arch/x86/kvm/vmx.c | 8 ++++++++ + 2 files changed, 9 insertions(+) + +--- a/arch/x86/include/asm/vmx.h ++++ b/arch/x86/include/asm/vmx.h +@@ -279,6 +279,7 @@ enum vmcs_field { + #define EXIT_REASON_APIC_ACCESS 44 + #define EXIT_REASON_EPT_VIOLATION 48 + #define EXIT_REASON_EPT_MISCONFIG 49 ++#define EXIT_REASON_INVEPT 50 + #define EXIT_REASON_WBINVD 54 + #define EXIT_REASON_XSETBV 55 + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -5566,6 +5566,12 @@ static int handle_vmptrst(struct kvm_vcp + return 1; + } + ++static int handle_invept(struct kvm_vcpu *vcpu) ++{ ++ kvm_queue_exception(vcpu, UD_VECTOR); ++ return 1; ++} ++ + /* + * The exit handlers return 1 if the exit was handled fully and guest execution + * may resume. Otherwise they set the kvm_run parameter to indicate what needs +@@ -5608,6 +5614,7 @@ static int (*kvm_vmx_exit_handlers[])(st + [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, + [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, + [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, ++ [EXIT_REASON_INVEPT] = handle_invept, + }; + + static const int kvm_vmx_max_exit_handlers = +@@ -5792,6 +5799,7 @@ static bool nested_vmx_exit_handled(stru + case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: + case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: + case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: ++ case EXIT_REASON_INVEPT: + /* + * VMX instructions trap unconditionally. This allows L1 to + * emulate them for its L2 guest, i.e., allows 3-level nesting! diff --git a/patches/nfsv4-fix-open-lock-state-recovery-error-handling.patch b/patches/nfsv4-fix-open-lock-state-recovery-error-handling.patch new file mode 100644 index 0000000..260e0e0 --- /dev/null +++ b/patches/nfsv4-fix-open-lock-state-recovery-error-handling.patch @@ -0,0 +1,67 @@ +From df817ba35736db2d62b07de6f050a4db53492ad8 Mon Sep 17 00:00:00 2001 +From: Trond Myklebust <trond.myklebust@primarydata.com> +Date: Sat, 27 Sep 2014 17:41:51 -0400 +Subject: NFSv4: fix open/lock state recovery error handling + +commit df817ba35736db2d62b07de6f050a4db53492ad8 upstream. + +The current open/lock state recovery unfortunately does not handle errors +such as NFS4ERR_CONN_NOT_BOUND_TO_SESSION correctly. Instead of looping, +just proceeds as if the state manager is finished recovering. +This patch ensures that we loop back, handle higher priority errors +and complete the open/lock state recovery. + +Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + fs/nfs/nfs4state.c | 16 ++++++---------- + 1 file changed, 6 insertions(+), 10 deletions(-) + +--- a/fs/nfs/nfs4state.c ++++ b/fs/nfs/nfs4state.c +@@ -1515,7 +1515,8 @@ restart: + if (status < 0) { + set_bit(ops->owner_flag_bit, &sp->so_flags); + nfs4_put_state_owner(sp); +- return nfs4_recovery_handle_error(clp, status); ++ status = nfs4_recovery_handle_error(clp, status); ++ return (status != 0) ? status : -EAGAIN; + } + + nfs4_put_state_owner(sp); +@@ -1524,7 +1525,7 @@ restart: + spin_unlock(&clp->cl_lock); + } + rcu_read_unlock(); +- return status; ++ return 0; + } + + static int nfs4_check_lease(struct nfs_client *clp) +@@ -1796,23 +1797,18 @@ static void nfs4_state_manager(struct nf + if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { + status = nfs4_do_reclaim(clp, + clp->cl_mvops->reboot_recovery_ops); +- if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || +- test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) +- continue; +- nfs4_state_end_reclaim_reboot(clp); +- if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) ++ if (status == -EAGAIN) + continue; + if (status < 0) + goto out_error; ++ nfs4_state_end_reclaim_reboot(clp); + } + + /* Now recover expired state... */ + if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { + status = nfs4_do_reclaim(clp, + clp->cl_mvops->nograce_recovery_ops); +- if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || +- test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || +- test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) ++ if (status == -EAGAIN) + continue; + if (status < 0) + goto out_error; diff --git a/patches/nfsv4.1-fix-an-nfsv4.1-state-renewal-regression.patch b/patches/nfsv4.1-fix-an-nfsv4.1-state-renewal-regression.patch new file mode 100644 index 0000000..a4f703c --- /dev/null +++ b/patches/nfsv4.1-fix-an-nfsv4.1-state-renewal-regression.patch @@ -0,0 +1,89 @@ +From d1f456b0b9545f1606a54cd17c20775f159bd2ce Mon Sep 17 00:00:00 2001 +From: Andy Adamson <andros@netapp.com> +Date: Mon, 29 Sep 2014 12:31:57 -0400 +Subject: NFSv4.1: Fix an NFSv4.1 state renewal regression + +commit d1f456b0b9545f1606a54cd17c20775f159bd2ce upstream. + +Commit 2f60ea6b8ced ("NFSv4: The NFSv4.0 client must send RENEW calls if it holds a delegation") set the NFS4_RENEW_TIMEOUT flag in nfs4_renew_state, and does +not put an nfs41_proc_async_sequence call, the NFSv4.1 lease renewal heartbeat +call, on the wire to renew the NFSv4.1 state if the flag was not set. + +The NFS4_RENEW_TIMEOUT flag is set when "now" is after the last renewal +(cl_last_renewal) plus the lease time divided by 3. This is arbitrary and +sometimes does the following: + +In normal operation, the only way a future state renewal call is put on the +wire is via a call to nfs4_schedule_state_renewal, which schedules a +nfs4_renew_state workqueue task. nfs4_renew_state determines if the +NFS4_RENEW_TIMEOUT should be set, and the calls nfs41_proc_async_sequence, +which only gets sent if the NFS4_RENEW_TIMEOUT flag is set. +Then the nfs41_proc_async_sequence rpc_release function schedules +another state remewal via nfs4_schedule_state_renewal. + +Without this change we can get into a state where an application stops +accessing the NFSv4.1 share, state renewal calls stop due to the +NFS4_RENEW_TIMEOUT flag _not_ being set. The only way to recover +from this situation is with a clientid re-establishment, once the application +resumes and the server has timed out the lease and so returns +NFS4ERR_BAD_SESSION on the subsequent SEQUENCE operation. + +An example application: +open, lock, write a file. + +sleep for 6 * lease (could be less) + +ulock, close. + +In the above example with NFSv4.1 delegations enabled, without this change, +there are no OP_SEQUENCE state renewal calls during the sleep, and the +clientid is recovered due to lease expiration on the close. + +This issue does not occur with NFSv4.1 delegations disabled, nor with +NFSv4.0, with or without delegations enabled. + +Signed-off-by: Andy Adamson <andros@netapp.com> +Link: http://lkml.kernel.org/r/1411486536-23401-1-git-send-email-andros@netapp.com +Fixes: 2f60ea6b8ced (NFSv4: The NFSv4.0 client must send RENEW calls...) +Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com> +[lizf: Backported to 3.4: adjust context] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + fs/nfs/nfs4proc.c | 2 +- + fs/nfs/nfs4renewd.c | 12 ++++++++++-- + 2 files changed, 11 insertions(+), 3 deletions(-) + +--- a/fs/nfs/nfs4proc.c ++++ b/fs/nfs/nfs4proc.c +@@ -5796,7 +5796,7 @@ static int nfs41_proc_async_sequence(str + int ret = 0; + + if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0) +- return 0; ++ return -EAGAIN; + task = _nfs41_proc_sequence(clp, cred, &nfs41_sequence_ops); + if (IS_ERR(task)) + ret = PTR_ERR(task); +--- a/fs/nfs/nfs4renewd.c ++++ b/fs/nfs/nfs4renewd.c +@@ -88,10 +88,18 @@ nfs4_renew_state(struct work_struct *wor + } + nfs_expire_all_delegations(clp); + } else { ++ int ret; ++ + /* Queue an asynchronous RENEW. */ +- ops->sched_state_renewal(clp, cred, renew_flags); ++ ret = ops->sched_state_renewal(clp, cred, renew_flags); + put_rpccred(cred); +- goto out_exp; ++ switch (ret) { ++ default: ++ goto out_exp; ++ case -EAGAIN: ++ case -ENOMEM: ++ break; ++ } + } + } else { + dprintk("%s: failed to call renewd. Reason: lease not expired \n", diff --git a/patches/oom-pm-oom-killed-task-shouldn-t-escape-pm-suspend.patch b/patches/oom-pm-oom-killed-task-shouldn-t-escape-pm-suspend.patch new file mode 100644 index 0000000..01da460 --- /dev/null +++ b/patches/oom-pm-oom-killed-task-shouldn-t-escape-pm-suspend.patch @@ -0,0 +1,169 @@ +From 5695be142e203167e3cb515ef86a88424f3524eb Mon Sep 17 00:00:00 2001 +From: Michal Hocko <mhocko@suse.cz> +Date: Mon, 20 Oct 2014 18:12:32 +0200 +Subject: OOM, PM: OOM killed task shouldn't escape PM suspend + +commit 5695be142e203167e3cb515ef86a88424f3524eb upstream. + +PM freezer relies on having all tasks frozen by the time devices are +getting frozen so that no task will touch them while they are getting +frozen. But OOM killer is allowed to kill an already frozen task in +order to handle OOM situtation. In order to protect from late wake ups +OOM killer is disabled after all tasks are frozen. This, however, still +keeps a window open when a killed task didn't manage to die by the time +freeze_processes finishes. + +Reduce the race window by checking all tasks after OOM killer has been +disabled. This is still not race free completely unfortunately because +oom_killer_disable cannot stop an already ongoing OOM killer so a task +might still wake up from the fridge and get killed without +freeze_processes noticing. Full synchronization of OOM and freezer is, +however, too heavy weight for this highly unlikely case. + +Introduce and check oom_kills counter which gets incremented early when +the allocator enters __alloc_pages_may_oom path and only check all the +tasks if the counter changes during the freezing attempt. The counter +is updated so early to reduce the race window since allocator checked +oom_killer_disabled which is set by PM-freezing code. A false positive +will push the PM-freezer into a slow path but that is not a big deal. + +Changes since v1 +- push the re-check loop out of freeze_processes into + check_frozen_processes and invert the condition to make the code more + readable as per Rafael + +Fixes: f660daac474c6f (oom: thaw threads if oom killed thread is frozen before deferring) +Signed-off-by: Michal Hocko <mhocko@suse.cz> +Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + include/linux/oom.h | 4 ++++ + kernel/power/process.c | 40 +++++++++++++++++++++++++++++++++++++++- + mm/oom_kill.c | 17 +++++++++++++++++ + mm/page_alloc.c | 8 ++++++++ + 4 files changed, 68 insertions(+), 1 deletion(-) + +--- a/include/linux/oom.h ++++ b/include/linux/oom.h +@@ -45,6 +45,10 @@ extern int test_set_oom_score_adj(int ne + + extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, + const nodemask_t *nodemask, unsigned long totalpages); ++ ++extern int oom_kills_count(void); ++extern void note_oom_kill(void); ++ + extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); + extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); + +--- a/kernel/power/process.c ++++ b/kernel/power/process.c +@@ -114,6 +114,28 @@ static int try_to_freeze_tasks(bool user + return todo ? -EBUSY : 0; + } + ++/* ++ * Returns true if all freezable tasks (except for current) are frozen already ++ */ ++static bool check_frozen_processes(void) ++{ ++ struct task_struct *g, *p; ++ bool ret = true; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ if (p != current && !freezer_should_skip(p) && ++ !frozen(p)) { ++ ret = false; ++ goto done; ++ } ++ } ++done: ++ read_unlock(&tasklist_lock); ++ ++ return ret; ++} ++ + /** + * freeze_processes - Signal user space processes to enter the refrigerator. + * +@@ -122,6 +144,7 @@ static int try_to_freeze_tasks(bool user + int freeze_processes(void) + { + int error; ++ int oom_kills_saved; + + error = __usermodehelper_disable(UMH_FREEZING); + if (error) +@@ -132,12 +155,27 @@ int freeze_processes(void) + + printk("Freezing user space processes ... "); + pm_freezing = true; ++ oom_kills_saved = oom_kills_count(); + error = try_to_freeze_tasks(true); + if (!error) { +- printk("done."); + __usermodehelper_set_disable_depth(UMH_DISABLED); + oom_killer_disable(); ++ ++ /* ++ * There might have been an OOM kill while we were ++ * freezing tasks and the killed task might be still ++ * on the way out so we have to double check for race. ++ */ ++ if (oom_kills_count() != oom_kills_saved && ++ !check_frozen_processes()) { ++ __usermodehelper_set_disable_depth(UMH_ENABLED); ++ printk("OOM in progress."); ++ error = -EBUSY; ++ goto done; ++ } ++ printk("done."); + } ++done: + printk("\n"); + BUG_ON(in_atomic()); + +--- a/mm/oom_kill.c ++++ b/mm/oom_kill.c +@@ -435,6 +435,23 @@ static void dump_header(struct task_stru + dump_tasks(memcg, nodemask); + } + ++/* ++ * Number of OOM killer invocations (including memcg OOM killer). ++ * Primarily used by PM freezer to check for potential races with ++ * OOM killed frozen task. ++ */ ++static atomic_t oom_kills = ATOMIC_INIT(0); ++ ++int oom_kills_count(void) ++{ ++ return atomic_read(&oom_kills); ++} ++ ++void note_oom_kill(void) ++{ ++ atomic_inc(&oom_kills); ++} ++ + #define K(x) ((x) << (PAGE_SHIFT-10)) + static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, + unsigned int points, unsigned long totalpages, +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1982,6 +1982,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, un + } + + /* ++ * PM-freezer should be notified that there might be an OOM killer on ++ * its way to kill and wake somebody up. This is too early and we might ++ * end up not killing anything but false positives are acceptable. ++ * See freeze_processes. ++ */ ++ note_oom_kill(); ++ ++ /* + * Go through the zonelist yet one more time, keep very high watermark + * here, this is only to catch a parallel oom killing, we must fail if + * we're still under heavy pressure. diff --git a/patches/pata_serverworks-disable-64-kb-dma-transfers-on-broadcom-osb4-ide-controller.patch b/patches/pata_serverworks-disable-64-kb-dma-transfers-on-broadcom-osb4-ide-controller.patch new file mode 100644 index 0000000..edb7216 --- /dev/null +++ b/patches/pata_serverworks-disable-64-kb-dma-transfers-on-broadcom-osb4-ide-controller.patch @@ -0,0 +1,160 @@ +From 37017ac6849e772e67dd187ba2fbd056c4afa533 Mon Sep 17 00:00:00 2001 +From: Scott Carter <ccscott@funsoft.com> +Date: Wed, 24 Sep 2014 18:13:09 -0700 +Subject: pata_serverworks: disable 64-KB DMA transfers on Broadcom OSB4 IDE + Controller + +commit 37017ac6849e772e67dd187ba2fbd056c4afa533 upstream. + +The Broadcom OSB4 IDE Controller (vendor and device IDs: 1166:0211) +does not support 64-KB DMA transfers. +Whenever a 64-KB DMA transfer is attempted, +the transfer fails and messages similar to the following +are written to the console log: + + [ 2431.851125] sr 0:0:0:0: [sr0] Unhandled sense code + [ 2431.851139] sr 0:0:0:0: [sr0] Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE + [ 2431.851152] sr 0:0:0:0: [sr0] Sense Key : Hardware Error [current] + [ 2431.851166] sr 0:0:0:0: [sr0] Add. Sense: Logical unit communication time-out + [ 2431.851182] sr 0:0:0:0: [sr0] CDB: Read(10): 28 00 00 00 76 f4 00 00 40 00 + [ 2431.851210] end_request: I/O error, dev sr0, sector 121808 + +When the libata and pata_serverworks modules +are recompiled with ATA_DEBUG and ATA_VERBOSE_DEBUG defined in libata.h, +the 64-KB transfer size in the scatter-gather list can be seen +in the console log: + + [ 2664.897267] sr 9:0:0:0: [sr0] Send: + [ 2664.897274] 0xf63d85e0 + [ 2664.897283] sr 9:0:0:0: [sr0] CDB: + [ 2664.897288] Read(10): 28 00 00 00 7f b4 00 00 40 00 + [ 2664.897319] buffer = 0xf6d6fbc0, bufflen = 131072, queuecommand 0xf81b7700 + [ 2664.897331] ata_scsi_dump_cdb: CDB (1:0,0,0) 28 00 00 00 7f b4 00 00 40 + [ 2664.897338] ata_scsi_translate: ENTER + [ 2664.897345] ata_sg_setup: ENTER, ata1 + [ 2664.897356] ata_sg_setup: 3 sg elements mapped + [ 2664.897364] ata_bmdma_fill_sg: PRD[0] = (0x66FD2000, 0xE000) + [ 2664.897371] ata_bmdma_fill_sg: PRD[1] = (0x65000000, 0x10000) + ------------------------------------------------------> ======= + [ 2664.897378] ata_bmdma_fill_sg: PRD[2] = (0x66A10000, 0x2000) + [ 2664.897386] ata1: ata_dev_select: ENTER, device 0, wait 1 + [ 2664.897422] ata_sff_tf_load: feat 0x1 nsect 0x0 lba 0x0 0x0 0xFC + [ 2664.897428] ata_sff_tf_load: device 0xA0 + [ 2664.897448] ata_sff_exec_command: ata1: cmd 0xA0 + [ 2664.897457] ata_scsi_translate: EXIT + [ 2664.897462] leaving scsi_dispatch_cmnd() + [ 2664.897497] Doing sr request, dev = sr0, block = 0 + [ 2664.897507] sr0 : reading 64/256 512 byte blocks. + [ 2664.897553] ata_sff_hsm_move: ata1: protocol 7 task_state 1 (dev_stat 0x58) + [ 2664.897560] atapi_send_cdb: send cdb + [ 2666.910058] ata_bmdma_port_intr: ata1: host_stat 0x64 + [ 2666.910079] __ata_sff_port_intr: ata1: protocol 7 task_state 3 + [ 2666.910093] ata_sff_hsm_move: ata1: protocol 7 task_state 3 (dev_stat 0x51) + [ 2666.910101] ata_sff_hsm_move: ata1: protocol 7 task_state 4 (dev_stat 0x51) + [ 2666.910129] sr 9:0:0:0: [sr0] Done: + [ 2666.910136] 0xf63d85e0 TIMEOUT + +lspci shows that the driver used for the Broadcom OSB4 IDE Controller is +pata_serverworks: + + 00:0f.1 IDE interface: Broadcom OSB4 IDE Controller (prog-if 8e [Master SecP SecO PriP]) + Flags: bus master, medium devsel, latency 64 + [virtual] Memory at 000001f0 (32-bit, non-prefetchable) [size=8] + [virtual] Memory at 000003f0 (type 3, non-prefetchable) [size=1] + I/O ports at 0170 [size=8] + I/O ports at 0374 [size=4] + I/O ports at 1440 [size=16] + Kernel driver in use: pata_serverworks + +The pata_serverworks driver supports five distinct device IDs, +one being the OSB4 and the other four belonging to the CSB series. +The CSB series appears to support 64-KB DMA transfers, +as tests on a machine with an SAI2 motherboard +containing a Broadcom CSB5 IDE Controller (vendor and device IDs: 1166:0212) +showed no problems with 64-KB DMA transfers. + +This problem was first discovered when attempting to install openSUSE +from a DVD on a machine with an STL2 motherboard. +Using the pata_serverworks module, +older releases of openSUSE will not install at all due to the timeouts. +Releases of openSUSE prior to 11.3 can be installed by disabling +the pata_serverworks module using the brokenmodules boot parameter, +which causes the serverworks module to be used instead. +Recent releases of openSUSE (12.2 and later) include better error recovery and +will install, though very slowly. +On all openSUSE releases, the problem can be recreated +on a machine containing a Broadcom OSB4 IDE Controller +by mounting an install DVD and running a command similar to the following: + + find /mnt -type f -print | xargs cat > /dev/null + +The patch below corrects the problem. +Similar to the other ATA drivers that do not support 64-KB DMA transfers, +the patch changes the ata_port_operations qc_prep vector to point to a routine +that breaks any 64-KB segment into two 32-KB segments and +changes the scsi_host_template sg_tablesize element to reduce by half +the number of scatter/gather elements allowed. +These two changes affect only the OSB4. + +Signed-off-by: Scott Carter <ccscott@funsoft.com> +Signed-off-by: Tejun Heo <tj@kernel.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/ata/pata_serverworks.c | 13 +++++++++++-- + 1 file changed, 11 insertions(+), 2 deletions(-) + +--- a/drivers/ata/pata_serverworks.c ++++ b/drivers/ata/pata_serverworks.c +@@ -252,12 +252,18 @@ static void serverworks_set_dmamode(stru + pci_write_config_byte(pdev, 0x54, ultra_cfg); + } + +-static struct scsi_host_template serverworks_sht = { ++static struct scsi_host_template serverworks_osb4_sht = { ++ ATA_BMDMA_SHT(DRV_NAME), ++ .sg_tablesize = LIBATA_DUMB_MAX_PRD, ++}; ++ ++static struct scsi_host_template serverworks_csb_sht = { + ATA_BMDMA_SHT(DRV_NAME), + }; + + static struct ata_port_operations serverworks_osb4_port_ops = { + .inherits = &ata_bmdma_port_ops, ++ .qc_prep = ata_bmdma_dumb_qc_prep, + .cable_detect = serverworks_cable_detect, + .mode_filter = serverworks_osb4_filter, + .set_piomode = serverworks_set_piomode, +@@ -266,6 +272,7 @@ static struct ata_port_operations server + + static struct ata_port_operations serverworks_csb_port_ops = { + .inherits = &serverworks_osb4_port_ops, ++ .qc_prep = ata_bmdma_qc_prep, + .mode_filter = serverworks_csb_filter, + }; + +@@ -405,6 +412,7 @@ static int serverworks_init_one(struct p + } + }; + const struct ata_port_info *ppi[] = { &info[id->driver_data], NULL }; ++ struct scsi_host_template *sht = &serverworks_csb_sht; + int rc; + + rc = pcim_enable_device(pdev); +@@ -418,6 +426,7 @@ static int serverworks_init_one(struct p + /* Select non UDMA capable OSB4 if we can't do fixups */ + if (rc < 0) + ppi[0] = &info[1]; ++ sht = &serverworks_osb4_sht; + } + /* setup CSB5/CSB6 : South Bridge and IDE option RAID */ + else if ((pdev->device == PCI_DEVICE_ID_SERVERWORKS_CSB5IDE) || +@@ -434,7 +443,7 @@ static int serverworks_init_one(struct p + ppi[1] = &ata_dummy_port_info; + } + +- return ata_pci_bmdma_init_one(pdev, ppi, &serverworks_sht, NULL, 0); ++ return ata_pci_bmdma_init_one(pdev, ppi, sht, NULL, 0); + } + + #ifdef CONFIG_PM diff --git a/patches/pci-generate-uppercase-hex-for-modalias-interface-class.patch b/patches/pci-generate-uppercase-hex-for-modalias-interface-class.patch new file mode 100644 index 0000000..e4ce959 --- /dev/null +++ b/patches/pci-generate-uppercase-hex-for-modalias-interface-class.patch @@ -0,0 +1,39 @@ +From 89ec3dcf17fd3fa009ecf8faaba36828dd6bc416 Mon Sep 17 00:00:00 2001 +From: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com> +Date: Wed, 27 Aug 2014 14:57:57 +0200 +Subject: PCI: Generate uppercase hex for modalias interface class + +commit 89ec3dcf17fd3fa009ecf8faaba36828dd6bc416 upstream. + +Some implementations of modprobe fail to load the driver for a PCI device +automatically because the "interface" part of the modalias from the kernel +is lowercase, and the modalias from file2alias is uppercase. + +The "interface" is the low-order byte of the Class Code, defined in PCI +r3.0, Appendix D. Most interface types defined in the spec do not use +alpha characters, so they won't be affected. For example, 00h, 01h, 10h, +20h, etc. are unaffected. + +Print the "interface" byte of the Class Code in uppercase hex, as we +already do for the Vendor ID, Device ID, Class, etc. + +[bhelgaas: changelog] +Signed-off-by: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com> +Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> +Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/pci/pci-sysfs.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/pci/pci-sysfs.c ++++ b/drivers/pci/pci-sysfs.c +@@ -173,7 +173,7 @@ static ssize_t modalias_show(struct devi + { + struct pci_dev *pci_dev = to_pci_dev(dev); + +- return sprintf(buf, "pci:v%08Xd%08Xsv%08Xsd%08Xbc%02Xsc%02Xi%02x\n", ++ return sprintf(buf, "pci:v%08Xd%08Xsv%08Xsd%08Xbc%02Xsc%02Xi%02X\n", + pci_dev->vendor, pci_dev->device, + pci_dev->subsystem_vendor, pci_dev->subsystem_device, + (u8)(pci_dev->class >> 16), (u8)(pci_dev->class >> 8), diff --git a/patches/pci-increase-ibm-ipr-sas-crocodile-bars-to-at-least-system-page-size.patch b/patches/pci-increase-ibm-ipr-sas-crocodile-bars-to-at-least-system-page-size.patch new file mode 100644 index 0000000..41d056a --- /dev/null +++ b/patches/pci-increase-ibm-ipr-sas-crocodile-bars-to-at-least-system-page-size.patch @@ -0,0 +1,62 @@ +From 9fe373f9997b48fcd6222b95baf4a20c134b587a Mon Sep 17 00:00:00 2001 +From: Douglas Lehr <dllehr@us.ibm.com> +Date: Thu, 21 Aug 2014 09:26:52 +1000 +Subject: PCI: Increase IBM ipr SAS Crocodile BARs to at least system page size + +commit 9fe373f9997b48fcd6222b95baf4a20c134b587a upstream. + +The Crocodile chip occasionally comes up with 4k and 8k BAR sizes. Due to +an erratum, setting the SR-IOV page size causes the physical function BARs +to expand to the system page size. Since ppc64 uses 64k pages, when Linux +tries to assign the smaller resource sizes to the now 64k BARs the address +will be truncated and the BARs will overlap. + +Force Linux to allocate the resource as a full page, which avoids the +overlap. + +[bhelgaas: print expanded resource, too] +Signed-off-by: Douglas Lehr <dllehr@us.ibm.com> +Signed-off-by: Anton Blanchard <anton@samba.org> +Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> +Acked-by: Milton Miller <miltonm@us.ibm.com> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/pci/quirks.c | 20 ++++++++++++++++++++ + 1 file changed, 20 insertions(+) + +--- a/drivers/pci/quirks.c ++++ b/drivers/pci/quirks.c +@@ -28,6 +28,7 @@ + #include <linux/ioport.h> + #include <linux/sched.h> + #include <linux/ktime.h> ++#include <linux/mm.h> + #include <asm/dma.h> /* isa_dma_bridge_buggy */ + #include "pci.h" + +@@ -291,6 +292,25 @@ static void __devinit quirk_citrine(stru + } + DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_CITRINE, quirk_citrine); + ++/* On IBM Crocodile ipr SAS adapters, expand BAR to system page size */ ++static void quirk_extend_bar_to_page(struct pci_dev *dev) ++{ ++ int i; ++ ++ for (i = 0; i < PCI_STD_RESOURCE_END; i++) { ++ struct resource *r = &dev->resource[i]; ++ ++ if (r->flags & IORESOURCE_MEM && resource_size(r) < PAGE_SIZE) { ++ r->end = PAGE_SIZE - 1; ++ r->start = 0; ++ r->flags |= IORESOURCE_UNSET; ++ dev_info(&dev->dev, "expanded BAR %d to page size: %pR\n", ++ i, r); ++ } ++ } ++} ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_IBM, 0x034a, quirk_extend_bar_to_page); ++ + /* + * S3 868 and 968 chips report region size equal to 32M, but they decode 64M. + * If it's needed, re-allocate the region. diff --git a/patches/pci-pciehp-prevent-null-dereference-during-probe.patch b/patches/pci-pciehp-prevent-null-dereference-during-probe.patch new file mode 100644 index 0000000..f414a9c --- /dev/null +++ b/patches/pci-pciehp-prevent-null-dereference-during-probe.patch @@ -0,0 +1,37 @@ +From bceee4a97eb58bd0e80e39eff11b506ddd9e7ad3 Mon Sep 17 00:00:00 2001 +From: Andreas Noever <andreas.noever@gmail.com> +Date: Tue, 16 Sep 2014 15:16:02 -0600 +Subject: PCI: pciehp: Prevent NULL dereference during probe + +commit bceee4a97eb58bd0e80e39eff11b506ddd9e7ad3 upstream. + +pciehp assumes that dev->subordinate, the struct pci_bus for a bridge's +secondary bus, exists. But we do not create that bus if we run out of bus +numbers during enumeration. This leads to a NULL dereference in +init_slot() (and other places). + +Change pciehp_probe() to return -ENODEV when no secondary bus is present. + +Signed-off-by: Andreas Noever <andreas.noever@gmail.com> +Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/pci/hotplug/pciehp_core.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/drivers/pci/hotplug/pciehp_core.c ++++ b/drivers/pci/hotplug/pciehp_core.c +@@ -237,6 +237,13 @@ static int pciehp_probe(struct pcie_devi + else if (pciehp_acpi_slot_detection_check(dev->port)) + goto err_out_none; + ++ if (!dev->port->subordinate) { ++ /* Can happen if we run out of bus numbers during probe */ ++ dev_err(&dev->device, ++ "Hotplug bridge without secondary bus, ignoring\n"); ++ goto err_out_none; ++ } ++ + ctrl = pcie_init(dev); + if (!ctrl) { + dev_err(&dev->device, "Controller initialization failed\n"); diff --git a/patches/power-charger-manager-fix-null-pointer-exception-with-missing-cm-fuel-gauge.patch b/patches/power-charger-manager-fix-null-pointer-exception-with-missing-cm-fuel-gauge.patch new file mode 100644 index 0000000..2db68c0 --- /dev/null +++ b/patches/power-charger-manager-fix-null-pointer-exception-with-missing-cm-fuel-gauge.patch @@ -0,0 +1,85 @@ +From 661a88860274e059fdb744dfaa98c045db7b5d1d Mon Sep 17 00:00:00 2001 +From: Krzysztof Kozlowski <k.kozlowski@samsung.com> +Date: Fri, 26 Sep 2014 13:27:03 +0200 +Subject: power: charger-manager: Fix NULL pointer exception with missing + cm-fuel-gauge + +commit 661a88860274e059fdb744dfaa98c045db7b5d1d upstream. + +NULL pointer exception happens during charger-manager probe if +'cm-fuel-gauge' property is not present. + +[ 2.448536] Unable to handle kernel NULL pointer dereference at virtual address 00000000 +[ 2.456572] pgd = c0004000 +[ 2.459217] [00000000] *pgd=00000000 +[ 2.462759] Internal error: Oops: 5 [#1] PREEMPT SMP ARM +[ 2.468047] Modules linked in: +[ 2.471089] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.17.0-rc6-00251-ge44cf96cd525-dirty #969 +[ 2.479765] task: ea890000 ti: ea87a000 task.ti: ea87a000 +[ 2.485161] PC is at strcmp+0x4/0x30 +[ 2.488719] LR is at power_supply_match_device_by_name+0x10/0x1c +[ 2.494695] pc : [<c01f4220>] lr : [<c030fe38>] psr: a0000113 +[ 2.494695] sp : ea87bde0 ip : 00000000 fp : eaa97010 +[ 2.506150] r10: 00000004 r9 : ea97269c r8 : ea3bbfd0 +[ 2.511360] r7 : eaa97000 r6 : c030fe28 r5 : 00000000 r4 : ea3b0000 +[ 2.517869] r3 : 0000006d r2 : 00000000 r1 : 00000000 r0 : c057c195 +[ 2.524381] Flags: NzCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment kernel +[ 2.531671] Control: 10c5387d Table: 4000404a DAC: 00000015 +[ 2.537399] Process swapper/0 (pid: 1, stack limit = 0xea87a240) +[ 2.543388] Stack: (0xea87bde0 to 0xea87c000) +[ 2.547733] bde0: ea3b0210 c026b1c8 eaa97010 eaa97000 eaa97010 eabb60a8 ea3b0210 00000000 +[ 2.555891] be00: 00000008 ea2db210 ea1a3410 c030fee0 ea3bbf90 c03138fc c068969c c013526c +[ 2.564050] be20: eaa040c0 00000000 c068969c 00000000 eaa040c0 ea2da300 00000002 00000000 +[ 2.572208] be40: 00000001 ea2da3c0 00000000 00000001 00000000 eaa97010 c068969c 00000000 +[ 2.580367] be60: 00000000 c068969c 00000000 00000002 00000000 c026b71c c026b6f0 eaa97010 +[ 2.588527] be80: c0e82530 c026a330 00000000 eaa97010 c068969c eaa97044 00000000 c061df50 +[ 2.596686] bea0: ea87a000 c026a4dc 00000000 c068969c c026a448 c0268b5c ea8054a8 eaa8fd50 +[ 2.604845] bec0: c068969c ea2db180 c06801f8 c0269b18 c0590f68 c068969c c0656c98 c068969c +[ 2.613004] bee0: c0656c98 ea3bbe40 c06988c0 c026aaf0 00000000 c0656c98 c0656c98 c00088a4 +[ 2.621163] bf00: 00000000 c0055f48 00000000 00000004 00000000 ea890000 c05dbc54 c062c178 +[ 2.629323] bf20: c0603518 c005f674 00000001 ea87a000 eb7ff83b c0476440 00000091 c003d41c +[ 2.637482] bf40: c05db344 00000007 eb7ff858 00000007 c065a76c c0647d24 00000007 c062c170 +[ 2.645642] bf60: c06988c0 00000091 c062c178 c0603518 00000000 c0603cc4 00000007 00000007 +[ 2.653801] bf80: c0603518 c0c0c0c0 00000000 c0453948 00000000 00000000 00000000 00000000 +[ 2.661959] bfa0: 00000000 c0453950 00000000 c000e728 00000000 00000000 00000000 00000000 +[ 2.670118] bfc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 +[ 2.678277] bfe0: 00000000 00000000 00000000 00000000 00000013 00000000 c0c0c0c0 c0c0c0c0 +[ 2.686454] [<c01f4220>] (strcmp) from [<c030fe38>] (power_supply_match_device_by_name+0x10/0x1c) +[ 2.695303] [<c030fe38>] (power_supply_match_device_by_name) from [<c026b1c8>] (class_find_device+0x54/0xac) +[ 2.705106] [<c026b1c8>] (class_find_device) from [<c030fee0>] (power_supply_get_by_name+0x1c/0x30) +[ 2.714137] [<c030fee0>] (power_supply_get_by_name) from [<c03138fc>] (charger_manager_probe+0x3d8/0xe58) +[ 2.723683] [<c03138fc>] (charger_manager_probe) from [<c026b71c>] (platform_drv_probe+0x2c/0x5c) +[ 2.732532] [<c026b71c>] (platform_drv_probe) from [<c026a330>] (driver_probe_device+0x10c/0x224) +[ 2.741384] [<c026a330>] (driver_probe_device) from [<c026a4dc>] (__driver_attach+0x94/0x98) +[ 2.749813] [<c026a4dc>] (__driver_attach) from [<c0268b5c>] (bus_for_each_dev+0x54/0x88) +[ 2.757969] [<c0268b5c>] (bus_for_each_dev) from [<c0269b18>] (bus_add_driver+0xd4/0x1d0) +[ 2.766123] [<c0269b18>] (bus_add_driver) from [<c026aaf0>] (driver_register+0x78/0xf4) +[ 2.774110] [<c026aaf0>] (driver_register) from [<c00088a4>] (do_one_initcall+0x80/0x1bc) +[ 2.782276] [<c00088a4>] (do_one_initcall) from [<c0603cc4>] (kernel_init_freeable+0x100/0x1cc) +[ 2.790952] [<c0603cc4>] (kernel_init_freeable) from [<c0453950>] (kernel_init+0x8/0xec) +[ 2.799029] [<c0453950>] (kernel_init) from [<c000e728>] (ret_from_fork+0x14/0x2c) +[ 2.806572] Code: e12fff1e e1a03000 eafffff7 e4d03001 (e4d12001) +[ 2.812832] ---[ end trace 7f12556111b9e7ef ]--- + +Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com> +Fixes: 856ee6115e2d ("charger-manager: Support deivce tree in charger manager driver") +Signed-off-by: Sebastian Reichel <sre@kernel.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/power/charger-manager.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/drivers/power/charger-manager.c ++++ b/drivers/power/charger-manager.c +@@ -808,6 +808,11 @@ static int charger_manager_probe(struct + goto err_no_charger_stat; + } + ++ if (!desc->psy_fuel_gauge) { ++ dev_err(&pdev->dev, "No fuel gauge power supply defined\n"); ++ return -EINVAL; ++ } ++ + /* Counting index only */ + while (desc->psy_charger_stat[i]) + i++; diff --git a/patches/random-add-and-use-memzero_explicit-for-clearing-data.patch b/patches/random-add-and-use-memzero_explicit-for-clearing-data.patch new file mode 100644 index 0000000..bcac96f --- /dev/null +++ b/patches/random-add-and-use-memzero_explicit-for-clearing-data.patch @@ -0,0 +1,118 @@ +From d4c5efdb97773f59a2b711754ca0953f24516739 Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann <dborkman@redhat.com> +Date: Tue, 26 Aug 2014 23:16:35 -0400 +Subject: random: add and use memzero_explicit() for clearing data + +commit d4c5efdb97773f59a2b711754ca0953f24516739 upstream. + +zatimend has reported that in his environment (3.16/gcc4.8.3/corei7) +memset() calls which clear out sensitive data in extract_{buf,entropy, +entropy_user}() in random driver are being optimized away by gcc. + +Add a helper memzero_explicit() (similarly as explicit_bzero() variants) +that can be used in such cases where a variable with sensitive data is +being cleared out in the end. Other use cases might also be in crypto +code. [ I have put this into lib/string.c though, as it's always built-in +and doesn't need any dependencies then. ] + +Fixes kernel bugzilla: 82041 + +Reported-by: zatimend@hotmail.co.uk +Signed-off-by: Daniel Borkmann <dborkman@redhat.com> +Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> +Cc: Alexey Dobriyan <adobriyan@gmail.com> +Signed-off-by: Theodore Ts'o <tytso@mit.edu> +[lizf: Backported to 3.4: + - adjust context + - another memset() in extract_buf() needs to be converted] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/char/random.c | 10 +++++----- + include/linux/string.h | 4 +++- + lib/string.c | 16 ++++++++++++++++ + 3 files changed, 24 insertions(+), 6 deletions(-) + +--- a/drivers/char/random.c ++++ b/drivers/char/random.c +@@ -932,8 +932,8 @@ static void extract_buf(struct entropy_s + * pool while mixing, and hash one final time. + */ + sha_transform(hash.w, extract, workspace); +- memset(extract, 0, sizeof(extract)); +- memset(workspace, 0, sizeof(workspace)); ++ memzero_explicit(extract, sizeof(extract)); ++ memzero_explicit(workspace, sizeof(workspace)); + + /* + * In case the hash function has some recognizable output +@@ -956,7 +956,7 @@ static void extract_buf(struct entropy_s + } + + memcpy(out, &hash, EXTRACT_SIZE); +- memset(&hash, 0, sizeof(hash)); ++ memzero_explicit(&hash, sizeof(hash)); + } + + static ssize_t extract_entropy(struct entropy_store *r, void *buf, +@@ -989,7 +989,7 @@ static ssize_t extract_entropy(struct en + } + + /* Wipe data just returned from memory */ +- memset(tmp, 0, sizeof(tmp)); ++ memzero_explicit(tmp, sizeof(tmp)); + + return ret; + } +@@ -1027,7 +1027,7 @@ static ssize_t extract_entropy_user(stru + } + + /* Wipe data just returned from memory */ +- memset(tmp, 0, sizeof(tmp)); ++ memzero_explicit(tmp, sizeof(tmp)); + + return ret; + } +--- a/include/linux/string.h ++++ b/include/linux/string.h +@@ -133,7 +133,7 @@ int bprintf(u32 *bin_buf, size_t size, c + #endif + + extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, +- const void *from, size_t available); ++ const void *from, size_t available); + + /** + * strstarts - does @str start with @prefix? +@@ -144,5 +144,7 @@ static inline bool strstarts(const char + { + return strncmp(str, prefix, strlen(prefix)) == 0; + } ++ ++void memzero_explicit(void *s, size_t count); + #endif + #endif /* _LINUX_STRING_H_ */ +--- a/lib/string.c ++++ b/lib/string.c +@@ -586,6 +586,22 @@ void *memset(void *s, int c, size_t coun + EXPORT_SYMBOL(memset); + #endif + ++/** ++ * memzero_explicit - Fill a region of memory (e.g. sensitive ++ * keying data) with 0s. ++ * @s: Pointer to the start of the area. ++ * @count: The size of the area. ++ * ++ * memzero_explicit() doesn't need an arch-specific version as ++ * it just invokes the one of memset() implicitly. ++ */ ++void memzero_explicit(void *s, size_t count) ++{ ++ memset(s, 0, count); ++ OPTIMIZER_HIDE_VAR(s); ++} ++EXPORT_SYMBOL(memzero_explicit); ++ + #ifndef __HAVE_ARCH_MEMCPY + /** + * memcpy - Copy one area of memory to another diff --git a/patches/regmap-debugfs-fix-possbile-null-pointer-dereference.patch b/patches/regmap-debugfs-fix-possbile-null-pointer-dereference.patch new file mode 100644 index 0000000..9f34808 --- /dev/null +++ b/patches/regmap-debugfs-fix-possbile-null-pointer-dereference.patch @@ -0,0 +1,39 @@ +From 2c98e0c1cc6b8e86f1978286c3d4e0769ee9d733 Mon Sep 17 00:00:00 2001 +From: Xiubo Li <Li.Xiubo@freescale.com> +Date: Sun, 28 Sep 2014 11:35:25 +0800 +Subject: regmap: debugfs: fix possbile NULL pointer dereference + +commit 2c98e0c1cc6b8e86f1978286c3d4e0769ee9d733 upstream. + +If 'map->dev' is NULL and there will lead dev_name() to be NULL pointer +dereference. So before dev_name(), we need to have check of the map->dev +pionter. + +We also should make sure that the 'name' pointer shouldn't be NULL for +debugfs_create_dir(). So here using one default "dummy" debugfs name when +the 'name' pointer and 'map->dev' are both NULL. + +Signed-off-by: Xiubo Li <Li.Xiubo@freescale.com> +Signed-off-by: Mark Brown <broonie@kernel.org> +[lizf: Backported to 3.4: dev_name() is passed to debugfs_create_dir() in 3.4] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/base/regmap/regmap-debugfs.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/drivers/base/regmap/regmap-debugfs.c ++++ b/drivers/base/regmap/regmap-debugfs.c +@@ -244,7 +244,12 @@ static const struct file_operations regm + + void regmap_debugfs_init(struct regmap *map) + { +- map->debugfs = debugfs_create_dir(dev_name(map->dev), ++ const char *devname = "dummy"; ++ ++ if (map->dev) ++ devname = dev_name(map->dev); ++ ++ map->debugfs = debugfs_create_dir(devname, + regmap_debugfs_root); + if (!map->debugfs) { + dev_warn(map->dev, "Failed to create debugfs directory\n"); diff --git a/patches/regmap-fix-possible-zero_size_ptr-pointer-dereferencing-error.patch b/patches/regmap-fix-possible-zero_size_ptr-pointer-dereferencing-error.patch new file mode 100644 index 0000000..68c5646 --- /dev/null +++ b/patches/regmap-fix-possible-zero_size_ptr-pointer-dereferencing-error.patch @@ -0,0 +1,36 @@ +From d6b41cb06044a7d895db82bdd54f6e4219970510 Mon Sep 17 00:00:00 2001 +From: Xiubo Li <Li.Xiubo@freescale.com> +Date: Sun, 28 Sep 2014 17:09:54 +0800 +Subject: regmap: fix possible ZERO_SIZE_PTR pointer dereferencing error. + +commit d6b41cb06044a7d895db82bdd54f6e4219970510 upstream. + +Since we cannot make sure the 'val_count' will always be none zero +here, and then if it equals to zero, the kmemdup() will return +ZERO_SIZE_PTR, which equals to ((void *)16). + +So this patch fix this with just doing the zero check before calling +kmemdup(). + +Signed-off-by: Xiubo Li <Li.Xiubo@freescale.com> +Signed-off-by: Mark Brown <broonie@kernel.org> +[lizf: Backported to 3.4: release mutex before returning EINVAL] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/base/regmap/regmap.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/drivers/base/regmap/regmap.c ++++ b/drivers/base/regmap/regmap.c +@@ -600,6 +600,11 @@ int regmap_bulk_write(struct regmap *map + if (val_bytes == 1) { + wval = (void *)val; + } else { ++ if (!val_count) { ++ ret = -EINVAL; ++ goto out; ++ } ++ + wval = kmemdup(val, val_count * val_bytes, GFP_KERNEL); + if (!wval) { + ret = -ENOMEM; diff --git a/patches/revert-lzo-properly-check-for-overruns.patch b/patches/revert-lzo-properly-check-for-overruns.patch new file mode 100644 index 0000000..f40ed40 --- /dev/null +++ b/patches/revert-lzo-properly-check-for-overruns.patch @@ -0,0 +1,179 @@ +From af958a38a60c7ca3d8a39c918c1baa2ff7b6b233 Mon Sep 17 00:00:00 2001 +From: Willy Tarreau <w@1wt.eu> +Date: Sat, 27 Sep 2014 12:31:36 +0200 +Subject: Revert "lzo: properly check for overruns" + +commit af958a38a60c7ca3d8a39c918c1baa2ff7b6b233 upstream. + +This reverts commit 206a81c ("lzo: properly check for overruns"). + +As analysed by Willem Pinckaers, this fix is still incomplete on +certain rare corner cases, and it is easier to restart from the +original code. + +Reported-by: Willem Pinckaers <willem@lekkertech.net> +Cc: "Don A. Bailey" <donb@securitymouse.com> +Signed-off-by: Willy Tarreau <w@1wt.eu> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + lib/lzo/lzo1x_decompress_safe.c | 62 +++++++++++++--------------------------- + 1 file changed, 21 insertions(+), 41 deletions(-) + +--- a/lib/lzo/lzo1x_decompress_safe.c ++++ b/lib/lzo/lzo1x_decompress_safe.c +@@ -19,31 +19,11 @@ + #include <linux/lzo.h> + #include "lzodefs.h" + +-#define HAVE_IP(t, x) \ +- (((size_t)(ip_end - ip) >= (size_t)(t + x)) && \ +- (((t + x) >= t) && ((t + x) >= x))) +- +-#define HAVE_OP(t, x) \ +- (((size_t)(op_end - op) >= (size_t)(t + x)) && \ +- (((t + x) >= t) && ((t + x) >= x))) +- +-#define NEED_IP(t, x) \ +- do { \ +- if (!HAVE_IP(t, x)) \ +- goto input_overrun; \ +- } while (0) +- +-#define NEED_OP(t, x) \ +- do { \ +- if (!HAVE_OP(t, x)) \ +- goto output_overrun; \ +- } while (0) +- +-#define TEST_LB(m_pos) \ +- do { \ +- if ((m_pos) < out) \ +- goto lookbehind_overrun; \ +- } while (0) ++#define HAVE_IP(x) ((size_t)(ip_end - ip) >= (size_t)(x)) ++#define HAVE_OP(x) ((size_t)(op_end - op) >= (size_t)(x)) ++#define NEED_IP(x) if (!HAVE_IP(x)) goto input_overrun ++#define NEED_OP(x) if (!HAVE_OP(x)) goto output_overrun ++#define TEST_LB(m_pos) if ((m_pos) < out) goto lookbehind_overrun + + int lzo1x_decompress_safe(const unsigned char *in, size_t in_len, + unsigned char *out, size_t *out_len) +@@ -78,14 +58,14 @@ int lzo1x_decompress_safe(const unsigned + while (unlikely(*ip == 0)) { + t += 255; + ip++; +- NEED_IP(1, 0); ++ NEED_IP(1); + } + t += 15 + *ip++; + } + t += 3; + copy_literal_run: + #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +- if (likely(HAVE_IP(t, 15) && HAVE_OP(t, 15))) { ++ if (likely(HAVE_IP(t + 15) && HAVE_OP(t + 15))) { + const unsigned char *ie = ip + t; + unsigned char *oe = op + t; + do { +@@ -101,8 +81,8 @@ copy_literal_run: + } else + #endif + { +- NEED_OP(t, 0); +- NEED_IP(t, 3); ++ NEED_OP(t); ++ NEED_IP(t + 3); + do { + *op++ = *ip++; + } while (--t > 0); +@@ -115,7 +95,7 @@ copy_literal_run: + m_pos -= t >> 2; + m_pos -= *ip++ << 2; + TEST_LB(m_pos); +- NEED_OP(2, 0); ++ NEED_OP(2); + op[0] = m_pos[0]; + op[1] = m_pos[1]; + op += 2; +@@ -139,10 +119,10 @@ copy_literal_run: + while (unlikely(*ip == 0)) { + t += 255; + ip++; +- NEED_IP(1, 0); ++ NEED_IP(1); + } + t += 31 + *ip++; +- NEED_IP(2, 0); ++ NEED_IP(2); + } + m_pos = op - 1; + next = get_unaligned_le16(ip); +@@ -157,10 +137,10 @@ copy_literal_run: + while (unlikely(*ip == 0)) { + t += 255; + ip++; +- NEED_IP(1, 0); ++ NEED_IP(1); + } + t += 7 + *ip++; +- NEED_IP(2, 0); ++ NEED_IP(2); + } + next = get_unaligned_le16(ip); + ip += 2; +@@ -174,7 +154,7 @@ copy_literal_run: + #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) + if (op - m_pos >= 8) { + unsigned char *oe = op + t; +- if (likely(HAVE_OP(t, 15))) { ++ if (likely(HAVE_OP(t + 15))) { + do { + COPY8(op, m_pos); + op += 8; +@@ -184,7 +164,7 @@ copy_literal_run: + m_pos += 8; + } while (op < oe); + op = oe; +- if (HAVE_IP(6, 0)) { ++ if (HAVE_IP(6)) { + state = next; + COPY4(op, ip); + op += next; +@@ -192,7 +172,7 @@ copy_literal_run: + continue; + } + } else { +- NEED_OP(t, 0); ++ NEED_OP(t); + do { + *op++ = *m_pos++; + } while (op < oe); +@@ -201,7 +181,7 @@ copy_literal_run: + #endif + { + unsigned char *oe = op + t; +- NEED_OP(t, 0); ++ NEED_OP(t); + op[0] = m_pos[0]; + op[1] = m_pos[1]; + op += 2; +@@ -214,15 +194,15 @@ match_next: + state = next; + t = next; + #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +- if (likely(HAVE_IP(6, 0) && HAVE_OP(4, 0))) { ++ if (likely(HAVE_IP(6) && HAVE_OP(4))) { + COPY4(op, ip); + op += t; + ip += t; + } else + #endif + { +- NEED_IP(t, 3); +- NEED_OP(t, 0); ++ NEED_IP(t + 3); ++ NEED_OP(t); + while (t > 0) { + *op++ = *ip++; + t--; diff --git a/patches/revert-percpu-free-percpu-allocation-info-for-uniprocessor-system.patch b/patches/revert-percpu-free-percpu-allocation-info-for-uniprocessor-system.patch new file mode 100644 index 0000000..490a5ad --- /dev/null +++ b/patches/revert-percpu-free-percpu-allocation-info-for-uniprocessor-system.patch @@ -0,0 +1,35 @@ +From bb2e226b3bef596dd56be97df655d857b4603923 Mon Sep 17 00:00:00 2001 +From: Guenter Roeck <linux@roeck-us.net> +Date: Sun, 21 Sep 2014 15:04:53 -0700 +Subject: Revert "percpu: free percpu allocation info for uniprocessor system" + +commit bb2e226b3bef596dd56be97df655d857b4603923 upstream. + +This reverts commit 3189eddbcafc ("percpu: free percpu allocation info for +uniprocessor system"). + +The commit causes a hang with a crisv32 image. This may be an architecture +problem, but at least for now the revert is necessary to be able to boot a +crisv32 image. + +Cc: Tejun Heo <tj@kernel.org> +Cc: Honggang Li <enjoymindful@gmail.com> +Signed-off-by: Guenter Roeck <linux@roeck-us.net> +Signed-off-by: Tejun Heo <tj@kernel.org> +Fixes: 3189eddbcafc ("percpu: free percpu allocation info for uniprocessor system") +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + mm/percpu.c | 2 -- + 1 file changed, 2 deletions(-) + +--- a/mm/percpu.c ++++ b/mm/percpu.c +@@ -1907,8 +1907,6 @@ void __init setup_per_cpu_areas(void) + + if (pcpu_setup_first_chunk(ai, fc) < 0) + panic("Failed to initialize percpu areas."); +- +- pcpu_free_alloc_info(ai); + } + + #endif /* CONFIG_SMP */ diff --git a/patches/rt2800-correct-bbp1_tx_power_ctrl-mask.patch b/patches/rt2800-correct-bbp1_tx_power_ctrl-mask.patch new file mode 100644 index 0000000..0e13343 --- /dev/null +++ b/patches/rt2800-correct-bbp1_tx_power_ctrl-mask.patch @@ -0,0 +1,29 @@ +From 01f7feeaf4528bec83798316b3c811701bac5d3e Mon Sep 17 00:00:00 2001 +From: Stanislaw Gruszka <sgruszka@redhat.com> +Date: Wed, 24 Sep 2014 11:24:54 +0200 +Subject: rt2800: correct BBP1_TX_POWER_CTRL mask + +commit 01f7feeaf4528bec83798316b3c811701bac5d3e upstream. + +Two bits control TX power on BBP_R1 register. Correct the mask, +otherwise we clear additional bit on BBP_R1 register, what can have +unknown, possible negative effect. + +Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com> +Signed-off-by: John W. Linville <linville@tuxdriver.com> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/net/wireless/rt2x00/rt2800.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/wireless/rt2x00/rt2800.h ++++ b/drivers/net/wireless/rt2x00/rt2800.h +@@ -1751,7 +1751,7 @@ struct mac_iveiv_entry { + * 2 - drop tx power by 12dBm, + * 3 - increase tx power by 6dBm + */ +-#define BBP1_TX_POWER_CTRL FIELD8(0x07) ++#define BBP1_TX_POWER_CTRL FIELD8(0x03) + #define BBP1_TX_ANTENNA FIELD8(0x18) + + /* diff --git a/patches/selinux-fix-inode-security-list-corruption.patch b/patches/selinux-fix-inode-security-list-corruption.patch new file mode 100644 index 0000000..176610e --- /dev/null +++ b/patches/selinux-fix-inode-security-list-corruption.patch @@ -0,0 +1,56 @@ +From 923190d32de4428afbea5e5773be86bea60a9925 Mon Sep 17 00:00:00 2001 +From: Stephen Smalley <sds@tycho.nsa.gov> +Date: Mon, 6 Oct 2014 16:32:52 -0400 +Subject: selinux: fix inode security list corruption + +commit 923190d32de4428afbea5e5773be86bea60a9925 upstream. + +sb_finish_set_opts() can race with inode_free_security() +when initializing inode security structures for inodes +created prior to initial policy load or by the filesystem +during ->mount(). This appears to have always been +a possible race, but commit 3dc91d4 ("SELinux: Fix possible +NULL pointer dereference in selinux_inode_permission()") +made it more evident by immediately reusing the unioned +list/rcu element of the inode security structure for call_rcu() +upon an inode_free_security(). But the underlying issue +was already present before that commit as a possible use-after-free +of isec. + +Shivnandan Kumar reported the list corruption and proposed +a patch to split the list and rcu elements out of the union +as separate fields of the inode_security_struct so that setting +the rcu element would not affect the list element. However, +this would merely hide the issue and not truly fix the code. + +This patch instead moves up the deletion of the list entry +prior to dropping the sbsec->isec_lock initially. Then, +if the inode is dropped subsequently, there will be no further +references to the isec. + +Reported-by: Shivnandan Kumar <shivnandan.k@samsung.com> +Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov> +Signed-off-by: Paul Moore <pmoore@redhat.com> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + security/selinux/hooks.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/security/selinux/hooks.c ++++ b/security/selinux/hooks.c +@@ -436,6 +436,7 @@ next_inode: + list_entry(sbsec->isec_head.next, + struct inode_security_struct, list); + struct inode *inode = isec->inode; ++ list_del_init(&isec->list); + spin_unlock(&sbsec->isec_lock); + inode = igrab(inode); + if (inode) { +@@ -444,7 +445,6 @@ next_inode: + iput(inode); + } + spin_lock(&sbsec->isec_lock); +- list_del_init(&isec->list); + goto next_inode; + } + spin_unlock(&sbsec->isec_lock); diff --git a/patches/serial-8250-add-quark-x1000-to-8250_pci.c.patch b/patches/serial-8250-add-quark-x1000-to-8250_pci.c.patch new file mode 100644 index 0000000..10bdca4 --- /dev/null +++ b/patches/serial-8250-add-quark-x1000-to-8250_pci.c.patch @@ -0,0 +1,84 @@ +From 1ede7dcca3c4fa15a518ab0473126f9c3e621e4c Mon Sep 17 00:00:00 2001 +From: Bryan O'Donoghue <pure.logic@nexus-software.ie> +Date: Tue, 23 Sep 2014 01:21:11 +0100 +Subject: serial: 8250: Add Quark X1000 to 8250_pci.c + +commit 1ede7dcca3c4fa15a518ab0473126f9c3e621e4c upstream. + +Quark X1000 contains two designware derived 8250 serial ports. +Each port has a unique PCI configuration space consisting of +BAR0:UART BAR1:DMA respectively. + +Unlike the standard 8250 the register width is 32 bits for RHR,IER etc +The Quark UART has a fundamental clock @ 44.2368 MHz allowing for a +bitrate of up to about 2.76 megabits per second. + +This patch enables standard 8250 mode + +Signed-off-by: Bryan O'Donoghue <pure.logic@nexus-software.ie> +Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +[lizf: Backported to 3.4: adjust context] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/tty/serial/8250/8250_pci.c | 21 +++++++++++++++++++++ + 1 file changed, 21 insertions(+) + +--- a/drivers/tty/serial/8250/8250_pci.c ++++ b/drivers/tty/serial/8250/8250_pci.c +@@ -1164,6 +1164,7 @@ pci_xr17c154_setup(struct serial_private + #define PCI_DEVICE_ID_PLX_CRONYX_OMEGA 0xc001 + #define PCI_DEVICE_ID_INTEL_PATSBURG_KT 0x1d3d + #define PCI_DEVICE_ID_BROADCOM_TRUMANAGE 0x160a ++#define PCI_DEVICE_ID_INTEL_QRK_UART 0x0936 + + /* Unknown vendors/cards - this should not be in linux/pci_ids.h */ + #define PCI_SUBDEVICE_ID_UNKNOWN_0x1584 0x1584 +@@ -1686,6 +1687,13 @@ static struct pci_serial_quirk pci_seria + .init = pci_eg20t_init, + .setup = pci_default_setup, + }, ++ { ++ .vendor = PCI_VENDOR_ID_INTEL, ++ .device = PCI_DEVICE_ID_INTEL_QRK_UART, ++ .subvendor = PCI_ANY_ID, ++ .subdevice = PCI_ANY_ID, ++ .setup = pci_default_setup, ++ }, + /* + * Cronyx Omega PCI (PLX-chip based) + */ +@@ -1894,6 +1902,7 @@ enum pci_board_num_t { + pbn_ADDIDATA_PCIe_4_3906250, + pbn_ADDIDATA_PCIe_8_3906250, + pbn_ce4100_1_115200, ++ pbn_qrk, + pbn_omegapci, + pbn_NETMOS9900_2s_115200, + pbn_brcm_trumanage, +@@ -2592,6 +2601,12 @@ static struct pciserial_board pci_boards + .base_baud = 921600, + .reg_shift = 2, + }, ++ [pbn_qrk] = { ++ .flags = FL_BASE0, ++ .num_ports = 1, ++ .base_baud = 2764800, ++ .reg_shift = 2, ++ }, + [pbn_omegapci] = { + .flags = FL_BASE0, + .num_ports = 8, +@@ -4164,6 +4179,12 @@ static struct pci_device_id serial_pci_t + pbn_ce4100_1_115200 }, + + /* ++ * Intel Quark x1000 ++ */ ++ { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_QRK_UART, ++ PCI_ANY_ID, PCI_ANY_ID, 0, 0, ++ pbn_qrk }, ++ /* + * Cronyx Omega PCI + */ + { PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_CRONYX_OMEGA, diff --git a/patches/series b/patches/series index e69de29..c63d71c 100644 --- a/patches/series +++ b/patches/series @@ -0,0 +1,87 @@ +kvm-x86-fix-stale-mmio-cache-bug.patch +ubifs-remove-mst_mutex.patch +ubifs-fix-a-race-condition.patch +ubifs-fix-free-log-space-calculation.patch +bluetooth-fix-issue-with-usb-suspend-in-btusb-driver.patch +kvm-s390-unintended-fallthrough-for-external-call.patch +pci-pciehp-prevent-null-dereference-during-probe.patch +pci-increase-ibm-ipr-sas-crocodile-bars-to-at-least-system-page-size.patch +bluetooth-fix-setting-correct-security-level-when-initiating-smp.patch +revert-percpu-free-percpu-allocation-info-for-uniprocessor-system.patch +usb-serial-cp210x-added-ketra-n1-wireless-interface-support.patch +usb-cp210x-add-support-for-seluxit-usb-dongle.patch +pci-generate-uppercase-hex-for-modalias-interface-class.patch +usb-add-device-quirk-for-asus-t100-base-station-keyboard.patch +firmware_class-make-sure-fw-requests-contain-a-name.patch +drivers-hv-vmbus-cleanup-vmbus_post_msg.patch +drivers-hv-vmbus-cleanup-vmbus_teardown_gpadl.patch +drivers-hv-vmbus-cleanup-vmbus_establish_gpadl.patch +drivers-hv-vmbus-fix-a-bug-in-vmbus_open.patch +drivers-hv-vmbus-cleanup-vmbus_close_internal.patch +spi-dw-mid-respect-8-bit-mode.patch +spi-dw-mid-terminate-ongoing-transfers-at-exit.patch +kvm-don-t-take-vcpu-mutex-for-obviously-invalid-vcpu-ioctls.patch +x86-intel-quark-switch-off-cr4.pge-so-tlb-flush-uses-cr3-instead.patch +lockd-try-to-reconnect-if-statd-has-moved.patch +power-charger-manager-fix-null-pointer-exception-with-missing-cm-fuel-gauge.patch +rt2800-correct-bbp1_tx_power_ctrl-mask.patch +documentation-lzo-document-part-of-the-encoding.patch +revert-lzo-properly-check-for-overruns.patch +lzo-check-for-length-overrun-in-variable-length-encoding.patch +regmap-debugfs-fix-possbile-null-pointer-dereference.patch +regmap-fix-possible-zero_size_ptr-pointer-dereferencing-error.patch +libata-sff-fix-controllers-with-no-ctl-port.patch +nfsv4-fix-open-lock-state-recovery-error-handling.patch +serial-8250-add-quark-x1000-to-8250_pci.c.patch +framebuffer-fix-border-color.patch +mpc85xx_edac-make-l2-interrupt-shared-too.patch +nfsv4.1-fix-an-nfsv4.1-state-renewal-regression.patch +m68k-disable-restore-interrupts-in-hwreg_present-hwreg_write.patch +dm-bufio-update-last_accessed-when-relinking-a-buffer.patch +dm-log-userspace-fix-memory-leak-in-dm_ulog_tfr_init-failure-path.patch +ecryptfs-avoid-to-access-null-pointer-when-write-metadata-in-xattr.patch +pata_serverworks-disable-64-kb-dma-transfers-on-broadcom-osb4-ide-controller.patch +x86-reject-x32-executables-if-x32-abi-not-supported.patch +fs-fix-theoretical-division-by-0-in-super_cache_scan.patch +fs-make-cont_expand_zero-interruptible.patch +fix-misuses-of-f_count-in-ppp-and-netlink.patch +block-fix-alignment_offset-math-that-assumes-io_min-is-a-power-of-2.patch +fanotify-enable-close-on-exec-on-events-fd-when-requested-in-fanotify_init.patch +input-synaptics-gate-forcepad-support-by-dmi-check.patch +input-i8042-add-noloop-quirk-for-asus-x750ln.patch +kernel-add-support-for-gcc-5.patch +alsa-emu10k1-fix-deadlock-in-synth-voice-lookup.patch +mnt-prevent-pivot_root-from-creating-a-loop-in-the-mount-tree.patch +virtio_pci-fix-virtio-spec-compliance-on-restore.patch +selinux-fix-inode-security-list-corruption.patch +futex-ensure-get_futex_key_refs-always-implies-a-barrier.patch +x86-kvm-vmx-preserve-cr4-across-vm-entry.patch +ext4-check-ea-value-offset-when-loading.patch +ext4-don-t-check-quota-format-when-there-are-no-quota-files.patch +target-fix-queue-full-status-null-pointer-for-scf_transport_task_sense.patch +vfs-fix-data-corruption-when-blocksize-pagesize-for-mmaped-data.patch +ext4-don-t-orphan-or-truncate-the-boot-loader-inode.patch +ext4-add-ext4_iget_normal-which-is-to-be-used-for-dir-tree-lookups.patch +ext4-fix-reservation-overflow-in-ext4_da_write_begin.patch +crypto-more-robust-crypto_memneq.patch +random-add-and-use-memzero_explicit-for-clearing-data.patch +alsa-pcm-use-the-same-dma-mmap-codepath-both-for-arm-and-arm64.patch +alsa-usb-audio-add-support-for-steinberg-ur22-usb-interface.patch +freezer-do-not-freeze-tasks-killed-by-oom-killer.patch +kernel-fork.c-copy_process-unify-clone_thread-or-thread_group_leader-code.patch +introduce-for_each_thread-to-replace-the-buggy-while_each_thread.patch +oom-pm-oom-killed-task-shouldn-t-escape-pm-suspend.patch +mips-tlbex-fix-a-missing-statement-for-hugetlb.patch +mips-tlbex-properly-fix-huge-tlb-refill-exception-handler.patch +cpufreq-expose-scaling_cur_freq-sysfs-file-for-set_policy-drivers.patch +kvm-x86-check-non-canonical-addresses-upon-wrmsr.patch +kvm-x86-prevent-host-from-panicking-on-shared-msr-writes.patch +kvm-x86-improve-thread-safety-in-pit.patch +kvm-x86-fix-wrong-masking-on-relative-jump-call.patch +kvm-x86-emulator-fixes-for-eip-canonical-checks-on-near-branches.patch +kvm-x86-use-new-cs.rpl-as-cpl-during-task-switch.patch +kvm-x86-handle-errors-when-rip-is-set-during-far-jumps.patch +nept-nested-invept.patch +kvm-vmx-handle-invvpid-vm-exit-gracefully.patch +kvm-x86-don-t-kill-guest-on-unknown-exit-reason.patch +kvm-fix-excessive-pages-un-pinning-in-kvm_iommu_map-error-path.patch diff --git a/patches/spi-dw-mid-respect-8-bit-mode.patch b/patches/spi-dw-mid-respect-8-bit-mode.patch new file mode 100644 index 0000000..3eb8ba4 --- /dev/null +++ b/patches/spi-dw-mid-respect-8-bit-mode.patch @@ -0,0 +1,37 @@ +From b41583e7299046abdc578c33f25ed83ee95b9b31 Mon Sep 17 00:00:00 2001 +From: Andy Shevchenko <andriy.shevchenko@linux.intel.com> +Date: Thu, 18 Sep 2014 20:08:51 +0300 +Subject: spi: dw-mid: respect 8 bit mode + +commit b41583e7299046abdc578c33f25ed83ee95b9b31 upstream. + +In case of 8 bit mode and DMA usage we end up with every second byte written as +0. We have to respect bits_per_word settings what this patch actually does. + +Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com> +Signed-off-by: Mark Brown <broonie@kernel.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/spi/spi-dw-mid.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/spi/spi-dw-mid.c ++++ b/drivers/spi/spi-dw-mid.c +@@ -136,7 +136,7 @@ static int mid_spi_dma_transfer(struct d + txconf.dst_addr = dws->dma_addr; + txconf.dst_maxburst = LNW_DMA_MSIZE_16; + txconf.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; +- txconf.dst_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES; ++ txconf.dst_addr_width = dws->dma_width; + txconf.device_fc = false; + + txchan->device->device_control(txchan, DMA_SLAVE_CONFIG, +@@ -159,7 +159,7 @@ static int mid_spi_dma_transfer(struct d + rxconf.src_addr = dws->dma_addr; + rxconf.src_maxburst = LNW_DMA_MSIZE_16; + rxconf.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; +- rxconf.src_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES; ++ rxconf.src_addr_width = dws->dma_width; + rxconf.device_fc = false; + + rxchan->device->device_control(rxchan, DMA_SLAVE_CONFIG, diff --git a/patches/spi-dw-mid-terminate-ongoing-transfers-at-exit.patch b/patches/spi-dw-mid-terminate-ongoing-transfers-at-exit.patch new file mode 100644 index 0000000..4f55fcf --- /dev/null +++ b/patches/spi-dw-mid-terminate-ongoing-transfers-at-exit.patch @@ -0,0 +1,30 @@ +From 8e45ef682cb31fda62ed4eeede5d9745a0a1b1e2 Mon Sep 17 00:00:00 2001 +From: Andy Shevchenko <andriy.shevchenko@linux.intel.com> +Date: Thu, 18 Sep 2014 20:08:53 +0300 +Subject: spi: dw-mid: terminate ongoing transfers at exit + +commit 8e45ef682cb31fda62ed4eeede5d9745a0a1b1e2 upstream. + +Do full clean up at exit, means terminate all ongoing DMA transfers. + +Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com> +Signed-off-by: Mark Brown <broonie@kernel.org> +[lizf: Backported to 3.4: adjust context] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/spi/spi-dw-mid.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/drivers/spi/spi-dw-mid.c ++++ b/drivers/spi/spi-dw-mid.c +@@ -89,7 +89,10 @@ err_exit: + + static void mid_spi_dma_exit(struct dw_spi *dws) + { ++ dmaengine_terminate_all(dws->txchan); + dma_release_channel(dws->txchan); ++ ++ dmaengine_terminate_all(dws->rxchan); + dma_release_channel(dws->rxchan); + } + diff --git a/patches/target-fix-queue-full-status-null-pointer-for-scf_transport_task_sense.patch b/patches/target-fix-queue-full-status-null-pointer-for-scf_transport_task_sense.patch new file mode 100644 index 0000000..232e66f --- /dev/null +++ b/patches/target-fix-queue-full-status-null-pointer-for-scf_transport_task_sense.patch @@ -0,0 +1,43 @@ +From 082f58ac4a48d3f5cb4597232cb2ac6823a96f43 Mon Sep 17 00:00:00 2001 +From: Quinn Tran <quinn.tran@qlogic.com> +Date: Thu, 25 Sep 2014 06:22:28 -0400 +Subject: target: Fix queue full status NULL pointer for + SCF_TRANSPORT_TASK_SENSE + +commit 082f58ac4a48d3f5cb4597232cb2ac6823a96f43 upstream. + +During temporary resource starvation at lower transport layer, command +is placed on queue full retry path, which expose this problem. The TCM +queue full handling of SCF_TRANSPORT_TASK_SENSE currently sends the same +cmd twice to lower layer. The 1st time led to cmd normal free path. +The 2nd time cause Null pointer access. + +This regression bug was originally introduced v3.1-rc code in the +following commit: + +commit e057f53308a5f071556ee80586b99ee755bf07f5 +Author: Christoph Hellwig <hch@infradead.org> +Date: Mon Oct 17 13:56:41 2011 -0400 + + target: remove the transport_qf_callback se_cmd callback + +Signed-off-by: Quinn Tran <quinn.tran@qlogic.com> +Signed-off-by: Saurav Kashyap <saurav.kashyap@qlogic.com> +Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/target/target_core_transport.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/drivers/target/target_core_transport.c ++++ b/drivers/target/target_core_transport.c +@@ -3284,8 +3284,7 @@ static void transport_complete_qf(struct + + if (cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) { + ret = cmd->se_tfo->queue_status(cmd); +- if (ret) +- goto out; ++ goto out; + } + + switch (cmd->data_direction) { diff --git a/patches/ubifs-fix-a-race-condition.patch b/patches/ubifs-fix-a-race-condition.patch new file mode 100644 index 0000000..89fccac --- /dev/null +++ b/patches/ubifs-fix-a-race-condition.patch @@ -0,0 +1,121 @@ +From 052c28073ff26f771d44ef33952a41d18dadd255 Mon Sep 17 00:00:00 2001 +From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> +Date: Sun, 29 Jun 2014 17:00:45 +0300 +Subject: UBIFS: fix a race condition + +commit 052c28073ff26f771d44ef33952a41d18dadd255 upstream. + +Hu (hujianyang@huawei.com) discovered a race condition which may lead to a +situation when UBIFS is unable to mount the file-system after an unclean +reboot. The problem is theoretical, though. + +In UBIFS, we have the log, which basically a set of LEBs in a certain area. The +log has the tail and the head. + +Every time user writes data to the file-system, the UBIFS journal grows, and +the log grows as well, because we append new reference nodes to the head of the +log. So the head moves forward all the time, while the log tail stays at the +same position. + +At any time, the UBIFS master node points to the tail of the log. When we mount +the file-system, we scan the log, and we always start from its tail, because +this is where the master node points to. The only occasion when the tail of the +log changes is the commit operation. + +The commit operation has 2 phases - "commit start" and "commit end". The former +is relatively short, and does not involve much I/O. During this phase we mostly +just build various in-memory lists of the things which have to be written to +the flash media during "commit end" phase. + +During the commit start phase, what we do is we "clean" the log. Indeed, the +commit operation will index all the data in the journal, so the entire journal +"disappears", and therefore the data in the log become unneeded. So we just +move the head of the log to the next LEB, and write the CS node there. This LEB +will be the tail of the new log when the commit operation finishes. + +When the "commit start" phase finishes, users may write more data to the +file-system, in parallel with the ongoing "commit end" operation. At this point +the log tail was not changed yet, it is the same as it had been before we +started the commit. The log head keeps moving forward, though. + +The commit operation now needs to write the new master node, and the new master +node should point to the new log tail. After this the LEBs between the old log +tail and the new log tail can be unmapped and re-used again. + +And here is the possible problem. We do 2 operations: (a) We first update the +log tail position in memory (see 'ubifs_log_end_commit()'). (b) And then we +write the master node (see the big lock of code in 'do_commit()'). + +But nothing prevents the log head from moving forward between (a) and (b), and +the log head may "wrap" now to the old log tail. And when the "wrap" happens, +the contends of the log tail gets erased. Now a power cut happens and we are in +trouble. We end up with the old master node pointing to the old tail, which was +erased. And replay fails because it expects the master node to point to the +correct log tail at all times. + +This patch merges the abovementioned (a) and (b) operations by moving the master +node change code to the 'ubifs_log_end_commit()' function, so that it runs with +the log mutex locked, which will prevent the log from being changed benween +operations (a) and (b). + +Reported-by: hujianyang <hujianyang@huawei.com> +Tested-by: hujianyang <hujianyang@huawei.com> +Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + fs/ubifs/commit.c | 8 +++----- + fs/ubifs/log.c | 11 ++++++++--- + 2 files changed, 11 insertions(+), 8 deletions(-) + +--- a/fs/ubifs/commit.c ++++ b/fs/ubifs/commit.c +@@ -166,10 +166,6 @@ static int do_commit(struct ubifs_info * + err = ubifs_orphan_end_commit(c); + if (err) + goto out; +- old_ltail_lnum = c->ltail_lnum; +- err = ubifs_log_end_commit(c, new_ltail_lnum); +- if (err) +- goto out; + err = dbg_check_old_index(c, &zroot); + if (err) + goto out; +@@ -202,7 +198,9 @@ static int do_commit(struct ubifs_info * + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); + else + c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS); +- err = ubifs_write_master(c); ++ ++ old_ltail_lnum = c->ltail_lnum; ++ err = ubifs_log_end_commit(c, new_ltail_lnum); + if (err) + goto out; + +--- a/fs/ubifs/log.c ++++ b/fs/ubifs/log.c +@@ -453,9 +453,9 @@ out: + * @ltail_lnum: new log tail LEB number + * + * This function is called on when the commit operation was finished. It +- * moves log tail to new position and unmaps LEBs which contain obsolete data. +- * Returns zero in case of success and a negative error code in case of +- * failure. ++ * moves log tail to new position and updates the master node so that it stores ++ * the new log tail LEB number. Returns zero in case of success and a negative ++ * error code in case of failure. + */ + int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum) + { +@@ -483,7 +483,12 @@ int ubifs_log_end_commit(struct ubifs_in + spin_unlock(&c->buds_lock); + + err = dbg_check_bud_bytes(c); ++ if (err) ++ goto out; + ++ err = ubifs_write_master(c); ++ ++out: + mutex_unlock(&c->log_mutex); + return err; + } diff --git a/patches/ubifs-fix-free-log-space-calculation.patch b/patches/ubifs-fix-free-log-space-calculation.patch new file mode 100644 index 0000000..c1d84e2 --- /dev/null +++ b/patches/ubifs-fix-free-log-space-calculation.patch @@ -0,0 +1,48 @@ +From ba29e721eb2df6df8f33c1f248388bb037a47914 Mon Sep 17 00:00:00 2001 +From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> +Date: Wed, 16 Jul 2014 15:22:29 +0300 +Subject: UBIFS: fix free log space calculation + +commit ba29e721eb2df6df8f33c1f248388bb037a47914 upstream. + +Hu (hujianyang <hujianyang@huawei.com>) discovered an issue in the +'empty_log_bytes()' function, which calculates how many bytes are left in the +log: + +" +If 'c->lhead_lnum + 1 == c->ltail_lnum' and 'c->lhead_offs == c->leb_size', 'h' +would equalent to 't' and 'empty_log_bytes()' would return 'c->log_bytes' +instead of 0. +" + +At this point it is not clear what would be the consequences of this, and +whether this may lead to any problems, but this patch addresses the issue just +in case. + +Tested-by: hujianyang <hujianyang@huawei.com> +Reported-by: hujianyang <hujianyang@huawei.com> +Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + fs/ubifs/log.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/fs/ubifs/log.c ++++ b/fs/ubifs/log.c +@@ -110,10 +110,14 @@ static inline long long empty_log_bytes( + h = (long long)c->lhead_lnum * c->leb_size + c->lhead_offs; + t = (long long)c->ltail_lnum * c->leb_size; + +- if (h >= t) ++ if (h > t) + return c->log_bytes - h + t; +- else ++ else if (h != t) + return t - h; ++ else if (c->lhead_lnum != c->ltail_lnum) ++ return 0; ++ else ++ return c->log_bytes; + } + + /** diff --git a/patches/ubifs-remove-mst_mutex.patch b/patches/ubifs-remove-mst_mutex.patch new file mode 100644 index 0000000..f6c94f9 --- /dev/null +++ b/patches/ubifs-remove-mst_mutex.patch @@ -0,0 +1,83 @@ +From 07e19dff63e3d5d6500d831e36554ac9b1b0560e Mon Sep 17 00:00:00 2001 +From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> +Date: Sun, 29 Jun 2014 16:55:02 +0300 +Subject: UBIFS: remove mst_mutex + +commit 07e19dff63e3d5d6500d831e36554ac9b1b0560e upstream. + +The 'mst_mutex' is not needed since because 'ubifs_write_master()' is only +called on the mount path and commit path. The mount path is sequential and +there is no parallelism, and the commit path is also serialized - there is only +one commit going on at a time. + +Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + fs/ubifs/commit.c | 2 -- + fs/ubifs/master.c | 7 +++---- + fs/ubifs/super.c | 1 - + fs/ubifs/ubifs.h | 2 -- + 4 files changed, 3 insertions(+), 9 deletions(-) + +--- a/fs/ubifs/commit.c ++++ b/fs/ubifs/commit.c +@@ -174,7 +174,6 @@ static int do_commit(struct ubifs_info * + if (err) + goto out; + +- mutex_lock(&c->mst_mutex); + c->mst_node->cmt_no = cpu_to_le64(c->cmt_no); + c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum); + c->mst_node->root_lnum = cpu_to_le32(zroot.lnum); +@@ -204,7 +203,6 @@ static int do_commit(struct ubifs_info * + else + c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS); + err = ubifs_write_master(c); +- mutex_unlock(&c->mst_mutex); + if (err) + goto out; + +--- a/fs/ubifs/master.c ++++ b/fs/ubifs/master.c +@@ -352,10 +352,9 @@ int ubifs_read_master(struct ubifs_info + * ubifs_write_master - write master node. + * @c: UBIFS file-system description object + * +- * This function writes the master node. The caller has to take the +- * @c->mst_mutex lock before calling this function. Returns zero in case of +- * success and a negative error code in case of failure. The master node is +- * written twice to enable recovery. ++ * This function writes the master node. Returns zero in case of success and a ++ * negative error code in case of failure. The master node is written twice to ++ * enable recovery. + */ + int ubifs_write_master(struct ubifs_info *c) + { +--- a/fs/ubifs/super.c ++++ b/fs/ubifs/super.c +@@ -1984,7 +1984,6 @@ static struct ubifs_info *alloc_ubifs_in + mutex_init(&c->lp_mutex); + mutex_init(&c->tnc_mutex); + mutex_init(&c->log_mutex); +- mutex_init(&c->mst_mutex); + mutex_init(&c->umount_mutex); + mutex_init(&c->bu_mutex); + mutex_init(&c->write_reserve_mutex); +--- a/fs/ubifs/ubifs.h ++++ b/fs/ubifs/ubifs.h +@@ -1041,7 +1041,6 @@ struct ubifs_debug_info; + * + * @mst_node: master node + * @mst_offs: offset of valid master node +- * @mst_mutex: protects the master node area, @mst_node, and @mst_offs + * + * @max_bu_buf_len: maximum bulk-read buffer length + * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu +@@ -1281,7 +1280,6 @@ struct ubifs_info { + + struct ubifs_mst_node *mst_node; + int mst_offs; +- struct mutex mst_mutex; + + int max_bu_buf_len; + struct mutex bu_mutex; diff --git a/patches/usb-add-device-quirk-for-asus-t100-base-station-keyboard.patch b/patches/usb-add-device-quirk-for-asus-t100-base-station-keyboard.patch new file mode 100644 index 0000000..4ff0033 --- /dev/null +++ b/patches/usb-add-device-quirk-for-asus-t100-base-station-keyboard.patch @@ -0,0 +1,66 @@ +From ddbe1fca0bcb87ca8c199ea873a456ca8a948567 Mon Sep 17 00:00:00 2001 +From: Lu Baolu <baolu.lu@linux.intel.com> +Date: Fri, 19 Sep 2014 10:13:50 +0800 +Subject: USB: Add device quirk for ASUS T100 Base Station keyboard + +commit ddbe1fca0bcb87ca8c199ea873a456ca8a948567 upstream. + +This full-speed USB device generates spurious remote wakeup event +as soon as USB_DEVICE_REMOTE_WAKEUP feature is set. As the result, +Linux can't enter system suspend and S0ix power saving modes once +this keyboard is used. + +This patch tries to introduce USB_QUIRK_IGNORE_REMOTE_WAKEUP quirk. +With this quirk set, wakeup capability will be ignored during +device configure. + +This patch could be back-ported to kernels as old as 2.6.39. + +Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com> +Acked-by: Alan Stern <stern@rowland.harvard.edu> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/usb/core/hub.c | 6 ++++-- + drivers/usb/core/quirks.c | 4 ++++ + include/linux/usb/quirks.h | 3 +++ + 3 files changed, 11 insertions(+), 2 deletions(-) + +--- a/drivers/usb/core/hub.c ++++ b/drivers/usb/core/hub.c +@@ -1638,8 +1638,10 @@ void usb_set_device_state(struct usb_dev + || new_state == USB_STATE_SUSPENDED) + ; /* No change to wakeup settings */ + else if (new_state == USB_STATE_CONFIGURED) +- wakeup = udev->actconfig->desc.bmAttributes +- & USB_CONFIG_ATT_WAKEUP; ++ wakeup = (udev->quirks & ++ USB_QUIRK_IGNORE_REMOTE_WAKEUP) ? 0 : ++ udev->actconfig->desc.bmAttributes & ++ USB_CONFIG_ATT_WAKEUP; + else + wakeup = 0; + } +--- a/drivers/usb/core/quirks.c ++++ b/drivers/usb/core/quirks.c +@@ -158,6 +158,10 @@ static const struct usb_device_id usb_in + { USB_VENDOR_AND_INTERFACE_INFO(0x046d, USB_CLASS_VIDEO, 1, 0), + .driver_info = USB_QUIRK_RESET_RESUME }, + ++ /* ASUS Base Station(T100) */ ++ { USB_DEVICE(0x0b05, 0x17e0), .driver_info = ++ USB_QUIRK_IGNORE_REMOTE_WAKEUP }, ++ + { } /* terminating entry must be last */ + }; + +--- a/include/linux/usb/quirks.h ++++ b/include/linux/usb/quirks.h +@@ -30,4 +30,7 @@ + descriptor */ + #define USB_QUIRK_DELAY_INIT 0x00000040 + ++/* device generates spurious wakeup, ignore remote wakeup capability */ ++#define USB_QUIRK_IGNORE_REMOTE_WAKEUP 0x00000200 ++ + #endif /* __LINUX_USB_QUIRKS_H */ diff --git a/patches/usb-cp210x-add-support-for-seluxit-usb-dongle.patch b/patches/usb-cp210x-add-support-for-seluxit-usb-dongle.patch new file mode 100644 index 0000000..b8fee71 --- /dev/null +++ b/patches/usb-cp210x-add-support-for-seluxit-usb-dongle.patch @@ -0,0 +1,26 @@ +From dee80ad12d2b1b304286a707fde7ab05d1fc7bab Mon Sep 17 00:00:00 2001 +From: Andreas Bomholtz <andreas@seluxit.com> +Date: Mon, 22 Sep 2014 09:50:43 +0200 +Subject: USB: cp210x: add support for Seluxit USB dongle + +commit dee80ad12d2b1b304286a707fde7ab05d1fc7bab upstream. + +Added the Seluxit ApS USB Serial Dongle to cp210x driver. + +Signed-off-by: Andreas Bomholtz <andreas@seluxit.com> +Signed-off-by: Johan Hovold <johan@kernel.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/usb/serial/cp210x.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/usb/serial/cp210x.c ++++ b/drivers/usb/serial/cp210x.c +@@ -162,6 +162,7 @@ static const struct usb_device_id id_tab + { USB_DEVICE(0x1ADB, 0x0001) }, /* Schweitzer Engineering C662 Cable */ + { USB_DEVICE(0x1B1C, 0x1C00) }, /* Corsair USB Dongle */ + { USB_DEVICE(0x1BE3, 0x07A6) }, /* WAGO 750-923 USB Service Cable */ ++ { USB_DEVICE(0x1D6F, 0x0010) }, /* Seluxit ApS RF Dongle */ + { USB_DEVICE(0x1E29, 0x0102) }, /* Festo CPX-USB */ + { USB_DEVICE(0x1E29, 0x0501) }, /* Festo CMSP */ + { USB_DEVICE(0x1FB9, 0x0100) }, /* Lake Shore Model 121 Current Source */ diff --git a/patches/usb-serial-cp210x-added-ketra-n1-wireless-interface-support.patch b/patches/usb-serial-cp210x-added-ketra-n1-wireless-interface-support.patch new file mode 100644 index 0000000..2a2fcca --- /dev/null +++ b/patches/usb-serial-cp210x-added-ketra-n1-wireless-interface-support.patch @@ -0,0 +1,27 @@ +From bfc2d7dfdd761ae3beccdb26abebe03cef042f46 Mon Sep 17 00:00:00 2001 +From: Joe Savage <joe.savage@goketra.com> +Date: Sat, 20 Sep 2014 08:01:16 -0500 +Subject: USB: serial: cp210x: added Ketra N1 wireless interface support + +commit bfc2d7dfdd761ae3beccdb26abebe03cef042f46 upstream. + +Added support for Ketra N1 wireless interface, which uses the +Silicon Labs' CP2104 USB to UART bridge with customized PID 8946. + +Signed-off-by: Joe Savage <joe.savage@goketra.com> +Signed-off-by: Johan Hovold <johan@kernel.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/usb/serial/cp210x.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/usb/serial/cp210x.c ++++ b/drivers/usb/serial/cp210x.c +@@ -128,6 +128,7 @@ static const struct usb_device_id id_tab + { USB_DEVICE(0x10C4, 0x8665) }, /* AC-Services OBD-IF */ + { USB_DEVICE(0x10C4, 0x88A4) }, /* MMB Networks ZigBee USB Device */ + { USB_DEVICE(0x10C4, 0x88A5) }, /* Planet Innovation Ingeni ZigBee USB Device */ ++ { USB_DEVICE(0x10C4, 0x8946) }, /* Ketra N1 Wireless Interface */ + { USB_DEVICE(0x10C4, 0xEA60) }, /* Silicon Labs factory default */ + { USB_DEVICE(0x10C4, 0xEA61) }, /* Silicon Labs factory default */ + { USB_DEVICE(0x10C4, 0xEA70) }, /* Silicon Labs factory default */ diff --git a/patches/vfs-fix-data-corruption-when-blocksize-pagesize-for-mmaped-data.patch b/patches/vfs-fix-data-corruption-when-blocksize-pagesize-for-mmaped-data.patch new file mode 100644 index 0000000..9625ed8 --- /dev/null +++ b/patches/vfs-fix-data-corruption-when-blocksize-pagesize-for-mmaped-data.patch @@ -0,0 +1,159 @@ +From 90a8020278c1598fafd071736a0846b38510309c Mon Sep 17 00:00:00 2001 +From: Jan Kara <jack@suse.cz> +Date: Wed, 1 Oct 2014 21:49:18 -0400 +Subject: vfs: fix data corruption when blocksize < pagesize for mmaped data + +commit 90a8020278c1598fafd071736a0846b38510309c upstream. + +->page_mkwrite() is used by filesystems to allocate blocks under a page +which is becoming writeably mmapped in some process' address space. This +allows a filesystem to return a page fault if there is not enough space +available, user exceeds quota or similar problem happens, rather than +silently discarding data later when writepage is called. + +However VFS fails to call ->page_mkwrite() in all the cases where +filesystems need it when blocksize < pagesize. For example when +blocksize = 1024, pagesize = 4096 the following is problematic: + ftruncate(fd, 0); + pwrite(fd, buf, 1024, 0); + map = mmap(NULL, 1024, PROT_WRITE, MAP_SHARED, fd, 0); + map[0] = 'a'; ----> page_mkwrite() for index 0 is called + ftruncate(fd, 10000); /* or even pwrite(fd, buf, 1, 10000) */ + mremap(map, 1024, 10000, 0); + map[4095] = 'a'; ----> no page_mkwrite() called + +At the moment ->page_mkwrite() is called, filesystem can allocate only +one block for the page because i_size == 1024. Otherwise it would create +blocks beyond i_size which is generally undesirable. But later at +->writepage() time, we also need to store data at offset 4095 but we +don't have block allocated for it. + +This patch introduces a helper function filesystems can use to have +->page_mkwrite() called at all the necessary moments. + +Signed-off-by: Jan Kara <jack@suse.cz> +Signed-off-by: Theodore Ts'o <tytso@mit.edu> +[lizf: Backported to 3.4: + - adjust context + - truncate_setsize() already has an oldsize variable] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + fs/buffer.c | 3 ++ + include/linux/mm.h | 1 + mm/truncate.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++--- + 3 files changed, 63 insertions(+), 3 deletions(-) + +--- a/fs/buffer.c ++++ b/fs/buffer.c +@@ -1982,6 +1982,7 @@ int generic_write_end(struct file *file, + struct page *page, void *fsdata) + { + struct inode *inode = mapping->host; ++ loff_t old_size = inode->i_size; + int i_size_changed = 0; + + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); +@@ -2001,6 +2002,8 @@ int generic_write_end(struct file *file, + unlock_page(page); + page_cache_release(page); + ++ if (old_size < pos) ++ pagecache_isize_extended(inode, old_size, pos); + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -953,6 +953,7 @@ static inline void unmap_shared_mapping_ + + extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new); + extern void truncate_setsize(struct inode *inode, loff_t newsize); ++void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); + extern int vmtruncate(struct inode *inode, loff_t offset); + extern int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end); + void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); +--- a/mm/truncate.c ++++ b/mm/truncate.c +@@ -20,6 +20,7 @@ + #include <linux/buffer_head.h> /* grr. try_to_release_page, + do_invalidatepage */ + #include <linux/cleancache.h> ++#include <linux/rmap.h> + #include "internal.h" + + +@@ -571,16 +572,71 @@ EXPORT_SYMBOL(truncate_pagecache); + */ + void truncate_setsize(struct inode *inode, loff_t newsize) + { +- loff_t oldsize; +- +- oldsize = inode->i_size; ++ loff_t oldsize = inode->i_size; + i_size_write(inode, newsize); + ++ if (newsize > oldsize) ++ pagecache_isize_extended(inode, oldsize, newsize); + truncate_pagecache(inode, oldsize, newsize); + } + EXPORT_SYMBOL(truncate_setsize); + + /** ++ * pagecache_isize_extended - update pagecache after extension of i_size ++ * @inode: inode for which i_size was extended ++ * @from: original inode size ++ * @to: new inode size ++ * ++ * Handle extension of inode size either caused by extending truncate or by ++ * write starting after current i_size. We mark the page straddling current ++ * i_size RO so that page_mkwrite() is called on the nearest write access to ++ * the page. This way filesystem can be sure that page_mkwrite() is called on ++ * the page before user writes to the page via mmap after the i_size has been ++ * changed. ++ * ++ * The function must be called after i_size is updated so that page fault ++ * coming after we unlock the page will already see the new i_size. ++ * The function must be called while we still hold i_mutex - this not only ++ * makes sure i_size is stable but also that userspace cannot observe new ++ * i_size value before we are prepared to store mmap writes at new inode size. ++ */ ++void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) ++{ ++ int bsize = 1 << inode->i_blkbits; ++ loff_t rounded_from; ++ struct page *page; ++ pgoff_t index; ++ ++ WARN_ON(!mutex_is_locked(&inode->i_mutex)); ++ WARN_ON(to > inode->i_size); ++ ++ if (from >= to || bsize == PAGE_CACHE_SIZE) ++ return; ++ /* Page straddling @from will not have any hole block created? */ ++ rounded_from = round_up(from, bsize); ++ if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1))) ++ return; ++ ++ index = from >> PAGE_CACHE_SHIFT; ++ page = find_lock_page(inode->i_mapping, index); ++ /* Page not cached? Nothing to do */ ++ if (!page) ++ return; ++ /* ++ * See clear_page_dirty_for_io() for details why set_page_dirty() ++ * is needed. ++ */ ++ if (page_mkclean(page)) ++ set_page_dirty(page); ++ unlock_page(page); ++ page_cache_release(page); ++} ++EXPORT_SYMBOL(pagecache_isize_extended); ++ ++/** ++ * truncate_pagecache_range - unmap and remove pagecache that is hole-punched ++ * @inode: inode ++ * @lstart: offset of beginning of hole + * vmtruncate - unmap mappings "freed" by truncate() syscall + * @inode: inode of the file used + * @newsize: file offset to start truncating diff --git a/patches/virtio_pci-fix-virtio-spec-compliance-on-restore.patch b/patches/virtio_pci-fix-virtio-spec-compliance-on-restore.patch new file mode 100644 index 0000000..e0e8c55 --- /dev/null +++ b/patches/virtio_pci-fix-virtio-spec-compliance-on-restore.patch @@ -0,0 +1,85 @@ +From 6fbc198cf623944ab60a1db6d306a4d55cdd820d Mon Sep 17 00:00:00 2001 +From: "Michael S. Tsirkin" <mst@redhat.com> +Date: Tue, 14 Oct 2014 10:40:29 +1030 +Subject: virtio_pci: fix virtio spec compliance on restore + +commit 6fbc198cf623944ab60a1db6d306a4d55cdd820d upstream. + +On restore, virtio pci does the following: ++ set features ++ init vqs etc - device can be used at this point! ++ set ACKNOWLEDGE,DRIVER and DRIVER_OK status bits + +This is in violation of the virtio spec, which +requires the following order: +- ACKNOWLEDGE +- DRIVER +- init vqs +- DRIVER_OK + +This behaviour will break with hypervisors that assume spec compliant +behaviour. It seems like a good idea to have this patch applied to +stable branches to reduce the support butden for the hypervisors. + +Cc: Amit Shah <amit.shah@redhat.com> +Signed-off-by: Michael S. Tsirkin <mst@redhat.com> +Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + drivers/virtio/virtio_pci.c | 33 ++++++++++++++++++++++++++++++--- + 1 file changed, 30 insertions(+), 3 deletions(-) + +--- a/drivers/virtio/virtio_pci.c ++++ b/drivers/virtio/virtio_pci.c +@@ -745,6 +745,7 @@ static int virtio_pci_restore(struct dev + struct pci_dev *pci_dev = to_pci_dev(dev); + struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); + struct virtio_driver *drv; ++ unsigned status = 0; + int ret; + + drv = container_of(vp_dev->vdev.dev.driver, +@@ -755,14 +756,40 @@ static int virtio_pci_restore(struct dev + return ret; + + pci_set_master(pci_dev); ++ /* We always start by resetting the device, in case a previous ++ * driver messed it up. */ ++ vp_reset(&vp_dev->vdev); ++ ++ /* Acknowledge that we've seen the device. */ ++ status |= VIRTIO_CONFIG_S_ACKNOWLEDGE; ++ vp_set_status(&vp_dev->vdev, status); ++ ++ /* Maybe driver failed before freeze. ++ * Restore the failed status, for debugging. */ ++ status |= vp_dev->saved_status & VIRTIO_CONFIG_S_FAILED; ++ vp_set_status(&vp_dev->vdev, status); ++ ++ if (!drv) ++ return 0; ++ ++ /* We have a driver! */ ++ status |= VIRTIO_CONFIG_S_DRIVER; ++ vp_set_status(&vp_dev->vdev, status); ++ + vp_finalize_features(&vp_dev->vdev); + +- if (drv && drv->restore) ++ if (drv->restore) { + ret = drv->restore(&vp_dev->vdev); ++ if (ret) { ++ status |= VIRTIO_CONFIG_S_FAILED; ++ vp_set_status(&vp_dev->vdev, status); ++ return ret; ++ } ++ } + + /* Finally, tell the device we're all set */ +- if (!ret) +- vp_set_status(&vp_dev->vdev, vp_dev->saved_status); ++ status |= VIRTIO_CONFIG_S_DRIVER_OK; ++ vp_set_status(&vp_dev->vdev, status); + + return ret; + } diff --git a/patches/x86-intel-quark-switch-off-cr4.pge-so-tlb-flush-uses-cr3-instead.patch b/patches/x86-intel-quark-switch-off-cr4.pge-so-tlb-flush-uses-cr3-instead.patch new file mode 100644 index 0000000..b888b56 --- /dev/null +++ b/patches/x86-intel-quark-switch-off-cr4.pge-so-tlb-flush-uses-cr3-instead.patch @@ -0,0 +1,50 @@ +From ee1b5b165c0a2f04d2107e634e51f05d0eb107de Mon Sep 17 00:00:00 2001 +From: Bryan O'Donoghue <pure.logic@nexus-software.ie> +Date: Wed, 24 Sep 2014 00:26:24 +0100 +Subject: x86/intel/quark: Switch off CR4.PGE so TLB flush uses CR3 instead + +commit ee1b5b165c0a2f04d2107e634e51f05d0eb107de upstream. + +Quark x1000 advertises PGE via the standard CPUID method +PGE bits exist in Quark X1000's PTEs. In order to flush +an individual PTE it is necessary to reload CR3 irrespective +of the PTE.PGE bit. + +See Quark Core_DevMan_001.pdf section 6.4.11 + +This bug was fixed in Galileo kernels, unfixed vanilla kernels are expected to +crash and burn on this platform. + +Signed-off-by: Bryan O'Donoghue <pure.logic@nexus-software.ie> +Cc: Borislav Petkov <bp@alien8.de> +Link: http://lkml.kernel.org/r/1411514784-14885-1-git-send-email-pure.logic@nexus-software.ie +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + arch/x86/kernel/cpu/intel.c | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +--- a/arch/x86/kernel/cpu/intel.c ++++ b/arch/x86/kernel/cpu/intel.c +@@ -143,6 +143,21 @@ static void __cpuinit early_init_intel(s + setup_clear_cpu_cap(X86_FEATURE_ERMS); + } + } ++ ++ /* ++ * Intel Quark Core DevMan_001.pdf section 6.4.11 ++ * "The operating system also is required to invalidate (i.e., flush) ++ * the TLB when any changes are made to any of the page table entries. ++ * The operating system must reload CR3 to cause the TLB to be flushed" ++ * ++ * As a result cpu_has_pge() in arch/x86/include/asm/tlbflush.h should ++ * be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE ++ * to be modified ++ */ ++ if (c->x86 == 5 && c->x86_model == 9) { ++ pr_info("Disabling PGE capability bit\n"); ++ setup_clear_cpu_cap(X86_FEATURE_PGE); ++ } + } + + #ifdef CONFIG_X86_32 diff --git a/patches/x86-kvm-vmx-preserve-cr4-across-vm-entry.patch b/patches/x86-kvm-vmx-preserve-cr4-across-vm-entry.patch new file mode 100644 index 0000000..1835798 --- /dev/null +++ b/patches/x86-kvm-vmx-preserve-cr4-across-vm-entry.patch @@ -0,0 +1,105 @@ +From d974baa398f34393db76be45f7d4d04fbdbb4a0a Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski <luto@amacapital.net> +Date: Wed, 8 Oct 2014 09:02:13 -0700 +Subject: x86,kvm,vmx: Preserve CR4 across VM entry + +commit d974baa398f34393db76be45f7d4d04fbdbb4a0a upstream. + +CR4 isn't constant; at least the TSD and PCE bits can vary. + +TBH, treating CR0 and CR3 as constant scares me a bit, too, but it looks +like it's correct. + +This adds a branch and a read from cr4 to each vm entry. Because it is +extremely likely that consecutive entries into the same vcpu will have +the same host cr4 value, this fixes up the vmcs instead of restoring cr4 +after the fact. A subsequent patch will add a kernel-wide cr4 shadow, +reducing the overhead in the common case to just two memory reads and a +branch. + +Signed-off-by: Andy Lutomirski <luto@amacapital.net> +Acked-by: Paolo Bonzini <pbonzini@redhat.com> +Cc: Petr Matousek <pmatouse@redhat.com> +Cc: Gleb Natapov <gleb@kernel.org> +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +[lizf: Backported to 3.4: + - adjust context + - add parameter struct vcpu_vmx *vmx to vmx_set_constant_host_state()] +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + arch/x86/kvm/vmx.c | 21 +++++++++++++++++---- + 1 file changed, 17 insertions(+), 4 deletions(-) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -388,6 +388,7 @@ struct vcpu_vmx { + u16 fs_sel, gs_sel, ldt_sel; + int gs_ldt_reload_needed; + int fs_reload_needed; ++ unsigned long vmcs_host_cr4; /* May not match real cr4 */ + } host_state; + struct { + int vm86_active; +@@ -3622,16 +3623,21 @@ static void vmx_disable_intercept_for_ms + * Note that host-state that does change is set elsewhere. E.g., host-state + * that is set differently for each CPU is set in vmx_vcpu_load(), not here. + */ +-static void vmx_set_constant_host_state(void) ++static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) + { + u32 low32, high32; + unsigned long tmpl; + struct desc_ptr dt; ++ unsigned long cr4; + + vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */ +- vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ + vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ + ++ /* Save the most likely value for this task's CR4 in the VMCS. */ ++ cr4 = read_cr4(); ++ vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ ++ vmx->host_state.vmcs_host_cr4 = cr4; ++ + vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ + vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ + vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ +@@ -3753,7 +3759,7 @@ static int vmx_vcpu_setup(struct vcpu_vm + + vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ + vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ +- vmx_set_constant_host_state(); ++ vmx_set_constant_host_state(vmx); + #ifdef CONFIG_X86_64 + rdmsrl(MSR_FS_BASE, a); + vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ +@@ -6101,6 +6107,7 @@ static void atomic_switch_perf_msrs(stru + static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) + { + struct vcpu_vmx *vmx = to_vmx(vcpu); ++ unsigned long cr4; + + if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); +@@ -6131,6 +6138,12 @@ static void __noclone vmx_vcpu_run(struc + if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + ++ cr4 = read_cr4(); ++ if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { ++ vmcs_writel(HOST_CR4, cr4); ++ vmx->host_state.vmcs_host_cr4 = cr4; ++ } ++ + /* When single-stepping over STI and MOV SS, we must clear the + * corresponding interruptibility bits in the guest state. Otherwise + * vmentry fails as it then expects bit 14 (BS) in pending debug +@@ -6589,7 +6602,7 @@ static void prepare_vmcs02(struct kvm_vc + * Other fields are different per CPU, and will be set later when + * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. + */ +- vmx_set_constant_host_state(); ++ vmx_set_constant_host_state(vmx); + + /* + * HOST_RSP is normally set correctly in vmx_vcpu_run() just before diff --git a/patches/x86-reject-x32-executables-if-x32-abi-not-supported.patch b/patches/x86-reject-x32-executables-if-x32-abi-not-supported.patch new file mode 100644 index 0000000..ed2760e --- /dev/null +++ b/patches/x86-reject-x32-executables-if-x32-abi-not-supported.patch @@ -0,0 +1,36 @@ +From 0e6d3112a4e95d55cf6dca88f298d5f4b8f29bd1 Mon Sep 17 00:00:00 2001 +From: Ben Hutchings <ben@decadent.org.uk> +Date: Sun, 7 Sep 2014 21:05:05 +0100 +Subject: x86: Reject x32 executables if x32 ABI not supported + +commit 0e6d3112a4e95d55cf6dca88f298d5f4b8f29bd1 upstream. + +It is currently possible to execve() an x32 executable on an x86_64 +kernel that has only ia32 compat enabled. However all its syscalls +will fail, even _exit(). This usually causes it to segfault. + +Change the ELF compat architecture check so that x32 executables are +rejected if we don't support the x32 ABI. + +Signed-off-by: Ben Hutchings <ben@decadent.org.uk> +Link: http://lkml.kernel.org/r/1410120305.6822.9.camel@decadent.org.uk +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Zefan Li <lizefan@huawei.com> +--- + arch/x86/include/asm/elf.h | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/arch/x86/include/asm/elf.h ++++ b/arch/x86/include/asm/elf.h +@@ -155,8 +155,9 @@ do { \ + #define elf_check_arch(x) \ + ((x)->e_machine == EM_X86_64) + +-#define compat_elf_check_arch(x) \ +- (elf_check_arch_ia32(x) || (x)->e_machine == EM_X86_64) ++#define compat_elf_check_arch(x) \ ++ (elf_check_arch_ia32(x) || \ ++ (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)) + + #if __USER32_DS != __USER_DS + # error "The following code assumes __USER32_DS == __USER_DS" |