Merge remote-tracking branch 'tip/auto-latest'

Conflicts: drivers/clocksource/Makefile include/linux/intel-iommu.h include/linux/rcupdate.h
author: Michael Ellerman <mpe@ellerman.id.au> 2015-06-05 18:27:08 +1000
committer: Michael Ellerman <mpe@ellerman.id.au> 2015-06-05 18:27:11 +1000
commit: 123ed28d0811617e5a0a9f96e650720768ff80d6 (patch)
tree: c5d4b8ca156214b7b0e3d3460ff6764db6427fc4
parent: e36a281282a20be3e51adccc9c9d0be8262e3a7d (diff)
parent: 7a6a010964a6fa86c5d41159ac1893cc918589fc (diff)
download: linux-next-123ed28d0811617e5a0a9f96e650720768ff80d6.tar.gz
748 files changed, 23088 insertions, 12770 deletions
diff --git a/Documentation/ABI/testing/sysfs-firmware-efi b/Documentation/ABI/testing/sysfs-firmware-efi
index 05874da7ce8030..e794eac32a90a5 100644
--- a/Documentation/ABI/testing/sysfs-firmware-efi
+++ b/Documentation/ABI/testing/sysfs-firmware-efi
@@ -18,3 +18,13 @@ Contact:	Dave Young <dyoung@redhat.com>
 Description:	It shows the physical address of config table entry in the EFI
 		system table.
 Users:		Kexec
+
+What:		/sys/firmware/efi/systab
+Date:		April 2005
+Contact:	linux-efi@vger.kernel.org
+Description:	Displays the physical addresses of all EFI Configuration
+		Tables found via the EFI System Table. The order in
+		which the tables are printed forms an ABI and newer
+		versions are always printed first, i.e. ACPI20 comes
+		before ACPI.
+Users:		dmidecode
diff --git a/Documentation/ABI/testing/sysfs-firmware-efi-esrt b/Documentation/ABI/testing/sysfs-firmware-efi-esrt
new file mode 100644
index 00000000000000..6e431d1a4e7976
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-firmware-efi-esrt
@@ -0,0 +1,81 @@
+What:		/sys/firmware/efi/esrt/
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	Provides userland access to read the EFI System Resource Table
+		(ESRT), a catalog of firmware for which can be updated with
+		the UEFI UpdateCapsule mechanism described in section 7.5 of
+		the UEFI Standard.
+Users:		fwupdate - https://github.com/rhinstaller/fwupdate
+
+What:		/sys/firmware/efi/esrt/fw_resource_count
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	The number of entries in the ESRT
+
+What:		/sys/firmware/efi/esrt/fw_resource_count_max
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	The maximum number of entries that /could/ be registered
+		in the allocation the table is currently in.  This is
+		really only useful to the system firmware itself.
+
+What:		/sys/firmware/efi/esrt/fw_resource_version
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	The version of the ESRT structure provided by the firmware.
+
+What:		/sys/firmware/efi/esrt/entries/entry$N/
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	Each ESRT entry is identified by a GUID, and each gets a
+		subdirectory under entries/ .
+		example: /sys/firmware/efi/esrt/entries/entry0/
+
+What:		/sys/firmware/efi/esrt/entries/entry$N/fw_type
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	What kind of firmware entry this is:
+		0 - Unknown
+		1 - System Firmware
+		2 - Device Firmware
+		3 - UEFI Driver
+
+What:		/sys/firmware/efi/esrt/entries/entry$N/fw_class
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	This is the entry's guid, and will match the directory name.
+
+What:		/sys/firmware/efi/esrt/entries/entry$N/fw_version
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	The version of the firmware currently installed.  This is a
+		32-bit unsigned integer.
+
+What:		/sys/firmware/efi/esrt/entries/entry$N/lowest_supported_fw_version
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	The lowest version of the firmware that can be installed.
+
+What:		/sys/firmware/efi/esrt/entries/entry$N/capsule_flags
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	Flags that must be passed to UpdateCapsule()
+
+What:		/sys/firmware/efi/esrt/entries/entry$N/last_attempt_version
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	The last firmware version for which an update was attempted.
+
+What:		/sys/firmware/efi/esrt/entries/entry$N/last_attempt_status
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	The result of the last firmware update attempt for the
+		firmware resource entry.
+		0 - Success
+		1 - Insufficient resources
+		2 - Incorrect version
+		3 - Invalid format
+		4 - Authentication error
+		5 - AC power event
+		6 - Battery power event
+
diff --git a/Documentation/RCU/arrayRCU.txt b/Documentation/RCU/arrayRCU.txt
index 453ebe6953eefe..f05a9afb2c39b6 100644
--- a/Documentation/RCU/arrayRCU.txt
+++ b/Documentation/RCU/arrayRCU.txt
@@ -10,7 +10,19 @@ also be used to protect arrays.  Three situations are as follows:
 
 3.  Resizeable Arrays
 
-Each of these situations are discussed below.
+Each of these three situations involves an RCU-protected pointer to an
+array that is separately indexed.  It might be tempting to consider use
+of RCU to instead protect the index into an array, however, this use
+case is -not- supported.  The problem with RCU-protected indexes into
+arrays is that compilers can play way too many optimization games with
+integers, which means that the rules governing handling of these indexes
+are far more trouble than they are worth.  If RCU-protected indexes into
+arrays prove to be particularly valuable (which they have not thus far),
+explicit cooperation from the compiler will be required to permit them
+to be safely used.
+
+That aside, each of the three RCU-protected pointer situations are
+described in the following sections.
 
 
 Situation 1: Hash Tables
@@ -36,9 +48,9 @@ Quick Quiz:  Why is it so important that updates be rare when
 Situation 3: Resizeable Arrays
 
 Use of RCU for resizeable arrays is demonstrated by the grow_ary()
-function used by the System V IPC code.  The array is used to map from
-semaphore, message-queue, and shared-memory IDs to the data structure
-that represents the corresponding IPC construct.  The grow_ary()
+function formerly used by the System V IPC code.  The array is used
+to map from semaphore, message-queue, and shared-memory IDs to the data
+structure that represents the corresponding IPC construct.  The grow_ary()
 function does not acquire any locks; instead its caller must hold the
 ids->sem semaphore.
 
diff --git a/Documentation/RCU/lockdep.txt b/Documentation/RCU/lockdep.txt
index cd83d2348fef8c..da51d306885077 100644
--- a/Documentation/RCU/lockdep.txt
+++ b/Documentation/RCU/lockdep.txt
@@ -47,11 +47,6 @@ checking of rcu_dereference() primitives:
 		Use explicit check expression "c" along with
 		srcu_read_lock_held()().  This is useful in code that
 		is invoked by both SRCU readers and updaters.
-	rcu_dereference_index_check(p, c):
-		Use explicit check expression "c", but the caller
-		must supply one of the rcu_read_lock_held() functions.
-		This is useful in code that uses RCU-protected arrays
-		that is invoked by both RCU readers and updaters.
 	rcu_dereference_raw(p):
 		Don't check.  (Use sparingly, if at all.)
 	rcu_dereference_protected(p, c):
@@ -64,11 +59,6 @@ checking of rcu_dereference() primitives:
 		but retain the compiler constraints that prevent duplicating
 		or coalescsing.  This is useful when when testing the
 		value of the pointer itself, for example, against NULL.
-	rcu_access_index(idx):
-		Return the value of the index and omit all barriers, but
-		retain the compiler constraints that prevent duplicating
-		or coalescsing.  This is useful when when testing the
-		value of the index itself, for example, against -1.
 
 The rcu_dereference_check() check expression can be any boolean
 expression, but would normally include a lockdep expression.  However,
diff --git a/Documentation/RCU/rcu_dereference.txt b/Documentation/RCU/rcu_dereference.txt
index ceb05da5a5acd3..1e6c0da994f544 100644
--- a/Documentation/RCU/rcu_dereference.txt
+++ b/Documentation/RCU/rcu_dereference.txt
@@ -25,17 +25,6 @@ o	You must use one of the rcu_dereference() family of primitives
 	for an example where the compiler can in fact deduce the exact
 	value of the pointer, and thus cause misordering.
 
-o	Do not use single-element RCU-protected arrays.  The compiler
-	is within its right to assume that the value of an index into
-	such an array must necessarily evaluate to zero.  The compiler
-	could then substitute the constant zero for the computation, so
-	that the array index no longer depended on the value returned
-	by rcu_dereference().  If the array index no longer depends
-	on rcu_dereference(), then both the compiler and the CPU
-	are within their rights to order the array access before the
-	rcu_dereference(), which can cause the array access to return
-	garbage.
-
 o	Avoid cancellation when using the "+" and "-" infix arithmetic
 	operators.  For example, for a given variable "x", avoid
 	"(x-x)".  There are similar arithmetic pitfalls from other
@@ -76,14 +65,15 @@ o	Do not use the results from the boolean "&&" and "||" when
 	dereferencing.	For example, the following (rather improbable)
 	code is buggy:
 
-		int a[2];
-		int index;
-		int force_zero_index = 1;
+		int *p;
+		int *q;
 
 		...
 
-		r1 = rcu_dereference(i1)
-		r2 = a[r1 && force_zero_index];  /* BUGGY!!! */
+		p = rcu_dereference(gp)
+		q = &global_q;
+		q += p != &oom_p1 && p != &oom_p2;
+		r1 = *q;  /* BUGGY!!! */
 
 	The reason this is buggy is that "&&" and "||" are often compiled
 	using branches.  While weak-memory machines such as ARM or PowerPC
@@ -94,14 +84,15 @@ o	Do not use the results from relational operators ("==", "!=",
 	">", ">=", "<", or "<=") when dereferencing.  For example,
 	the following (quite strange) code is buggy:
 
-		int a[2];
-		int index;
-		int flip_index = 0;
+		int *p;
+		int *q;
 
 		...
 
-		r1 = rcu_dereference(i1)
-		r2 = a[r1 != flip_index];  /* BUGGY!!! */
+		p = rcu_dereference(gp)
+		q = &global_q;
+		q += p > &oom_p;
+		r1 = *q;  /* BUGGY!!! */
 
 	As before, the reason this is buggy is that relational operators
 	are often compiled using branches.  And as before, although
@@ -193,6 +184,11 @@ o	Be very careful about comparing pointers obtained from
 		pointer.  Note that the volatile cast in rcu_dereference()
 		will normally prevent the compiler from knowing too much.
 
+		However, please note that if the compiler knows that the
+		pointer takes on only one of two values, a not-equal
+		comparison will provide exactly the information that the
+		compiler needs to deduce the value of the pointer.
+
 o	Disable any value-speculation optimizations that your compiler
 	might provide, especially if you are making use of feedback-based
 	optimizations that take data collected from prior runs.  Such
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 88dfce182f6609..5746b0c77f3e4c 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -256,7 +256,9 @@ rcu_dereference()
 	If you are going to be fetching multiple fields from the
 	RCU-protected structure, using the local variable is of
 	course preferred.  Repeated rcu_dereference() calls look
-	ugly and incur unnecessary overhead on Alpha CPUs.
+	ugly, do not guarantee that the same pointer will be returned
+	if an update happened while in the critical section, and incur
+	unnecessary overhead on Alpha CPUs.
 
 	Note that the value returned by rcu_dereference() is valid
 	only within the enclosing RCU read-side critical section.
@@ -879,9 +881,7 @@ SRCU:	Initialization/cleanup
 
 All:  lockdep-checked RCU-protected pointer access
 
-	rcu_access_index
 	rcu_access_pointer
-	rcu_dereference_index_check
 	rcu_dereference_raw
 	rcu_lockdep_assert
 	rcu_sleep_check
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
index 0aad6deb2d9638..12b1b25b4da971 100644
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -1,6 +1,6 @@
 
 Export CPU topology info via sysfs. Items (attributes) are similar
-to /proc/cpuinfo.
+to /proc/cpuinfo output of some architectures:
 
 1) /sys/devices/system/cpu/cpuX/topology/physical_package_id:
 
@@ -23,20 +23,35 @@ to /proc/cpuinfo.
 4) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
 
 	internal kernel map of cpuX's hardware threads within the same
-	core as cpuX
+	core as cpuX.
 
-5) /sys/devices/system/cpu/cpuX/topology/core_siblings:
+5) /sys/devices/system/cpu/cpuX/topology/thread_siblings_list:
+
+	human-readable list of cpuX's hardware threads within the same
+	core as cpuX.
+
+6) /sys/devices/system/cpu/cpuX/topology/core_siblings:
 
 	internal kernel map of cpuX's hardware threads within the same
 	physical_package_id.
 
-6) /sys/devices/system/cpu/cpuX/topology/book_siblings:
+7) /sys/devices/system/cpu/cpuX/topology/core_siblings_list:
+
+	human-readable list of cpuX's hardware threads within the same
+	physical_package_id.
+
+8) /sys/devices/system/cpu/cpuX/topology/book_siblings:
 
 	internal kernel map of cpuX's hardware threads within the same
 	book_id.
 
+9) /sys/devices/system/cpu/cpuX/topology/book_siblings_list:
+
+	human-readable list of cpuX's hardware threads within the same
+	book_id.
+
 To implement it in an architecture-neutral way, a new source file,
-drivers/base/topology.c, is to export the 4 or 6 attributes. The two book
+drivers/base/topology.c, is to export the 6 or 9 attributes. The three book
 related sysfs files will only be created if CONFIG_SCHED_BOOK is selected.
 
 For an architecture to support this feature, it must define some of
@@ -44,20 +59,22 @@ these macros in include/asm-XXX/topology.h:
 #define topology_physical_package_id(cpu)
 #define topology_core_id(cpu)
 #define topology_book_id(cpu)
-#define topology_thread_cpumask(cpu)
+#define topology_sibling_cpumask(cpu)
 #define topology_core_cpumask(cpu)
 #define topology_book_cpumask(cpu)
 
-The type of **_id is int.
-The type of siblings is (const) struct cpumask *.
+The type of **_id macros is int.
+The type of **_cpumask macros is (const) struct cpumask *. The latter
+correspond with appropriate **_siblings sysfs attributes (except for
+topology_sibling_cpumask() which corresponds with thread_siblings).
 
 To be consistent on all architectures, include/linux/topology.h
 provides default definitions for any of the above macros that are
 not defined by include/asm-XXX/topology.h:
 1) physical_package_id: -1
 2) core_id: 0
-3) thread_siblings: just the given CPU
-4) core_siblings: just the given CPU
+3) sibling_cpumask: just the given CPU
+4) core_cpumask: just the given CPU
 
 For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
 default definitions for topology_book_id() and topology_book_cpumask().
diff --git a/Documentation/devicetree/bindings/arm/armv7m_systick.txt b/Documentation/devicetree/bindings/arm/armv7m_systick.txt
new file mode 100644
index 00000000000000..7cf4a24601eb2e
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/armv7m_systick.txt
@@ -0,0 +1,26 @@
+* ARMv7M System Timer
+
+ARMv7-M includes a system timer, known as SysTick. Current driver only
+implements the clocksource feature.
+
+Required properties:
+- compatible	  : Should be "arm,armv7m-systick"
+- reg		  : The address range of the timer
+
+Required clocking property, have to be one of:
+- clocks	  : The input clock of the timer
+- clock-frequency : The rate in HZ in input of the ARM SysTick
+
+Examples:
+
+systick: timer@e000e010 {
+	compatible = "arm,armv7m-systick";
+	reg = <0xe000e010 0x10>;
+	clocks = <&clk_systick>;
+};
+
+systick: timer@e000e010 {
+	compatible = "arm,armv7m-systick";
+	reg = <0xe000e010 0x10>;
+	clock-frequency = <90000000>;
+};
diff --git a/Documentation/devicetree/bindings/timer/nxp,lpc3220-timer.txt b/Documentation/devicetree/bindings/timer/nxp,lpc3220-timer.txt
new file mode 100644
index 00000000000000..51b05a0e70d191
--- /dev/null
+++ b/Documentation/devicetree/bindings/timer/nxp,lpc3220-timer.txt
@@ -0,0 +1,26 @@
+* NXP LPC3220 timer
+
+The NXP LPC3220 timer is used on a wide range of NXP SoCs. This
+includes LPC32xx, LPC178x, LPC18xx and LPC43xx parts.
+
+Required properties:
+- compatible:
+	Should be "nxp,lpc3220-timer".
+- reg:
+	Address and length of the register set.
+- interrupts:
+	Reference to the timer interrupt
+- clocks:
+	Should contain a reference to timer clock.
+- clock-names:
+	Should contain "timerclk".
+
+Example:
+
+timer1: timer@40085000 {
+	compatible = "nxp,lpc3220-timer";
+	reg = <0x40085000 0x1000>;
+	interrupts = <13>;
+	clocks = <&ccu1 CLK_CPU_TIMER1>;
+	clock-names = "timerclk";
+};
diff --git a/Documentation/devicetree/bindings/timer/st,stm32-timer.txt b/Documentation/devicetree/bindings/timer/st,stm32-timer.txt
new file mode 100644
index 00000000000000..8ef28e70d6e84e
--- /dev/null
+++ b/Documentation/devicetree/bindings/timer/st,stm32-timer.txt
@@ -0,0 +1,22 @@
+. STMicroelectronics STM32 timer
+
+The STM32 MCUs family has several general-purpose 16 and 32 bits timers.
+
+Required properties:
+- compatible : Should be "st,stm32-timer"
+- reg : Address and length of the register set
+- clocks : Reference on the timer input clock
+- interrupts : Reference to the timer interrupt
+
+Optional properties:
+- resets: Reference to a reset controller asserting the timer
+
+Example:
+
+timer5: timer@40000c00 {
+	compatible = "st,stm32-timer";
+	reg = <0x40000c00 0x400>;
+	interrupts = <50>;
+	resets = <&rrc 259>;
+	clocks = <&clk_pmtr1>;
+};
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 963fcd38eba94d..136e6223a6c124 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -746,6 +746,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 	cpuidle.off=1	[CPU_IDLE]
 			disable the cpuidle sub-system
 
+	cpu_init_udelay=N
+			[X86] Delay for N microsec between assert and de-assert
+			of APIC INIT to start processors.  This delay occurs
+			on every CPU online, such as boot, and resume from suspend.
+			Default: 10000
+
 	cpcihp_generic=	[HW,PCI] Generic port I/O CompactPCI driver
 			Format:
 			<first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]
@@ -2994,11 +3000,34 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			Set maximum number of finished RCU callbacks to
 			process in one batch.
 
+	rcutree.dump_tree=	[KNL]
+			Dump the structure of the rcu_node combining tree
+			out at early boot.  This is used for diagnostic
+			purposes, to verify correct tree setup.
+
+	rcutree.gp_cleanup_delay=	[KNL]
+			Set the number of jiffies to delay each step of
+			RCU grace-period cleanup.  This only has effect
+			when CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP is set.
+
 	rcutree.gp_init_delay=	[KNL]
 			Set the number of jiffies to delay each step of
 			RCU grace-period initialization.  This only has
-			effect when CONFIG_RCU_TORTURE_TEST_SLOW_INIT is
-			set.
+			effect when CONFIG_RCU_TORTURE_TEST_SLOW_INIT
+			is set.
+
+	rcutree.gp_preinit_delay=	[KNL]
+			Set the number of jiffies to delay each step of
+			RCU grace-period pre-initialization, that is,
+			the propagation of recent CPU-hotplug changes up
+			the rcu_node combining tree.  This only has effect
+			when CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT is set.
+
+	rcutree.rcu_fanout_exact= [KNL]
+			Disable autobalancing of the rcu_node combining
+			tree.  This is used by rcutorture, and might
+			possibly be useful for architectures having high
+			cache-to-cache transfer latencies.
 
 	rcutree.rcu_fanout_leaf= [KNL]
 			Increase the number of CPUs assigned to each
@@ -3103,7 +3132,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			test, hence the "fake".
 
 	rcutorture.nreaders= [KNL]
-			Set number of RCU readers.
+			Set number of RCU readers.  The value -1 selects
+			N-1, where N is the number of CPUs.  A value
+			"n" less than -1 selects N-n-2, where N is again
+			the number of CPUs.  For example, -2 selects N
+			(the number of CPUs), -3 selects N+1, and so on.
 
 	rcutorture.object_debug= [KNL]
 			Enable debug-object double-call_rcu() testing.
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index f95746189b5ded..13feb697271f0a 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -617,16 +617,16 @@ case what's actually required is:
 However, stores are not speculated.  This means that ordering -is- provided
 for load-store control dependencies, as in the following example:
 
-	q = ACCESS_ONCE(a);
+	q = READ_ONCE_CTRL(a);
 	if (q) {
 		ACCESS_ONCE(b) = p;
 	}
 
-Control dependencies pair normally with other types of barriers.
-That said, please note that ACCESS_ONCE() is not optional!  Without the
-ACCESS_ONCE(), might combine the load from 'a' with other loads from
-'a', and the store to 'b' with other stores to 'b', with possible highly
-counterintuitive effects on ordering.
+Control dependencies pair normally with other types of barriers.  That
+said, please note that READ_ONCE_CTRL() is not optional!  Without the
+READ_ONCE_CTRL(), the compiler might combine the load from 'a' with
+other loads from 'a', and the store to 'b' with other stores to 'b',
+with possible highly counterintuitive effects on ordering.
 
 Worse yet, if the compiler is able to prove (say) that the value of
 variable 'a' is always non-zero, it would be well within its rights
@@ -636,12 +636,15 @@ as follows:
 	q = a;
 	b = p;  /* BUG: Compiler and CPU can both reorder!!! */
 
-So don't leave out the ACCESS_ONCE().
+Finally, the READ_ONCE_CTRL() includes an smp_read_barrier_depends()
+that DEC Alpha needs in order to respect control depedencies.
+
+So don't leave out the READ_ONCE_CTRL().
 
 It is tempting to try to enforce ordering on identical stores on both
 branches of the "if" statement as follows:
 
-	q = ACCESS_ONCE(a);
+	q = READ_ONCE_CTRL(a);
 	if (q) {
 		barrier();
 		ACCESS_ONCE(b) = p;
@@ -655,7 +658,7 @@ branches of the "if" statement as follows:
 Unfortunately, current compilers will transform this as follows at high
 optimization levels:
 
-	q = ACCESS_ONCE(a);
+	q = READ_ONCE_CTRL(a);
 	barrier();
 	ACCESS_ONCE(b) = p;  /* BUG: No ordering vs. load from a!!! */
 	if (q) {
@@ -685,7 +688,7 @@ memory barriers, for example, smp_store_release():
 In contrast, without explicit memory barriers, two-legged-if control
 ordering is guaranteed only when the stores differ, for example:
 
-	q = ACCESS_ONCE(a);
+	q = READ_ONCE_CTRL(a);
 	if (q) {
 		ACCESS_ONCE(b) = p;
 		do_something();
@@ -694,14 +697,14 @@ ordering is guaranteed only when the stores differ, for example:
 		do_something_else();
 	}
 
-The initial ACCESS_ONCE() is still required to prevent the compiler from
-proving the value of 'a'.
+The initial READ_ONCE_CTRL() is still required to prevent the compiler
+from proving the value of 'a'.
 
 In addition, you need to be careful what you do with the local variable 'q',
 otherwise the compiler might be able to guess the value and again remove
 the needed conditional.  For example:
 
-	q = ACCESS_ONCE(a);
+	q = READ_ONCE_CTRL(a);
 	if (q % MAX) {
 		ACCESS_ONCE(b) = p;
 		do_something();
@@ -714,7 +717,7 @@ If MAX is defined to be 1, then the compiler knows that (q % MAX) is
 equal to zero, in which case the compiler is within its rights to
 transform the above code into the following:
 
-	q = ACCESS_ONCE(a);
+	q = READ_ONCE_CTRL(a);
 	ACCESS_ONCE(b) = p;
 	do_something_else();
 
@@ -725,7 +728,7 @@ is gone, and the barrier won't bring it back.  Therefore, if you are
 relying on this ordering, you should make sure that MAX is greater than
 one, perhaps as follows:
 
-	q = ACCESS_ONCE(a);
+	q = READ_ONCE_CTRL(a);
 	BUILD_BUG_ON(MAX <= 1); /* Order load from a with store to b. */
 	if (q % MAX) {
 		ACCESS_ONCE(b) = p;
@@ -742,14 +745,15 @@ of the 'if' statement.
 You must also be careful not to rely too much on boolean short-circuit
 evaluation.  Consider this example:
 
-	q = ACCESS_ONCE(a);
+	q = READ_ONCE_CTRL(a);
 	if (a || 1 > 0)
 		ACCESS_ONCE(b) = 1;
 
-Because the second condition is always true, the compiler can transform
-this example as following, defeating control dependency:
+Because the first condition cannot fault and the second condition is
+always true, the compiler can transform this example as following,
+defeating control dependency:
 
-	q = ACCESS_ONCE(a);
+	q = READ_ONCE_CTRL(a);
 	ACCESS_ONCE(b) = 1;
 
 This example underscores the need to ensure that the compiler cannot
@@ -762,8 +766,8 @@ demonstrated by two related examples, with the initial values of
 x and y both being zero:
 
 	CPU 0                     CPU 1
-	=====================     =====================
-	r1 = ACCESS_ONCE(x);      r2 = ACCESS_ONCE(y);
+	=======================   =======================
+	r1 = READ_ONCE_CTRL(x);   r2 = READ_ONCE_CTRL(y);
 	if (r1 > 0)               if (r2 > 0)
 	  ACCESS_ONCE(y) = 1;       ACCESS_ONCE(x) = 1;
 
@@ -783,7 +787,8 @@ But because control dependencies do -not- provide transitivity, the above
 assertion can fail after the combined three-CPU example completes.  If you
 need the three-CPU example to provide ordering, you will need smp_mb()
 between the loads and stores in the CPU 0 and CPU 1 code fragments,
-that is, just before or just after the "if" statements.
+that is, just before or just after the "if" statements.  Furthermore,
+the original two-CPU example is very fragile and should be avoided.
 
 These two examples are the LB and WWC litmus tests from this paper:
 http://www.cl.cam.ac.uk/users/pes20/ppc-supplemental/test6.pdf and this
@@ -791,6 +796,12 @@ site: https://www.cl.cam.ac.uk/~pes20/ppcmem/index.html.
 
 In summary:
 
+  (*) Control dependencies must be headed by READ_ONCE_CTRL().
+      Or, as a much less preferable alternative, interpose
+      be headed by READ_ONCE() or an ACCESS_ONCE() read and must
+      have smp_read_barrier_depends() between this read and the
+      control-dependent write.
+
   (*) Control dependencies can order prior loads against later stores.
       However, they do -not- guarantee any other sort of ordering:
       Not prior loads against later loads, nor prior stores against
@@ -1662,7 +1673,7 @@ CPU from reordering them.
 
 There are some more advanced barrier functions:
 
- (*) set_mb(var, value)
+ (*) smp_store_mb(var, value)
 
      This assigns the value to the variable and then inserts a full memory
      barrier after it, depending on the function.  It isn't guaranteed to
@@ -1784,10 +1795,9 @@ for each construct.  These operations all imply certain barriers:
 
      Memory operations issued before the ACQUIRE may be completed after
      the ACQUIRE operation has completed.  An smp_mb__before_spinlock(),
-     combined with a following ACQUIRE, orders prior loads against
-     subsequent loads and stores and also orders prior stores against
-     subsequent stores.  Note that this is weaker than smp_mb()!  The
-     smp_mb__before_spinlock() primitive is free on many architectures.
+     combined with a following ACQUIRE, orders prior stores against
+     subsequent loads and stores. Note that this is weaker than smp_mb()!
+     The smp_mb__before_spinlock() primitive is free on many architectures.
 
  (2) RELEASE operation implication:
 
@@ -1975,7 +1985,7 @@ after it has altered the task state:
 	CPU 1
 	===============================
 	set_current_state();
-	  set_mb();
+	  smp_store_mb();
 	    STORE current->state
 	    <general barrier>
 	LOAD event_indicated
@@ -2016,7 +2026,7 @@ between the STORE to indicate the event and the STORE to set TASK_RUNNING:
 	CPU 1				CPU 2
 	===============================	===============================
 	set_current_state();		STORE event_indicated
-	  set_mb();			wake_up();
+	  smp_store_mb();		wake_up();
 	    STORE current->state	  <write barrier>
 	    <general barrier>		  STORE current->state
 	LOAD event_indicated
diff --git a/Documentation/preempt-locking.txt b/Documentation/preempt-locking.txt
index 57883ca2498bb5..e89ce6624af2fa 100644
--- a/Documentation/preempt-locking.txt
+++ b/Documentation/preempt-locking.txt
@@ -48,7 +48,7 @@ preemption must be disabled around such regions.
 
 Note, some FPU functions are already explicitly preempt safe.  For example,
 kernel_fpu_begin and kernel_fpu_end will disable and enable preemption.
-However, math_state_restore must be called with preemption disabled.
+However, fpu__restore() must be called with preemption disabled.
 
 
 RULE #3: Lock acquire and release must be performed by same task
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt
index 21461a0441c129..e114513a2731dc 100644
--- a/Documentation/scheduler/sched-deadline.txt
+++ b/Documentation/scheduler/sched-deadline.txt
@@ -8,6 +8,10 @@ CONTENTS
  1. Overview
  2. Scheduling algorithm
  3. Scheduling Real-Time Tasks
+   3.1 Definitions
+   3.2 Schedulability Analysis for Uniprocessor Systems
+   3.3 Schedulability Analysis for Multiprocessor Systems
+   3.4 Relationship with SCHED_DEADLINE Parameters
  4. Bandwidth management
    4.1 System-wide settings
    4.2 Task interface
@@ -43,7 +47,7 @@ CONTENTS
  "deadline", to schedule tasks. A SCHED_DEADLINE task should receive
  "runtime" microseconds of execution time every "period" microseconds, and
  these "runtime" microseconds are available within "deadline" microseconds
- from the beginning of the period.  In order to implement this behaviour,
+ from the beginning of the period.  In order to implement this behavior,
  every time the task wakes up, the scheduler computes a "scheduling deadline"
  consistent with the guarantee (using the CBS[2,3] algorithm). Tasks are then
  scheduled using EDF[1] on these scheduling deadlines (the task with the
@@ -52,7 +56,7 @@ CONTENTS
  "admission control" strategy (see Section "4. Bandwidth management") is used
  (clearly, if the system is overloaded this guarantee cannot be respected).
 
- Summing up, the CBS[2,3] algorithms assigns scheduling deadlines to tasks so
+ Summing up, the CBS[2,3] algorithm assigns scheduling deadlines to tasks so
  that each task runs for at most its runtime every period, avoiding any
  interference between different tasks (bandwidth isolation), while the EDF[1]
  algorithm selects the task with the earliest scheduling deadline as the one
@@ -63,7 +67,7 @@ CONTENTS
  In more details, the CBS algorithm assigns scheduling deadlines to
  tasks in the following way:
 
-  - Each SCHED_DEADLINE task is characterised by the "runtime",
+  - Each SCHED_DEADLINE task is characterized by the "runtime",
     "deadline", and "period" parameters;
 
   - The state of the task is described by a "scheduling deadline", and
@@ -78,7 +82,7 @@ CONTENTS
 
     then, if the scheduling deadline is smaller than the current time, or
     this condition is verified, the scheduling deadline and the
-    remaining runtime are re-initialised as
+    remaining runtime are re-initialized as
 
          scheduling deadline = current time + deadline
          remaining runtime = runtime
@@ -126,31 +130,37 @@ CONTENTS
  suited for periodic or sporadic real-time tasks that need guarantees on their
  timing behavior, e.g., multimedia, streaming, control applications, etc.
 
+3.1 Definitions
+------------------------
+
  A typical real-time task is composed of a repetition of computation phases
  (task instances, or jobs) which are activated on a periodic or sporadic
  fashion.
- Each job J_j (where J_j is the j^th job of the task) is characterised by an
+ Each job J_j (where J_j is the j^th job of the task) is characterized by an
  arrival time r_j (the time when the job starts), an amount of computation
  time c_j needed to finish the job, and a job absolute deadline d_j, which
  is the time within which the job should be finished. The maximum execution
- time max_j{c_j} is called "Worst Case Execution Time" (WCET) for the task.
+ time max{c_j} is called "Worst Case Execution Time" (WCET) for the task.
  A real-time task can be periodic with period P if r_{j+1} = r_j + P, or
  sporadic with minimum inter-arrival time P is r_{j+1} >= r_j + P. Finally,
  d_j = r_j + D, where D is the task's relative deadline.
- The utilisation of a real-time task is defined as the ratio between its
+ Summing up, a real-time task can be described as
+	Task = (WCET, D, P)
+
+ The utilization of a real-time task is defined as the ratio between its
  WCET and its period (or minimum inter-arrival time), and represents
  the fraction of CPU time needed to execute the task.
 
- If the total utilisation sum_i(WCET_i/P_i) is larger than M (with M equal
+ If the total utilization U=sum(WCET_i/P_i) is larger than M (with M equal
  to the number of CPUs), then the scheduler is unable to respect all the
  deadlines.
- Note that total utilisation is defined as the sum of the utilisations
+ Note that total utilization is defined as the sum of the utilizations
  WCET_i/P_i over all the real-time tasks in the system. When considering
  multiple real-time tasks, the parameters of the i-th task are indicated
  with the "_i" suffix.
- Moreover, if the total utilisation is larger than M, then we risk starving
+ Moreover, if the total utilization is larger than M, then we risk starving
  non- real-time tasks by real-time tasks.
- If, instead, the total utilisation is smaller than M, then non real-time
+ If, instead, the total utilization is smaller than M, then non real-time
  tasks will not be starved and the system might be able to respect all the
  deadlines.
  As a matter of fact, in this case it is possible to provide an upper bound
@@ -159,38 +169,119 @@ CONTENTS
  More precisely, it can be proven that using a global EDF scheduler the
  maximum tardiness of each task is smaller or equal than
 	((M − 1) · WCET_max − WCET_min)/(M − (M − 2) · U_max) + WCET_max
- where WCET_max = max_i{WCET_i} is the maximum WCET, WCET_min=min_i{WCET_i}
- is the minimum WCET, and U_max = max_i{WCET_i/P_i} is the maximum utilisation.
+ where WCET_max = max{WCET_i} is the maximum WCET, WCET_min=min{WCET_i}
+ is the minimum WCET, and U_max = max{WCET_i/P_i} is the maximum
+ utilization[12].
+
+3.2 Schedulability Analysis for Uniprocessor Systems
+------------------------
 
  If M=1 (uniprocessor system), or in case of partitioned scheduling (each
  real-time task is statically assigned to one and only one CPU), it is
  possible to formally check if all the deadlines are respected.
  If D_i = P_i for all tasks, then EDF is able to respect all the deadlines
- of all the tasks executing on a CPU if and only if the total utilisation
+ of all the tasks executing on a CPU if and only if the total utilization
  of the tasks running on such a CPU is smaller or equal than 1.
  If D_i != P_i for some task, then it is possible to define the density of
- a task as C_i/min{D_i,T_i}, and EDF is able to respect all the deadlines
- of all the tasks running on a CPU if the sum sum_i C_i/min{D_i,T_i} of the
- densities of the tasks running on such a CPU is smaller or equal than 1
- (notice that this condition is only sufficient, and not necessary).
+ a task as WCET_i/min{D_i,P_i}, and EDF is able to respect all the deadlines
+ of all the tasks running on a CPU if the sum of the densities of the tasks
+ running on such a CPU is smaller or equal than 1:
+	sum(WCET_i / min{D_i, P_i}) <= 1
+ It is important to notice that this condition is only sufficient, and not
+ necessary: there are task sets that are schedulable, but do not respect the
+ condition. For example, consider the task set {Task_1,Task_2} composed by
+ Task_1=(50ms,50ms,100ms) and Task_2=(10ms,100ms,100ms).
+ EDF is clearly able to schedule the two tasks without missing any deadline
+ (Task_1 is scheduled as soon as it is released, and finishes just in time
+ to respect its deadline; Task_2 is scheduled immediately after Task_1, hence
+ its response time cannot be larger than 50ms + 10ms = 60ms) even if
+	50 / min{50,100} + 10 / min{100, 100} = 50 / 50 + 10 / 100 = 1.1
+ Of course it is possible to test the exact schedulability of tasks with
+ D_i != P_i (checking a condition that is both sufficient and necessary),
+ but this cannot be done by comparing the total utilization or density with
+ a constant. Instead, the so called "processor demand" approach can be used,
+ computing the total amount of CPU time h(t) needed by all the tasks to
+ respect all of their deadlines in a time interval of size t, and comparing
+ such a time with the interval size t. If h(t) is smaller than t (that is,
+ the amount of time needed by the tasks in a time interval of size t is
+ smaller than the size of the interval) for all the possible values of t, then
+ EDF is able to schedule the tasks respecting all of their deadlines. Since
+ performing this check for all possible values of t is impossible, it has been
+ proven[4,5,6] that it is sufficient to perform the test for values of t
+ between 0 and a maximum value L. The cited papers contain all of the
+ mathematical details and explain how to compute h(t) and L.
+ In any case, this kind of analysis is too complex as well as too
+ time-consuming to be performed on-line. Hence, as explained in Section
+ 4 Linux uses an admission test based on the tasks' utilizations.
+
+3.3 Schedulability Analysis for Multiprocessor Systems
+------------------------
 
  On multiprocessor systems with global EDF scheduling (non partitioned
  systems), a sufficient test for schedulability can not be based on the
- utilisations (it can be shown that task sets with utilisations slightly
- larger than 1 can miss deadlines regardless of the number of CPUs M).
- However, as previously stated, enforcing that the total utilisation is smaller
- than M is enough to guarantee that non real-time tasks are not starved and
- that the tardiness of real-time tasks has an upper bound.
+ utilizations or densities: it can be shown that even if D_i = P_i task
+ sets with utilizations slightly larger than 1 can miss deadlines regardless
+ of the number of CPUs.
+
+ Consider a set {Task_1,...Task_{M+1}} of M+1 tasks on a system with M
+ CPUs, with the first task Task_1=(P,P,P) having period, relative deadline
+ and WCET equal to P. The remaining M tasks Task_i=(e,P-1,P-1) have an
+ arbitrarily small worst case execution time (indicated as "e" here) and a
+ period smaller than the one of the first task. Hence, if all the tasks
+ activate at the same time t, global EDF schedules these M tasks first
+ (because their absolute deadlines are equal to t + P - 1, hence they are
+ smaller than the absolute deadline of Task_1, which is t + P). As a
+ result, Task_1 can be scheduled only at time t + e, and will finish at
+ time t + e + P, after its absolute deadline. The total utilization of the
+ task set is U = M · e / (P - 1) + P / P = M · e / (P - 1) + 1, and for small
+ values of e this can become very close to 1. This is known as "Dhall's
+ effect"[7]. Note: the example in the original paper by Dhall has been
+ slightly simplified here (for example, Dhall more correctly computed
+ lim_{e->0}U).
+
+ More complex schedulability tests for global EDF have been developed in
+ real-time literature[8,9], but they are not based on a simple comparison
+ between total utilization (or density) and a fixed constant. If all tasks
+ have D_i = P_i, a sufficient schedulability condition can be expressed in
+ a simple way:
+	sum(WCET_i / P_i) <= M - (M - 1) · U_max
+ where U_max = max{WCET_i / P_i}[10]. Notice that for U_max = 1,
+ M - (M - 1) · U_max becomes M - M + 1 = 1 and this schedulability condition
+ just confirms the Dhall's effect. A more complete survey of the literature
+ about schedulability tests for multi-processor real-time scheduling can be
+ found in [11].
+
+ As seen, enforcing that the total utilization is smaller than M does not
+ guarantee that global EDF schedules the tasks without missing any deadline
+ (in other words, global EDF is not an optimal scheduling algorithm). However,
+ a total utilization smaller than M is enough to guarantee that non real-time
+ tasks are not starved and that the tardiness of real-time tasks has an upper
+ bound[12] (as previously noted). Different bounds on the maximum tardiness
+ experienced by real-time tasks have been developed in various papers[13,14],
+ but the theoretical result that is important for SCHED_DEADLINE is that if
+ the total utilization is smaller or equal than M then the response times of
+ the tasks are limited.
+
+3.4 Relationship with SCHED_DEADLINE Parameters
+------------------------
 
- SCHED_DEADLINE can be used to schedule real-time tasks guaranteeing that
- the jobs' deadlines of a task are respected. In order to do this, a task
- must be scheduled by setting:
+ Finally, it is important to understand the relationship between the
+ SCHED_DEADLINE scheduling parameters described in Section 2 (runtime,
+ deadline and period) and the real-time task parameters (WCET, D, P)
+ described in this section. Note that the tasks' temporal constraints are
+ represented by its absolute deadlines d_j = r_j + D described above, while
+ SCHED_DEADLINE schedules the tasks according to scheduling deadlines (see
+ Section 2).
+ If an admission test is used to guarantee that the scheduling deadlines
+ are respected, then SCHED_DEADLINE can be used to schedule real-time tasks
+ guaranteeing that all the jobs' deadlines of a task are respected.
+ In order to do this, a task must be scheduled by setting:
 
   - runtime >= WCET
   - deadline = D
   - period <= P
 
- IOW, if runtime >= WCET and if period is >= P, then the scheduling deadlines
+ IOW, if runtime >= WCET and if period is <= P, then the scheduling deadlines
  and the absolute deadlines (d_j) coincide, so a proper admission control
  allows to respect the jobs' absolute deadlines for this task (this is what is
  called "hard schedulability property" and is an extension of Lemma 1 of [2]).
@@ -206,6 +297,39 @@ CONTENTS
       Symposium, 1998. http://retis.sssup.it/~giorgio/paps/1998/rtss98-cbs.pdf
   3 - L. Abeni. Server Mechanisms for Multimedia Applications. ReTiS Lab
       Technical Report. http://disi.unitn.it/~abeni/tr-98-01.pdf
+  4 - J. Y. Leung and M.L. Merril. A Note on Preemptive Scheduling of
+      Periodic, Real-Time Tasks. Information Processing Letters, vol. 11,
+      no. 3, pp. 115-118, 1980.
+  5 - S. K. Baruah, A. K. Mok and L. E. Rosier. Preemptively Scheduling
+      Hard-Real-Time Sporadic Tasks on One Processor. Proceedings of the
+      11th IEEE Real-time Systems Symposium, 1990.
+  6 - S. K. Baruah, L. E. Rosier and R. R. Howell. Algorithms and Complexity
+      Concerning the Preemptive Scheduling of Periodic Real-Time tasks on
+      One Processor. Real-Time Systems Journal, vol. 4, no. 2, pp 301-324,
+      1990.
+  7 - S. J. Dhall and C. L. Liu. On a real-time scheduling problem. Operations
+      research, vol. 26, no. 1, pp 127-140, 1978.
+  8 - T. Baker. Multiprocessor EDF and Deadline Monotonic Schedulability
+      Analysis. Proceedings of the 24th IEEE Real-Time Systems Symposium, 2003.
+  9 - T. Baker. An Analysis of EDF Schedulability on a Multiprocessor.
+      IEEE Transactions on Parallel and Distributed Systems, vol. 16, no. 8,
+      pp 760-768, 2005.
+  10 - J. Goossens, S. Funk and S. Baruah, Priority-Driven Scheduling of
+       Periodic Task Systems on Multiprocessors. Real-Time Systems Journal,
+       vol. 25, no. 2–3, pp. 187–205, 2003.
+  11 - R. Davis and A. Burns. A Survey of Hard Real-Time Scheduling for
+       Multiprocessor Systems. ACM Computing Surveys, vol. 43, no. 4, 2011.
+       http://www-users.cs.york.ac.uk/~robdavis/papers/MPSurveyv5.0.pdf
+  12 - U. C. Devi and J. H. Anderson. Tardiness Bounds under Global EDF
+       Scheduling on a Multiprocessor. Real-Time Systems Journal, vol. 32,
+       no. 2, pp 133-189, 2008.
+  13 - P. Valente and G. Lipari. An Upper Bound to the Lateness of Soft
+       Real-Time Tasks Scheduled by EDF on Multiprocessors. Proceedings of
+       the 26th IEEE Real-Time Systems Symposium, 2005.
+  14 - J. Erickson, U. Devi and S. Baruah. Improved tardiness bounds for
+       Global EDF. Proceedings of the 22nd Euromicro Conference on
+       Real-Time Systems, 2010.
+
 
 4. Bandwidth management
 =======================
@@ -218,10 +342,10 @@ CONTENTS
  no guarantee can be given on the actual scheduling of the -deadline tasks.
 
  As already stated in Section 3, a necessary condition to be respected to
- correctly schedule a set of real-time tasks is that the total utilisation
+ correctly schedule a set of real-time tasks is that the total utilization
  is smaller than M. When talking about -deadline tasks, this requires that
  the sum of the ratio between runtime and period for all tasks is smaller
- than M. Notice that the ratio runtime/period is equivalent to the utilisation
+ than M. Notice that the ratio runtime/period is equivalent to the utilization
  of a "traditional" real-time task, and is also often referred to as
  "bandwidth".
  The interface used to control the CPU bandwidth that can be allocated
@@ -251,7 +375,7 @@ CONTENTS
  The system wide settings are configured under the /proc virtual file system.
 
  For now the -rt knobs are used for -deadline admission control and the
- -deadline runtime is accounted against the -rt runtime. We realise that this
+ -deadline runtime is accounted against the -rt runtime. We realize that this
  isn't entirely desirable; however, it is better to have a small interface for
  now, and be able to change it easily later. The ideal situation (see 5.) is to
  run -rt tasks from a -deadline server; in which case the -rt bandwidth is a
diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index 88b85899d30953..9da6f351224962 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -406,7 +406,7 @@ Protocol:	2.00+
 	- If 0, the protected-mode code is loaded at 0x10000.
 	- If 1, the protected-mode code is loaded at 0x100000.
 
-  Bit 1 (kernel internal): ALSR_FLAG
+  Bit 1 (kernel internal): KASLR_FLAG
 	- Used internally by the compressed kernel to communicate
 	  KASLR status to kernel proper.
 	  If 1, KASLR enabled.
@@ -1124,7 +1124,6 @@ The boot loader *must* fill out the following fields in bp,
 
     o hdr.code32_start
     o hdr.cmd_line_ptr
-    o hdr.cmdline_size
     o hdr.ramdisk_image (if applicable)
     o hdr.ramdisk_size  (if applicable)
 
diff --git a/Documentation/x86/x86_64/kernel-stacks b/Documentation/x86/kernel-stacks
index e3c8a49d1a2f5b..0f3a6c20194326 100644
--- a/Documentation/x86/x86_64/kernel-stacks
+++ b/Documentation/x86/kernel-stacks
@@ -1,3 +1,6 @@
+Kernel stacks on x86-64 bit
+---------------------------
+
 Most of the text from Keith Owens, hacked by AK
 
 x86_64 page size (PAGE_SIZE) is 4K.
@@ -56,13 +59,6 @@ If that assumption is ever broken then the stacks will become corrupt.
 
 The currently assigned IST stacks are :-
 
-* STACKFAULT_STACK.  EXCEPTION_STKSZ (PAGE_SIZE).
-
-  Used for interrupt 12 - Stack Fault Exception (#SS).
-
-  This allows the CPU to recover from invalid stack segments. Rarely
-  happens.
-
 * DOUBLEFAULT_STACK.  EXCEPTION_STKSZ (PAGE_SIZE).
 
   Used for interrupt 8 - Double Fault Exception (#DF).
@@ -99,3 +95,47 @@ The currently assigned IST stacks are :-
   assumptions about the previous state of the kernel stack.
 
 For more details see the Intel IA32 or AMD AMD64 architecture manuals.
+
+
+Printing backtraces on x86
+--------------------------
+
+The question about the '?' preceding function names in an x86 stacktrace
+keeps popping up, here's an indepth explanation. It helps if the reader
+stares at print_context_stack() and the whole machinery in and around
+arch/x86/kernel/dumpstack.c.
+
+Adapted from Ingo's mail, Message-ID: <20150521101614.GA10889@gmail.com>:
+
+We always scan the full kernel stack for return addresses stored on
+the kernel stack(s) [*], from stack top to stack bottom, and print out
+anything that 'looks like' a kernel text address.
+
+If it fits into the frame pointer chain, we print it without a question
+mark, knowing that it's part of the real backtrace.
+
+If the address does not fit into our expected frame pointer chain we
+still print it, but we print a '?'. It can mean two things:
+
+ - either the address is not part of the call chain: it's just stale
+   values on the kernel stack, from earlier function calls. This is
+   the common case.
+
+ - or it is part of the call chain, but the frame pointer was not set
+   up properly within the function, so we don't recognize it.
+
+This way we will always print out the real call chain (plus a few more
+entries), regardless of whether the frame pointer was set up correctly
+or not - but in most cases we'll get the call chain right as well. The
+entries printed are strictly in stack order, so you can deduce more
+information from that as well.
+
+The most important property of this method is that we _never_ lose
+information: we always strive to print _all_ addresses on the stack(s)
+that look like kernel text addresses, so if debug information is wrong,
+we still print out the real call chain as well - just with more question
+marks than ideal.
+
+[*] For things like IRQ and IST stacks, we also scan those stacks, in
+    the right order, and try to cross from one stack into another
+    reconstructing the call chain. This works most of the time.
diff --git a/Documentation/x86/mtrr.txt b/Documentation/x86/mtrr.txt
index cc071dc333c213..860bc3adc22344 100644
--- a/Documentation/x86/mtrr.txt
+++ b/Documentation/x86/mtrr.txt
@@ -1,7 +1,19 @@
 MTRR (Memory Type Range Register) control
-3 Jun 1999
-Richard Gooch
-<rgooch@atnf.csiro.au>
+
+Richard Gooch <rgooch@atnf.csiro.au> - 3 Jun 1999
+Luis R. Rodriguez <mcgrof@do-not-panic.com> - April 9, 2015
+
+===============================================================================
+Phasing out MTRR use
+
+MTRR use is replaced on modern x86 hardware with PAT. Over time the only type
+of effective MTRR that is expected to be supported will be for write-combining.
+As MTRR use is phased out device drivers should use arch_phys_wc_add() to make
+MTRR effective on non-PAT systems while a no-op on PAT enabled systems.
+
+For details refer to Documentation/x86/pat.txt.
+
+===============================================================================
 
   On Intel P6 family processors (Pentium Pro, Pentium II and later)
   the Memory Type Range Registers (MTRRs) may be used to control
diff --git a/Documentation/x86/pat.txt b/Documentation/x86/pat.txt
index cf08c9fff3cdaa..521bd8adc3b8ed 100644
--- a/Documentation/x86/pat.txt
+++ b/Documentation/x86/pat.txt
@@ -34,6 +34,8 @@ ioremap                |    --    |    UC-     |       UC-        |
                        |          |            |                  |
 ioremap_cache          |    --    |    WB      |       WB         |
                        |          |            |                  |
+ioremap_uc             |    --    |    UC      |       UC         |
+                       |          |            |                  |
 ioremap_nocache        |    --    |    UC-     |       UC-        |
                        |          |            |                  |
 ioremap_wc             |    --    |    --      |       WC         |
@@ -102,7 +104,38 @@ wants to export a RAM region, it has to do set_memory_uc() or set_memory_wc()
 as step 0 above and also track the usage of those pages and use set_memory_wb()
 before the page is freed to free pool.
 
-
+MTRR effects on PAT / non-PAT systems
+-------------------------------------
+
+The following table provides the effects of using write-combining MTRRs when
+using ioremap*() calls on x86 for both non-PAT and PAT systems. Ideally
+mtrr_add() usage will be phased out in favor of arch_phys_wc_add() which will
+be a no-op on PAT enabled systems. The region over which a arch_phys_wc_add()
+is made, should already have been ioremapped with WC attributes or PAT entries,
+this can be done by using ioremap_wc() / set_memory_wc().  Devices which
+combine areas of IO memory desired to remain uncacheable with areas where
+write-combining is desirable should consider use of ioremap_uc() followed by
+set_memory_wc() to white-list effective write-combined areas.  Such use is
+nevertheless discouraged as the effective memory type is considered
+implementation defined, yet this strategy can be used as last resort on devices
+with size-constrained regions where otherwise MTRR write-combining would
+otherwise not be effective.
+
+----------------------------------------------------------------------
+MTRR Non-PAT   PAT    Linux ioremap value        Effective memory type
+----------------------------------------------------------------------
+                                                  Non-PAT |  PAT
+     PAT
+     |PCD
+     ||PWT
+     |||
+WC   000      WB      _PAGE_CACHE_MODE_WB            WC   |   WC
+WC   001      WC      _PAGE_CACHE_MODE_WC            WC*  |   WC
+WC   010      UC-     _PAGE_CACHE_MODE_UC_MINUS      WC*  |   UC
+WC   011      UC      _PAGE_CACHE_MODE_UC            UC   |   UC
+----------------------------------------------------------------------
+
+(*) denotes implementation defined and is discouraged
 
 Notes:
 
diff --git a/Kbuild b/Kbuild
index 6f0d82a9245d89..df99a5f53beb88 100644
--- a/Kbuild
+++ b/Kbuild
@@ -2,8 +2,9 @@
 # Kbuild for top-level directory of the kernel
 # This file takes care of the following:
 # 1) Generate bounds.h
-# 2) Generate asm-offsets.h (may need bounds.h)
-# 3) Check for missing system calls
+# 2) Generate timeconst.h
+# 3) Generate asm-offsets.h (may need bounds.h and timeconst.h)
+# 4) Check for missing system calls
 
 # Default sed regexp - multiline due to syntax constraints
 define sed-y
@@ -47,7 +48,26 @@ $(obj)/$(bounds-file): kernel/bounds.s FORCE
 	$(call filechk,offsets,__LINUX_BOUNDS_H__)
 
 #####
-# 2) Generate asm-offsets.h
+# 2) Generate timeconst.h
+
+timeconst-file := include/generated/timeconst.h
+
+#always  += $(timeconst-file)
+targets += $(timeconst-file)
+
+quiet_cmd_gentimeconst = GEN     $@
+define cmd_gentimeconst
+	(echo $(CONFIG_HZ) | bc -q $< ) > $@
+endef
+define filechk_gentimeconst
+	(echo $(CONFIG_HZ) | bc -q $< )
+endef
+
+$(obj)/$(timeconst-file): kernel/time/timeconst.bc FORCE
+	$(call filechk,gentimeconst)
+
+#####
+# 3) Generate asm-offsets.h
 #
 
 offsets-file := include/generated/asm-offsets.h
@@ -57,7 +77,7 @@ targets += arch/$(SRCARCH)/kernel/asm-offsets.s
 
 # We use internal kbuild rules to avoid the "is up to date" message from make
 arch/$(SRCARCH)/kernel/asm-offsets.s: arch/$(SRCARCH)/kernel/asm-offsets.c \
-                                      $(obj)/$(bounds-file) FORCE
+                                      $(obj)/$(timeconst-file) $(obj)/$(bounds-file) FORCE
 	$(Q)mkdir -p $(dir $@)
 	$(call if_changed_dep,cc_s_c)
 
@@ -65,7 +85,7 @@ $(obj)/$(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE
 	$(call filechk,offsets,__ASM_OFFSETS_H__)
 
 #####
-# 3) Check for missing system calls
+# 4) Check for missing system calls
 #
 
 always += missing-syscalls
@@ -77,5 +97,5 @@ quiet_cmd_syscalls = CALL    $<
 missing-syscalls: scripts/checksyscalls.sh $(offsets-file) FORCE
 	$(call cmd,syscalls)
 
-# Keep these two files during make clean
-no-clean-files := $(bounds-file) $(offsets-file)
+# Keep these three files during make clean
+no-clean-files := $(bounds-file) $(offsets-file) $(timeconst-file)
diff --git a/MAINTAINERS b/MAINTAINERS
index 0ebb10ea50fd71..fb76df0f81b8e4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7758,7 +7758,6 @@ F:	kernel/delayacct.c
 
 PERFORMANCE EVENTS SUBSYSTEM
 M:	Peter Zijlstra <a.p.zijlstra@chello.nl>
-M:	Paul Mackerras <paulus@samba.org>
 M:	Ingo Molnar <mingo@redhat.com>
 M:	Arnaldo Carvalho de Melo <acme@kernel.org>
 L:	linux-kernel@vger.kernel.org
diff --git a/Makefile b/Makefile
index 3c5a318ffb3b67..baa957d5de29d5 100644
--- a/Makefile
+++ b/Makefile
@@ -215,7 +215,6 @@ VPATH		:= $(srctree)$(if $(KBUILD_EXTMOD),:$(KBUILD_EXTMOD))
 
 export srctree objtree VPATH
 
-
 # SUBARCH tells the usermode build what the underlying arch is.  That is set
 # first, and if a usermode build is happening, the "ARCH=um" on the command
 # line overrides the setting of ARCH below.  If a native build is happening,
@@ -1512,11 +1511,11 @@ image_name:
 # Clear a bunch of variables before executing the submake
 tools/: FORCE
 	$(Q)mkdir -p $(objtree)/tools
-	$(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(filter --j% -j,$(MAKEFLAGS))" O=$(objtree) subdir=tools -C $(src)/tools/
+	$(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(filter --j% -j,$(MAKEFLAGS))" O=$(O) subdir=tools -C $(src)/tools/
 
 tools/%: FORCE
 	$(Q)mkdir -p $(objtree)/tools
-	$(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(filter --j% -j,$(MAKEFLAGS))" O=$(objtree) subdir=tools -C $(src)/tools/ $*
+	$(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(filter --j% -j,$(MAKEFLAGS))" O=$(O) subdir=tools -C $(src)/tools/ $*
 
 # Single targets
 # ---------------------------------------------------------------------------
diff --git a/arch/alpha/include/asm/cmpxchg.h b/arch/alpha/include/asm/cmpxchg.h
index 429e8cd0d78e5a..e5117766529e87 100644
--- a/arch/alpha/include/asm/cmpxchg.h
+++ b/arch/alpha/include/asm/cmpxchg.h
@@ -66,6 +66,4 @@
 #undef __ASM__MB
 #undef ____cmpxchg
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 #endif /* _ALPHA_CMPXCHG_H */
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 9d0ac091a52a7d..4a905bd667e2ef 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -23,8 +23,7 @@
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
-
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 extern void die_if_kernel(char *,struct pt_regs *,long, unsigned long *);
 
@@ -107,7 +106,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 
 	/* If we're in an interrupt context, or have no user context,
 	   we must not take the fault.  */
-	if (!mm || in_atomic())
+	if (!mm || faulthandler_disabled())
 		goto no_context;
 
 #ifdef CONFIG_ALPHA_LARGE_VMALLOC
diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h
index 4dc64ddebecebe..05b5aaf5b0f91e 100644
--- a/arch/arc/include/asm/futex.h
+++ b/arch/arc/include/asm/futex.h
@@ -53,7 +53,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
-	pagefault_disable();	/* implies preempt_disable() */
+	pagefault_disable();
 
 	switch (op) {
 	case FUTEX_OP_SET:
@@ -75,7 +75,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 		ret = -ENOSYS;
 	}
 
-	pagefault_enable();	/* subsumes preempt_enable() */
+	pagefault_enable();
 
 	if (!ret) {
 		switch (cmp) {
@@ -104,7 +104,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	return ret;
 }
 
-/* Compare-xchg with preemption disabled.
+/* Compare-xchg with pagefaults disabled.
  *  Notes:
  *      -Best-Effort: Exchg happens only if compare succeeds.
  *          If compare fails, returns; leaving retry/looping to upper layers
@@ -121,7 +121,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
-	pagefault_disable();	/* implies preempt_disable() */
+	pagefault_disable();
 
 	/* TBD : can use llock/scond */
 	__asm__ __volatile__(
@@ -142,7 +142,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
 	: "r"(oldval), "r"(newval), "r"(uaddr), "ir"(-EFAULT)
 	: "cc", "memory");
 
-	pagefault_enable();	/* subsumes preempt_enable() */
+	pagefault_enable();
 
 	*uval = val;
 	return val;
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index 6a2e006cbcce1f..d948e4e9d89c4e 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -86,7 +86,7 @@ void do_page_fault(unsigned long address, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/arm/include/asm/barrier.h b/arch/arm/include/asm/barrier.h
index d2f81e6b8c1cc5..6c2327e1c7323d 100644
--- a/arch/arm/include/asm/barrier.h
+++ b/arch/arm/include/asm/barrier.h
@@ -81,7 +81,7 @@ do {									\
 #define read_barrier_depends()		do { } while(0)
 #define smp_read_barrier_depends()	do { } while(0)
 
-#define set_mb(var, value)	do { var = value; smp_mb(); } while (0)
+#define smp_store_mb(var, value)	do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 
 #define smp_mb__before_atomic()	smp_mb()
 #define smp_mb__after_atomic()	smp_mb()
diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h
index 4e78065a16aa3c..5eed82809d82b7 100644
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -93,6 +93,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
+	preempt_disable();
 	__asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n"
 	"1:	" TUSER(ldr) "	%1, [%4]\n"
 	"	teq	%1, %2\n"
@@ -104,6 +105,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	: "cc", "memory");
 
 	*uval = val;
+	preempt_enable();
+
 	return ret;
 }
 
@@ -124,7 +127,10 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
-	pagefault_disable();	/* implies preempt_disable() */
+#ifndef CONFIG_SMP
+	preempt_disable();
+#endif
+	pagefault_disable();
 
 	switch (op) {
 	case FUTEX_OP_SET:
@@ -146,7 +152,10 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 		ret = -ENOSYS;
 	}
 
-	pagefault_enable();	/* subsumes preempt_enable() */
+	pagefault_enable();
+#ifndef CONFIG_SMP
+	preempt_enable();
+#endif
 
 	if (!ret) {
 		switch (cmp) {
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 2fe85fff5ccacd..370f7a732900ae 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -18,7 +18,7 @@ extern struct cputopo_arm cpu_topology[NR_CPUS];
 #define topology_physical_package_id(cpu)	(cpu_topology[cpu].socket_id)
 #define topology_core_id(cpu)		(cpu_topology[cpu].core_id)
 #define topology_core_cpumask(cpu)	(&cpu_topology[cpu].core_sibling)
-#define topology_thread_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)
+#define topology_sibling_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)
 
 void init_cpu_topology(void);
 void store_cpu_topology(unsigned int cpuid);
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 6333d9c178757f..0d629b8f973fc2 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -276,7 +276,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
index b98895d9fe57cc..ee8dfa79398978 100644
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -59,6 +59,7 @@ void *kmap_atomic(struct page *page)
 	void *kmap;
 	int type;
 
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -121,6 +122,7 @@ void __kunmap_atomic(void *kvaddr)
 		kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
 	}
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
 
@@ -130,6 +132,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
 	int idx, type;
 	struct page *page = pfn_to_page(pfn);
 
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 71f19c4dc0dee8..0fa47c4275cb56 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -114,7 +114,7 @@ do {									\
 #define read_barrier_depends()		do { } while(0)
 #define smp_read_barrier_depends()	do { } while(0)
 
-#define set_mb(var, value)	do { var = value; smp_mb(); } while (0)
+#define smp_store_mb(var, value)	do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 #define nop()		asm volatile("nop");
 
 #define smp_mb__before_atomic()	smp_mb()
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 5f750dc96e0fd6..74069b3bd919c7 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -58,7 +58,7 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
-	pagefault_disable();	/* implies preempt_disable() */
+	pagefault_disable();
 
 	switch (op) {
 	case FUTEX_OP_SET:
@@ -85,7 +85,7 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 		ret = -ENOSYS;
 	}
 
-	pagefault_enable();	/* subsumes preempt_enable() */
+	pagefault_enable();
 
 	if (!ret) {
 		switch (cmp) {
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 7ebcd31ce51cae..225ec3524fbfc6 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -18,7 +18,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
 #define topology_physical_package_id(cpu)	(cpu_topology[cpu].cluster_id)
 #define topology_core_id(cpu)		(cpu_topology[cpu].core_id)
 #define topology_core_cpumask(cpu)	(&cpu_topology[cpu].core_sibling)
-#define topology_thread_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)
+#define topology_sibling_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)
 
 void init_cpu_topology(void);
 void store_cpu_topology(unsigned int cpuid);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 96da13167d4a5c..0948d327d01365 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -211,7 +211,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	 * If we're in an interrupt or have no user context, we must not take
 	 * the fault.
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/avr32/include/asm/cmpxchg.h b/arch/avr32/include/asm/cmpxchg.h
index 962a6aeab78782..366bbeaeb405d6 100644
--- a/arch/avr32/include/asm/cmpxchg.h
+++ b/arch/avr32/include/asm/cmpxchg.h
@@ -70,8 +70,6 @@ extern unsigned long __cmpxchg_u64_unsupported_on_32bit_kernels(
    if something tries to do an invalid cmpxchg().  */
 extern void __cmpxchg_called_with_bad_pointer(void);
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
 				      unsigned long new, int size)
 {
diff --git a/arch/avr32/include/asm/uaccess.h b/arch/avr32/include/asm/uaccess.h
index a46f7cf3e1eab2..68cf638faf4867 100644
--- a/arch/avr32/include/asm/uaccess.h
+++ b/arch/avr32/include/asm/uaccess.h
@@ -97,7 +97,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
  * @x:   Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -116,7 +117,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
  * @x:   Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -136,7 +138,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
  * @x:   Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -158,7 +161,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
  * @x:   Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index d223a8b57c1eaa..c03533937a9f0a 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -14,11 +14,11 @@
 #include <linux/pagemap.h>
 #include <linux/kdebug.h>
 #include <linux/kprobes.h>
+#include <linux/uaccess.h>
 
 #include <asm/mmu_context.h>
 #include <asm/sysreg.h>
 #include <asm/tlb.h>
-#include <asm/uaccess.h>
 
 #ifdef CONFIG_KPROBES
 static inline int notify_page_fault(struct pt_regs *regs, int trap)
@@ -81,7 +81,7 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user context, we must
 	 * not take the fault...
 	 */
-	if (in_atomic() || !mm || regs->sr & SYSREG_BIT(GM))
+	if (faulthandler_disabled() || !mm || regs->sr & SYSREG_BIT(GM))
 		goto no_context;
 
 	local_irq_enable();
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index 83f12f2ed9e31b..3066d40a6db144 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -8,7 +8,7 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/wait.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <arch/system.h>
 
 extern int find_fixup_code(struct pt_regs *);
@@ -109,11 +109,11 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 	info.si_code = SEGV_MAPERR;
 
 	/*
-	 * If we're in an interrupt or "atomic" operation or have no
+	 * If we're in an interrupt, have pagefaults disabled or have no
 	 * user context, we must not take the fault.
 	 */
 
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
index ec4917ddf67872..61d99767fe1691 100644
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -19,9 +19,9 @@
 #include <linux/kernel.h>
 #include <linux/ptrace.h>
 #include <linux/hardirq.h>
+#include <linux/uaccess.h>
 
 #include <asm/pgtable.h>
-#include <asm/uaccess.h>
 #include <asm/gdb-stub.h>
 
 /*****************************************************************************/
@@ -78,7 +78,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(__frame))
diff --git a/arch/frv/mm/highmem.c b/arch/frv/mm/highmem.c
index bed9a9bd3c10c8..785344bbdc07c3 100644
--- a/arch/frv/mm/highmem.c
+++ b/arch/frv/mm/highmem.c
@@ -42,6 +42,7 @@ void *kmap_atomic(struct page *page)
 	unsigned long paddr;
 	int type;
 
+	preempt_disable();
 	pagefault_disable();
 	type = kmap_atomic_idx_push();
 	paddr = page_to_phys(page);
@@ -85,5 +86,6 @@ void __kunmap_atomic(void *kvaddr)
 	}
 	kmap_atomic_idx_pop();
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/hexagon/include/asm/cmpxchg.h b/arch/hexagon/include/asm/cmpxchg.h
index 9e7802911a57f2..a6e34e2acbbaf6 100644
--- a/arch/hexagon/include/asm/cmpxchg.h
+++ b/arch/hexagon/include/asm/cmpxchg.h
@@ -64,7 +64,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
  *  looks just like atomic_cmpxchg on our arch currently with a bunch of
  *  variable casting.
  */
-#define __HAVE_ARCH_CMPXCHG 1
 
 #define cmpxchg(ptr, old, new)					\
 ({								\
diff --git a/arch/hexagon/include/asm/uaccess.h b/arch/hexagon/include/asm/uaccess.h
index e4127e4d6a5bbd..f000a382bc7f62 100644
--- a/arch/hexagon/include/asm/uaccess.h
+++ b/arch/hexagon/include/asm/uaccess.h
@@ -36,7 +36,8 @@
  * @addr: User space pointer to start of block to check
  * @size: Size of block to check
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Checks if a pointer to a block of memory in user space is valid.
  *
diff --git a/arch/ia64/include/asm/barrier.h b/arch/ia64/include/asm/barrier.h
index f6769eb2bbf9b5..843ba435e43bc2 100644
--- a/arch/ia64/include/asm/barrier.h
+++ b/arch/ia64/include/asm/barrier.h
@@ -77,12 +77,7 @@ do {									\
 	___p1;								\
 })
 
-/*
- * XXX check on this ---I suspect what Linus really wants here is
- * acquire vs release semantics but we can't discuss this stuff with
- * Linus just yet.  Grrr...
- */
-#define set_mb(var, value)	do { (var) = (value); mb(); } while (0)
+#define smp_store_mb(var, value)	do { WRITE_ONCE(var, value); mb(); } while (0)
 
 /*
  * The group barrier in front of the rsm & ssm are necessary to ensure
diff --git a/arch/ia64/include/asm/irq_remapping.h b/arch/ia64/include/asm/irq_remapping.h
index e3b3556e2e1bb8..a8687b1d890691 100644
--- a/arch/ia64/include/asm/irq_remapping.h
+++ b/arch/ia64/include/asm/irq_remapping.h
@@ -1,6 +1,4 @@
 #ifndef __IA64_INTR_REMAPPING_H
 #define __IA64_INTR_REMAPPING_H
 #define irq_remapping_enabled 0
-#define dmar_alloc_hwirq	create_irq
-#define dmar_free_hwirq		destroy_irq
 #endif
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 6437ca21f61b49..3ad8f698836346 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -53,7 +53,7 @@ void build_cpu_to_node_map(void);
 #define topology_physical_package_id(cpu)	(cpu_data(cpu)->socket_id)
 #define topology_core_id(cpu)			(cpu_data(cpu)->core_id)
 #define topology_core_cpumask(cpu)		(&cpu_core_map[cpu])
-#define topology_thread_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
+#define topology_sibling_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
 #endif
 
 extern void arch_fix_phys_package_id(int num, u32 slot);
diff --git a/arch/ia64/include/uapi/asm/cmpxchg.h b/arch/ia64/include/uapi/asm/cmpxchg.h
index f35109b1d9076c..a0e3620f8f133f 100644
--- a/arch/ia64/include/uapi/asm/cmpxchg.h
+++ b/arch/ia64/include/uapi/asm/cmpxchg.h
@@ -61,8 +61,6 @@ extern void ia64_xchg_called_with_bad_pointer(void);
  * indicated by comparing RETURN with OLD.
  */
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 /*
  * This function doesn't exist, so you'll get a linker error
  * if something tries to do an invalid cmpxchg().
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 9dd7464f8c1742..d70bf15c690a53 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -165,7 +165,7 @@ static struct irq_chip dmar_msi_type = {
 	.irq_retrigger = ia64_msi_retrigger_irq,
 };
 
-static int
+static void
 msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
 {
 	struct irq_cfg *cfg = irq_cfg + irq;
@@ -186,21 +186,29 @@ msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
 		MSI_DATA_LEVEL_ASSERT |
 		MSI_DATA_DELIVERY_FIXED |
 		MSI_DATA_VECTOR(cfg->vector);
-	return 0;
 }
 
-int arch_setup_dmar_msi(unsigned int irq)
+int dmar_alloc_hwirq(int id, int node, void *arg)
 {
-	int ret;
+	int irq;
 	struct msi_msg msg;
 
-	ret = msi_compose_msg(NULL, irq, &msg);
-	if (ret < 0)
-		return ret;
-	dmar_msi_write(irq, &msg);
-	irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
-				      "edge");
-	return 0;
+	irq = create_irq();
+	if (irq > 0) {
+		irq_set_handler_data(irq, arg);
+		irq_set_chip_and_handler_name(irq, &dmar_msi_type,
+					      handle_edge_irq, "edge");
+		msi_compose_msg(NULL, irq, &msg);
+		dmar_msi_write(irq, &msg);
+	}
+
+	return irq;
+}
+
+void dmar_free_hwirq(int irq)
+{
+	irq_set_handler_data(irq, NULL);
+	destroy_irq(irq);
 }
 #endif /* CONFIG_INTEL_IOMMU */
 
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index ba5ba7accd0d6b..70b40d1205a6b9 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -11,10 +11,10 @@
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
 #include <linux/prefetch.h>
+#include <linux/uaccess.h>
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
 
 extern int die(char *, struct pt_regs *, long);
 
@@ -96,7 +96,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	/*
 	 * If we're in an interrupt or have no user context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
diff --git a/arch/m32r/include/asm/cmpxchg.h b/arch/m32r/include/asm/cmpxchg.h
index de651db20b4393..14bf9b739dd2c8 100644
--- a/arch/m32r/include/asm/cmpxchg.h
+++ b/arch/m32r/include/asm/cmpxchg.h
@@ -107,8 +107,6 @@ __xchg_local(unsigned long x, volatile void *ptr, int size)
 	((__typeof__(*(ptr)))__xchg_local((unsigned long)(x), (ptr),	\
 			sizeof(*(ptr))))
 
-#define __HAVE_ARCH_CMPXCHG	1
-
 static inline unsigned long
 __cmpxchg_u32(volatile unsigned int *p, unsigned int old, unsigned int new)
 {
diff --git a/arch/m32r/include/asm/uaccess.h b/arch/m32r/include/asm/uaccess.h
index 71adff209405e1..cac7014daef3aa 100644
--- a/arch/m32r/include/asm/uaccess.h
+++ b/arch/m32r/include/asm/uaccess.h
@@ -91,7 +91,8 @@ static inline void set_fs(mm_segment_t s)
  * @addr: User space pointer to start of block to check
  * @size: Size of block to check
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Checks if a pointer to a block of memory in user space is valid.
  *
@@ -155,7 +156,8 @@ extern int fixup_exception(struct pt_regs *regs);
  * @x:   Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -175,7 +177,8 @@ extern int fixup_exception(struct pt_regs *regs);
  * @x:   Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -194,7 +197,8 @@ extern int fixup_exception(struct pt_regs *regs);
  * @x:   Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -274,7 +278,8 @@ do {									\
  * @x:   Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -568,7 +573,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
  * @from: Source address, in kernel space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from kernel space to user space.  Caller must check
  * the specified block with access_ok() before calling this function.
@@ -588,7 +594,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
  * @from: Source address, in kernel space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from kernel space to user space.
  *
@@ -606,7 +613,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
  * @from: Source address, in user space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from user space to kernel space.  Caller must check
  * the specified block with access_ok() before calling this function.
@@ -626,7 +634,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
  * @from: Source address, in user space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from user space to kernel space.
  *
@@ -677,7 +686,8 @@ unsigned long clear_user(void __user *mem, unsigned long len);
  * strlen_user: - Get the size of a string in user space.
  * @str: The string to measure.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Get the size of a NUL-terminated string in user space.
  *
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
index e3d4d4890104cc..8f9875b7933d55 100644
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -24,9 +24,9 @@
 #include <linux/vt_kern.h>		/* For unblank_screen() */
 #include <linux/highmem.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 
 #include <asm/m32r.h>
-#include <asm/uaccess.h>
 #include <asm/hardirq.h>
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
@@ -111,10 +111,10 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	mm = tsk->mm;
 
 	/*
-	 * If we're in an interrupt or have no user context or are running in an
-	 * atomic region then we must not take the fault..
+	 * If we're in an interrupt or have no user context or have pagefaults
+	 * disabled then we must not take the fault.
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto bad_area_nosemaphore;
 
 	if (error_code & ACE_USERMODE)
diff --git a/arch/m68k/include/asm/cmpxchg.h b/arch/m68k/include/asm/cmpxchg.h
index bc755bc620ad09..83b1df80f0ac0e 100644
--- a/arch/m68k/include/asm/cmpxchg.h
+++ b/arch/m68k/include/asm/cmpxchg.h
@@ -90,7 +90,6 @@ extern unsigned long __invalid_cmpxchg_size(volatile void *,
  * indicated by comparing RETURN with OLD.
  */
 #ifdef CONFIG_RMW_INSNS
-#define __HAVE_ARCH_CMPXCHG	1
 
 static inline unsigned long __cmpxchg(volatile void *p, unsigned long old,
 				      unsigned long new, int size)
diff --git a/arch/m68k/include/asm/irqflags.h b/arch/m68k/include/asm/irqflags.h
index a823cd73dc09e3..b5941818346f65 100644
--- a/arch/m68k/include/asm/irqflags.h
+++ b/arch/m68k/include/asm/irqflags.h
@@ -2,9 +2,6 @@
 #define _M68K_IRQFLAGS_H
 
 #include <linux/types.h>
-#ifdef CONFIG_MMU
-#include <linux/preempt_mask.h>
-#endif
 #include <linux/preempt.h>
 #include <asm/thread_info.h>
 #include <asm/entry.h>
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index b2f04aee46ecc2..6a94cdd0c8308c 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -10,10 +10,10 @@
 #include <linux/ptrace.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 
 #include <asm/setup.h>
 #include <asm/traps.h>
-#include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 
 extern void die_if_kernel(char *, struct pt_regs *, long);
@@ -81,7 +81,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/metag/include/asm/barrier.h b/arch/metag/include/asm/barrier.h
index d703d8e26a656c..5a696e50793034 100644
--- a/arch/metag/include/asm/barrier.h
+++ b/arch/metag/include/asm/barrier.h
@@ -84,7 +84,7 @@ static inline void fence(void)
 #define read_barrier_depends()		do { } while (0)
 #define smp_read_barrier_depends()	do { } while (0)
 
-#define set_mb(var, value) do { var = value; smp_mb(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 
 #define smp_store_release(p, v)						\
 do {									\
diff --git a/arch/metag/include/asm/cmpxchg.h b/arch/metag/include/asm/cmpxchg.h
index b1bc1be8540f58..be29e3e44321a1 100644
--- a/arch/metag/include/asm/cmpxchg.h
+++ b/arch/metag/include/asm/cmpxchg.h
@@ -51,8 +51,6 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
 	return old;
 }
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 #define cmpxchg(ptr, o, n)						\
 	({								\
 		__typeof__(*(ptr)) _o_ = (o);				\
diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c
index 2de5dc695a87fa..f57edca63609bf 100644
--- a/arch/metag/mm/fault.c
+++ b/arch/metag/mm/fault.c
@@ -105,7 +105,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 
 	mm = tsk->mm;
 
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/metag/mm/highmem.c b/arch/metag/mm/highmem.c
index d71f621a2c0b92..807f1b1c4e6567 100644
--- a/arch/metag/mm/highmem.c
+++ b/arch/metag/mm/highmem.c
@@ -43,7 +43,7 @@ void *kmap_atomic(struct page *page)
 	unsigned long vaddr;
 	int type;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -82,6 +82,7 @@ void __kunmap_atomic(void *kvaddr)
 	}
 
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
 
@@ -95,6 +96,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
 	unsigned long vaddr;
 	int type;
 
+	preempt_disable();
 	pagefault_disable();
 
 	type = kmap_atomic_idx_push();
diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h
index 62942fd1267286..331b0d35f89ce3 100644
--- a/arch/microblaze/include/asm/uaccess.h
+++ b/arch/microblaze/include/asm/uaccess.h
@@ -178,7 +178,8 @@ extern long __user_bad(void);
  * @x:   Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -290,7 +291,8 @@ extern long __user_bad(void);
  * @x:   Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index d46a5ebb7570e0..177dfc00364362 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -107,14 +107,14 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
 	if ((error_code & 0x13) == 0x13 || (error_code & 0x11) == 0x11)
 		is_write = 0;
 
-	if (unlikely(in_atomic() || !mm)) {
+	if (unlikely(faulthandler_disabled() || !mm)) {
 		if (kernel_mode(regs))
 			goto bad_area_nosemaphore;
 
-		/* in_atomic() in user mode is really bad,
+		/* faulthandler_disabled() in user mode is really bad,
 		   as is current->mm == NULL. */
-		pr_emerg("Page fault in user mode with in_atomic(), mm = %p\n",
-									mm);
+		pr_emerg("Page fault in user mode with faulthandler_disabled(), mm = %p\n",
+			 mm);
 		pr_emerg("r15 = %lx  MSR = %lx\n",
 		       regs->r15, regs->msr);
 		die("Weird page fault", regs, SIGSEGV);
diff --git a/arch/microblaze/mm/highmem.c b/arch/microblaze/mm/highmem.c
index 5a92576fad9271..2fcc5a52d84d1c 100644
--- a/arch/microblaze/mm/highmem.c
+++ b/arch/microblaze/mm/highmem.c
@@ -37,7 +37,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 	unsigned long vaddr;
 	int idx, type;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -63,6 +63,7 @@ void __kunmap_atomic(void *kvaddr)
 
 	if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
 		pagefault_enable();
+		preempt_enable();
 		return;
 	}
 
@@ -84,5 +85,6 @@ void __kunmap_atomic(void *kvaddr)
 #endif
 	kmap_atomic_idx_pop();
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/mips/include/asm/barrier.h b/arch/mips/include/asm/barrier.h
index 2b8bbbcb9be0e9..7ecba84656d495 100644
--- a/arch/mips/include/asm/barrier.h
+++ b/arch/mips/include/asm/barrier.h
@@ -112,8 +112,8 @@
 #define __WEAK_LLSC_MB		"		\n"
 #endif
 
-#define set_mb(var, value) \
-	do { var = value; smp_mb(); } while (0)
+#define smp_store_mb(var, value) \
+	do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 
 #define smp_llsc_mb()	__asm__ __volatile__(__WEAK_LLSC_MB : : :"memory")
 
diff --git a/arch/mips/include/asm/cmpxchg.h b/arch/mips/include/asm/cmpxchg.h
index 412f945f1f5ec2..b71ab4a5fd508e 100644
--- a/arch/mips/include/asm/cmpxchg.h
+++ b/arch/mips/include/asm/cmpxchg.h
@@ -138,8 +138,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz
 		__xchg((unsigned long)(x), (ptr), sizeof(*(ptr))));	\
 })
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 #define __cmpxchg_asm(ld, st, m, old, new)				\
 ({									\
 	__typeof(*(m)) __ret;						\
diff --git a/arch/mips/include/asm/topology.h b/arch/mips/include/asm/topology.h
index 3e307ec2afbae2..7afda4150a59d9 100644
--- a/arch/mips/include/asm/topology.h
+++ b/arch/mips/include/asm/topology.h
@@ -15,7 +15,7 @@
 #define topology_physical_package_id(cpu)	(cpu_data[cpu].package)
 #define topology_core_id(cpu)			(cpu_data[cpu].core)
 #define topology_core_cpumask(cpu)		(&cpu_core_map[cpu])
-#define topology_thread_cpumask(cpu)		(&cpu_sibling_map[cpu])
+#define topology_sibling_cpumask(cpu)		(&cpu_sibling_map[cpu])
 #endif
 
 #endif /* __ASM_TOPOLOGY_H */
diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h
index bf8b32450ef6b4..9722357d285471 100644
--- a/arch/mips/include/asm/uaccess.h
+++ b/arch/mips/include/asm/uaccess.h
@@ -103,7 +103,8 @@ extern u64 __ua_limit;
  * @addr: User space pointer to start of block to check
  * @size: Size of block to check
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Checks if a pointer to a block of memory in user space is valid.
  *
@@ -138,7 +139,8 @@ extern u64 __ua_limit;
  * @x:	 Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -157,7 +159,8 @@ extern u64 __ua_limit;
  * @x:	 Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -177,7 +180,8 @@ extern u64 __ua_limit;
  * @x:	 Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -199,7 +203,8 @@ extern u64 __ua_limit;
  * @x:	 Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -498,7 +503,8 @@ extern void __put_user_unknown(void);
  * @x:	 Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -517,7 +523,8 @@ extern void __put_user_unknown(void);
  * @x:	 Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -537,7 +544,8 @@ extern void __put_user_unknown(void);
  * @x:	 Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -559,7 +567,8 @@ extern void __put_user_unknown(void);
  * @x:	 Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -815,7 +824,8 @@ extern size_t __copy_user(void *__to, const void *__from, size_t __n);
  * @from: Source address, in kernel space.
  * @n:	  Number of bytes to copy.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from kernel space to user space.  Caller must check
  * the specified block with access_ok() before calling this function.
@@ -888,7 +898,8 @@ extern size_t __copy_user_inatomic(void *__to, const void *__from, size_t __n);
  * @from: Source address, in kernel space.
  * @n:	  Number of bytes to copy.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from kernel space to user space.
  *
@@ -1075,7 +1086,8 @@ extern size_t __copy_in_user_eva(void *__to, const void *__from, size_t __n);
  * @from: Source address, in user space.
  * @n:	  Number of bytes to copy.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from user space to kernel space.  Caller must check
  * the specified block with access_ok() before calling this function.
@@ -1107,7 +1119,8 @@ extern size_t __copy_in_user_eva(void *__to, const void *__from, size_t __n);
  * @from: Source address, in user space.
  * @n:	  Number of bytes to copy.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from user space to kernel space.
  *
@@ -1329,7 +1342,8 @@ strncpy_from_user(char *__to, const char __user *__from, long __len)
  * strlen_user: - Get the size of a string in user space.
  * @str: The string to measure.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Get the size of a NUL-terminated string in user space.
  *
@@ -1398,7 +1412,8 @@ static inline long __strnlen_user(const char __user *s, long n)
  * strnlen_user: - Get the size of a string in user space.
  * @str: The string to measure.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Get the size of a NUL-terminated string in user space.
  *
diff --git a/arch/mips/kernel/signal-common.h b/arch/mips/kernel/signal-common.h
index 06805e09bcd357..0b85f827cd1836 100644
--- a/arch/mips/kernel/signal-common.h
+++ b/arch/mips/kernel/signal-common.h
@@ -28,12 +28,7 @@ extern void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
 extern int fpcsr_pending(unsigned int __user *fpcsr);
 
 /* Make sure we will not lose FPU ownership */
-#ifdef CONFIG_PREEMPT
-#define lock_fpu_owner()	preempt_disable()
-#define unlock_fpu_owner()	preempt_enable()
-#else
-#define lock_fpu_owner()	pagefault_disable()
-#define unlock_fpu_owner()	pagefault_enable()
-#endif
+#define lock_fpu_owner()	({ preempt_disable(); pagefault_disable(); })
+#define unlock_fpu_owner()	({ pagefault_enable(); preempt_enable(); })
 
 #endif	/* __SIGNAL_COMMON_H */
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 7ff8637e530d79..36c0f26fac6b07 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -21,10 +21,10 @@
 #include <linux/module.h>
 #include <linux/kprobes.h>
 #include <linux/perf_event.h>
+#include <linux/uaccess.h>
 
 #include <asm/branch.h>
 #include <asm/mmu_context.h>
-#include <asm/uaccess.h>
 #include <asm/ptrace.h>
 #include <asm/highmem.h>		/* For VMALLOC_END */
 #include <linux/kdebug.h>
@@ -94,7 +94,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto bad_area_nosemaphore;
 
 	if (user_mode(regs))
diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c
index da815d295239ba..11661cbc11a819 100644
--- a/arch/mips/mm/highmem.c
+++ b/arch/mips/mm/highmem.c
@@ -47,7 +47,7 @@ void *kmap_atomic(struct page *page)
 	unsigned long vaddr;
 	int idx, type;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -72,6 +72,7 @@ void __kunmap_atomic(void *kvaddr)
 
 	if (vaddr < FIXADDR_START) { // FIXME
 		pagefault_enable();
+		preempt_enable();
 		return;
 	}
 
@@ -92,6 +93,7 @@ void __kunmap_atomic(void *kvaddr)
 #endif
 	kmap_atomic_idx_pop();
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
 
@@ -104,6 +106,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
 	unsigned long vaddr;
 	int idx, type;
 
+	preempt_disable();
 	pagefault_disable();
 
 	type = kmap_atomic_idx_push();
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index faa5c9822eccf4..198a3147dd7d08 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -90,6 +90,7 @@ static void *__kmap_pgprot(struct page *page, unsigned long addr, pgprot_t prot)
 
 	BUG_ON(Page_dcache_dirty(page));
 
+	preempt_disable();
 	pagefault_disable();
 	idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1);
 	idx += in_interrupt() ? FIX_N_COLOURS : 0;
@@ -152,6 +153,7 @@ void kunmap_coherent(void)
 	write_c0_entryhi(old_ctx);
 	local_irq_restore(flags);
 	pagefault_enable();
+	preempt_enable();
 }
 
 void copy_user_highpage(struct page *to, struct page *from,
diff --git a/arch/mn10300/include/asm/highmem.h b/arch/mn10300/include/asm/highmem.h
index 2fbbe4d920aa2e..1ddea5afba0934 100644
--- a/arch/mn10300/include/asm/highmem.h
+++ b/arch/mn10300/include/asm/highmem.h
@@ -75,6 +75,7 @@ static inline void *kmap_atomic(struct page *page)
 	unsigned long vaddr;
 	int idx, type;
 
+	preempt_disable();
 	pagefault_disable();
 	if (page < highmem_start_page)
 		return page_address(page);
@@ -98,6 +99,7 @@ static inline void __kunmap_atomic(unsigned long vaddr)
 
 	if (vaddr < FIXADDR_START) { /* FIXME */
 		pagefault_enable();
+		preempt_enable();
 		return;
 	}
 
@@ -122,6 +124,7 @@ static inline void __kunmap_atomic(unsigned long vaddr)
 
 	kmap_atomic_idx_pop();
 	pagefault_enable();
+	preempt_enable();
 }
 #endif /* __KERNEL__ */
 
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index 0c2cc5d39c8e37..4a1d181ed32f76 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -23,8 +23,8 @@
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/vt_kern.h>		/* For unblank_screen() */
+#include <linux/uaccess.h>
 
-#include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/hardirq.h>
 #include <asm/cpu-regs.h>
@@ -168,7 +168,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR)
diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c
index 0c9b6afe69e909..b51878b0c6b873 100644
--- a/arch/nios2/mm/fault.c
+++ b/arch/nios2/mm/fault.c
@@ -77,7 +77,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto bad_area_nosemaphore;
 
 	if (user_mode(regs))
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index de65f66ea64e75..ec2df4bab3022d 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -142,6 +142,7 @@ static inline void kunmap(struct page *page)
 
 static inline void *kmap_atomic(struct page *page)
 {
+	preempt_disable();
 	pagefault_disable();
 	return page_address(page);
 }
@@ -150,6 +151,7 @@ static inline void __kunmap_atomic(void *addr)
 {
 	flush_kernel_dcache_page_addr(addr);
 	pagefault_enable();
+	preempt_enable();
 }
 
 #define kmap_atomic_prot(page, prot)	kmap_atomic(page)
diff --git a/arch/parisc/include/asm/cmpxchg.h b/arch/parisc/include/asm/cmpxchg.h
index dbd13354ec414b..0a90b965cccbef 100644
--- a/arch/parisc/include/asm/cmpxchg.h
+++ b/arch/parisc/include/asm/cmpxchg.h
@@ -46,8 +46,6 @@ __xchg(unsigned long x, __volatile__ void *ptr, int size)
 #define xchg(ptr, x) \
 	((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr))))
 
-#define __HAVE_ARCH_CMPXCHG	1
-
 /* bug catcher for when unsupported size is used - won't link */
 extern void __cmpxchg_called_with_bad_pointer(void);
 
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index 47ee620d15d278..6548fd1d2e62de 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -26,9 +26,9 @@
 #include <linux/console.h>
 #include <linux/bug.h>
 #include <linux/ratelimit.h>
+#include <linux/uaccess.h>
 
 #include <asm/assembly.h>
-#include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/traps.h>
@@ -800,7 +800,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
 	     * unless pagefault_disable() was called before.
 	     */
 
-	    if (fault_space == 0 && !in_atomic())
+	    if (fault_space == 0 && !faulthandler_disabled())
 	    {
 		pdc_chassis_send_status(PDC_CHASSIS_DIRECT_PANIC);
 		parisc_terminate("Kernel Fault", regs, code, fault_address);
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index e5120e653240c4..15503adddf4f59 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -15,8 +15,8 @@
 #include <linux/sched.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 
-#include <asm/uaccess.h>
 #include <asm/traps.h>
 
 /* Various important other fields */
@@ -207,7 +207,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
 	int fault;
 	unsigned int flags;
 
-	if (in_atomic())
+	if (pagefault_disabled())
 		goto no_context;
 
 	tsk = current;
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index a3bf5be111ff1d..51ccc7232042e9 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -34,7 +34,7 @@
 #define rmb()  __asm__ __volatile__ ("sync" : : : "memory")
 #define wmb()  __asm__ __volatile__ ("sync" : : : "memory")
 
-#define set_mb(var, value)	do { var = value; mb(); } while (0)
+#define smp_store_mb(var, value)	do { WRITE_ONCE(var, value); mb(); } while (0)
 
 #ifdef __SUBARCH_HAS_LWSYNC
 #    define SMPWMB      LWSYNC
@@ -89,5 +89,6 @@ do {									\
 
 #define smp_mb__before_atomic()     smp_mb()
 #define smp_mb__after_atomic()      smp_mb()
+#define smp_mb__before_spinlock()   smp_mb()
 
 #endif /* _ASM_POWERPC_BARRIER_H */
diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index d463c68fe7f05f..ad6263cffb0fd0 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -144,7 +144,6 @@ __xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
  * Compare and exchange - if *p == old, set it to new,
  * and return the old value of *p.
  */
-#define __HAVE_ARCH_CMPXCHG	1
 
 static __always_inline unsigned long
 __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 5f1048eaa5b604..8b3b46b7b0f279 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -87,7 +87,7 @@ static inline int prrn_is_enabled(void)
 #include <asm/smp.h>
 
 #define topology_physical_package_id(cpu)	(cpu_to_chip_id(cpu))
-#define topology_thread_cpumask(cpu)	(per_cpu(cpu_sibling_map, cpu))
+#define topology_sibling_cpumask(cpu)	(per_cpu(cpu_sibling_map, cpu))
 #define topology_core_cpumask(cpu)	(per_cpu(cpu_core_map, cpu))
 #define topology_core_id(cpu)		(cpu_to_core_id(cpu))
 #endif
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
index 3cf529ceec5bd6..ac93a3bd27300f 100644
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -27,11 +27,11 @@ int enter_vmx_usercopy(void)
 	if (in_interrupt())
 		return 0;
 
-	/* This acts as preempt_disable() as well and will make
-	 * enable_kernel_altivec(). We need to disable page faults
-	 * as they can call schedule and thus make us lose the VMX
-	 * context. So on page faults, we just fail which will cause
-	 * a fallback to the normal non-vmx copy.
+	preempt_disable();
+	/*
+	 * We need to disable page faults as they can call schedule and
+	 * thus make us lose the VMX context. So on page faults, we just
+	 * fail which will cause a fallback to the normal non-vmx copy.
 	 */
 	pagefault_disable();
 
@@ -47,6 +47,7 @@ int enter_vmx_usercopy(void)
 int exit_vmx_usercopy(void)
 {
 	pagefault_enable();
+	preempt_enable();
 	return 0;
 }
 
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index b396868d2aa7c4..6d535973b200dd 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -33,13 +33,13 @@
 #include <linux/ratelimit.h>
 #include <linux/context_tracking.h>
 #include <linux/hugetlb.h>
+#include <linux/uaccess.h>
 
 #include <asm/firmware.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
 #include <asm/mmu_context.h>
-#include <asm/uaccess.h>
 #include <asm/tlbflush.h>
 #include <asm/siginfo.h>
 #include <asm/debug.h>
@@ -272,15 +272,16 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	if (!arch_irq_disabled_regs(regs))
 		local_irq_enable();
 
-	if (in_atomic() || mm == NULL) {
+	if (faulthandler_disabled() || mm == NULL) {
 		if (!user_mode(regs)) {
 			rc = SIGSEGV;
 			goto bail;
 		}
-		/* in_atomic() in user mode is really bad,
+		/* faulthandler_disabled() in user mode is really bad,
 		   as is current->mm == NULL. */
 		printk(KERN_EMERG "Page fault in user mode with "
-		       "in_atomic() = %d mm = %p\n", in_atomic(), mm);
+		       "faulthandler_disabled() = %d mm = %p\n",
+		       faulthandler_disabled(), mm);
 		printk(KERN_EMERG "NIP = %lx  MSR = %lx\n",
 		       regs->nip, regs->msr);
 		die("Weird page fault", regs, SIGSEGV);
diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c
index e7450bdbe83a93..e292c8a609523b 100644
--- a/arch/powerpc/mm/highmem.c
+++ b/arch/powerpc/mm/highmem.c
@@ -34,7 +34,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 	unsigned long vaddr;
 	int idx, type;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -59,6 +59,7 @@ void __kunmap_atomic(void *kvaddr)
 
 	if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
 		pagefault_enable();
+		preempt_enable();
 		return;
 	}
 
@@ -82,5 +83,6 @@ void __kunmap_atomic(void *kvaddr)
 
 	kmap_atomic_idx_pop();
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index cbd3d069897f61..723a099f6be31a 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -217,7 +217,7 @@ static DEFINE_RAW_SPINLOCK(tlbivax_lock);
 static int mm_is_core_local(struct mm_struct *mm)
 {
 	return cpumask_subset(mm_cpumask(mm),
-			      topology_thread_cpumask(smp_processor_id()));
+			      topology_sibling_cpumask(smp_processor_id()));
 }
 
 struct tlb_flush_param {
diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h
index 8d724718ec21c8..e6f8615a11eb5c 100644
--- a/arch/s390/include/asm/barrier.h
+++ b/arch/s390/include/asm/barrier.h
@@ -36,7 +36,7 @@
 #define smp_mb__before_atomic()		smp_mb()
 #define smp_mb__after_atomic()		smp_mb()
 
-#define set_mb(var, value)		do { var = value; mb(); } while (0)
+#define smp_store_mb(var, value)		do { WRITE_ONCE(var, value); mb(); } while (0)
 
 #define smp_store_release(p, v)						\
 do {									\
diff --git a/arch/s390/include/asm/cmpxchg.h b/arch/s390/include/asm/cmpxchg.h
index 4eadec466b8ca3..411464f4c97a57 100644
--- a/arch/s390/include/asm/cmpxchg.h
+++ b/arch/s390/include/asm/cmpxchg.h
@@ -32,8 +32,6 @@
 	__old;								\
 })
 
-#define __HAVE_ARCH_CMPXCHG
-
 #define __cmpxchg_double_op(p1, p2, o1, o2, n1, n2, insn)		\
 ({									\
 	register __typeof__(*(p1)) __old1 asm("2") = (o1);		\
diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h
index 98eb2a5792234d..dcb6312a0b9180 100644
--- a/arch/s390/include/asm/timex.h
+++ b/arch/s390/include/asm/timex.h
@@ -10,6 +10,7 @@
 #define _ASM_S390_TIMEX_H
 
 #include <asm/lowcore.h>
+#include <linux/time64.h>
 
 /* The value of the TOD clock for 1.1.1970. */
 #define TOD_UNIX_EPOCH 0x7d91048bca000000ULL
@@ -108,10 +109,10 @@ int get_sync_clock(unsigned long long *clock);
 void init_cpu_timer(void);
 unsigned long long monotonic_clock(void);
 
-void tod_to_timeval(__u64, struct timespec *);
+void tod_to_timeval(__u64 todval, struct timespec64 *xt);
 
 static inline
-void stck_to_timespec(unsigned long long stck, struct timespec *ts)
+void stck_to_timespec64(unsigned long long stck, struct timespec64 *ts)
 {
 	tod_to_timeval(stck - TOD_UNIX_EPOCH, ts);
 }
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h
index b1453a2ae1ca58..4990f6c6628858 100644
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -22,7 +22,8 @@ DECLARE_PER_CPU(struct cpu_topology_s390, cpu_topology);
 
 #define topology_physical_package_id(cpu) (per_cpu(cpu_topology, cpu).socket_id)
 #define topology_thread_id(cpu)		  (per_cpu(cpu_topology, cpu).thread_id)
-#define topology_thread_cpumask(cpu)	  (&per_cpu(cpu_topology, cpu).thread_mask)
+#define topology_sibling_cpumask(cpu) \
+		(&per_cpu(cpu_topology, cpu).thread_mask)
 #define topology_core_id(cpu)		  (per_cpu(cpu_topology, cpu).core_id)
 #define topology_core_cpumask(cpu)	  (&per_cpu(cpu_topology, cpu).core_mask)
 #define topology_book_id(cpu)		  (per_cpu(cpu_topology, cpu).book_id)
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index d64a7a62164f85..9dd4cc47ddc792 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -98,7 +98,8 @@ static inline unsigned long extable_fixup(const struct exception_table_entry *x)
  * @from: Source address, in user space.
  * @n:	  Number of bytes to copy.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from user space to kernel space.  Caller must check
  * the specified block with access_ok() before calling this function.
@@ -118,7 +119,8 @@ unsigned long __must_check __copy_from_user(void *to, const void __user *from,
  * @from: Source address, in kernel space.
  * @n:	  Number of bytes to copy.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from kernel space to user space.  Caller must check
  * the specified block with access_ok() before calling this function.
@@ -264,7 +266,8 @@ int __get_user_bad(void) __attribute__((noreturn));
  * @from: Source address, in kernel space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from kernel space to user space.
  *
@@ -290,7 +293,8 @@ __compiletime_warning("copy_from_user() buffer size is not provably correct")
  * @from: Source address, in user space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from user space to kernel space.
  *
@@ -348,7 +352,8 @@ static inline unsigned long strnlen_user(const char __user *src, unsigned long n
  * strlen_user: - Get the size of a string in user space.
  * @str: The string to measure.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Get the size of a NUL-terminated string in user space.
  *
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index c1f21aca76e7fe..6fca0e46464e01 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -1457,23 +1457,24 @@ int
 debug_dflt_header_fn(debug_info_t * id, struct debug_view *view,
 			 int area, debug_entry_t * entry, char *out_buf)
 {
-	struct timespec time_spec;
+	struct timespec64 time_spec;
 	char *except_str;
 	unsigned long caller;
 	int rc = 0;
 	unsigned int level;
 
 	level = entry->id.fields.level;
-	stck_to_timespec(entry->id.stck, &time_spec);
+	stck_to_timespec64(entry->id.stck, &time_spec);
 
 	if (entry->id.fields.exception)
 		except_str = "*";
 	else
 		except_str = "-";
 	caller = ((unsigned long) entry->caller) & PSW_ADDR_INSN;
-	rc += sprintf(out_buf, "%02i %011lu:%06lu %1u %1s %02i %p  ",
-		      area, time_spec.tv_sec, time_spec.tv_nsec / 1000, level,
-		      except_str, entry->id.fields.cpuid, (void *) caller);
+	rc += sprintf(out_buf, "%02i %011lld:%06lu %1u %1s %02i %p  ",
+		      area, (long long)time_spec.tv_sec,
+		      time_spec.tv_nsec / 1000, level, except_str,
+		      entry->id.fields.cpuid, (void *)caller);
 	return rc;
 }
 EXPORT_SYMBOL(debug_dflt_header_fn);
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 170ddd2018b316..9e733d965e0888 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -76,7 +76,7 @@ unsigned long long monotonic_clock(void)
 }
 EXPORT_SYMBOL(monotonic_clock);
 
-void tod_to_timeval(__u64 todval, struct timespec *xt)
+void tod_to_timeval(__u64 todval, struct timespec64 *xt)
 {
 	unsigned long long sec;
 
@@ -181,12 +181,12 @@ static void timing_alert_interrupt(struct ext_code ext_code,
 static void etr_reset(void);
 static void stp_reset(void);
 
-void read_persistent_clock(struct timespec *ts)
+void read_persistent_clock64(struct timespec64 *ts)
 {
 	tod_to_timeval(get_tod_clock() - TOD_UNIX_EPOCH, ts);
 }
 
-void read_boot_clock(struct timespec *ts)
+void read_boot_clock64(struct timespec64 *ts)
 {
 	tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, ts);
 }
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 76515bcea2f18f..4c8f5d7f9c23d7 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -399,7 +399,7 @@ static inline int do_exception(struct pt_regs *regs, int access)
 	 * user context.
 	 */
 	fault = VM_FAULT_BADCONTEXT;
-	if (unlikely(!user_space_fault(regs) || in_atomic() || !mm))
+	if (unlikely(!user_space_fault(regs) || faulthandler_disabled() || !mm))
 		goto out;
 
 	address = trans_exc_code & __FAIL_ADDR_MASK;
diff --git a/arch/score/include/asm/cmpxchg.h b/arch/score/include/asm/cmpxchg.h
index f384839c3ee53e..cc3f6420b71c1d 100644
--- a/arch/score/include/asm/cmpxchg.h
+++ b/arch/score/include/asm/cmpxchg.h
@@ -42,8 +42,6 @@ static inline unsigned long __cmpxchg(volatile unsigned long *m,
 					(unsigned long)(o),	\
 					(unsigned long)(n)))
 
-#define __HAVE_ARCH_CMPXCHG	1
-
 #include <asm-generic/cmpxchg-local.h>
 
 #endif /* _ASM_SCORE_CMPXCHG_H */
diff --git a/arch/score/include/asm/uaccess.h b/arch/score/include/asm/uaccess.h
index ab66ddde777b5a..20a3591225ccea 100644
--- a/arch/score/include/asm/uaccess.h
+++ b/arch/score/include/asm/uaccess.h
@@ -36,7 +36,8 @@
  * @addr: User space pointer to start of block to check
  * @size: Size of block to check
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Checks if a pointer to a block of memory in user space is valid.
  *
@@ -61,7 +62,8 @@
  * @x:   Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -79,7 +81,8 @@
  * @x:   Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -98,7 +101,8 @@
  * @x:   Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -119,7 +123,8 @@
  * @x:   Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c
index 6860beb2a280d0..37a6c2e0e96926 100644
--- a/arch/score/mm/fault.c
+++ b/arch/score/mm/fault.c
@@ -34,6 +34,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/ptrace.h>
+#include <linux/uaccess.h>
 
 /*
  * This routine handles page faults.  It determines the address,
@@ -73,7 +74,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
 	* If we're in an interrupt or have no user
 	* context, we must not take the fault..
 	*/
-	if (in_atomic() || !mm)
+	if (pagefault_disabled() || !mm)
 		goto bad_area_nosemaphore;
 
 	if (user_mode(regs))
diff --git a/arch/sh/include/asm/barrier.h b/arch/sh/include/asm/barrier.h
index 43715308b06874..bf91037db4e01f 100644
--- a/arch/sh/include/asm/barrier.h
+++ b/arch/sh/include/asm/barrier.h
@@ -32,7 +32,7 @@
 #define ctrl_barrier()	__asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop")
 #endif
 
-#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
+#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
 
 #include <asm-generic/barrier.h>
 
diff --git a/arch/sh/include/asm/cmpxchg.h b/arch/sh/include/asm/cmpxchg.h
index f6bd1406b897fc..85c97b188d7164 100644
--- a/arch/sh/include/asm/cmpxchg.h
+++ b/arch/sh/include/asm/cmpxchg.h
@@ -46,8 +46,6 @@ extern void __xchg_called_with_bad_pointer(void);
  * if something tries to do an invalid cmpxchg(). */
 extern void __cmpxchg_called_with_bad_pointer(void);
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 static inline unsigned long __cmpxchg(volatile void * ptr, unsigned long old,
 		unsigned long new, int size)
 {
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index a58fec9b55e016..79d8276377d1e2 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -17,6 +17,7 @@
 #include <linux/kprobes.h>
 #include <linux/perf_event.h>
 #include <linux/kdebug.h>
+#include <linux/uaccess.h>
 #include <asm/io_trapped.h>
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
@@ -438,9 +439,9 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
-	 * in an atomic region then we must not take the fault:
+	 * with pagefaults disabled then we must not take the fault:
 	 */
-	if (unlikely(in_atomic() || !mm)) {
+	if (unlikely(faulthandler_disabled() || !mm)) {
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
diff --git a/arch/sparc/include/asm/barrier_64.h b/arch/sparc/include/asm/barrier_64.h
index 76648941fea71b..809941e33e1217 100644
--- a/arch/sparc/include/asm/barrier_64.h
+++ b/arch/sparc/include/asm/barrier_64.h
@@ -40,8 +40,8 @@ do {	__asm__ __volatile__("ba,pt	%%xcc, 1f\n\t" \
 #define dma_rmb()	rmb()
 #define dma_wmb()	wmb()
 
-#define set_mb(__var, __value) \
-	do { __var = __value; membar_safe("#StoreLoad"); } while(0)
+#define smp_store_mb(__var, __value) \
+	do { WRITE_ONCE(__var, __value); membar_safe("#StoreLoad"); } while(0)
 
 #ifdef CONFIG_SMP
 #define smp_mb()	mb()
diff --git a/arch/sparc/include/asm/cmpxchg_32.h b/arch/sparc/include/asm/cmpxchg_32.h
index d38b52dca21627..83ffb83c5397f3 100644
--- a/arch/sparc/include/asm/cmpxchg_32.h
+++ b/arch/sparc/include/asm/cmpxchg_32.h
@@ -34,7 +34,6 @@ static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr, int
  *
  * Cribbed from <asm-parisc/atomic.h>
  */
-#define __HAVE_ARCH_CMPXCHG	1
 
 /* bug catcher for when unsupported size is used - won't link */
 void __cmpxchg_called_with_bad_pointer(void);
diff --git a/arch/sparc/include/asm/cmpxchg_64.h b/arch/sparc/include/asm/cmpxchg_64.h
index 0e1ed6cfbf68fa..faa2f61058c271 100644
--- a/arch/sparc/include/asm/cmpxchg_64.h
+++ b/arch/sparc/include/asm/cmpxchg_64.h
@@ -65,8 +65,6 @@ static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr,
 
 #include <asm-generic/cmpxchg-local.h>
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 static inline unsigned long
 __cmpxchg_u32(volatile int *m, int old, int new)
 {
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index d1761df5cca6fe..01d17046225a8a 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -41,7 +41,7 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
 #define topology_physical_package_id(cpu)	(cpu_data(cpu).proc_id)
 #define topology_core_id(cpu)			(cpu_data(cpu).core_id)
 #define topology_core_cpumask(cpu)		(&cpu_core_sib_map[cpu])
-#define topology_thread_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
+#define topology_sibling_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
 #endif /* CONFIG_SMP */
 
 extern cpumask_t cpu_core_map[NR_CPUS];
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index 70d817154fe8bf..c399e7b3b03525 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -21,6 +21,7 @@
 #include <linux/perf_event.h>
 #include <linux/interrupt.h>
 #include <linux/kdebug.h>
+#include <linux/uaccess.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -29,7 +30,6 @@
 #include <asm/setup.h>
 #include <asm/smp.h>
 #include <asm/traps.h>
-#include <asm/uaccess.h>
 
 #include "mm_32.h"
 
@@ -196,7 +196,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (pagefault_disabled() || !mm)
 		goto no_context;
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 4798232494294a..e9268ea1a68de5 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -22,12 +22,12 @@
 #include <linux/kdebug.h>
 #include <linux/percpu.h>
 #include <linux/context_tracking.h>
+#include <linux/uaccess.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/openprom.h>
 #include <asm/oplib.h>
-#include <asm/uaccess.h>
 #include <asm/asi.h>
 #include <asm/lsu.h>
 #include <asm/sections.h>
@@ -330,7 +330,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto intr_or_no_mm;
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c
index 449f864f0cefdb..a454ec5ff07af7 100644
--- a/arch/sparc/mm/highmem.c
+++ b/arch/sparc/mm/highmem.c
@@ -53,7 +53,7 @@ void *kmap_atomic(struct page *page)
 	unsigned long vaddr;
 	long idx, type;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -91,6 +91,7 @@ void __kunmap_atomic(void *kvaddr)
 
 	if (vaddr < FIXADDR_START) { // FIXME
 		pagefault_enable();
+		preempt_enable();
 		return;
 	}
 
@@ -126,5 +127,6 @@ void __kunmap_atomic(void *kvaddr)
 
 	kmap_atomic_idx_pop();
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 559cb744112ccd..c5d08b89a96c81 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2738,7 +2738,7 @@ void hugetlb_setup(struct pt_regs *regs)
 	struct mm_struct *mm = current->mm;
 	struct tsb_config *tp;
 
-	if (in_atomic() || !mm) {
+	if (faulthandler_disabled() || !mm) {
 		const struct exception_table_entry *entry;
 
 		entry = search_exception_tables(regs->tpc);
diff --git a/arch/tile/include/asm/atomic_64.h b/arch/tile/include/asm/atomic_64.h
index 7b11c5fadd4220..0496970cef8296 100644
--- a/arch/tile/include/asm/atomic_64.h
+++ b/arch/tile/include/asm/atomic_64.h
@@ -105,9 +105,6 @@ static inline long atomic64_add_unless(atomic64_t *v, long a, long u)
 
 #define atomic64_inc_not_zero(v)	atomic64_add_unless((v), 1, 0)
 
-/* Define this to indicate that cmpxchg is an efficient operation. */
-#define __HAVE_ARCH_CMPXCHG
-
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_TILE_ATOMIC_64_H */
diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h
index 938311844233b8..76b0d0ebb24433 100644
--- a/arch/tile/include/asm/topology.h
+++ b/arch/tile/include/asm/topology.h
@@ -55,7 +55,7 @@ static inline const struct cpumask *cpumask_of_node(int node)
 #define topology_physical_package_id(cpu)       ((void)(cpu), 0)
 #define topology_core_id(cpu)                   (cpu)
 #define topology_core_cpumask(cpu)              ((void)(cpu), cpu_online_mask)
-#define topology_thread_cpumask(cpu)            cpumask_of(cpu)
+#define topology_sibling_cpumask(cpu)           cpumask_of(cpu)
 #endif
 
 #endif /* _ASM_TILE_TOPOLOGY_H */
diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h
index d598c11a82d9b8..0a9c4265763bd1 100644
--- a/arch/tile/include/asm/uaccess.h
+++ b/arch/tile/include/asm/uaccess.h
@@ -85,7 +85,8 @@ int __range_ok(unsigned long addr, unsigned long size);
  * @addr: User space pointer to start of block to check
  * @size: Size of block to check
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Checks if a pointer to a block of memory in user space is valid.
  *
@@ -199,7 +200,8 @@ extern int __get_user_bad(void)
  * @x:   Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -281,7 +283,8 @@ extern int __put_user_bad(void)
  * @x:   Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -337,7 +340,8 @@ extern int __put_user_bad(void)
  * @from: Source address, in kernel space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from kernel space to user space.  Caller must check
  * the specified block with access_ok() before calling this function.
@@ -373,7 +377,8 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
  * @from: Source address, in user space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from user space to kernel space.  Caller must check
  * the specified block with access_ok() before calling this function.
@@ -444,7 +449,8 @@ static inline unsigned long __must_check copy_from_user(void *to,
  * @from: Source address, in user space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from user space to user space.  Caller must check
  * the specified blocks with access_ok() before calling this function.
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 4787ee8f0705c7..13eac59bf16a5e 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -354,9 +354,9 @@ static int handle_page_fault(struct pt_regs *regs,
 
 	/*
 	 * If we're in an interrupt, have no user context or are running in an
-	 * atomic region then we must not take the fault.
+	 * region with pagefaults disabled then we must not take the fault.
 	 */
-	if (in_atomic() || !mm) {
+	if (pagefault_disabled() || !mm) {
 		vma = NULL;  /* happy compiler */
 		goto bad_area_nosemaphore;
 	}
diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c
index 6aa2f26254471e..fcd545014e79dc 100644
--- a/arch/tile/mm/highmem.c
+++ b/arch/tile/mm/highmem.c
@@ -201,7 +201,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 	int idx, type;
 	pte_t *pte;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 
 	/* Avoid icache flushes by disallowing atomic executable mappings. */
@@ -259,6 +259,7 @@ void __kunmap_atomic(void *kvaddr)
 	}
 
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
 
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 8e4daf44e9805e..47ff9b7f3e5d39 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -7,6 +7,7 @@
 #include <linux/sched.h>
 #include <linux/hardirq.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 #include <asm/current.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -35,10 +36,10 @@ int handle_page_fault(unsigned long address, unsigned long ip,
 	*code_out = SEGV_MAPERR;
 
 	/*
-	 * If the fault was during atomic operation, don't take the fault, just
+	 * If the fault was with pagefaults disabled, don't take the fault, just
 	 * fail.
 	 */
-	if (in_atomic())
+	if (faulthandler_disabled())
 		goto out_nosemaphore;
 
 	if (is_user)
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
index 0dc922dba9154d..afccef5529ccb8 100644
--- a/arch/unicore32/mm/fault.c
+++ b/arch/unicore32/mm/fault.c
@@ -218,7 +218,7 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 93b592cea001c2..76be12d596a164 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -9,140 +9,141 @@ config 64BIT
 config X86_32
 	def_bool y
 	depends on !64BIT
-	select CLKSRC_I8253
-	select HAVE_UID16
 
 config X86_64
 	def_bool y
 	depends on 64BIT
-	select X86_DEV_DMA_OPS
-	select ARCH_USE_CMPXCHG_LOCKREF
-	select HAVE_LIVEPATCH
 
 ### Arch settings
 config X86
 	def_bool y
-	select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
-	select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
+	select ACPI_LEGACY_TABLES_LOOKUP	if ACPI
+	select ACPI_SYSTEM_POWER_STATES_SUPPORT	if ACPI
+	select ANON_INODES
+	select ARCH_CLOCKSOURCE_DATA
+	select ARCH_DISCARD_MEMBLOCK
+	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
+	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FAST_MULTIPLIER
 	select ARCH_HAS_GCOV_PROFILE_ALL
+	select ARCH_HAS_SG_CHAIN
+	select ARCH_HAVE_NMI_SAFE_CMPXCHG
+	select ARCH_MIGHT_HAVE_ACPI_PDC		if ACPI
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_MIGHT_HAVE_PC_SERIO
-	select HAVE_AOUT if X86_32
-	select HAVE_UNSTABLE_SCHED_CLOCK
-	select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
-	select ARCH_SUPPORTS_INT128 if X86_64
-	select HAVE_IDE
-	select HAVE_OPROFILE
-	select HAVE_PCSPKR_PLATFORM
-	select HAVE_PERF_EVENTS
-	select HAVE_IOREMAP_PROT
-	select HAVE_KPROBES
-	select HAVE_MEMBLOCK
-	select HAVE_MEMBLOCK_NODE_MAP
-	select ARCH_DISCARD_MEMBLOCK
-	select ARCH_WANT_OPTIONAL_GPIOLIB
+	select ARCH_SUPPORTS_ATOMIC_RMW
+	select ARCH_SUPPORTS_INT128		if X86_64
+	select ARCH_SUPPORTS_NUMA_BALANCING	if X86_64
+	select ARCH_USE_BUILTIN_BSWAP
+	select ARCH_USE_CMPXCHG_LOCKREF		if X86_64
+	select ARCH_USE_QUEUED_RWLOCKS
+	select ARCH_USE_QUEUED_SPINLOCKS
 	select ARCH_WANT_FRAME_POINTERS
-	select HAVE_DMA_ATTRS
-	select HAVE_DMA_CONTIGUOUS
-	select HAVE_KRETPROBES
+	select ARCH_WANT_IPC_PARSE_VERSION	if X86_32
+	select ARCH_WANT_OPTIONAL_GPIOLIB
+	select BUILDTIME_EXTABLE_SORT
+	select CLKEVT_I8253
+	select CLKSRC_I8253			if X86_32
+	select CLOCKSOURCE_VALIDATE_LAST_CYCLE
+	select CLOCKSOURCE_WATCHDOG
+	select CLONE_BACKWARDS			if X86_32
+	select COMPAT_OLD_SIGACTION		if IA32_EMULATION
+	select DCACHE_WORD_ACCESS
+	select GENERIC_CLOCKEVENTS
+	select GENERIC_CLOCKEVENTS_BROADCAST	if X86_64 || (X86_32 && X86_LOCAL_APIC)
+	select GENERIC_CLOCKEVENTS_MIN_ADJUST
+	select GENERIC_CMOS_UPDATE
+	select GENERIC_CPU_AUTOPROBE
 	select GENERIC_EARLY_IOREMAP
-	select HAVE_OPTPROBES
-	select HAVE_KPROBES_ON_FTRACE
-	select HAVE_FTRACE_MCOUNT_RECORD
-	select HAVE_FENTRY if X86_64
+	select GENERIC_FIND_FIRST_BIT
+	select GENERIC_IOMAP
+	select GENERIC_IRQ_PROBE
+	select GENERIC_IRQ_SHOW
+	select GENERIC_PENDING_IRQ		if SMP
+	select GENERIC_SMP_IDLE_THREAD
+	select GENERIC_STRNCPY_FROM_USER
+	select GENERIC_STRNLEN_USER
+	select GENERIC_TIME_VSYSCALL
+	select HAVE_ACPI_APEI			if ACPI
+	select HAVE_ACPI_APEI_NMI		if ACPI
+	select HAVE_ALIGNED_STRUCT_PAGE		if SLUB
+	select HAVE_AOUT			if X86_32
+	select HAVE_ARCH_AUDITSYSCALL
+	select HAVE_ARCH_HUGE_VMAP		if X86_64 || X86_PAE
+	select HAVE_ARCH_JUMP_LABEL
+	select HAVE_ARCH_KASAN			if X86_64 && SPARSEMEM_VMEMMAP
+	select HAVE_ARCH_KGDB
+	select HAVE_ARCH_KMEMCHECK
+	select HAVE_ARCH_SECCOMP_FILTER
+	select HAVE_ARCH_SOFT_DIRTY		if X86_64
+	select HAVE_ARCH_TRACEHOOK
+	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+	select HAVE_BPF_JIT			if X86_64
+	select HAVE_CC_STACKPROTECTOR
+	select HAVE_CMPXCHG_DOUBLE
+	select HAVE_CMPXCHG_LOCAL
+	select HAVE_CONTEXT_TRACKING		if X86_64
 	select HAVE_C_RECORDMCOUNT
+	select HAVE_DEBUG_KMEMLEAK
+	select HAVE_DEBUG_STACKOVERFLOW
+	select HAVE_DMA_API_DEBUG
+	select HAVE_DMA_ATTRS
+	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
-	select HAVE_FUNCTION_TRACER
-	select HAVE_FUNCTION_GRAPH_TRACER
-	select HAVE_FUNCTION_GRAPH_FP_TEST
-	select HAVE_SYSCALL_TRACEPOINTS
-	select SYSCTL_EXCEPTION_TRACE
-	select HAVE_KVM
-	select HAVE_ARCH_KGDB
-	select HAVE_ARCH_TRACEHOOK
-	select HAVE_GENERIC_DMA_COHERENT if X86_32
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
-	select USER_STACKTRACE_SUPPORT
-	select HAVE_REGS_AND_STACK_ACCESS_API
-	select HAVE_DMA_API_DEBUG
-	select HAVE_KERNEL_GZIP
+	select HAVE_FENTRY			if X86_64
+	select HAVE_FTRACE_MCOUNT_RECORD
+	select HAVE_FUNCTION_GRAPH_FP_TEST
+	select HAVE_FUNCTION_GRAPH_TRACER
+	select HAVE_FUNCTION_TRACER
+	select HAVE_GENERIC_DMA_COHERENT	if X86_32
+	select HAVE_HW_BREAKPOINT
+	select HAVE_IDE
+	select HAVE_IOREMAP_PROT
+	select HAVE_IRQ_EXIT_ON_IRQ_STACK	if X86_64
+	select HAVE_IRQ_TIME_ACCOUNTING
 	select HAVE_KERNEL_BZIP2
+	select HAVE_KERNEL_GZIP
+	select HAVE_KERNEL_LZ4
 	select HAVE_KERNEL_LZMA
-	select HAVE_KERNEL_XZ
 	select HAVE_KERNEL_LZO
-	select HAVE_KERNEL_LZ4
-	select HAVE_HW_BREAKPOINT
+	select HAVE_KERNEL_XZ
+	select HAVE_KPROBES
+	select HAVE_KPROBES_ON_FTRACE
+	select HAVE_KRETPROBES
+	select HAVE_KVM
+	select HAVE_LIVEPATCH			if X86_64
+	select HAVE_MEMBLOCK
+	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_MIXED_BREAKPOINTS_REGS
-	select PERF_EVENTS
+	select HAVE_OPROFILE
+	select HAVE_OPTPROBES
+	select HAVE_PCSPKR_PLATFORM
+	select HAVE_PERF_EVENTS
 	select HAVE_PERF_EVENTS_NMI
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
-	select HAVE_DEBUG_KMEMLEAK
-	select ANON_INODES
-	select HAVE_ALIGNED_STRUCT_PAGE if SLUB
-	select HAVE_CMPXCHG_LOCAL
-	select HAVE_CMPXCHG_DOUBLE
-	select HAVE_ARCH_KMEMCHECK
-	select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
+	select HAVE_REGS_AND_STACK_ACCESS_API
+	select HAVE_SYSCALL_TRACEPOINTS
+	select HAVE_UID16			if X86_32
+	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_USER_RETURN_NOTIFIER
-	select ARCH_HAS_ELF_RANDOMIZE
-	select HAVE_ARCH_JUMP_LABEL
-	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
-	select SPARSE_IRQ
-	select GENERIC_FIND_FIRST_BIT
-	select GENERIC_IRQ_PROBE
-	select GENERIC_PENDING_IRQ if SMP
-	select GENERIC_IRQ_SHOW
-	select GENERIC_CLOCKEVENTS_MIN_ADJUST
 	select IRQ_FORCED_THREADING
-	select HAVE_BPF_JIT if X86_64
-	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
-	select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE)
-	select ARCH_HAS_SG_CHAIN
-	select CLKEVT_I8253
-	select ARCH_HAVE_NMI_SAFE_CMPXCHG
-	select GENERIC_IOMAP
-	select DCACHE_WORD_ACCESS
-	select GENERIC_SMP_IDLE_THREAD
-	select ARCH_WANT_IPC_PARSE_VERSION if X86_32
-	select HAVE_ARCH_SECCOMP_FILTER
-	select BUILDTIME_EXTABLE_SORT
-	select GENERIC_CMOS_UPDATE
-	select HAVE_ARCH_SOFT_DIRTY if X86_64
-	select CLOCKSOURCE_WATCHDOG
-	select GENERIC_CLOCKEVENTS
-	select ARCH_CLOCKSOURCE_DATA
-	select CLOCKSOURCE_VALIDATE_LAST_CYCLE
-	select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
-	select GENERIC_TIME_VSYSCALL
-	select GENERIC_STRNCPY_FROM_USER
-	select GENERIC_STRNLEN_USER
-	select HAVE_CONTEXT_TRACKING if X86_64
-	select HAVE_IRQ_TIME_ACCOUNTING
-	select VIRT_TO_BUS
-	select MODULES_USE_ELF_REL if X86_32
-	select MODULES_USE_ELF_RELA if X86_64
-	select CLONE_BACKWARDS if X86_32
-	select ARCH_USE_BUILTIN_BSWAP
-	select ARCH_USE_QUEUE_RWLOCK
-	select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION
-	select OLD_SIGACTION if X86_32
-	select COMPAT_OLD_SIGACTION if IA32_EMULATION
+	select MODULES_USE_ELF_RELA		if X86_64
+	select MODULES_USE_ELF_REL		if X86_32
+	select OLD_SIGACTION			if X86_32
+	select OLD_SIGSUSPEND3			if X86_32 || IA32_EMULATION
+	select PERF_EVENTS
 	select RTC_LIB
-	select HAVE_DEBUG_STACKOVERFLOW
-	select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64
-	select HAVE_CC_STACKPROTECTOR
-	select GENERIC_CPU_AUTOPROBE
-	select HAVE_ARCH_AUDITSYSCALL
-	select ARCH_SUPPORTS_ATOMIC_RMW
-	select HAVE_ACPI_APEI if ACPI
-	select HAVE_ACPI_APEI_NMI if ACPI
-	select ACPI_LEGACY_TABLES_LOOKUP if ACPI
-	select X86_FEATURE_NAMES if PROC_FS
+	select SPARSE_IRQ
 	select SRCU
+	select SYSCTL_EXCEPTION_TRACE
+	select USER_STACKTRACE_SUPPORT
+	select VIRT_TO_BUS
+	select X86_DEV_DMA_OPS			if X86_64
+	select X86_FEATURE_NAMES		if PROC_FS
 
 config INSTRUCTION_DECODER
 	def_bool y
@@ -341,7 +342,7 @@ config X86_FEATURE_NAMES
 
 config X86_X2APIC
 	bool "Support x2apic"
-	depends on X86_LOCAL_APIC && X86_64 && IRQ_REMAP
+	depends on X86_LOCAL_APIC && X86_64 && (IRQ_REMAP || HYPERVISOR_GUEST)
 	---help---
 	  This enables x2apic support on CPUs that have this feature.
 
@@ -441,6 +442,7 @@ config X86_UV
 	depends on X86_EXTENDED_PLATFORM
 	depends on NUMA
 	depends on X86_X2APIC
+	depends on PCI
 	---help---
 	  This option is needed in order to support SGI Ultraviolet systems.
 	  If you don't have one of these, you should say N here.
@@ -466,7 +468,6 @@ config X86_INTEL_CE
 	select X86_REBOOTFIXUPS
 	select OF
 	select OF_EARLY_FLATTREE
-	select IRQ_DOMAIN
 	---help---
 	  Select for the Intel CE media processor (CE4100) SOC.
 	  This option compiles in support for the CE4100 SOC for settop
@@ -666,7 +667,7 @@ config PARAVIRT_DEBUG
 config PARAVIRT_SPINLOCKS
 	bool "Paravirtualization layer for spinlocks"
 	depends on PARAVIRT && SMP
-	select UNINLINE_SPIN_UNLOCK
+	select UNINLINE_SPIN_UNLOCK if !QUEUED_SPINLOCKS
 	---help---
 	  Paravirtualized spinlocks allow a pvops backend to replace the
 	  spinlock implementation with something virtualization-friendly
@@ -851,11 +852,12 @@ config NR_CPUS
 	default "1" if !SMP
 	default "8192" if MAXSMP
 	default "32" if SMP && X86_BIGSMP
-	default "8" if SMP
+	default "8" if SMP && X86_32
+	default "64" if SMP
 	---help---
 	  This allows you to specify the maximum number of CPUs which this
 	  kernel will support.  If CPUMASK_OFFSTACK is enabled, the maximum
-	  supported value is 4096, otherwise the maximum value is 512.  The
+	  supported value is 8192, otherwise the maximum value is 512.  The
 	  minimum value which makes sense is 2.
 
 	  This is purely to save memory - each supported CPU adds
@@ -914,12 +916,12 @@ config X86_UP_IOAPIC
 config X86_LOCAL_APIC
 	def_bool y
 	depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI
-	select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
+	select IRQ_DOMAIN_HIERARCHY
+	select PCI_MSI_IRQ_DOMAIN if PCI_MSI
 
 config X86_IO_APIC
 	def_bool y
 	depends on X86_LOCAL_APIC || X86_UP_IOAPIC
-	select IRQ_DOMAIN
 
 config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
 	bool "Reroute for broken boot IRQs"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 72484a645f056d..a15893d17c5598 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -332,4 +332,27 @@ config X86_DEBUG_STATIC_CPU_HAS
 
 	  If unsure, say N.
 
+config X86_DEBUG_FPU
+	bool "Debug the x86 FPU code"
+	depends on DEBUG_KERNEL
+	default y
+	---help---
+	  If this option is enabled then there will be extra sanity
+	  checks and (boot time) debug printouts added to the kernel.
+	  This debugging adds some small amount of runtime overhead
+	  to the kernel.
+
+	  If unsure, say N.
+
+config PUNIT_ATOM_DEBUG
+	tristate "ATOM Punit debug driver"
+	select DEBUG_FS
+	select IOSF_MBI
+	---help---
+	  This is a debug driver, which gets the power states
+	  of all Punit North Complex devices. The power states of
+	  each device is exposed as part of the debugfs interface.
+	  The current power state can be read from
+	  /sys/kernel/debug/punit_atom/dev_power_state
+
 endmenu
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 2fda005bb33443..43e8328a23e4cf 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -77,6 +77,12 @@ else
         KBUILD_AFLAGS += -m64
         KBUILD_CFLAGS += -m64
 
+        # Align jump targets to 1 byte, not the default 16 bytes:
+        KBUILD_CFLAGS += -falign-jumps=1
+
+        # Pack loops tightly as well:
+        KBUILD_CFLAGS += -falign-loops=1
+
         # Don't autogenerate traditional x87 instructions
         KBUILD_CFLAGS += $(call cc-option,-mno-80387)
         KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387)
@@ -84,6 +90,9 @@ else
 	# Use -mpreferred-stack-boundary=3 if supported.
 	KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3)
 
+	# Use -mskip-rax-setup if supported.
+	KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)
+
         # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
         cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
         cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
@@ -140,12 +149,6 @@ endif
 sp-$(CONFIG_X86_32) := esp
 sp-$(CONFIG_X86_64) := rsp
 
-# do binutils support CFI?
-cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1)
-# is .cfi_signal_frame supported too?
-cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
-cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
-
 # does binutils support specific instructions?
 asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
 asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
@@ -153,8 +156,8 @@ asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
 avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
 avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
 
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr)
+KBUILD_AFLAGS += $(asinstr) $(avx_instr) $(avx2_instr)
+KBUILD_CFLAGS += $(asinstr) $(avx_instr) $(avx2_instr)
 
 LDFLAGS := -m elf_$(UTS_MACHINE)
 
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 89dd0d78013aaf..805d25ca5f1db1 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -2,15 +2,14 @@
 #define BOOT_COMPRESSED_MISC_H
 
 /*
- * we have to be careful, because no indirections are allowed here, and
- * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
- * we just keep it from happening
+ * Special hack: we have to be careful, because no indirections are allowed here,
+ * and paravirt_ops is a kind of one. As it will only run in baremetal anyway,
+ * we just keep it from happening. (This list needs to be extended when new
+ * paravirt and debugging variants are added.)
  */
 #undef CONFIG_PARAVIRT
+#undef CONFIG_PARAVIRT_SPINLOCKS
 #undef CONFIG_KASAN
-#ifdef CONFIG_X86_32
-#define _ASM_X86_DESC_H 1
-#endif
 
 #include <linux/linkage.h>
 #include <linux/screen_info.h>
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index ebcb981d241cbd..2bfc8a7c88c11e 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -32,7 +32,7 @@
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
 #include <asm/cpu_device_id.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/aes.h>
 #include <crypto/ablk_helper.h>
 #include <crypto/scatterwalk.h>
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index baf0ac21ace566..4c65c70e628bb2 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -19,8 +19,7 @@
 #include <crypto/ctr.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/camellia.h>
 #include <asm/crypto/glue_helper.h>
 
@@ -561,16 +560,15 @@ static struct crypto_alg cmll_algs[10] = { {
 
 static int __init camellia_aesni_init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
 	if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
 		pr_info("AVX2 or AES-NI instructions are not detected.\n");
 		return -ENODEV;
 	}
 
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX2 detected but unusable.\n");
+	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index 78818a1e73e3f6..80a0e4389c9ad3 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -19,8 +19,7 @@
 #include <crypto/ctr.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/camellia.h>
 #include <asm/crypto/glue_helper.h>
 
@@ -553,16 +552,10 @@ static struct crypto_alg cmll_algs[10] = { {
 
 static int __init camellia_aesni_init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
-	if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
-		pr_info("AVX or AES-NI instructions are not detected.\n");
-		return -ENODEV;
-	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
+	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index 236c80974457b9..be00aa48b2b5e3 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -31,8 +31,7 @@
 #include <crypto/cast5.h>
 #include <crypto/cryptd.h>
 #include <crypto/ctr.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/glue_helper.h>
 
 #define CAST5_PARALLEL_BLOCKS 16
@@ -468,16 +467,10 @@ static struct crypto_alg cast5_algs[6] = { {
 
 static int __init cast5_init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
-	if (!cpu_has_avx || !cpu_has_osxsave) {
-		pr_info("AVX instructions are not detected.\n");
-		return -ENODEV;
-	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
+	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index f448810ca4ac1d..5dbba722422175 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -36,8 +36,7 @@
 #include <crypto/ctr.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/glue_helper.h>
 
 #define CAST6_PARALLEL_BLOCKS 8
@@ -590,16 +589,10 @@ static struct crypto_alg cast6_algs[10] = { {
 
 static int __init cast6_init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
-	if (!cpu_has_avx || !cpu_has_osxsave) {
-		pr_info("AVX instructions are not detected.\n");
-		return -ENODEV;
-	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
+	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index 1937fc1d876338..07d2c6c86a5483 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -35,7 +35,7 @@
 
 #include <asm/cpufeature.h>
 #include <asm/cpu_device_id.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 #define CHKSUM_BLOCK_SIZE	1
 #define CHKSUM_DIGEST_SIZE	4
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 28640c3d6af7f6..81a595d75cf595 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -32,8 +32,7 @@
 
 #include <asm/cpufeature.h>
 #include <asm/cpu_device_id.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 
 #define CHKSUM_BLOCK_SIZE	1
 #define CHKSUM_DIGEST_SIZE	4
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
index b6c67bf30fdf67..a3fcfc97a311d5 100644
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -29,7 +29,7 @@
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <asm/cpufeature.h>
 #include <asm/cpu_device_id.h>
 
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
index f368ba261739fa..5a2f30f9f52dca 100644
--- a/arch/x86/crypto/fpu.c
+++ b/arch/x86/crypto/fpu.c
@@ -18,7 +18,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/crypto.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 struct crypto_fpu_ctx {
 	struct crypto_blkcipher *child;
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 2079baf06bdd3b..64d7cf1b50e112 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -19,7 +19,7 @@
 #include <crypto/cryptd.h>
 #include <crypto/gf128mul.h>
 #include <crypto/internal/hash.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <asm/cpu_device_id.h>
 
 #define GHASH_BLOCK_SIZE	16
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index 2f63dc89e7a9ed..7d838dc4d888f3 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -20,8 +20,7 @@
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
 #include <crypto/serpent.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/serpent-avx.h>
 #include <asm/crypto/glue_helper.h>
 
@@ -537,16 +536,14 @@ static struct crypto_alg srp_algs[10] = { {
 
 static int __init init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
 	if (!cpu_has_avx2 || !cpu_has_osxsave) {
 		pr_info("AVX2 instructions are not detected.\n");
 		return -ENODEV;
 	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
+	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index c8d478af84563e..da7dafc9b16d5c 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -36,8 +36,7 @@
 #include <crypto/ctr.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/serpent-avx.h>
 #include <asm/crypto/glue_helper.h>
 
@@ -596,16 +595,10 @@ static struct crypto_alg serpent_algs[10] = { {
 
 static int __init serpent_init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
-	if (!cpu_has_avx || !cpu_has_osxsave) {
-		printk(KERN_INFO "AVX instructions are not detected.\n");
-		return -ENODEV;
-	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		printk(KERN_INFO "AVX detected but unusable.\n");
+	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
index 0cb5149518ce51..a841e9765bd614 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -65,11 +65,8 @@
 #include <crypto/mcryptd.h>
 #include <crypto/crypto_wq.h>
 #include <asm/byteorder.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
 #include <linux/hardirq.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/api.h>
 #include "sha_mb_ctx.h"
 
 #define FLUSH_INTERVAL 1000 /* in usec */
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 33d1b9dc14cc75..7c48e8b20848e5 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -29,9 +29,7 @@
 #include <linux/types.h>
 #include <crypto/sha.h>
 #include <crypto/sha1_base.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 
 
 asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
@@ -123,15 +121,9 @@ static struct shash_alg alg = {
 #ifdef CONFIG_AS_AVX
 static bool __init avx_usable(void)
 {
-	u64 xcr0;
-
-	if (!cpu_has_avx || !cpu_has_osxsave)
-		return false;
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
-
+	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) {
+		if (cpu_has_avx)
+			pr_info("AVX detected but unusable.\n");
 		return false;
 	}
 
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index ccc338881ee818..f8097fc0d1d1b5 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -37,9 +37,7 @@
 #include <linux/types.h>
 #include <crypto/sha.h>
 #include <crypto/sha256_base.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <linux/string.h>
 
 asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data,
@@ -132,15 +130,9 @@ static struct shash_alg algs[] = { {
 #ifdef CONFIG_AS_AVX
 static bool __init avx_usable(void)
 {
-	u64 xcr0;
-
-	if (!cpu_has_avx || !cpu_has_osxsave)
-		return false;
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
-
+	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) {
+		if (cpu_has_avx)
+			pr_info("AVX detected but unusable.\n");
 		return false;
 	}
 
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index d9fa4c1e063ff6..2edad7b8187015 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -35,9 +35,7 @@
 #include <linux/types.h>
 #include <crypto/sha.h>
 #include <crypto/sha512_base.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 
 #include <linux/string.h>
 
@@ -131,15 +129,9 @@ static struct shash_alg algs[] = { {
 #ifdef CONFIG_AS_AVX
 static bool __init avx_usable(void)
 {
-	u64 xcr0;
-
-	if (!cpu_has_avx || !cpu_has_osxsave)
-		return false;
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
-
+	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) {
+		if (cpu_has_avx)
+			pr_info("AVX detected but unusable.\n");
 		return false;
 	}
 
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index b5e2d565185170..c2bd0ce718eee5 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -36,9 +36,7 @@
 #include <crypto/ctr.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/twofish.h>
 #include <asm/crypto/glue_helper.h>
 #include <crypto/scatterwalk.h>
@@ -558,16 +556,10 @@ static struct crypto_alg twofish_algs[10] = { {
 
 static int __init twofish_init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
-	if (!cpu_has_avx || !cpu_has_osxsave) {
-		printk(KERN_INFO "AVX instructions are not detected.\n");
-		return -ENODEV;
-	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		printk(KERN_INFO "AVX detected but unusable.\n");
+	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index c81d35e6c7f1d9..ae3a29ae875b55 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -21,8 +21,8 @@
 #include <linux/binfmts.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
 #include <asm/ptrace.h>
 #include <asm/ia32_unistd.h>
 #include <asm/user32.h>
@@ -198,7 +198,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 		buf = compat_ptr(tmp);
 	} get_user_catch(err);
 
-	err |= restore_xstate_sig(buf, 1);
+	err |= fpu__restore_sig(buf, 1);
 
 	force_iret();
 
@@ -308,6 +308,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
 				 size_t frame_size,
 				 void __user **fpstate)
 {
+	struct fpu *fpu = &current->thread.fpu;
 	unsigned long sp;
 
 	/* Default to using normal stack */
@@ -322,12 +323,12 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
 		 ksig->ka.sa.sa_restorer)
 		sp = (unsigned long) ksig->ka.sa.sa_restorer;
 
-	if (used_math()) {
+	if (fpu->fpstate_active) {
 		unsigned long fx_aligned, math_size;
 
-		sp = alloc_mathframe(sp, 1, &fx_aligned, &math_size);
+		sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
 		*fpstate = (struct _fpstate_ia32 __user *) sp;
-		if (save_xstate_sig(*fpstate, (void __user *)fx_aligned,
+		if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
 				    math_size) < 0)
 			return (void __user *) -1L;
 	}
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 72bf2680f819ad..2be23c734db5c2 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -4,7 +4,6 @@
  * Copyright 2000-2002 Andi Kleen, SuSE Labs.
  */		 
 
-#include <asm/dwarf2.h>
 #include <asm/calling.h>
 #include <asm/asm-offsets.h>
 #include <asm/current.h>
@@ -60,29 +59,12 @@
 	movl %eax,%eax			/* zero extension */
 	.endm
 	
-	.macro CFI_STARTPROC32 simple
-	CFI_STARTPROC	\simple
-	CFI_UNDEFINED	r8
-	CFI_UNDEFINED	r9
-	CFI_UNDEFINED	r10
-	CFI_UNDEFINED	r11
-	CFI_UNDEFINED	r12
-	CFI_UNDEFINED	r13
-	CFI_UNDEFINED	r14
-	CFI_UNDEFINED	r15
-	.endm
 
 #ifdef CONFIG_PARAVIRT
 ENTRY(native_usergs_sysret32)
 	swapgs
 	sysretl
 ENDPROC(native_usergs_sysret32)
-
-ENTRY(native_irq_enable_sysexit)
-	swapgs
-	sti
-	sysexit
-ENDPROC(native_irq_enable_sysexit)
 #endif
 
 /*
@@ -108,18 +90,13 @@ ENDPROC(native_irq_enable_sysexit)
  * with the int 0x80 path.
  */
 ENTRY(ia32_sysenter_target)
-	CFI_STARTPROC32	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,0
-	CFI_REGISTER	rsp,rbp
-
 	/*
 	 * Interrupts are off on entry.
 	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
 	 * it is too small to ever cause noticeable irq latency.
 	 */
 	SWAPGS_UNSAFE_STACK
-	movq	PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
+	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	ENABLE_INTERRUPTS(CLBR_NONE)
 
 	/* Zero-extending 32-bit regs, do not remove */
@@ -127,25 +104,21 @@ ENTRY(ia32_sysenter_target)
 	movl	%eax, %eax
 
 	movl	ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d
-	CFI_REGISTER rip,r10
 
 	/* Construct struct pt_regs on stack */
-	pushq_cfi	$__USER32_DS		/* pt_regs->ss */
-	pushq_cfi	%rbp			/* pt_regs->sp */
-	CFI_REL_OFFSET	rsp,0
-	pushfq_cfi				/* pt_regs->flags */
-	pushq_cfi	$__USER32_CS		/* pt_regs->cs */
-	pushq_cfi	%r10 /* pt_regs->ip = thread_info->sysenter_return */
-	CFI_REL_OFFSET	rip,0
-	pushq_cfi_reg	rax			/* pt_regs->orig_ax */
-	pushq_cfi_reg	rdi			/* pt_regs->di */
-	pushq_cfi_reg	rsi			/* pt_regs->si */
-	pushq_cfi_reg	rdx			/* pt_regs->dx */
-	pushq_cfi_reg	rcx			/* pt_regs->cx */
-	pushq_cfi_reg	rax			/* pt_regs->ax */
+	pushq	$__USER32_DS		/* pt_regs->ss */
+	pushq	%rbp			/* pt_regs->sp */
+	pushfq				/* pt_regs->flags */
+	pushq	$__USER32_CS		/* pt_regs->cs */
+	pushq	%r10 /* pt_regs->ip = thread_info->sysenter_return */
+	pushq	%rax			/* pt_regs->orig_ax */
+	pushq	%rdi			/* pt_regs->di */
+	pushq	%rsi			/* pt_regs->si */
+	pushq	%rdx			/* pt_regs->dx */
+	pushq	%rcx			/* pt_regs->cx */
+	pushq	$-ENOSYS		/* pt_regs->ax */
 	cld
 	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
-	CFI_ADJUST_CFA_OFFSET 10*8
 
 	/*
 	 * no need to do an access_ok check here because rbp has been
@@ -167,10 +140,8 @@ sysenter_flags_fixed:
 
 	orl     $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
 	testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	CFI_REMEMBER_STATE
 	jnz  sysenter_tracesys
-	cmpq	$(IA32_NR_syscalls-1),%rax
-	ja	ia32_badsys
+
 sysenter_do_call:
 	/* 32bit syscall -> 64bit C ABI argument conversion */
 	movl	%edi,%r8d	/* arg5 */
@@ -179,8 +150,11 @@ sysenter_do_call:
 	movl	%ebx,%edi	/* arg1 */
 	movl	%edx,%edx	/* arg3 (zero extension) */
 sysenter_dispatch:
+	cmpq	$(IA32_NR_syscalls-1),%rax
+	ja	1f
 	call	*ia32_sys_call_table(,%rax,8)
 	movq	%rax,RAX(%rsp)
+1:
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
@@ -198,14 +172,12 @@ sysexit_from_sys_call:
 	 */
 	andl    $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
 	movl	RIP(%rsp),%ecx		/* User %eip */
-	CFI_REGISTER rip,rcx
 	RESTORE_RSI_RDI
 	xorl	%edx,%edx		/* avoid info leaks */
 	xorq	%r8,%r8
 	xorq	%r9,%r9
 	xorq	%r10,%r10
 	movl	EFLAGS(%rsp),%r11d	/* User eflags */
-	/*CFI_RESTORE rflags*/
 	TRACE_IRQS_ON
 
 	/*
@@ -236,8 +208,6 @@ sysexit_from_sys_call:
 	 */
 	USERGS_SYSRET32
 
-	CFI_RESTORE_STATE
-
 #ifdef CONFIG_AUDITSYSCALL
 	.macro auditsys_entry_common
 	movl %esi,%r8d			/* 5th arg: 4th syscall arg */
@@ -247,9 +217,7 @@ sysexit_from_sys_call:
 	movl %ebx,%esi			/* 2nd arg: 1st syscall arg */
 	movl %eax,%edi			/* 1st arg: syscall number */
 	call __audit_syscall_entry
-	movl RAX(%rsp),%eax	/* reload syscall number */
-	cmpq $(IA32_NR_syscalls-1),%rax
-	ja ia32_badsys
+	movl ORIG_RAX(%rsp),%eax	/* reload syscall number */
 	movl %ebx,%edi			/* reload 1st syscall arg */
 	movl RCX(%rsp),%esi	/* reload 2nd syscall arg */
 	movl RDX(%rsp),%edx	/* reload 3rd syscall arg */
@@ -289,8 +257,8 @@ sysexit_audit:
 #endif
 
 sysenter_fix_flags:
-	pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED)
-	popfq_cfi
+	pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED)
+	popfq
 	jmp sysenter_flags_fixed
 
 sysenter_tracesys:
@@ -300,15 +268,11 @@ sysenter_tracesys:
 #endif
 	SAVE_EXTRA_REGS
 	CLEAR_RREGS
-	movq	$-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
 	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
 	call	syscall_trace_enter
 	LOAD_ARGS32  /* reload args from stack in case ptrace changed it */
 	RESTORE_EXTRA_REGS
-	cmpq	$(IA32_NR_syscalls-1),%rax
-	ja	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
 	jmp	sysenter_do_call
-	CFI_ENDPROC
 ENDPROC(ia32_sysenter_target)
 
 /*
@@ -342,12 +306,6 @@ ENDPROC(ia32_sysenter_target)
  * with the int 0x80 path.
  */
 ENTRY(ia32_cstar_target)
-	CFI_STARTPROC32	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,0
-	CFI_REGISTER	rip,rcx
-	/*CFI_REGISTER	rflags,r11*/
-
 	/*
 	 * Interrupts are off on entry.
 	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
@@ -355,30 +313,26 @@ ENTRY(ia32_cstar_target)
 	 */
 	SWAPGS_UNSAFE_STACK
 	movl	%esp,%r8d
-	CFI_REGISTER	rsp,r8
-	movq	PER_CPU_VAR(kernel_stack),%rsp
+	movq	PER_CPU_VAR(cpu_current_top_of_stack),%rsp
 	ENABLE_INTERRUPTS(CLBR_NONE)
 
 	/* Zero-extending 32-bit regs, do not remove */
 	movl	%eax,%eax
 
 	/* Construct struct pt_regs on stack */
-	pushq_cfi	$__USER32_DS		/* pt_regs->ss */
-	pushq_cfi	%r8			/* pt_regs->sp */
-	CFI_REL_OFFSET rsp,0
-	pushq_cfi	%r11			/* pt_regs->flags */
-	pushq_cfi	$__USER32_CS		/* pt_regs->cs */
-	pushq_cfi	%rcx			/* pt_regs->ip */
-	CFI_REL_OFFSET rip,0
-	pushq_cfi_reg	rax			/* pt_regs->orig_ax */
-	pushq_cfi_reg	rdi			/* pt_regs->di */
-	pushq_cfi_reg	rsi			/* pt_regs->si */
-	pushq_cfi_reg	rdx			/* pt_regs->dx */
-	pushq_cfi_reg	rbp			/* pt_regs->cx */
+	pushq	$__USER32_DS		/* pt_regs->ss */
+	pushq	%r8			/* pt_regs->sp */
+	pushq	%r11			/* pt_regs->flags */
+	pushq	$__USER32_CS		/* pt_regs->cs */
+	pushq	%rcx			/* pt_regs->ip */
+	pushq	%rax			/* pt_regs->orig_ax */
+	pushq	%rdi			/* pt_regs->di */
+	pushq	%rsi			/* pt_regs->si */
+	pushq	%rdx			/* pt_regs->dx */
+	pushq	%rbp			/* pt_regs->cx */
 	movl	%ebp,%ecx
-	pushq_cfi_reg	rax			/* pt_regs->ax */
+	pushq	$-ENOSYS		/* pt_regs->ax */
 	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
-	CFI_ADJUST_CFA_OFFSET 10*8
 
 	/*
 	 * no need to do an access_ok check here because r8 has been
@@ -390,10 +344,8 @@ ENTRY(ia32_cstar_target)
 	ASM_CLAC
 	orl     $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
 	testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	CFI_REMEMBER_STATE
 	jnz   cstar_tracesys
-	cmpq $IA32_NR_syscalls-1,%rax
-	ja  ia32_badsys
+
 cstar_do_call:
 	/* 32bit syscall -> 64bit C ABI argument conversion */
 	movl	%edi,%r8d	/* arg5 */
@@ -402,8 +354,11 @@ cstar_do_call:
 	movl	%ebx,%edi	/* arg1 */
 	movl	%edx,%edx	/* arg3 (zero extension) */
 cstar_dispatch:
+	cmpq	$(IA32_NR_syscalls-1),%rax
+	ja	1f
 	call *ia32_sys_call_table(,%rax,8)
 	movq %rax,RAX(%rsp)
+1:
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
@@ -412,15 +367,12 @@ sysretl_from_sys_call:
 	andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
 	RESTORE_RSI_RDI_RDX
 	movl RIP(%rsp),%ecx
-	CFI_REGISTER rip,rcx
 	movl EFLAGS(%rsp),%r11d
-	/*CFI_REGISTER rflags,r11*/
 	xorq	%r10,%r10
 	xorq	%r9,%r9
 	xorq	%r8,%r8
 	TRACE_IRQS_ON
 	movl RSP(%rsp),%esp
-	CFI_RESTORE rsp
 	/*
 	 * 64bit->32bit SYSRET restores eip from ecx,
 	 * eflags from r11 (but RF and VM bits are forced to 0),
@@ -439,7 +391,6 @@ sysretl_from_sys_call:
 
 #ifdef CONFIG_AUDITSYSCALL
 cstar_auditsys:
-	CFI_RESTORE_STATE
 	movl %r9d,R9(%rsp)	/* register to be clobbered by call */
 	auditsys_entry_common
 	movl R9(%rsp),%r9d	/* reload 6th syscall arg */
@@ -457,14 +408,11 @@ cstar_tracesys:
 	xchgl %r9d,%ebp
 	SAVE_EXTRA_REGS
 	CLEAR_RREGS r9
-	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	call syscall_trace_enter
 	LOAD_ARGS32 1	/* reload args from stack in case ptrace changed it */
 	RESTORE_EXTRA_REGS
 	xchgl %ebp,%r9d
-	cmpq $(IA32_NR_syscalls-1),%rax
-	ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
 	jmp cstar_do_call
 END(ia32_cstar_target)
 				
@@ -472,7 +420,6 @@ ia32_badarg:
 	ASM_CLAC
 	movq $-EFAULT,%rax
 	jmp ia32_sysret
-	CFI_ENDPROC
 
 /*
  * Emulated IA32 system calls via int 0x80.
@@ -496,15 +443,6 @@ ia32_badarg:
  */
 
 ENTRY(ia32_syscall)
-	CFI_STARTPROC32	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,5*8
-	/*CFI_REL_OFFSET	ss,4*8 */
-	CFI_REL_OFFSET	rsp,3*8
-	/*CFI_REL_OFFSET	rflags,2*8 */
-	/*CFI_REL_OFFSET	cs,1*8 */
-	CFI_REL_OFFSET	rip,0*8
-
 	/*
 	 * Interrupts are off on entry.
 	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
@@ -518,21 +456,18 @@ ENTRY(ia32_syscall)
 	movl	%eax,%eax
 
 	/* Construct struct pt_regs on stack (iret frame is already on stack) */
-	pushq_cfi_reg	rax			/* pt_regs->orig_ax */
-	pushq_cfi_reg	rdi			/* pt_regs->di */
-	pushq_cfi_reg	rsi			/* pt_regs->si */
-	pushq_cfi_reg	rdx			/* pt_regs->dx */
-	pushq_cfi_reg	rcx			/* pt_regs->cx */
-	pushq_cfi_reg	rax			/* pt_regs->ax */
+	pushq	%rax			/* pt_regs->orig_ax */
+	pushq	%rdi			/* pt_regs->di */
+	pushq	%rsi			/* pt_regs->si */
+	pushq	%rdx			/* pt_regs->dx */
+	pushq	%rcx			/* pt_regs->cx */
+	pushq	$-ENOSYS		/* pt_regs->ax */
 	cld
 	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
-	CFI_ADJUST_CFA_OFFSET 10*8
 
 	orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
 	testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz ia32_tracesys
-	cmpq $(IA32_NR_syscalls-1),%rax
-	ja ia32_badsys
 ia32_do_call:
 	/* 32bit syscall -> 64bit C ABI argument conversion */
 	movl %edi,%r8d	/* arg5 */
@@ -540,9 +475,12 @@ ia32_do_call:
 	xchg %ecx,%esi	/* rsi:arg2, rcx:arg4 */
 	movl %ebx,%edi	/* arg1 */
 	movl %edx,%edx	/* arg3 (zero extension) */
+	cmpq	$(IA32_NR_syscalls-1),%rax
+	ja	1f
 	call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
 ia32_sysret:
 	movq %rax,RAX(%rsp)
+1:
 ia32_ret_from_sys_call:
 	CLEAR_RREGS
 	jmp int_ret_from_sys_call
@@ -550,23 +488,13 @@ ia32_ret_from_sys_call:
 ia32_tracesys:
 	SAVE_EXTRA_REGS
 	CLEAR_RREGS
-	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	call syscall_trace_enter
 	LOAD_ARGS32	/* reload args from stack in case ptrace changed it */
 	RESTORE_EXTRA_REGS
-	cmpq $(IA32_NR_syscalls-1),%rax
-	ja  int_ret_from_sys_call	/* ia32_tracesys has set RAX(%rsp) */
 	jmp ia32_do_call
 END(ia32_syscall)
 
-ia32_badsys:
-	movq $0,ORIG_RAX(%rsp)
-	movq $-ENOSYS,%rax
-	jmp ia32_sysret
-
-	CFI_ENDPROC
-	
 	.macro PTREGSCALL label, func
 	ALIGN
 GLOBAL(\label)
@@ -574,8 +502,6 @@ GLOBAL(\label)
 	jmp  ia32_ptregs_common	
 	.endm
 
-	CFI_STARTPROC32
-
 	PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
 	PTREGSCALL stub32_sigreturn, sys32_sigreturn
 	PTREGSCALL stub32_fork, sys_fork
@@ -589,23 +515,8 @@ GLOBAL(stub32_clone)
 
 	ALIGN
 ia32_ptregs_common:
-	CFI_ENDPROC
-	CFI_STARTPROC32	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,SIZEOF_PTREGS
-	CFI_REL_OFFSET	rax,RAX
-	CFI_REL_OFFSET	rcx,RCX
-	CFI_REL_OFFSET	rdx,RDX
-	CFI_REL_OFFSET	rsi,RSI
-	CFI_REL_OFFSET	rdi,RDI
-	CFI_REL_OFFSET	rip,RIP
-/*	CFI_REL_OFFSET	cs,CS*/
-/*	CFI_REL_OFFSET	rflags,EFLAGS*/
-	CFI_REL_OFFSET	rsp,RSP
-/*	CFI_REL_OFFSET	ss,SS*/
 	SAVE_EXTRA_REGS 8
 	call *%rax
 	RESTORE_EXTRA_REGS 8
 	ret
-	CFI_ENDPROC
 END(ia32_ptregs_common)
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index bdf02eeee76519..e7636bac7372d4 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -18,6 +18,12 @@
 	.endm
 #endif
 
+/*
+ * Issue one struct alt_instr descriptor entry (need to put it into
+ * the section .altinstructions, see below). This entry contains
+ * enough information for the alternatives patching code to patch an
+ * instruction. See apply_alternatives().
+ */
 .macro altinstruction_entry orig alt feature orig_len alt_len pad_len
 	.long \orig - .
 	.long \alt - .
@@ -27,6 +33,12 @@
 	.byte \pad_len
 .endm
 
+/*
+ * Define an alternative between two instructions. If @feature is
+ * present, early code in apply_alternatives() replaces @oldinstr with
+ * @newinstr. ".skip" directive takes care of proper instruction padding
+ * in case @newinstr is longer than @oldinstr.
+ */
 .macro ALTERNATIVE oldinstr, newinstr, feature
 140:
 	\oldinstr
@@ -55,6 +67,12 @@
  */
 #define alt_max_short(a, b)	((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
 
+
+/*
+ * Same as ALTERNATIVE macro above but for two alternatives. If CPU
+ * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has
+ * @feature2, it replaces @oldinstr with @feature2.
+ */
 .macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
 140:
 	\oldinstr
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index ba32af062f61d6..7bfc85bbb8ffc0 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -52,6 +52,12 @@ struct alt_instr {
 	u8  padlen;		/* length of build-time padding */
 } __packed;
 
+/*
+ * Debug flag that can be tested to see whether alternative
+ * instructions were patched in already:
+ */
+extern int alternatives_patched;
+
 extern void alternative_instructions(void);
 extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
 
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index aaac3b2fb746d3..1a5da2e63aeeeb 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -98,11 +98,22 @@ static inline u16 amd_get_node_id(struct pci_dev *pdev)
 	return 0;
 }
 
+static inline bool amd_gart_present(void)
+{
+	/* GART present only on Fam15h, upto model 0fh */
+	if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+	    (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10))
+		return true;
+
+	return false;
+}
+
 #else
 
 #define amd_nb_num(x)		0
 #define amd_nb_has_feature(x)	false
 #define node_to_amd_nb(x)	NULL
+#define amd_gart_present(x)	false
 
 #endif
 
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 976b86a325e55c..c8393634ca0c50 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -644,6 +644,12 @@ static inline void entering_ack_irq(void)
 	entering_irq();
 }
 
+static inline void ipi_entering_ack_irq(void)
+{
+	ack_APIC_irq();
+	irq_enter();
+}
+
 static inline void exiting_irq(void)
 {
 	irq_exit();
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 7730c1c5c83aa7..189679aba70353 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -63,6 +63,31 @@
 	_ASM_ALIGN ;						\
 	_ASM_PTR (entry);					\
 	.popsection
+
+.macro ALIGN_DESTINATION
+	/* check for bad alignment of destination */
+	movl %edi,%ecx
+	andl $7,%ecx
+	jz 102f				/* already aligned */
+	subl $8,%ecx
+	negl %ecx
+	subl %ecx,%edx
+100:	movb (%rsi),%al
+101:	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 100b
+102:
+	.section .fixup,"ax"
+103:	addl %ecx,%edx			/* ecx is zerorest also */
+	jmp copy_user_handle_tail
+	.previous
+
+	_ASM_EXTABLE(100b,103b)
+	_ASM_EXTABLE(101b,103b)
+	.endm
+
 #else
 # define _ASM_EXTABLE(from,to)					\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 5e5cd123fdfbc2..e9168955c42f4e 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -22,7 +22,7 @@
  *
  * Atomically reads the value of @v.
  */
-static inline int atomic_read(const atomic_t *v)
+static __always_inline int atomic_read(const atomic_t *v)
 {
 	return ACCESS_ONCE((v)->counter);
 }
@@ -34,7 +34,7 @@ static inline int atomic_read(const atomic_t *v)
  *
  * Atomically sets the value of @v to @i.
  */
-static inline void atomic_set(atomic_t *v, int i)
+static __always_inline void atomic_set(atomic_t *v, int i)
 {
 	v->counter = i;
 }
@@ -46,7 +46,7 @@ static inline void atomic_set(atomic_t *v, int i)
  *
  * Atomically adds @i to @v.
  */
-static inline void atomic_add(int i, atomic_t *v)
+static __always_inline void atomic_add(int i, atomic_t *v)
 {
 	asm volatile(LOCK_PREFIX "addl %1,%0"
 		     : "+m" (v->counter)
@@ -60,7 +60,7 @@ static inline void atomic_add(int i, atomic_t *v)
  *
  * Atomically subtracts @i from @v.
  */
-static inline void atomic_sub(int i, atomic_t *v)
+static __always_inline void atomic_sub(int i, atomic_t *v)
 {
 	asm volatile(LOCK_PREFIX "subl %1,%0"
 		     : "+m" (v->counter)
@@ -76,7 +76,7 @@ static inline void atomic_sub(int i, atomic_t *v)
  * true if the result is zero, or false for all
  * other cases.
  */
-static inline int atomic_sub_and_test(int i, atomic_t *v)
+static __always_inline int atomic_sub_and_test(int i, atomic_t *v)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", "e");
 }
@@ -87,7 +87,7 @@ static inline int atomic_sub_and_test(int i, atomic_t *v)
  *
  * Atomically increments @v by 1.
  */
-static inline void atomic_inc(atomic_t *v)
+static __always_inline void atomic_inc(atomic_t *v)
 {
 	asm volatile(LOCK_PREFIX "incl %0"
 		     : "+m" (v->counter));
@@ -99,7 +99,7 @@ static inline void atomic_inc(atomic_t *v)
  *
  * Atomically decrements @v by 1.
  */
-static inline void atomic_dec(atomic_t *v)
+static __always_inline void atomic_dec(atomic_t *v)
 {
 	asm volatile(LOCK_PREFIX "decl %0"
 		     : "+m" (v->counter));
@@ -113,7 +113,7 @@ static inline void atomic_dec(atomic_t *v)
  * returns true if the result is 0, or false for all other
  * cases.
  */
-static inline int atomic_dec_and_test(atomic_t *v)
+static __always_inline int atomic_dec_and_test(atomic_t *v)
 {
 	GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e");
 }
@@ -126,7 +126,7 @@ static inline int atomic_dec_and_test(atomic_t *v)
  * and returns true if the result is zero, or false for all
  * other cases.
  */
-static inline int atomic_inc_and_test(atomic_t *v)
+static __always_inline int atomic_inc_and_test(atomic_t *v)
 {
 	GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e");
 }
@@ -140,7 +140,7 @@ static inline int atomic_inc_and_test(atomic_t *v)
  * if the result is negative, or false when
  * result is greater than or equal to zero.
  */
-static inline int atomic_add_negative(int i, atomic_t *v)
+static __always_inline int atomic_add_negative(int i, atomic_t *v)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", "s");
 }
@@ -152,7 +152,7 @@ static inline int atomic_add_negative(int i, atomic_t *v)
  *
  * Atomically adds @i to @v and returns @i + @v
  */
-static inline int atomic_add_return(int i, atomic_t *v)
+static __always_inline int atomic_add_return(int i, atomic_t *v)
 {
 	return i + xadd(&v->counter, i);
 }
@@ -164,7 +164,7 @@ static inline int atomic_add_return(int i, atomic_t *v)
  *
  * Atomically subtracts @i from @v and returns @v - @i
  */
-static inline int atomic_sub_return(int i, atomic_t *v)
+static __always_inline int atomic_sub_return(int i, atomic_t *v)
 {
 	return atomic_add_return(-i, v);
 }
@@ -172,7 +172,7 @@ static inline int atomic_sub_return(int i, atomic_t *v)
 #define atomic_inc_return(v)  (atomic_add_return(1, v))
 #define atomic_dec_return(v)  (atomic_sub_return(1, v))
 
-static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
+static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 {
 	return cmpxchg(&v->counter, old, new);
 }
@@ -191,7 +191,7 @@ static inline int atomic_xchg(atomic_t *v, int new)
  * Atomically adds @a to @v, so long as @v was not already @u.
  * Returns the old value of @v.
  */
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
+static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
 {
 	int c, old;
 	c = atomic_read(v);
@@ -213,7 +213,7 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
  * Atomically adds 1 to @v
  * Returns the new value of @u
  */
-static inline short int atomic_inc_short(short int *v)
+static __always_inline short int atomic_inc_short(short int *v)
 {
 	asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
 	return *v;
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index f8d273e18516de..b965f9e03f2a04 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -40,7 +40,7 @@ static inline void atomic64_set(atomic64_t *v, long i)
  *
  * Atomically adds @i to @v.
  */
-static inline void atomic64_add(long i, atomic64_t *v)
+static __always_inline void atomic64_add(long i, atomic64_t *v)
 {
 	asm volatile(LOCK_PREFIX "addq %1,%0"
 		     : "=m" (v->counter)
@@ -81,7 +81,7 @@ static inline int atomic64_sub_and_test(long i, atomic64_t *v)
  *
  * Atomically increments @v by 1.
  */
-static inline void atomic64_inc(atomic64_t *v)
+static __always_inline void atomic64_inc(atomic64_t *v)
 {
 	asm volatile(LOCK_PREFIX "incq %0"
 		     : "=m" (v->counter)
@@ -94,7 +94,7 @@ static inline void atomic64_inc(atomic64_t *v)
  *
  * Atomically decrements @v by 1.
  */
-static inline void atomic64_dec(atomic64_t *v)
+static __always_inline void atomic64_dec(atomic64_t *v)
 {
 	asm volatile(LOCK_PREFIX "decq %0"
 		     : "=m" (v->counter)
@@ -148,7 +148,7 @@ static inline int atomic64_add_negative(long i, atomic64_t *v)
  *
  * Atomically adds @i to @v and returns @i + @v
  */
-static inline long atomic64_add_return(long i, atomic64_t *v)
+static __always_inline long atomic64_add_return(long i, atomic64_t *v)
 {
 	return i + xadd(&v->counter, i);
 }
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 959e45b81fe291..e51a8f803f55e3 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -35,12 +35,12 @@
 #define smp_mb()	mb()
 #define smp_rmb()	dma_rmb()
 #define smp_wmb()	barrier()
-#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
+#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
 #else /* !SMP */
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
 #define smp_wmb()	barrier()
-#define set_mb(var, value) do { var = value; barrier(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
 #endif /* SMP */
 
 #define read_barrier_depends()		do { } while (0)
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 1c8b50edb2db3c..f4e6308c4200bc 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -46,8 +46,6 @@ For 32-bit we have the following conventions - kernel is built with
 
 */
 
-#include <asm/dwarf2.h>
-
 #ifdef CONFIG_X86_64
 
 /*
@@ -91,28 +89,27 @@ For 32-bit we have the following conventions - kernel is built with
 #define SIZEOF_PTREGS	21*8
 
 	.macro ALLOC_PT_GPREGS_ON_STACK addskip=0
-	subq	$15*8+\addskip, %rsp
-	CFI_ADJUST_CFA_OFFSET 15*8+\addskip
+	addq	$-(15*8+\addskip), %rsp
 	.endm
 
 	.macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1
 	.if \r11
-	movq_cfi r11, 6*8+\offset
+	movq %r11, 6*8+\offset(%rsp)
 	.endif
 	.if \r8910
-	movq_cfi r10, 7*8+\offset
-	movq_cfi r9,  8*8+\offset
-	movq_cfi r8,  9*8+\offset
+	movq %r10, 7*8+\offset(%rsp)
+	movq %r9,  8*8+\offset(%rsp)
+	movq %r8,  9*8+\offset(%rsp)
 	.endif
 	.if \rax
-	movq_cfi rax, 10*8+\offset
+	movq %rax, 10*8+\offset(%rsp)
 	.endif
 	.if \rcx
-	movq_cfi rcx, 11*8+\offset
+	movq %rcx, 11*8+\offset(%rsp)
 	.endif
-	movq_cfi rdx, 12*8+\offset
-	movq_cfi rsi, 13*8+\offset
-	movq_cfi rdi, 14*8+\offset
+	movq %rdx, 12*8+\offset(%rsp)
+	movq %rsi, 13*8+\offset(%rsp)
+	movq %rdi, 14*8+\offset(%rsp)
 	.endm
 	.macro SAVE_C_REGS offset=0
 	SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
@@ -131,24 +128,24 @@ For 32-bit we have the following conventions - kernel is built with
 	.endm
 
 	.macro SAVE_EXTRA_REGS offset=0
-	movq_cfi r15, 0*8+\offset
-	movq_cfi r14, 1*8+\offset
-	movq_cfi r13, 2*8+\offset
-	movq_cfi r12, 3*8+\offset
-	movq_cfi rbp, 4*8+\offset
-	movq_cfi rbx, 5*8+\offset
+	movq %r15, 0*8+\offset(%rsp)
+	movq %r14, 1*8+\offset(%rsp)
+	movq %r13, 2*8+\offset(%rsp)
+	movq %r12, 3*8+\offset(%rsp)
+	movq %rbp, 4*8+\offset(%rsp)
+	movq %rbx, 5*8+\offset(%rsp)
 	.endm
 	.macro SAVE_EXTRA_REGS_RBP offset=0
-	movq_cfi rbp, 4*8+\offset
+	movq %rbp, 4*8+\offset(%rsp)
 	.endm
 
 	.macro RESTORE_EXTRA_REGS offset=0
-	movq_cfi_restore 0*8+\offset, r15
-	movq_cfi_restore 1*8+\offset, r14
-	movq_cfi_restore 2*8+\offset, r13
-	movq_cfi_restore 3*8+\offset, r12
-	movq_cfi_restore 4*8+\offset, rbp
-	movq_cfi_restore 5*8+\offset, rbx
+	movq 0*8+\offset(%rsp), %r15
+	movq 1*8+\offset(%rsp), %r14
+	movq 2*8+\offset(%rsp), %r13
+	movq 3*8+\offset(%rsp), %r12
+	movq 4*8+\offset(%rsp), %rbp
+	movq 5*8+\offset(%rsp), %rbx
 	.endm
 
 	.macro ZERO_EXTRA_REGS
@@ -162,24 +159,24 @@ For 32-bit we have the following conventions - kernel is built with
 
 	.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
 	.if \rstor_r11
-	movq_cfi_restore 6*8, r11
+	movq 6*8(%rsp), %r11
 	.endif
 	.if \rstor_r8910
-	movq_cfi_restore 7*8, r10
-	movq_cfi_restore 8*8, r9
-	movq_cfi_restore 9*8, r8
+	movq 7*8(%rsp), %r10
+	movq 8*8(%rsp), %r9
+	movq 9*8(%rsp), %r8
 	.endif
 	.if \rstor_rax
-	movq_cfi_restore 10*8, rax
+	movq 10*8(%rsp), %rax
 	.endif
 	.if \rstor_rcx
-	movq_cfi_restore 11*8, rcx
+	movq 11*8(%rsp), %rcx
 	.endif
 	.if \rstor_rdx
-	movq_cfi_restore 12*8, rdx
+	movq 12*8(%rsp), %rdx
 	.endif
-	movq_cfi_restore 13*8, rsi
-	movq_cfi_restore 14*8, rdi
+	movq 13*8(%rsp), %rsi
+	movq 14*8(%rsp), %rdi
 	.endm
 	.macro RESTORE_C_REGS
 	RESTORE_C_REGS_HELPER 1,1,1,1,1
@@ -204,8 +201,7 @@ For 32-bit we have the following conventions - kernel is built with
 	.endm
 
 	.macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
-	addq $15*8+\addskip, %rsp
-	CFI_ADJUST_CFA_OFFSET -(15*8+\addskip)
+	subq $-(15*8+\addskip), %rsp
 	.endm
 
 	.macro icebp
@@ -224,23 +220,23 @@ For 32-bit we have the following conventions - kernel is built with
  */
 
 	.macro SAVE_ALL
-	pushl_cfi_reg eax
-	pushl_cfi_reg ebp
-	pushl_cfi_reg edi
-	pushl_cfi_reg esi
-	pushl_cfi_reg edx
-	pushl_cfi_reg ecx
-	pushl_cfi_reg ebx
+	pushl %eax
+	pushl %ebp
+	pushl %edi
+	pushl %esi
+	pushl %edx
+	pushl %ecx
+	pushl %ebx
 	.endm
 
 	.macro RESTORE_ALL
-	popl_cfi_reg ebx
-	popl_cfi_reg ecx
-	popl_cfi_reg edx
-	popl_cfi_reg esi
-	popl_cfi_reg edi
-	popl_cfi_reg ebp
-	popl_cfi_reg eax
+	popl %ebx
+	popl %ecx
+	popl %edx
+	popl %esi
+	popl %edi
+	popl %ebp
+	popl %eax
 	.endm
 
 #endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 99c105d78b7e12..ad19841eddfe14 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -4,8 +4,6 @@
 #include <linux/compiler.h>
 #include <asm/alternative.h> /* Provides LOCK_PREFIX */
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 /*
  * Non-existant functions to indicate usage errors at link time
  * (or compile-time if the compiler implements __compiletime_error().
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h
index 1eef55596e82ca..03bb1065c33528 100644
--- a/arch/x86/include/asm/crypto/glue_helper.h
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -7,7 +7,7 @@
 
 #include <linux/kernel.h>
 #include <linux/crypto.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <crypto/b128ops.h>
 
 typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 808dae63eeea6f..1f5b7287d1ad8d 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -127,50 +127,14 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp)
 
 #define dma_alloc_coherent(d,s,h,f)	dma_alloc_attrs(d,s,h,f,NULL)
 
-static inline void *
+void *
 dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		gfp_t gfp, struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	void *memory;
-
-	gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-
-	if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
-		return memory;
-
-	if (!dev)
-		dev = &x86_dma_fallback_dev;
-
-	if (!is_device_dma_capable(dev))
-		return NULL;
-
-	if (!ops->alloc)
-		return NULL;
-
-	memory = ops->alloc(dev, size, dma_handle,
-			    dma_alloc_coherent_gfp_flags(dev, gfp), attrs);
-	debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-
-	return memory;
-}
+		gfp_t gfp, struct dma_attrs *attrs);
 
 #define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
 
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				  void *vaddr, dma_addr_t bus,
-				  struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	WARN_ON(irqs_disabled());       /* for portability */
-
-	if (dma_release_from_coherent(dev, get_order(size), vaddr))
-		return;
-
-	debug_dma_free_coherent(dev, size, vaddr, bus);
-	if (ops->free)
-		ops->free(dev, size, vaddr, bus, attrs);
-}
+void dma_free_attrs(struct device *dev, size_t size,
+		    void *vaddr, dma_addr_t bus,
+		    struct dma_attrs *attrs);
 
 #endif
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
deleted file mode 100644
index de1cdaf4d74346..00000000000000
--- a/arch/x86/include/asm/dwarf2.h
+++ /dev/null
@@ -1,170 +0,0 @@
-#ifndef _ASM_X86_DWARF2_H
-#define _ASM_X86_DWARF2_H
-
-#ifndef __ASSEMBLY__
-#warning "asm/dwarf2.h should be only included in pure assembly files"
-#endif
-
-/*
- * Macros for dwarf2 CFI unwind table entries.
- * See "as.info" for details on these pseudo ops. Unfortunately
- * they are only supported in very new binutils, so define them
- * away for older version.
- */
-
-#ifdef CONFIG_AS_CFI
-
-#define CFI_STARTPROC		.cfi_startproc
-#define CFI_ENDPROC		.cfi_endproc
-#define CFI_DEF_CFA		.cfi_def_cfa
-#define CFI_DEF_CFA_REGISTER	.cfi_def_cfa_register
-#define CFI_DEF_CFA_OFFSET	.cfi_def_cfa_offset
-#define CFI_ADJUST_CFA_OFFSET	.cfi_adjust_cfa_offset
-#define CFI_OFFSET		.cfi_offset
-#define CFI_REL_OFFSET		.cfi_rel_offset
-#define CFI_REGISTER		.cfi_register
-#define CFI_RESTORE		.cfi_restore
-#define CFI_REMEMBER_STATE	.cfi_remember_state
-#define CFI_RESTORE_STATE	.cfi_restore_state
-#define CFI_UNDEFINED		.cfi_undefined
-#define CFI_ESCAPE		.cfi_escape
-
-#ifdef CONFIG_AS_CFI_SIGNAL_FRAME
-#define CFI_SIGNAL_FRAME	.cfi_signal_frame
-#else
-#define CFI_SIGNAL_FRAME
-#endif
-
-#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__)
-	/*
-	 * Emit CFI data in .debug_frame sections, not .eh_frame sections.
-	 * The latter we currently just discard since we don't do DWARF
-	 * unwinding at runtime.  So only the offline DWARF information is
-	 * useful to anyone.  Note we should not use this directive if this
-	 * file is used in the vDSO assembly, or if vmlinux.lds.S gets
-	 * changed so it doesn't discard .eh_frame.
-	 */
-	.cfi_sections .debug_frame
-#endif
-
-#else
-
-/*
- * Due to the structure of pre-exisiting code, don't use assembler line
- * comment character # to ignore the arguments. Instead, use a dummy macro.
- */
-.macro cfi_ignore a=0, b=0, c=0, d=0
-.endm
-
-#define CFI_STARTPROC		cfi_ignore
-#define CFI_ENDPROC		cfi_ignore
-#define CFI_DEF_CFA		cfi_ignore
-#define CFI_DEF_CFA_REGISTER	cfi_ignore
-#define CFI_DEF_CFA_OFFSET	cfi_ignore
-#define CFI_ADJUST_CFA_OFFSET	cfi_ignore
-#define CFI_OFFSET		cfi_ignore
-#define CFI_REL_OFFSET		cfi_ignore
-#define CFI_REGISTER		cfi_ignore
-#define CFI_RESTORE		cfi_ignore
-#define CFI_REMEMBER_STATE	cfi_ignore
-#define CFI_RESTORE_STATE	cfi_ignore
-#define CFI_UNDEFINED		cfi_ignore
-#define CFI_ESCAPE		cfi_ignore
-#define CFI_SIGNAL_FRAME	cfi_ignore
-
-#endif
-
-/*
- * An attempt to make CFI annotations more or less
- * correct and shorter. It is implied that you know
- * what you're doing if you use them.
- */
-#ifdef __ASSEMBLY__
-#ifdef CONFIG_X86_64
-	.macro pushq_cfi reg
-	pushq \reg
-	CFI_ADJUST_CFA_OFFSET 8
-	.endm
-
-	.macro pushq_cfi_reg reg
-	pushq %\reg
-	CFI_ADJUST_CFA_OFFSET 8
-	CFI_REL_OFFSET \reg, 0
-	.endm
-
-	.macro popq_cfi reg
-	popq \reg
-	CFI_ADJUST_CFA_OFFSET -8
-	.endm
-
-	.macro popq_cfi_reg reg
-	popq %\reg
-	CFI_ADJUST_CFA_OFFSET -8
-	CFI_RESTORE \reg
-	.endm
-
-	.macro pushfq_cfi
-	pushfq
-	CFI_ADJUST_CFA_OFFSET 8
-	.endm
-
-	.macro popfq_cfi
-	popfq
-	CFI_ADJUST_CFA_OFFSET -8
-	.endm
-
-	.macro movq_cfi reg offset=0
-	movq %\reg, \offset(%rsp)
-	CFI_REL_OFFSET \reg, \offset
-	.endm
-
-	.macro movq_cfi_restore offset reg
-	movq \offset(%rsp), %\reg
-	CFI_RESTORE \reg
-	.endm
-#else /*!CONFIG_X86_64*/
-	.macro pushl_cfi reg
-	pushl \reg
-	CFI_ADJUST_CFA_OFFSET 4
-	.endm
-
-	.macro pushl_cfi_reg reg
-	pushl %\reg
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET \reg, 0
-	.endm
-
-	.macro popl_cfi reg
-	popl \reg
-	CFI_ADJUST_CFA_OFFSET -4
-	.endm
-
-	.macro popl_cfi_reg reg
-	popl %\reg
-	CFI_ADJUST_CFA_OFFSET -4
-	CFI_RESTORE \reg
-	.endm
-
-	.macro pushfl_cfi
-	pushfl
-	CFI_ADJUST_CFA_OFFSET 4
-	.endm
-
-	.macro popfl_cfi
-	popfl
-	CFI_ADJUST_CFA_OFFSET -4
-	.endm
-
-	.macro movl_cfi reg offset=0
-	movl %\reg, \offset(%esp)
-	CFI_REL_OFFSET \reg, \offset
-	.endm
-
-	.macro movl_cfi_restore offset reg
-	movl \offset(%esp), %\reg
-	CFI_RESTORE \reg
-	.endm
-#endif /*!CONFIG_X86_64*/
-#endif /*__ASSEMBLY__*/
-
-#endif /* _ASM_X86_DWARF2_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 3738b138b843d4..155162ea0e0029 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_X86_EFI_H
 #define _ASM_X86_EFI_H
 
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <asm/pgtable.h>
 
 /*
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index dc5fa661465f9a..df002992d8fd3d 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -23,6 +23,8 @@ BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
 #ifdef CONFIG_HAVE_KVM
 BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR,
 		 smp_kvm_posted_intr_ipi)
+BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR,
+		 smp_kvm_posted_intr_wakeup_ipi)
 #endif
 
 /*
@@ -50,4 +52,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
 BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
 #endif
 
+#ifdef CONFIG_X86_MCE_AMD
+BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR)
+#endif
 #endif
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
deleted file mode 100644
index da5e96756570be..00000000000000
--- a/arch/x86/include/asm/fpu-internal.h
+++ /dev/null
@@ -1,626 +0,0 @@
-/*
- * Copyright (C) 1994 Linus Torvalds
- *
- * Pentium III FXSR, SSE support
- * General FPU state handling cleanups
- *	Gareth Hughes <gareth@valinux.com>, May 2000
- * x86-64 work by Andi Kleen 2002
- */
-
-#ifndef _FPU_INTERNAL_H
-#define _FPU_INTERNAL_H
-
-#include <linux/kernel_stat.h>
-#include <linux/regset.h>
-#include <linux/compat.h>
-#include <linux/slab.h>
-#include <asm/asm.h>
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-#include <asm/sigcontext.h>
-#include <asm/user.h>
-#include <asm/uaccess.h>
-#include <asm/xsave.h>
-#include <asm/smap.h>
-
-#ifdef CONFIG_X86_64
-# include <asm/sigcontext32.h>
-# include <asm/user32.h>
-struct ksignal;
-int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
-			compat_sigset_t *set, struct pt_regs *regs);
-int ia32_setup_frame(int sig, struct ksignal *ksig,
-		     compat_sigset_t *set, struct pt_regs *regs);
-#else
-# define user_i387_ia32_struct	user_i387_struct
-# define user32_fxsr_struct	user_fxsr_struct
-# define ia32_setup_frame	__setup_frame
-# define ia32_setup_rt_frame	__setup_rt_frame
-#endif
-
-extern unsigned int mxcsr_feature_mask;
-extern void fpu_init(void);
-extern void eager_fpu_init(void);
-
-DECLARE_PER_CPU(struct task_struct *, fpu_owner_task);
-
-extern void convert_from_fxsr(struct user_i387_ia32_struct *env,
-			      struct task_struct *tsk);
-extern void convert_to_fxsr(struct task_struct *tsk,
-			    const struct user_i387_ia32_struct *env);
-
-extern user_regset_active_fn fpregs_active, xfpregs_active;
-extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
-				xstateregs_get;
-extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
-				 xstateregs_set;
-
-/*
- * xstateregs_active == fpregs_active. Please refer to the comment
- * at the definition of fpregs_active.
- */
-#define xstateregs_active	fpregs_active
-
-#ifdef CONFIG_MATH_EMULATION
-extern void finit_soft_fpu(struct i387_soft_struct *soft);
-#else
-static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
-#endif
-
-/*
- * Must be run with preemption disabled: this clears the fpu_owner_task,
- * on this CPU.
- *
- * This will disable any lazy FPU state restore of the current FPU state,
- * but if the current thread owns the FPU, it will still be saved by.
- */
-static inline void __cpu_disable_lazy_restore(unsigned int cpu)
-{
-	per_cpu(fpu_owner_task, cpu) = NULL;
-}
-
-/*
- * Used to indicate that the FPU state in memory is newer than the FPU
- * state in registers, and the FPU state should be reloaded next time the
- * task is run. Only safe on the current task, or non-running tasks.
- */
-static inline void task_disable_lazy_fpu_restore(struct task_struct *tsk)
-{
-	tsk->thread.fpu.last_cpu = ~0;
-}
-
-static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
-{
-	return new == this_cpu_read_stable(fpu_owner_task) &&
-		cpu == new->thread.fpu.last_cpu;
-}
-
-static inline int is_ia32_compat_frame(void)
-{
-	return config_enabled(CONFIG_IA32_EMULATION) &&
-	       test_thread_flag(TIF_IA32);
-}
-
-static inline int is_ia32_frame(void)
-{
-	return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame();
-}
-
-static inline int is_x32_frame(void)
-{
-	return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32);
-}
-
-#define X87_FSW_ES (1 << 7)	/* Exception Summary */
-
-static __always_inline __pure bool use_eager_fpu(void)
-{
-	return static_cpu_has_safe(X86_FEATURE_EAGER_FPU);
-}
-
-static __always_inline __pure bool use_xsaveopt(void)
-{
-	return static_cpu_has_safe(X86_FEATURE_XSAVEOPT);
-}
-
-static __always_inline __pure bool use_xsave(void)
-{
-	return static_cpu_has_safe(X86_FEATURE_XSAVE);
-}
-
-static __always_inline __pure bool use_fxsr(void)
-{
-	return static_cpu_has_safe(X86_FEATURE_FXSR);
-}
-
-static inline void fx_finit(struct i387_fxsave_struct *fx)
-{
-	fx->cwd = 0x37f;
-	fx->mxcsr = MXCSR_DEFAULT;
-}
-
-extern void __sanitize_i387_state(struct task_struct *);
-
-static inline void sanitize_i387_state(struct task_struct *tsk)
-{
-	if (!use_xsaveopt())
-		return;
-	__sanitize_i387_state(tsk);
-}
-
-#define user_insn(insn, output, input...)				\
-({									\
-	int err;							\
-	asm volatile(ASM_STAC "\n"					\
-		     "1:" #insn "\n\t"					\
-		     "2: " ASM_CLAC "\n"				\
-		     ".section .fixup,\"ax\"\n"				\
-		     "3:  movl $-1,%[err]\n"				\
-		     "    jmp  2b\n"					\
-		     ".previous\n"					\
-		     _ASM_EXTABLE(1b, 3b)				\
-		     : [err] "=r" (err), output				\
-		     : "0"(0), input);					\
-	err;								\
-})
-
-#define check_insn(insn, output, input...)				\
-({									\
-	int err;							\
-	asm volatile("1:" #insn "\n\t"					\
-		     "2:\n"						\
-		     ".section .fixup,\"ax\"\n"				\
-		     "3:  movl $-1,%[err]\n"				\
-		     "    jmp  2b\n"					\
-		     ".previous\n"					\
-		     _ASM_EXTABLE(1b, 3b)				\
-		     : [err] "=r" (err), output				\
-		     : "0"(0), input);					\
-	err;								\
-})
-
-static inline int fsave_user(struct i387_fsave_struct __user *fx)
-{
-	return user_insn(fnsave %[fx]; fwait,  [fx] "=m" (*fx), "m" (*fx));
-}
-
-static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
-{
-	if (config_enabled(CONFIG_X86_32))
-		return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
-	else if (config_enabled(CONFIG_AS_FXSAVEQ))
-		return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
-
-	/* See comment in fpu_fxsave() below. */
-	return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx));
-}
-
-static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
-{
-	if (config_enabled(CONFIG_X86_32))
-		return check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-	else if (config_enabled(CONFIG_AS_FXSAVEQ))
-		return check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-
-	/* See comment in fpu_fxsave() below. */
-	return check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx),
-			  "m" (*fx));
-}
-
-static inline int fxrstor_user(struct i387_fxsave_struct __user *fx)
-{
-	if (config_enabled(CONFIG_X86_32))
-		return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-	else if (config_enabled(CONFIG_AS_FXSAVEQ))
-		return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-
-	/* See comment in fpu_fxsave() below. */
-	return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx),
-			  "m" (*fx));
-}
-
-static inline int frstor_checking(struct i387_fsave_struct *fx)
-{
-	return check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline int frstor_user(struct i387_fsave_struct __user *fx)
-{
-	return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline void fpu_fxsave(struct fpu *fpu)
-{
-	if (config_enabled(CONFIG_X86_32))
-		asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state->fxsave));
-	else if (config_enabled(CONFIG_AS_FXSAVEQ))
-		asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state->fxsave));
-	else {
-		/* Using "rex64; fxsave %0" is broken because, if the memory
-		 * operand uses any extended registers for addressing, a second
-		 * REX prefix will be generated (to the assembler, rex64
-		 * followed by semicolon is a separate instruction), and hence
-		 * the 64-bitness is lost.
-		 *
-		 * Using "fxsaveq %0" would be the ideal choice, but is only
-		 * supported starting with gas 2.16.
-		 *
-		 * Using, as a workaround, the properly prefixed form below
-		 * isn't accepted by any binutils version so far released,
-		 * complaining that the same type of prefix is used twice if
-		 * an extended register is needed for addressing (fix submitted
-		 * to mainline 2005-11-21).
-		 *
-		 *  asm volatile("rex64/fxsave %0" : "=m" (fpu->state->fxsave));
-		 *
-		 * This, however, we can work around by forcing the compiler to
-		 * select an addressing mode that doesn't require extended
-		 * registers.
-		 */
-		asm volatile( "rex64/fxsave (%[fx])"
-			     : "=m" (fpu->state->fxsave)
-			     : [fx] "R" (&fpu->state->fxsave));
-	}
-}
-
-/*
- * These must be called with preempt disabled. Returns
- * 'true' if the FPU state is still intact.
- */
-static inline int fpu_save_init(struct fpu *fpu)
-{
-	if (use_xsave()) {
-		fpu_xsave(fpu);
-
-		/*
-		 * xsave header may indicate the init state of the FP.
-		 */
-		if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP))
-			return 1;
-	} else if (use_fxsr()) {
-		fpu_fxsave(fpu);
-	} else {
-		asm volatile("fnsave %[fx]; fwait"
-			     : [fx] "=m" (fpu->state->fsave));
-		return 0;
-	}
-
-	/*
-	 * If exceptions are pending, we need to clear them so
-	 * that we don't randomly get exceptions later.
-	 *
-	 * FIXME! Is this perhaps only true for the old-style
-	 * irq13 case? Maybe we could leave the x87 state
-	 * intact otherwise?
-	 */
-	if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) {
-		asm volatile("fnclex");
-		return 0;
-	}
-	return 1;
-}
-
-static inline int __save_init_fpu(struct task_struct *tsk)
-{
-	return fpu_save_init(&tsk->thread.fpu);
-}
-
-static inline int fpu_restore_checking(struct fpu *fpu)
-{
-	if (use_xsave())
-		return fpu_xrstor_checking(&fpu->state->xsave);
-	else if (use_fxsr())
-		return fxrstor_checking(&fpu->state->fxsave);
-	else
-		return frstor_checking(&fpu->state->fsave);
-}
-
-static inline int restore_fpu_checking(struct task_struct *tsk)
-{
-	/*
-	 * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
-	 * pending. Clear the x87 state here by setting it to fixed values.
-	 * "m" is a random variable that should be in L1.
-	 */
-	if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) {
-		asm volatile(
-			"fnclex\n\t"
-			"emms\n\t"
-			"fildl %P[addr]"	/* set F?P to defined value */
-			: : [addr] "m" (tsk->thread.fpu.has_fpu));
-	}
-
-	return fpu_restore_checking(&tsk->thread.fpu);
-}
-
-/*
- * Software FPU state helpers. Careful: these need to
- * be preemption protection *and* they need to be
- * properly paired with the CR0.TS changes!
- */
-static inline int __thread_has_fpu(struct task_struct *tsk)
-{
-	return tsk->thread.fpu.has_fpu;
-}
-
-/* Must be paired with an 'stts' after! */
-static inline void __thread_clear_has_fpu(struct task_struct *tsk)
-{
-	tsk->thread.fpu.has_fpu = 0;
-	this_cpu_write(fpu_owner_task, NULL);
-}
-
-/* Must be paired with a 'clts' before! */
-static inline void __thread_set_has_fpu(struct task_struct *tsk)
-{
-	tsk->thread.fpu.has_fpu = 1;
-	this_cpu_write(fpu_owner_task, tsk);
-}
-
-/*
- * Encapsulate the CR0.TS handling together with the
- * software flag.
- *
- * These generally need preemption protection to work,
- * do try to avoid using these on their own.
- */
-static inline void __thread_fpu_end(struct task_struct *tsk)
-{
-	__thread_clear_has_fpu(tsk);
-	if (!use_eager_fpu())
-		stts();
-}
-
-static inline void __thread_fpu_begin(struct task_struct *tsk)
-{
-	if (!use_eager_fpu())
-		clts();
-	__thread_set_has_fpu(tsk);
-}
-
-static inline void drop_fpu(struct task_struct *tsk)
-{
-	/*
-	 * Forget coprocessor state..
-	 */
-	preempt_disable();
-	tsk->thread.fpu_counter = 0;
-
-	if (__thread_has_fpu(tsk)) {
-		/* Ignore delayed exceptions from user space */
-		asm volatile("1: fwait\n"
-			     "2:\n"
-			     _ASM_EXTABLE(1b, 2b));
-		__thread_fpu_end(tsk);
-	}
-
-	clear_stopped_child_used_math(tsk);
-	preempt_enable();
-}
-
-static inline void restore_init_xstate(void)
-{
-	if (use_xsave())
-		xrstor_state(init_xstate_buf, -1);
-	else
-		fxrstor_checking(&init_xstate_buf->i387);
-}
-
-/*
- * Reset the FPU state in the eager case and drop it in the lazy case (later use
- * will reinit it).
- */
-static inline void fpu_reset_state(struct task_struct *tsk)
-{
-	if (!use_eager_fpu())
-		drop_fpu(tsk);
-	else
-		restore_init_xstate();
-}
-
-/*
- * FPU state switching for scheduling.
- *
- * This is a two-stage process:
- *
- *  - switch_fpu_prepare() saves the old state and
- *    sets the new state of the CR0.TS bit. This is
- *    done within the context of the old process.
- *
- *  - switch_fpu_finish() restores the new state as
- *    necessary.
- */
-typedef struct { int preload; } fpu_switch_t;
-
-static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu)
-{
-	fpu_switch_t fpu;
-
-	/*
-	 * If the task has used the math, pre-load the FPU on xsave processors
-	 * or if the past 5 consecutive context-switches used math.
-	 */
-	fpu.preload = tsk_used_math(new) &&
-		      (use_eager_fpu() || new->thread.fpu_counter > 5);
-
-	if (__thread_has_fpu(old)) {
-		if (!__save_init_fpu(old))
-			task_disable_lazy_fpu_restore(old);
-		else
-			old->thread.fpu.last_cpu = cpu;
-
-		/* But leave fpu_owner_task! */
-		old->thread.fpu.has_fpu = 0;
-
-		/* Don't change CR0.TS if we just switch! */
-		if (fpu.preload) {
-			new->thread.fpu_counter++;
-			__thread_set_has_fpu(new);
-			prefetch(new->thread.fpu.state);
-		} else if (!use_eager_fpu())
-			stts();
-	} else {
-		old->thread.fpu_counter = 0;
-		task_disable_lazy_fpu_restore(old);
-		if (fpu.preload) {
-			new->thread.fpu_counter++;
-			if (fpu_lazy_restore(new, cpu))
-				fpu.preload = 0;
-			else
-				prefetch(new->thread.fpu.state);
-			__thread_fpu_begin(new);
-		}
-	}
-	return fpu;
-}
-
-/*
- * By the time this gets called, we've already cleared CR0.TS and
- * given the process the FPU if we are going to preload the FPU
- * state - all we need to do is to conditionally restore the register
- * state itself.
- */
-static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu)
-{
-	if (fpu.preload) {
-		if (unlikely(restore_fpu_checking(new)))
-			fpu_reset_state(new);
-	}
-}
-
-/*
- * Signal frame handlers...
- */
-extern int save_xstate_sig(void __user *buf, void __user *fx, int size);
-extern int __restore_xstate_sig(void __user *buf, void __user *fx, int size);
-
-static inline int xstate_sigframe_size(void)
-{
-	return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size;
-}
-
-static inline int restore_xstate_sig(void __user *buf, int ia32_frame)
-{
-	void __user *buf_fx = buf;
-	int size = xstate_sigframe_size();
-
-	if (ia32_frame && use_fxsr()) {
-		buf_fx = buf + sizeof(struct i387_fsave_struct);
-		size += sizeof(struct i387_fsave_struct);
-	}
-
-	return __restore_xstate_sig(buf, buf_fx, size);
-}
-
-/*
- * Needs to be preemption-safe.
- *
- * NOTE! user_fpu_begin() must be used only immediately before restoring
- * the save state. It does not do any saving/restoring on its own. In
- * lazy FPU mode, it is just an optimization to avoid a #NM exception,
- * the task can lose the FPU right after preempt_enable().
- */
-static inline void user_fpu_begin(void)
-{
-	preempt_disable();
-	if (!user_has_fpu())
-		__thread_fpu_begin(current);
-	preempt_enable();
-}
-
-static inline void __save_fpu(struct task_struct *tsk)
-{
-	if (use_xsave()) {
-		if (unlikely(system_state == SYSTEM_BOOTING))
-			xsave_state_booting(&tsk->thread.fpu.state->xsave, -1);
-		else
-			xsave_state(&tsk->thread.fpu.state->xsave, -1);
-	} else
-		fpu_fxsave(&tsk->thread.fpu);
-}
-
-/*
- * i387 state interaction
- */
-static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
-{
-	if (cpu_has_fxsr) {
-		return tsk->thread.fpu.state->fxsave.cwd;
-	} else {
-		return (unsigned short)tsk->thread.fpu.state->fsave.cwd;
-	}
-}
-
-static inline unsigned short get_fpu_swd(struct task_struct *tsk)
-{
-	if (cpu_has_fxsr) {
-		return tsk->thread.fpu.state->fxsave.swd;
-	} else {
-		return (unsigned short)tsk->thread.fpu.state->fsave.swd;
-	}
-}
-
-static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
-{
-	if (cpu_has_xmm) {
-		return tsk->thread.fpu.state->fxsave.mxcsr;
-	} else {
-		return MXCSR_DEFAULT;
-	}
-}
-
-static bool fpu_allocated(struct fpu *fpu)
-{
-	return fpu->state != NULL;
-}
-
-static inline int fpu_alloc(struct fpu *fpu)
-{
-	if (fpu_allocated(fpu))
-		return 0;
-	fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL);
-	if (!fpu->state)
-		return -ENOMEM;
-	WARN_ON((unsigned long)fpu->state & 15);
-	return 0;
-}
-
-static inline void fpu_free(struct fpu *fpu)
-{
-	if (fpu->state) {
-		kmem_cache_free(task_xstate_cachep, fpu->state);
-		fpu->state = NULL;
-	}
-}
-
-static inline void fpu_copy(struct task_struct *dst, struct task_struct *src)
-{
-	if (use_eager_fpu()) {
-		memset(&dst->thread.fpu.state->xsave, 0, xstate_size);
-		__save_fpu(dst);
-	} else {
-		struct fpu *dfpu = &dst->thread.fpu;
-		struct fpu *sfpu = &src->thread.fpu;
-
-		unlazy_fpu(src);
-		memcpy(dfpu->state, sfpu->state, xstate_size);
-	}
-}
-
-static inline unsigned long
-alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx,
-		unsigned long *size)
-{
-	unsigned long frame_size = xstate_sigframe_size();
-
-	*buf_fx = sp = round_down(sp - frame_size, 64);
-	if (ia32_frame && use_fxsr()) {
-		frame_size += sizeof(struct i387_fsave_struct);
-		sp -= sizeof(struct i387_fsave_struct);
-	}
-
-	*size = frame_size;
-	return sp;
-}
-
-#endif
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
new file mode 100644
index 00000000000000..1429a7c736db81
--- /dev/null
+++ b/arch/x86/include/asm/fpu/api.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ * x86-64 work by Andi Kleen 2002
+ */
+
+#ifndef _ASM_X86_FPU_API_H
+#define _ASM_X86_FPU_API_H
+
+/*
+ * Careful: __kernel_fpu_begin/end() must be called with preempt disabled
+ * and they don't touch the preempt state on their own.
+ * If you enable preemption after __kernel_fpu_begin(), preempt notifier
+ * should call the __kernel_fpu_end() to prevent the kernel/user FPU
+ * state from getting corrupted. KVM for example uses this model.
+ *
+ * All other cases use kernel_fpu_begin/end() which disable preemption
+ * during kernel FPU usage.
+ */
+extern void __kernel_fpu_begin(void);
+extern void __kernel_fpu_end(void);
+extern void kernel_fpu_begin(void);
+extern void kernel_fpu_end(void);
+extern bool irq_fpu_usable(void);
+
+/*
+ * Some instructions like VIA's padlock instructions generate a spurious
+ * DNA fault but don't modify SSE registers. And these instructions
+ * get used from interrupt context as well. To prevent these kernel instructions
+ * in interrupt context interacting wrongly with other user/kernel fpu usage, we
+ * should use them only in the context of irq_ts_save/restore()
+ */
+extern int  irq_ts_save(void);
+extern void irq_ts_restore(int TS_state);
+
+/*
+ * Query the presence of one or more xfeatures. Works on any legacy CPU as well.
+ *
+ * If 'feature_name' is set then put a human-readable description of
+ * the feature there as well - this can be used to print error (or success)
+ * messages.
+ */
+extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
+
+#endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
new file mode 100644
index 00000000000000..3c3550c3a4a3f0
--- /dev/null
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -0,0 +1,694 @@
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ * x86-64 work by Andi Kleen 2002
+ */
+
+#ifndef _ASM_X86_FPU_INTERNAL_H
+#define _ASM_X86_FPU_INTERNAL_H
+
+#include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <asm/user.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/xstate.h>
+
+/*
+ * High level FPU state handling functions:
+ */
+extern void fpu__activate_curr(struct fpu *fpu);
+extern void fpu__activate_fpstate_read(struct fpu *fpu);
+extern void fpu__activate_fpstate_write(struct fpu *fpu);
+extern void fpu__save(struct fpu *fpu);
+extern void fpu__restore(struct fpu *fpu);
+extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
+extern void fpu__drop(struct fpu *fpu);
+extern int  fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu);
+extern void fpu__clear(struct fpu *fpu);
+extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
+extern int  dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate);
+
+/*
+ * Boot time FPU initialization functions:
+ */
+extern void fpu__init_cpu(void);
+extern void fpu__init_system_xstate(void);
+extern void fpu__init_cpu_xstate(void);
+extern void fpu__init_system(struct cpuinfo_x86 *c);
+extern void fpu__init_check_bugs(void);
+extern void fpu__resume_cpu(void);
+
+/*
+ * Debugging facility:
+ */
+#ifdef CONFIG_X86_DEBUG_FPU
+# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
+#else
+# define WARN_ON_FPU(x) ({ (void)(x); 0; })
+#endif
+
+/*
+ * FPU related CPU feature flag helper routines:
+ */
+static __always_inline __pure bool use_eager_fpu(void)
+{
+	return static_cpu_has_safe(X86_FEATURE_EAGER_FPU);
+}
+
+static __always_inline __pure bool use_xsaveopt(void)
+{
+	return static_cpu_has_safe(X86_FEATURE_XSAVEOPT);
+}
+
+static __always_inline __pure bool use_xsave(void)
+{
+	return static_cpu_has_safe(X86_FEATURE_XSAVE);
+}
+
+static __always_inline __pure bool use_fxsr(void)
+{
+	return static_cpu_has_safe(X86_FEATURE_FXSR);
+}
+
+/*
+ * fpstate handling functions:
+ */
+
+extern union fpregs_state init_fpstate;
+
+extern void fpstate_init(union fpregs_state *state);
+#ifdef CONFIG_MATH_EMULATION
+extern void fpstate_init_soft(struct swregs_state *soft);
+#else
+static inline void fpstate_init_soft(struct swregs_state *soft) {}
+#endif
+static inline void fpstate_init_fxstate(struct fxregs_state *fx)
+{
+	fx->cwd = 0x37f;
+	fx->mxcsr = MXCSR_DEFAULT;
+}
+extern void fpstate_sanitize_xstate(struct fpu *fpu);
+
+#define user_insn(insn, output, input...)				\
+({									\
+	int err;							\
+	asm volatile(ASM_STAC "\n"					\
+		     "1:" #insn "\n\t"					\
+		     "2: " ASM_CLAC "\n"				\
+		     ".section .fixup,\"ax\"\n"				\
+		     "3:  movl $-1,%[err]\n"				\
+		     "    jmp  2b\n"					\
+		     ".previous\n"					\
+		     _ASM_EXTABLE(1b, 3b)				\
+		     : [err] "=r" (err), output				\
+		     : "0"(0), input);					\
+	err;								\
+})
+
+#define check_insn(insn, output, input...)				\
+({									\
+	int err;							\
+	asm volatile("1:" #insn "\n\t"					\
+		     "2:\n"						\
+		     ".section .fixup,\"ax\"\n"				\
+		     "3:  movl $-1,%[err]\n"				\
+		     "    jmp  2b\n"					\
+		     ".previous\n"					\
+		     _ASM_EXTABLE(1b, 3b)				\
+		     : [err] "=r" (err), output				\
+		     : "0"(0), input);					\
+	err;								\
+})
+
+static inline int copy_fregs_to_user(struct fregs_state __user *fx)
+{
+	return user_insn(fnsave %[fx]; fwait,  [fx] "=m" (*fx), "m" (*fx));
+}
+
+static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
+{
+	if (config_enabled(CONFIG_X86_32))
+		return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
+	else if (config_enabled(CONFIG_AS_FXSAVEQ))
+		return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
+
+	/* See comment in copy_fxregs_to_kernel() below. */
+	return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx));
+}
+
+static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
+{
+	int err;
+
+	if (config_enabled(CONFIG_X86_32)) {
+		err = check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+	} else {
+		if (config_enabled(CONFIG_AS_FXSAVEQ)) {
+			err = check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+		} else {
+			/* See comment in copy_fxregs_to_kernel() below. */
+			err = check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx));
+		}
+	}
+	/* Copying from a kernel buffer to FPU registers should never fail: */
+	WARN_ON_FPU(err);
+}
+
+static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
+{
+	if (config_enabled(CONFIG_X86_32))
+		return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+	else if (config_enabled(CONFIG_AS_FXSAVEQ))
+		return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+
+	/* See comment in copy_fxregs_to_kernel() below. */
+	return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx),
+			  "m" (*fx));
+}
+
+static inline void copy_kernel_to_fregs(struct fregs_state *fx)
+{
+	int err = check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+
+	WARN_ON_FPU(err);
+}
+
+static inline int copy_user_to_fregs(struct fregs_state __user *fx)
+{
+	return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline void copy_fxregs_to_kernel(struct fpu *fpu)
+{
+	if (config_enabled(CONFIG_X86_32))
+		asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
+	else if (config_enabled(CONFIG_AS_FXSAVEQ))
+		asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
+	else {
+		/* Using "rex64; fxsave %0" is broken because, if the memory
+		 * operand uses any extended registers for addressing, a second
+		 * REX prefix will be generated (to the assembler, rex64
+		 * followed by semicolon is a separate instruction), and hence
+		 * the 64-bitness is lost.
+		 *
+		 * Using "fxsaveq %0" would be the ideal choice, but is only
+		 * supported starting with gas 2.16.
+		 *
+		 * Using, as a workaround, the properly prefixed form below
+		 * isn't accepted by any binutils version so far released,
+		 * complaining that the same type of prefix is used twice if
+		 * an extended register is needed for addressing (fix submitted
+		 * to mainline 2005-11-21).
+		 *
+		 *  asm volatile("rex64/fxsave %0" : "=m" (fpu->state.fxsave));
+		 *
+		 * This, however, we can work around by forcing the compiler to
+		 * select an addressing mode that doesn't require extended
+		 * registers.
+		 */
+		asm volatile( "rex64/fxsave (%[fx])"
+			     : "=m" (fpu->state.fxsave)
+			     : [fx] "R" (&fpu->state.fxsave));
+	}
+}
+
+/* These macros all use (%edi)/(%rdi) as the single memory argument. */
+#define XSAVE		".byte " REX_PREFIX "0x0f,0xae,0x27"
+#define XSAVEOPT	".byte " REX_PREFIX "0x0f,0xae,0x37"
+#define XSAVES		".byte " REX_PREFIX "0x0f,0xc7,0x2f"
+#define XRSTOR		".byte " REX_PREFIX "0x0f,0xae,0x2f"
+#define XRSTORS		".byte " REX_PREFIX "0x0f,0xc7,0x1f"
+
+/* xstate instruction fault handler: */
+#define xstate_fault(__err)		\
+					\
+	".section .fixup,\"ax\"\n"	\
+					\
+	"3:  movl $-2,%[_err]\n"	\
+	"    jmp  2b\n"			\
+					\
+	".previous\n"			\
+					\
+	_ASM_EXTABLE(1b, 3b)		\
+	: [_err] "=r" (__err)
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
+{
+	u64 mask = -1;
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err = 0;
+
+	WARN_ON(system_state != SYSTEM_BOOTING);
+
+	if (boot_cpu_has(X86_FEATURE_XSAVES))
+		asm volatile("1:"XSAVES"\n\t"
+			"2:\n\t"
+			     xstate_fault(err)
+			: "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
+			: "memory");
+	else
+		asm volatile("1:"XSAVE"\n\t"
+			"2:\n\t"
+			     xstate_fault(err)
+			: "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
+			: "memory");
+
+	/* We should never fault when copying to a kernel buffer: */
+	WARN_ON_FPU(err);
+}
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
+{
+	u64 mask = -1;
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err = 0;
+
+	WARN_ON(system_state != SYSTEM_BOOTING);
+
+	if (boot_cpu_has(X86_FEATURE_XSAVES))
+		asm volatile("1:"XRSTORS"\n\t"
+			"2:\n\t"
+			     xstate_fault(err)
+			: "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
+			: "memory");
+	else
+		asm volatile("1:"XRSTOR"\n\t"
+			"2:\n\t"
+			     xstate_fault(err)
+			: "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
+			: "memory");
+
+	/* We should never fault when copying from a kernel buffer: */
+	WARN_ON_FPU(err);
+}
+
+/*
+ * Save processor xstate to xsave area.
+ */
+static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
+{
+	u64 mask = -1;
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err = 0;
+
+	WARN_ON(!alternatives_patched);
+
+	/*
+	 * If xsaves is enabled, xsaves replaces xsaveopt because
+	 * it supports compact format and supervisor states in addition to
+	 * modified optimization in xsaveopt.
+	 *
+	 * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave
+	 * because xsaveopt supports modified optimization which is not
+	 * supported by xsave.
+	 *
+	 * If none of xsaves and xsaveopt is enabled, use xsave.
+	 */
+	alternative_input_2(
+		"1:"XSAVE,
+		XSAVEOPT,
+		X86_FEATURE_XSAVEOPT,
+		XSAVES,
+		X86_FEATURE_XSAVES,
+		[xstate] "D" (xstate), "a" (lmask), "d" (hmask) :
+		"memory");
+	asm volatile("2:\n\t"
+		     xstate_fault(err)
+		     : "0" (err)
+		     : "memory");
+
+	/* We should never fault when copying to a kernel buffer: */
+	WARN_ON_FPU(err);
+}
+
+/*
+ * Restore processor xstate from xsave area.
+ */
+static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
+{
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err = 0;
+
+	/*
+	 * Use xrstors to restore context if it is enabled. xrstors supports
+	 * compacted format of xsave area which is not supported by xrstor.
+	 */
+	alternative_input(
+		"1: " XRSTOR,
+		XRSTORS,
+		X86_FEATURE_XSAVES,
+		"D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask)
+		: "memory");
+
+	asm volatile("2:\n"
+		     xstate_fault(err)
+		     : "0" (err)
+		     : "memory");
+
+	/* We should never fault when copying from a kernel buffer: */
+	WARN_ON_FPU(err);
+}
+
+/*
+ * Save xstate to user space xsave area.
+ *
+ * We don't use modified optimization because xrstor/xrstors might track
+ * a different application.
+ *
+ * We don't use compacted format xsave area for
+ * backward compatibility for old applications which don't understand
+ * compacted format of xsave area.
+ */
+static inline int copy_xregs_to_user(struct xregs_state __user *buf)
+{
+	int err;
+
+	/*
+	 * Clear the xsave header first, so that reserved fields are
+	 * initialized to zero.
+	 */
+	err = __clear_user(&buf->header, sizeof(buf->header));
+	if (unlikely(err))
+		return -EFAULT;
+
+	__asm__ __volatile__(ASM_STAC "\n"
+			     "1:"XSAVE"\n"
+			     "2: " ASM_CLAC "\n"
+			     xstate_fault(err)
+			     : "D" (buf), "a" (-1), "d" (-1), "0" (err)
+			     : "memory");
+	return err;
+}
+
+/*
+ * Restore xstate from user space xsave area.
+ */
+static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
+{
+	struct xregs_state *xstate = ((__force struct xregs_state *)buf);
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err = 0;
+
+	__asm__ __volatile__(ASM_STAC "\n"
+			     "1:"XRSTOR"\n"
+			     "2: " ASM_CLAC "\n"
+			     xstate_fault(err)
+			     : "D" (xstate), "a" (lmask), "d" (hmask), "0" (err)
+			     : "memory");	/* memory required? */
+	return err;
+}
+
+/*
+ * These must be called with preempt disabled. Returns
+ * 'true' if the FPU state is still intact and we can
+ * keep registers active.
+ *
+ * The legacy FNSAVE instruction cleared all FPU state
+ * unconditionally, so registers are essentially destroyed.
+ * Modern FPU state can be kept in registers, if there are
+ * no pending FP exceptions.
+ */
+static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
+{
+	if (likely(use_xsave())) {
+		copy_xregs_to_kernel(&fpu->state.xsave);
+		return 1;
+	}
+
+	if (likely(use_fxsr())) {
+		copy_fxregs_to_kernel(fpu);
+		return 1;
+	}
+
+	/*
+	 * Legacy FPU register saving, FNSAVE always clears FPU registers,
+	 * so we have to mark them inactive:
+	 */
+	asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
+
+	return 0;
+}
+
+static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate)
+{
+	if (use_xsave()) {
+		copy_kernel_to_xregs(&fpstate->xsave, -1);
+	} else {
+		if (use_fxsr())
+			copy_kernel_to_fxregs(&fpstate->fxsave);
+		else
+			copy_kernel_to_fregs(&fpstate->fsave);
+	}
+}
+
+static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
+{
+	/*
+	 * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
+	 * pending. Clear the x87 state here by setting it to fixed values.
+	 * "m" is a random variable that should be in L1.
+	 */
+	if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) {
+		asm volatile(
+			"fnclex\n\t"
+			"emms\n\t"
+			"fildl %P[addr]"	/* set F?P to defined value */
+			: : [addr] "m" (fpstate));
+	}
+
+	__copy_kernel_to_fpregs(fpstate);
+}
+
+extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
+
+/*
+ * FPU context switch related helper methods:
+ */
+
+DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+
+/*
+ * Must be run with preemption disabled: this clears the fpu_fpregs_owner_ctx,
+ * on this CPU.
+ *
+ * This will disable any lazy FPU state restore of the current FPU state,
+ * but if the current thread owns the FPU, it will still be saved by.
+ */
+static inline void __cpu_disable_lazy_restore(unsigned int cpu)
+{
+	per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
+}
+
+static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu)
+{
+	return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
+}
+
+
+/*
+ * Wrap lazy FPU TS handling in a 'hw fpregs activation/deactivation'
+ * idiom, which is then paired with the sw-flag (fpregs_active) later on:
+ */
+
+static inline void __fpregs_activate_hw(void)
+{
+	if (!use_eager_fpu())
+		clts();
+}
+
+static inline void __fpregs_deactivate_hw(void)
+{
+	if (!use_eager_fpu())
+		stts();
+}
+
+/* Must be paired with an 'stts' (fpregs_deactivate_hw()) after! */
+static inline void __fpregs_deactivate(struct fpu *fpu)
+{
+	WARN_ON_FPU(!fpu->fpregs_active);
+
+	fpu->fpregs_active = 0;
+	this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+}
+
+/* Must be paired with a 'clts' (fpregs_activate_hw()) before! */
+static inline void __fpregs_activate(struct fpu *fpu)
+{
+	WARN_ON_FPU(fpu->fpregs_active);
+
+	fpu->fpregs_active = 1;
+	this_cpu_write(fpu_fpregs_owner_ctx, fpu);
+}
+
+/*
+ * The question "does this thread have fpu access?"
+ * is slightly racy, since preemption could come in
+ * and revoke it immediately after the test.
+ *
+ * However, even in that very unlikely scenario,
+ * we can just assume we have FPU access - typically
+ * to save the FP state - we'll just take a #NM
+ * fault and get the FPU access back.
+ */
+static inline int fpregs_active(void)
+{
+	return current->thread.fpu.fpregs_active;
+}
+
+/*
+ * Encapsulate the CR0.TS handling together with the
+ * software flag.
+ *
+ * These generally need preemption protection to work,
+ * do try to avoid using these on their own.
+ */
+static inline void fpregs_activate(struct fpu *fpu)
+{
+	__fpregs_activate_hw();
+	__fpregs_activate(fpu);
+}
+
+static inline void fpregs_deactivate(struct fpu *fpu)
+{
+	__fpregs_deactivate(fpu);
+	__fpregs_deactivate_hw();
+}
+
+/*
+ * FPU state switching for scheduling.
+ *
+ * This is a two-stage process:
+ *
+ *  - switch_fpu_prepare() saves the old state and
+ *    sets the new state of the CR0.TS bit. This is
+ *    done within the context of the old process.
+ *
+ *  - switch_fpu_finish() restores the new state as
+ *    necessary.
+ */
+typedef struct { int preload; } fpu_switch_t;
+
+static inline fpu_switch_t
+switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu)
+{
+	fpu_switch_t fpu;
+
+	/*
+	 * If the task has used the math, pre-load the FPU on xsave processors
+	 * or if the past 5 consecutive context-switches used math.
+	 */
+	fpu.preload = new_fpu->fpstate_active &&
+		      (use_eager_fpu() || new_fpu->counter > 5);
+
+	if (old_fpu->fpregs_active) {
+		if (!copy_fpregs_to_fpstate(old_fpu))
+			old_fpu->last_cpu = -1;
+		else
+			old_fpu->last_cpu = cpu;
+
+		/* But leave fpu_fpregs_owner_ctx! */
+		old_fpu->fpregs_active = 0;
+
+		/* Don't change CR0.TS if we just switch! */
+		if (fpu.preload) {
+			new_fpu->counter++;
+			__fpregs_activate(new_fpu);
+			prefetch(&new_fpu->state);
+		} else {
+			__fpregs_deactivate_hw();
+		}
+	} else {
+		old_fpu->counter = 0;
+		old_fpu->last_cpu = -1;
+		if (fpu.preload) {
+			new_fpu->counter++;
+			if (fpu_want_lazy_restore(new_fpu, cpu))
+				fpu.preload = 0;
+			else
+				prefetch(&new_fpu->state);
+			fpregs_activate(new_fpu);
+		}
+	}
+	return fpu;
+}
+
+/*
+ * Misc helper functions:
+ */
+
+/*
+ * By the time this gets called, we've already cleared CR0.TS and
+ * given the process the FPU if we are going to preload the FPU
+ * state - all we need to do is to conditionally restore the register
+ * state itself.
+ */
+static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switch)
+{
+	if (fpu_switch.preload)
+		copy_kernel_to_fpregs(&new_fpu->state);
+}
+
+/*
+ * Needs to be preemption-safe.
+ *
+ * NOTE! user_fpu_begin() must be used only immediately before restoring
+ * the save state. It does not do any saving/restoring on its own. In
+ * lazy FPU mode, it is just an optimization to avoid a #NM exception,
+ * the task can lose the FPU right after preempt_enable().
+ */
+static inline void user_fpu_begin(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	preempt_disable();
+	if (!fpregs_active())
+		fpregs_activate(fpu);
+	preempt_enable();
+}
+
+/*
+ * MXCSR and XCR definitions:
+ */
+
+extern unsigned int mxcsr_feature_mask;
+
+#define XCR_XFEATURE_ENABLED_MASK	0x00000000
+
+static inline u64 xgetbv(u32 index)
+{
+	u32 eax, edx;
+
+	asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
+		     : "=a" (eax), "=d" (edx)
+		     : "c" (index));
+	return eax + ((u64)edx << 32);
+}
+
+static inline void xsetbv(u32 index, u64 value)
+{
+	u32 eax = value;
+	u32 edx = value >> 32;
+
+	asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
+		     : : "a" (eax), "d" (edx), "c" (index));
+}
+
+#endif /* _ASM_X86_FPU_INTERNAL_H */
diff --git a/arch/x86/include/asm/fpu/regset.h b/arch/x86/include/asm/fpu/regset.h
new file mode 100644
index 00000000000000..39d3107ac6c7a4
--- /dev/null
+++ b/arch/x86/include/asm/fpu/regset.h
@@ -0,0 +1,21 @@
+/*
+ * FPU regset handling methods:
+ */
+#ifndef _ASM_X86_FPU_REGSET_H
+#define _ASM_X86_FPU_REGSET_H
+
+#include <linux/regset.h>
+
+extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active;
+extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
+				xstateregs_get;
+extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
+				 xstateregs_set;
+
+/*
+ * xstateregs_active == regset_fpregs_active. Please refer to the comment
+ * at the definition of regset_fpregs_active.
+ */
+#define xstateregs_active	regset_fpregs_active
+
+#endif /* _ASM_X86_FPU_REGSET_H */
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
new file mode 100644
index 00000000000000..7358e9d61f1e70
--- /dev/null
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -0,0 +1,33 @@
+/*
+ * x86 FPU signal frame handling methods:
+ */
+#ifndef _ASM_X86_FPU_SIGNAL_H
+#define _ASM_X86_FPU_SIGNAL_H
+
+#ifdef CONFIG_X86_64
+# include <asm/sigcontext32.h>
+# include <asm/user32.h>
+struct ksignal;
+int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
+			compat_sigset_t *set, struct pt_regs *regs);
+int ia32_setup_frame(int sig, struct ksignal *ksig,
+		     compat_sigset_t *set, struct pt_regs *regs);
+#else
+# define user_i387_ia32_struct	user_i387_struct
+# define user32_fxsr_struct	user_fxsr_struct
+# define ia32_setup_frame	__setup_frame
+# define ia32_setup_rt_frame	__setup_rt_frame
+#endif
+
+extern void convert_from_fxsr(struct user_i387_ia32_struct *env,
+			      struct task_struct *tsk);
+extern void convert_to_fxsr(struct task_struct *tsk,
+			    const struct user_i387_ia32_struct *env);
+
+unsigned long
+fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
+		     unsigned long *buf_fx, unsigned long *size);
+
+extern void fpu__init_prepare_fx_sw_frame(void);
+
+#endif /* _ASM_X86_FPU_SIGNAL_H */
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
new file mode 100644
index 00000000000000..0637826292deae
--- /dev/null
+++ b/arch/x86/include/asm/fpu/types.h
@@ -0,0 +1,293 @@
+/*
+ * FPU data structures:
+ */
+#ifndef _ASM_X86_FPU_H
+#define _ASM_X86_FPU_H
+
+/*
+ * The legacy x87 FPU state format, as saved by FSAVE and
+ * restored by the FRSTOR instructions:
+ */
+struct fregs_state {
+	u32			cwd;	/* FPU Control Word		*/
+	u32			swd;	/* FPU Status Word		*/
+	u32			twd;	/* FPU Tag Word			*/
+	u32			fip;	/* FPU IP Offset		*/
+	u32			fcs;	/* FPU IP Selector		*/
+	u32			foo;	/* FPU Operand Pointer Offset	*/
+	u32			fos;	/* FPU Operand Pointer Selector	*/
+
+	/* 8*10 bytes for each FP-reg = 80 bytes:			*/
+	u32			st_space[20];
+
+	/* Software status information [not touched by FSAVE]:		*/
+	u32			status;
+};
+
+/*
+ * The legacy fx SSE/MMX FPU state format, as saved by FXSAVE and
+ * restored by the FXRSTOR instructions. It's similar to the FSAVE
+ * format, but differs in some areas, plus has extensions at
+ * the end for the XMM registers.
+ */
+struct fxregs_state {
+	u16			cwd; /* Control Word			*/
+	u16			swd; /* Status Word			*/
+	u16			twd; /* Tag Word			*/
+	u16			fop; /* Last Instruction Opcode		*/
+	union {
+		struct {
+			u64	rip; /* Instruction Pointer		*/
+			u64	rdp; /* Data Pointer			*/
+		};
+		struct {
+			u32	fip; /* FPU IP Offset			*/
+			u32	fcs; /* FPU IP Selector			*/
+			u32	foo; /* FPU Operand Offset		*/
+			u32	fos; /* FPU Operand Selector		*/
+		};
+	};
+	u32			mxcsr;		/* MXCSR Register State */
+	u32			mxcsr_mask;	/* MXCSR Mask		*/
+
+	/* 8*16 bytes for each FP-reg = 128 bytes:			*/
+	u32			st_space[32];
+
+	/* 16*16 bytes for each XMM-reg = 256 bytes:			*/
+	u32			xmm_space[64];
+
+	u32			padding[12];
+
+	union {
+		u32		padding1[12];
+		u32		sw_reserved[12];
+	};
+
+} __attribute__((aligned(16)));
+
+/* Default value for fxregs_state.mxcsr: */
+#define MXCSR_DEFAULT		0x1f80
+
+/*
+ * Software based FPU emulation state. This is arbitrary really,
+ * it matches the x87 format to make it easier to understand:
+ */
+struct swregs_state {
+	u32			cwd;
+	u32			swd;
+	u32			twd;
+	u32			fip;
+	u32			fcs;
+	u32			foo;
+	u32			fos;
+	/* 8*10 bytes for each FP-reg = 80 bytes: */
+	u32			st_space[20];
+	u8			ftop;
+	u8			changed;
+	u8			lookahead;
+	u8			no_update;
+	u8			rm;
+	u8			alimit;
+	struct math_emu_info	*info;
+	u32			entry_eip;
+};
+
+/*
+ * List of XSAVE features Linux knows about:
+ */
+enum xfeature_bit {
+	XSTATE_BIT_FP,
+	XSTATE_BIT_SSE,
+	XSTATE_BIT_YMM,
+	XSTATE_BIT_BNDREGS,
+	XSTATE_BIT_BNDCSR,
+	XSTATE_BIT_OPMASK,
+	XSTATE_BIT_ZMM_Hi256,
+	XSTATE_BIT_Hi16_ZMM,
+
+	XFEATURES_NR_MAX,
+};
+
+#define XSTATE_FP		(1 << XSTATE_BIT_FP)
+#define XSTATE_SSE		(1 << XSTATE_BIT_SSE)
+#define XSTATE_YMM		(1 << XSTATE_BIT_YMM)
+#define XSTATE_BNDREGS		(1 << XSTATE_BIT_BNDREGS)
+#define XSTATE_BNDCSR		(1 << XSTATE_BIT_BNDCSR)
+#define XSTATE_OPMASK		(1 << XSTATE_BIT_OPMASK)
+#define XSTATE_ZMM_Hi256	(1 << XSTATE_BIT_ZMM_Hi256)
+#define XSTATE_Hi16_ZMM		(1 << XSTATE_BIT_Hi16_ZMM)
+
+#define XSTATE_FPSSE		(XSTATE_FP | XSTATE_SSE)
+#define XSTATE_AVX512		(XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
+
+/*
+ * There are 16x 256-bit AVX registers named YMM0-YMM15.
+ * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15)
+ * and are stored in 'struct fxregs_state::xmm_space[]'.
+ *
+ * The high 128 bits are stored here:
+ *    16x 128 bits == 256 bytes.
+ */
+struct ymmh_struct {
+	u8				ymmh_space[256];
+};
+
+/* We don't support LWP yet: */
+struct lwp_struct {
+	u8				reserved[128];
+};
+
+/* Intel MPX support: */
+struct bndreg {
+	u64				lower_bound;
+	u64				upper_bound;
+} __packed;
+
+struct bndcsr {
+	u64				bndcfgu;
+	u64				bndstatus;
+} __packed;
+
+struct mpx_struct {
+	struct bndreg			bndreg[4];
+	struct bndcsr			bndcsr;
+};
+
+struct xstate_header {
+	u64				xfeatures;
+	u64				xcomp_bv;
+	u64				reserved[6];
+} __attribute__((packed));
+
+/* New processor state extensions should be added here: */
+#define XSTATE_RESERVE			(sizeof(struct ymmh_struct) + \
+					 sizeof(struct lwp_struct)  + \
+					 sizeof(struct mpx_struct)  )
+/*
+ * This is our most modern FPU state format, as saved by the XSAVE
+ * and restored by the XRSTOR instructions.
+ *
+ * It consists of a legacy fxregs portion, an xstate header and
+ * subsequent fixed size areas as defined by the xstate header.
+ * Not all CPUs support all the extensions.
+ */
+struct xregs_state {
+	struct fxregs_state		i387;
+	struct xstate_header		header;
+	u8				__reserved[XSTATE_RESERVE];
+} __attribute__ ((packed, aligned (64)));
+
+/*
+ * This is a union of all the possible FPU state formats
+ * put together, so that we can pick the right one runtime.
+ *
+ * The size of the structure is determined by the largest
+ * member - which is the xsave area:
+ */
+union fpregs_state {
+	struct fregs_state		fsave;
+	struct fxregs_state		fxsave;
+	struct swregs_state		soft;
+	struct xregs_state		xsave;
+};
+
+/*
+ * Highest level per task FPU state data structure that
+ * contains the FPU register state plus various FPU
+ * state fields:
+ */
+struct fpu {
+	/*
+	 * @state:
+	 *
+	 * In-memory copy of all FPU registers that we save/restore
+	 * over context switches. If the task is using the FPU then
+	 * the registers in the FPU are more recent than this state
+	 * copy. If the task context-switches away then they get
+	 * saved here and represent the FPU state.
+	 *
+	 * After context switches there may be a (short) time period
+	 * during which the in-FPU hardware registers are unchanged
+	 * and still perfectly match this state, if the tasks
+	 * scheduled afterwards are not using the FPU.
+	 *
+	 * This is the 'lazy restore' window of optimization, which
+	 * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
+	 *
+	 * We detect whether a subsequent task uses the FPU via setting
+	 * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
+	 *
+	 * During this window, if the task gets scheduled again, we
+	 * might be able to skip having to do a restore from this
+	 * memory buffer to the hardware registers - at the cost of
+	 * incurring the overhead of #NM fault traps.
+	 *
+	 * Note that on modern CPUs that support the XSAVEOPT (or other
+	 * optimized XSAVE instructions), we don't use #NM traps anymore,
+	 * as the hardware can track whether FPU registers need saving
+	 * or not. On such CPUs we activate the non-lazy ('eagerfpu')
+	 * logic, which unconditionally saves/restores all FPU state
+	 * across context switches. (if FPU state exists.)
+	 */
+	union fpregs_state		state;
+
+	/*
+	 * @last_cpu:
+	 *
+	 * Records the last CPU on which this context was loaded into
+	 * FPU registers. (In the lazy-restore case we might be
+	 * able to reuse FPU registers across multiple context switches
+	 * this way, if no intermediate task used the FPU.)
+	 *
+	 * A value of -1 is used to indicate that the FPU state in context
+	 * memory is newer than the FPU state in registers, and that the
+	 * FPU state should be reloaded next time the task is run.
+	 */
+	unsigned int			last_cpu;
+
+	/*
+	 * @fpstate_active:
+	 *
+	 * This flag indicates whether this context is active: if the task
+	 * is not running then we can restore from this context, if the task
+	 * is running then we should save into this context.
+	 */
+	unsigned char			fpstate_active;
+
+	/*
+	 * @fpregs_active:
+	 *
+	 * This flag determines whether a given context is actively
+	 * loaded into the FPU's registers and that those registers
+	 * represent the task's current FPU state.
+	 *
+	 * Note the interaction with fpstate_active:
+	 *
+	 *   # task does not use the FPU:
+	 *   fpstate_active == 0
+	 *
+	 *   # task uses the FPU and regs are active:
+	 *   fpstate_active == 1 && fpregs_active == 1
+	 *
+	 *   # the regs are inactive but still match fpstate:
+	 *   fpstate_active == 1 && fpregs_active == 0 && fpregs_owner == fpu
+	 *
+	 * The third state is what we use for the lazy restore optimization
+	 * on lazy-switching CPUs.
+	 */
+	unsigned char			fpregs_active;
+
+	/*
+	 * @counter:
+	 *
+	 * This counter contains the number of consecutive context switches
+	 * during which the FPU stays used. If this is over a threshold, the
+	 * lazy FPU restore logic becomes eager, to save the trap overhead.
+	 * This is an unsigned char so that after 256 iterations the counter
+	 * wraps and the context switch behavior turns lazy again; this is to
+	 * deal with bursty apps that only use the FPU for a short time:
+	 */
+	unsigned char			counter;
+};
+
+#endif /* _ASM_X86_FPU_H */
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
new file mode 100644
index 00000000000000..339894669117f7
--- /dev/null
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -0,0 +1,45 @@
+#ifndef __ASM_X86_XSAVE_H
+#define __ASM_X86_XSAVE_H
+
+#include <linux/types.h>
+#include <asm/processor.h>
+#include <linux/uaccess.h>
+
+/* Bit 63 of XCR0 is reserved for future expansion */
+#define XSTATE_EXTEND_MASK	(~(XSTATE_FPSSE | (1ULL << 63)))
+
+#define XSTATE_CPUID		0x0000000d
+
+#define FXSAVE_SIZE	512
+
+#define XSAVE_HDR_SIZE	    64
+#define XSAVE_HDR_OFFSET    FXSAVE_SIZE
+
+#define XSAVE_YMM_SIZE	    256
+#define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
+
+/* Supported features which support lazy state saving */
+#define XSTATE_LAZY	(XSTATE_FP | XSTATE_SSE | XSTATE_YMM		      \
+			| XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
+
+/* Supported features which require eager state saving */
+#define XSTATE_EAGER	(XSTATE_BNDREGS | XSTATE_BNDCSR)
+
+/* All currently supported features */
+#define XCNTXT_MASK	(XSTATE_LAZY | XSTATE_EAGER)
+
+#ifdef CONFIG_X86_64
+#define REX_PREFIX	"0x48, "
+#else
+#define REX_PREFIX
+#endif
+
+extern unsigned int xstate_size;
+extern u64 xfeatures_mask;
+extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
+
+extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
+
+void *get_xsave_addr(struct xregs_state *xsave, int xstate);
+
+#endif
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index 3b629f47eb65b4..793179cf8e21aa 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -1,20 +1,17 @@
 #ifdef __ASSEMBLY__
 
 #include <asm/asm.h>
-#include <asm/dwarf2.h>
 
 /* The annotation hides the frame from the unwinder and makes it look
    like a ordinary ebp save/restore. This avoids some special cases for
    frame pointer later */
 #ifdef CONFIG_FRAME_POINTER
 	.macro FRAME
-	__ASM_SIZE(push,_cfi)	%__ASM_REG(bp)
-	CFI_REL_OFFSET		__ASM_REG(bp), 0
+	__ASM_SIZE(push,)	%__ASM_REG(bp)
 	__ASM_SIZE(mov)		%__ASM_REG(sp), %__ASM_REG(bp)
 	.endm
 	.macro ENDFRAME
-	__ASM_SIZE(pop,_cfi)	%__ASM_REG(bp)
-	CFI_RESTORE		__ASM_REG(bp)
+	__ASM_SIZE(pop,)	%__ASM_REG(bp)
 	.endm
 #else
 	.macro FRAME
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 0f5fb6b6567e9c..7178043b0e1dd6 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -14,6 +14,7 @@ typedef struct {
 #endif
 #ifdef CONFIG_HAVE_KVM
 	unsigned int kvm_posted_intr_ipis;
+	unsigned int kvm_posted_intr_wakeup_ipis;
 #endif
 	unsigned int x86_platform_ipis;	/* arch dependent */
 	unsigned int apic_perf_irqs;
@@ -33,6 +34,9 @@ typedef struct {
 #ifdef CONFIG_X86_MCE_THRESHOLD
 	unsigned int irq_threshold_count;
 #endif
+#ifdef CONFIG_X86_MCE_AMD
+	unsigned int irq_deferred_error_count;
+#endif
 #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
 	unsigned int irq_hv_callback_count;
 #endif
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 36f7125945e3e2..5fa9fb0f880990 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -74,20 +74,16 @@ extern unsigned int hpet_readl(unsigned int a);
 extern void force_hpet_resume(void);
 
 struct irq_data;
+struct hpet_dev;
+struct irq_domain;
+
 extern void hpet_msi_unmask(struct irq_data *data);
 extern void hpet_msi_mask(struct irq_data *data);
-struct hpet_dev;
 extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg);
 extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg);
-
-#ifdef CONFIG_PCI_MSI
-extern int default_setup_hpet_msi(unsigned int irq, unsigned int id);
-#else
-static inline int default_setup_hpet_msi(unsigned int irq, unsigned int id)
-{
-	return -EINVAL;
-}
-#endif
+extern struct irq_domain *hpet_create_irq_domain(int hpet_id);
+extern int hpet_assign_irq(struct irq_domain *domain,
+			   struct hpet_dev *dev, int dev_num);
 
 #ifdef CONFIG_HPET_EMULATE_RTC
 
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index e9571ddabc4feb..6615032e19c80b 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,6 +29,7 @@
 extern asmlinkage void apic_timer_interrupt(void);
 extern asmlinkage void x86_platform_ipi(void);
 extern asmlinkage void kvm_posted_intr_ipi(void);
+extern asmlinkage void kvm_posted_intr_wakeup_ipi(void);
 extern asmlinkage void error_interrupt(void);
 extern asmlinkage void irq_work_interrupt(void);
 
@@ -36,43 +37,10 @@ extern asmlinkage void spurious_interrupt(void);
 extern asmlinkage void thermal_interrupt(void);
 extern asmlinkage void reschedule_interrupt(void);
 
-extern asmlinkage void invalidate_interrupt(void);
-extern asmlinkage void invalidate_interrupt0(void);
-extern asmlinkage void invalidate_interrupt1(void);
-extern asmlinkage void invalidate_interrupt2(void);
-extern asmlinkage void invalidate_interrupt3(void);
-extern asmlinkage void invalidate_interrupt4(void);
-extern asmlinkage void invalidate_interrupt5(void);
-extern asmlinkage void invalidate_interrupt6(void);
-extern asmlinkage void invalidate_interrupt7(void);
-extern asmlinkage void invalidate_interrupt8(void);
-extern asmlinkage void invalidate_interrupt9(void);
-extern asmlinkage void invalidate_interrupt10(void);
-extern asmlinkage void invalidate_interrupt11(void);
-extern asmlinkage void invalidate_interrupt12(void);
-extern asmlinkage void invalidate_interrupt13(void);
-extern asmlinkage void invalidate_interrupt14(void);
-extern asmlinkage void invalidate_interrupt15(void);
-extern asmlinkage void invalidate_interrupt16(void);
-extern asmlinkage void invalidate_interrupt17(void);
-extern asmlinkage void invalidate_interrupt18(void);
-extern asmlinkage void invalidate_interrupt19(void);
-extern asmlinkage void invalidate_interrupt20(void);
-extern asmlinkage void invalidate_interrupt21(void);
-extern asmlinkage void invalidate_interrupt22(void);
-extern asmlinkage void invalidate_interrupt23(void);
-extern asmlinkage void invalidate_interrupt24(void);
-extern asmlinkage void invalidate_interrupt25(void);
-extern asmlinkage void invalidate_interrupt26(void);
-extern asmlinkage void invalidate_interrupt27(void);
-extern asmlinkage void invalidate_interrupt28(void);
-extern asmlinkage void invalidate_interrupt29(void);
-extern asmlinkage void invalidate_interrupt30(void);
-extern asmlinkage void invalidate_interrupt31(void);
-
 extern asmlinkage void irq_move_cleanup_interrupt(void);
 extern asmlinkage void reboot_interrupt(void);
 extern asmlinkage void threshold_interrupt(void);
+extern asmlinkage void deferred_error_interrupt(void);
 
 extern asmlinkage void call_function_interrupt(void);
 extern asmlinkage void call_function_single_interrupt(void);
@@ -87,60 +55,93 @@ extern void trace_spurious_interrupt(void);
 extern void trace_thermal_interrupt(void);
 extern void trace_reschedule_interrupt(void);
 extern void trace_threshold_interrupt(void);
+extern void trace_deferred_error_interrupt(void);
 extern void trace_call_function_interrupt(void);
 extern void trace_call_function_single_interrupt(void);
 #define trace_irq_move_cleanup_interrupt  irq_move_cleanup_interrupt
 #define trace_reboot_interrupt  reboot_interrupt
 #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi
+#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi
 #endif /* CONFIG_TRACING */
 
-#ifdef CONFIG_IRQ_REMAP
-/* Intel specific interrupt remapping information */
-struct irq_2_iommu {
-	struct intel_iommu *iommu;
-	u16 irte_index;
-	u16 sub_handle;
-	u8  irte_mask;
-};
-
-/* AMD specific interrupt remapping information */
-struct irq_2_irte {
-	u16 devid; /* Device ID for IRTE table */
-	u16 index; /* Index into IRTE table*/
-};
-#endif	/* CONFIG_IRQ_REMAP */
-
 #ifdef	CONFIG_X86_LOCAL_APIC
 struct irq_data;
+struct pci_dev;
+struct msi_desc;
+
+enum irq_alloc_type {
+	X86_IRQ_ALLOC_TYPE_IOAPIC = 1,
+	X86_IRQ_ALLOC_TYPE_HPET,
+	X86_IRQ_ALLOC_TYPE_MSI,
+	X86_IRQ_ALLOC_TYPE_MSIX,
+	X86_IRQ_ALLOC_TYPE_DMAR,
+	X86_IRQ_ALLOC_TYPE_UV,
+};
 
-struct irq_cfg {
-	cpumask_var_t		domain;
-	cpumask_var_t		old_domain;
-	u8			vector;
-	u8			move_in_progress : 1;
-#ifdef CONFIG_IRQ_REMAP
-	u8			remapped : 1;
+struct irq_alloc_info {
+	enum irq_alloc_type	type;
+	u32			flags;
+	const struct cpumask	*mask;	/* CPU mask for vector allocation */
 	union {
-		struct irq_2_iommu irq_2_iommu;
-		struct irq_2_irte  irq_2_irte;
-	};
+		int		unused;
+#ifdef	CONFIG_HPET_TIMER
+		struct {
+			int		hpet_id;
+			int		hpet_index;
+			void		*hpet_data;
+		};
 #endif
-	union {
-#ifdef CONFIG_X86_IO_APIC
+#ifdef	CONFIG_PCI_MSI
 		struct {
-			struct list_head	irq_2_pin;
+			struct pci_dev	*msi_dev;
+			irq_hw_number_t	msi_hwirq;
+		};
+#endif
+#ifdef	CONFIG_X86_IO_APIC
+		struct {
+			int		ioapic_id;
+			int		ioapic_pin;
+			int		ioapic_node;
+			u32		ioapic_trigger : 1;
+			u32		ioapic_polarity : 1;
+			u32		ioapic_valid : 1;
+			struct IO_APIC_route_entry *ioapic_entry;
+		};
+#endif
+#ifdef	CONFIG_DMAR_TABLE
+		struct {
+			int		dmar_id;
+			void		*dmar_data;
+		};
+#endif
+#ifdef	CONFIG_HT_IRQ
+		struct {
+			int		ht_pos;
+			int		ht_idx;
+			struct pci_dev	*ht_dev;
+			void		*ht_update;
+		};
+#endif
+#ifdef	CONFIG_X86_UV
+		struct {
+			int		uv_limit;
+			int		uv_blade;
+			unsigned long	uv_offset;
+			char		*uv_name;
 		};
 #endif
 	};
 };
 
+struct irq_cfg {
+	unsigned int		dest_apicid;
+	u8			vector;
+};
+
 extern struct irq_cfg *irq_cfg(unsigned int irq);
 extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data);
-extern struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node);
 extern void lock_vector_lock(void);
 extern void unlock_vector_lock(void);
-extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
-extern void clear_irq_vector(int irq, struct irq_cfg *cfg);
 extern void setup_vector_irq(int cpu);
 #ifdef CONFIG_SMP
 extern void send_cleanup_vector(struct irq_cfg *);
@@ -150,10 +151,7 @@ static inline void send_cleanup_vector(struct irq_cfg *c) { }
 static inline void irq_complete_move(struct irq_cfg *c) { }
 #endif
 
-extern int apic_retrigger_irq(struct irq_data *data);
 extern void apic_ack_edge(struct irq_data *data);
-extern int apic_set_affinity(struct irq_data *data, const struct cpumask *mask,
-			     unsigned int *dest_id);
 #else	/*  CONFIG_X86_LOCAL_APIC */
 static inline void lock_vector_lock(void) {}
 static inline void unlock_vector_lock(void) {}
@@ -163,8 +161,7 @@ static inline void unlock_vector_lock(void) {}
 extern atomic_t irq_err_count;
 extern atomic_t irq_mis_count;
 
-/* EISA */
-extern void eisa_set_level_irq(unsigned int irq);
+extern void elcr_set_level_irq(unsigned int irq);
 
 /* SMP */
 extern __visible void smp_apic_timer_interrupt(struct pt_regs *);
@@ -178,7 +175,6 @@ extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
 extern __visible void smp_reschedule_interrupt(struct pt_regs *);
 extern __visible void smp_call_function_interrupt(struct pt_regs *);
 extern __visible void smp_call_function_single_interrupt(struct pt_regs *);
-extern __visible void smp_invalidate_interrupt(struct pt_regs *);
 #endif
 
 extern char irq_entries_start[];
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
deleted file mode 100644
index 6eb6fcb83f6362..00000000000000
--- a/arch/x86/include/asm/i387.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (C) 1994 Linus Torvalds
- *
- * Pentium III FXSR, SSE support
- * General FPU state handling cleanups
- *	Gareth Hughes <gareth@valinux.com>, May 2000
- * x86-64 work by Andi Kleen 2002
- */
-
-#ifndef _ASM_X86_I387_H
-#define _ASM_X86_I387_H
-
-#ifndef __ASSEMBLY__
-
-#include <linux/sched.h>
-#include <linux/hardirq.h>
-
-struct pt_regs;
-struct user_i387_struct;
-
-extern int init_fpu(struct task_struct *child);
-extern void fpu_finit(struct fpu *fpu);
-extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
-extern void math_state_restore(void);
-
-extern bool irq_fpu_usable(void);
-
-/*
- * Careful: __kernel_fpu_begin/end() must be called with preempt disabled
- * and they don't touch the preempt state on their own.
- * If you enable preemption after __kernel_fpu_begin(), preempt notifier
- * should call the __kernel_fpu_end() to prevent the kernel/user FPU
- * state from getting corrupted. KVM for example uses this model.
- *
- * All other cases use kernel_fpu_begin/end() which disable preemption
- * during kernel FPU usage.
- */
-extern void __kernel_fpu_begin(void);
-extern void __kernel_fpu_end(void);
-
-static inline void kernel_fpu_begin(void)
-{
-	preempt_disable();
-	WARN_ON_ONCE(!irq_fpu_usable());
-	__kernel_fpu_begin();
-}
-
-static inline void kernel_fpu_end(void)
-{
-	__kernel_fpu_end();
-	preempt_enable();
-}
-
-/* Must be called with preempt disabled */
-extern void kernel_fpu_disable(void);
-extern void kernel_fpu_enable(void);
-
-/*
- * Some instructions like VIA's padlock instructions generate a spurious
- * DNA fault but don't modify SSE registers. And these instructions
- * get used from interrupt context as well. To prevent these kernel instructions
- * in interrupt context interacting wrongly with other user/kernel fpu usage, we
- * should use them only in the context of irq_ts_save/restore()
- */
-static inline int irq_ts_save(void)
-{
-	/*
-	 * If in process context and not atomic, we can take a spurious DNA fault.
-	 * Otherwise, doing clts() in process context requires disabling preemption
-	 * or some heavy lifting like kernel_fpu_begin()
-	 */
-	if (!in_atomic())
-		return 0;
-
-	if (read_cr0() & X86_CR0_TS) {
-		clts();
-		return 1;
-	}
-
-	return 0;
-}
-
-static inline void irq_ts_restore(int TS_state)
-{
-	if (TS_state)
-		stts();
-}
-
-/*
- * The question "does this thread have fpu access?"
- * is slightly racy, since preemption could come in
- * and revoke it immediately after the test.
- *
- * However, even in that very unlikely scenario,
- * we can just assume we have FPU access - typically
- * to save the FP state - we'll just take a #NM
- * fault and get the FPU access back.
- */
-static inline int user_has_fpu(void)
-{
-	return current->thread.fpu.has_fpu;
-}
-
-extern void unlazy_fpu(struct task_struct *tsk);
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 34a5b93704d3ec..a2b97404922d89 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -177,6 +177,7 @@ static inline unsigned int isa_virt_to_bus(volatile void *address)
  * look at pci_iomap().
  */
 extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
 extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
 				unsigned long prot_val);
@@ -338,6 +339,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
 #define IO_SPACE_LIMIT 0xffff
 
 #ifdef CONFIG_MTRR
+extern int __must_check arch_phys_wc_index(int handle);
+#define arch_phys_wc_index arch_phys_wc_index
+
 extern int __must_check arch_phys_wc_add(unsigned long base,
 					 unsigned long size);
 extern void arch_phys_wc_del(int handle);
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 2f91685fe1cdb5..6cbf2cfb3f8a02 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -95,9 +95,22 @@ struct IR_IO_APIC_route_entry {
 		index		: 15;
 } __attribute__ ((packed));
 
-#define IOAPIC_AUTO     -1
-#define IOAPIC_EDGE     0
-#define IOAPIC_LEVEL    1
+struct irq_alloc_info;
+struct ioapic_domain_cfg;
+
+#define IOAPIC_AUTO			-1
+#define IOAPIC_EDGE			0
+#define IOAPIC_LEVEL			1
+
+#define IOAPIC_MASKED			1
+#define IOAPIC_UNMASKED			0
+
+#define IOAPIC_POL_HIGH			0
+#define IOAPIC_POL_LOW			1
+
+#define IOAPIC_DEST_MODE_PHYSICAL	0
+#define IOAPIC_DEST_MODE_LOGICAL	1
+
 #define	IOAPIC_MAP_ALLOC		0x1
 #define	IOAPIC_MAP_CHECK		0x2
 
@@ -110,9 +123,6 @@ extern int nr_ioapics;
 
 extern int mpc_ioapic_id(int ioapic);
 extern unsigned int mpc_ioapic_addr(int ioapic);
-extern struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic);
-
-#define MP_MAX_IOAPIC_PIN 127
 
 /* # of MP IRQ source entries */
 extern int mp_irq_entries;
@@ -120,9 +130,6 @@ extern int mp_irq_entries;
 /* MP IRQ source entries */
 extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
 
-/* Older SiS APIC requires we rewrite the index register */
-extern int sis_apic_bug;
-
 /* 1 if "noapic" boot option passed */
 extern int skip_ioapic_setup;
 
@@ -132,6 +139,8 @@ extern int noioapicquirk;
 /* -1 if "noapic" boot option passed */
 extern int noioapicreroute;
 
+extern u32 gsi_top;
+
 extern unsigned long io_apic_irqs;
 
 #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1 << (x)) & io_apic_irqs))
@@ -147,13 +156,6 @@ struct irq_cfg;
 extern void ioapic_insert_resources(void);
 extern int arch_early_ioapic_init(void);
 
-extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *,
-				     unsigned int, int,
-				     struct io_apic_irq_attr *);
-extern void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg);
-
-extern void native_eoi_ioapic_pin(int apic, int pin, int vector);
-
 extern int save_ioapic_entries(void);
 extern void mask_ioapic_entries(void);
 extern int restore_ioapic_entries(void);
@@ -161,82 +163,32 @@ extern int restore_ioapic_entries(void);
 extern void setup_ioapic_ids_from_mpc(void);
 extern void setup_ioapic_ids_from_mpc_nocheck(void);
 
-struct io_apic_irq_attr {
-	int ioapic;
-	int ioapic_pin;
-	int trigger;
-	int polarity;
-};
-
-enum ioapic_domain_type {
-	IOAPIC_DOMAIN_INVALID,
-	IOAPIC_DOMAIN_LEGACY,
-	IOAPIC_DOMAIN_STRICT,
-	IOAPIC_DOMAIN_DYNAMIC,
-};
-
-struct device_node;
-struct irq_domain;
-struct irq_domain_ops;
-
-struct ioapic_domain_cfg {
-	enum ioapic_domain_type		type;
-	const struct irq_domain_ops	*ops;
-	struct device_node		*dev;
-};
-
-struct mp_ioapic_gsi{
-	u32 gsi_base;
-	u32 gsi_end;
-};
-extern u32 gsi_top;
-
 extern int mp_find_ioapic(u32 gsi);
 extern int mp_find_ioapic_pin(int ioapic, u32 gsi);
-extern u32 mp_pin_to_gsi(int ioapic, int pin);
-extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags);
+extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags,
+			     struct irq_alloc_info *info);
 extern void mp_unmap_irq(int irq);
 extern int mp_register_ioapic(int id, u32 address, u32 gsi_base,
 			      struct ioapic_domain_cfg *cfg);
 extern int mp_unregister_ioapic(u32 gsi_base);
 extern int mp_ioapic_registered(u32 gsi_base);
-extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
-			    irq_hw_number_t hwirq);
-extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq);
-extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node);
-extern void __init pre_init_apic_IRQ0(void);
+
+extern void ioapic_set_alloc_attr(struct irq_alloc_info *info,
+				  int node, int trigger, int polarity);
 
 extern void mp_save_irq(struct mpc_intsrc *m);
 
 extern void disable_ioapic_support(void);
 
-extern void __init native_io_apic_init_mappings(void);
+extern void __init io_apic_init_mappings(void);
 extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg);
-extern void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int val);
-extern void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val);
 extern void native_disable_io_apic(void);
-extern void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries);
-extern void intel_ir_io_apic_print_entries(unsigned int apic, unsigned int nr_entries);
-extern int native_ioapic_set_affinity(struct irq_data *,
-				      const struct cpumask *,
-				      bool);
 
 static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
 {
 	return x86_io_apic_ops.read(apic, reg);
 }
 
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	x86_io_apic_ops.write(apic, reg, value);
-}
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	x86_io_apic_ops.modify(apic, reg, value);
-}
-
-extern void io_apic_eoi(unsigned int apic, unsigned int vector);
-
 extern void setup_IO_APIC(void);
 extern void enable_IO_APIC(void);
 extern void disable_IO_APIC(void);
@@ -253,8 +205,12 @@ static inline int arch_early_ioapic_init(void) { return 0; }
 static inline void print_IO_APICs(void) {}
 #define gsi_top (NR_IRQS_LEGACY)
 static inline int mp_find_ioapic(u32 gsi) { return 0; }
-static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return UINT_MAX; }
-static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) { return gsi; }
+static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags,
+				    struct irq_alloc_info *info)
+{
+	return gsi;
+}
+
 static inline void mp_unmap_irq(int irq) { }
 
 static inline int save_ioapic_entries(void)
@@ -268,17 +224,11 @@ static inline int restore_ioapic_entries(void)
 	return -ENOMEM;
 }
 
-static inline void mp_save_irq(struct mpc_intsrc *m) { };
+static inline void mp_save_irq(struct mpc_intsrc *m) { }
 static inline void disable_ioapic_support(void) { }
-#define native_io_apic_init_mappings	NULL
+static inline void io_apic_init_mappings(void) { }
 #define native_io_apic_read		NULL
-#define native_io_apic_write		NULL
-#define native_io_apic_modify		NULL
 #define native_disable_io_apic		NULL
-#define native_io_apic_print_entries	NULL
-#define native_ioapic_set_affinity	NULL
-#define native_setup_ioapic_entry	NULL
-#define native_eoi_ioapic_pin		NULL
 
 static inline void setup_IO_APIC(void) { }
 static inline void enable_IO_APIC(void) { }
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index a80cbb88ea911e..8008d06581c7f4 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -30,6 +30,10 @@ extern void fixup_irqs(void);
 extern void irq_force_complete_move(int);
 #endif
 
+#ifdef CONFIG_HAVE_KVM
+extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void));
+#endif
+
 extern void (*x86_platform_ipi_callback)(void);
 extern void native_init_IRQ(void);
 extern bool handle_irq(unsigned irq, struct pt_regs *regs);
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 6224d316c405c4..78974fbc33b474 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -22,14 +22,12 @@
 #ifndef __X86_IRQ_REMAPPING_H
 #define __X86_IRQ_REMAPPING_H
 
+#include <asm/irqdomain.h>
+#include <asm/hw_irq.h>
 #include <asm/io_apic.h>
 
-struct IO_APIC_route_entry;
-struct io_apic_irq_attr;
-struct irq_chip;
 struct msi_msg;
-struct pci_dev;
-struct irq_cfg;
+struct irq_alloc_info;
 
 #ifdef CONFIG_IRQ_REMAP
 
@@ -39,22 +37,21 @@ extern int irq_remapping_enable(void);
 extern void irq_remapping_disable(void);
 extern int irq_remapping_reenable(int);
 extern int irq_remap_enable_fault_handling(void);
-extern int setup_ioapic_remapped_entry(int irq,
-				       struct IO_APIC_route_entry *entry,
-				       unsigned int destination,
-				       int vector,
-				       struct io_apic_irq_attr *attr);
-extern void free_remapped_irq(int irq);
-extern void compose_remapped_msi_msg(struct pci_dev *pdev,
-				     unsigned int irq, unsigned int dest,
-				     struct msi_msg *msg, u8 hpet_id);
-extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id);
 extern void panic_if_irq_remap(const char *msg);
-extern bool setup_remapped_irq(int irq,
-			       struct irq_cfg *cfg,
-			       struct irq_chip *chip);
 
-void irq_remap_modify_chip_defaults(struct irq_chip *chip);
+extern struct irq_domain *
+irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info);
+extern struct irq_domain *
+irq_remapping_get_irq_domain(struct irq_alloc_info *info);
+
+/* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */
+extern struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent);
+
+/* Get parent irqdomain for interrupt remapping irqdomain */
+static inline struct irq_domain *arch_get_ir_parent_domain(void)
+{
+	return x86_vector_domain;
+}
 
 #else  /* CONFIG_IRQ_REMAP */
 
@@ -64,42 +61,22 @@ static inline int irq_remapping_enable(void) { return -ENODEV; }
 static inline void irq_remapping_disable(void) { }
 static inline int irq_remapping_reenable(int eim) { return -ENODEV; }
 static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; }
-static inline int setup_ioapic_remapped_entry(int irq,
-					      struct IO_APIC_route_entry *entry,
-					      unsigned int destination,
-					      int vector,
-					      struct io_apic_irq_attr *attr)
-{
-	return -ENODEV;
-}
-static inline void free_remapped_irq(int irq) { }
-static inline void compose_remapped_msi_msg(struct pci_dev *pdev,
-					    unsigned int irq, unsigned int dest,
-					    struct msi_msg *msg, u8 hpet_id)
-{
-}
-static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id)
-{
-	return -ENODEV;
-}
 
 static inline void panic_if_irq_remap(const char *msg)
 {
 }
 
-static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip)
+static inline struct irq_domain *
+irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info)
 {
+	return NULL;
 }
 
-static inline bool setup_remapped_irq(int irq,
-				      struct irq_cfg *cfg,
-				      struct irq_chip *chip)
+static inline struct irq_domain *
+irq_remapping_get_irq_domain(struct irq_alloc_info *info)
 {
-	return false;
+	return NULL;
 }
-#endif /* CONFIG_IRQ_REMAP */
-
-#define dmar_alloc_hwirq()	irq_alloc_hwirq(-1)
-#define dmar_free_hwirq		irq_free_hwirq
 
+#endif /* CONFIG_IRQ_REMAP */
 #endif /* __X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 666c89ec4bd729..4c2d2eb2060a0b 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -47,31 +47,12 @@
 #define IRQ_MOVE_CLEANUP_VECTOR		FIRST_EXTERNAL_VECTOR
 
 #define IA32_SYSCALL_VECTOR		0x80
-#ifdef CONFIG_X86_32
-# define SYSCALL_VECTOR			0x80
-#endif
 
 /*
  * Vectors 0x30-0x3f are used for ISA interrupts.
  *   round up to the next 16-vector boundary
  */
-#define IRQ0_VECTOR			((FIRST_EXTERNAL_VECTOR + 16) & ~15)
-
-#define IRQ1_VECTOR			(IRQ0_VECTOR +  1)
-#define IRQ2_VECTOR			(IRQ0_VECTOR +  2)
-#define IRQ3_VECTOR			(IRQ0_VECTOR +  3)
-#define IRQ4_VECTOR			(IRQ0_VECTOR +  4)
-#define IRQ5_VECTOR			(IRQ0_VECTOR +  5)
-#define IRQ6_VECTOR			(IRQ0_VECTOR +  6)
-#define IRQ7_VECTOR			(IRQ0_VECTOR +  7)
-#define IRQ8_VECTOR			(IRQ0_VECTOR +  8)
-#define IRQ9_VECTOR			(IRQ0_VECTOR +  9)
-#define IRQ10_VECTOR			(IRQ0_VECTOR + 10)
-#define IRQ11_VECTOR			(IRQ0_VECTOR + 11)
-#define IRQ12_VECTOR			(IRQ0_VECTOR + 12)
-#define IRQ13_VECTOR			(IRQ0_VECTOR + 13)
-#define IRQ14_VECTOR			(IRQ0_VECTOR + 14)
-#define IRQ15_VECTOR			(IRQ0_VECTOR + 15)
+#define ISA_IRQ_VECTOR(irq)		(((FIRST_EXTERNAL_VECTOR + 16) & ~15) + irq)
 
 /*
  * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
@@ -102,21 +83,23 @@
  */
 #define X86_PLATFORM_IPI_VECTOR		0xf7
 
-/* Vector for KVM to deliver posted interrupt IPI */
-#ifdef CONFIG_HAVE_KVM
-#define POSTED_INTR_VECTOR		0xf2
-#endif
-
+#define POSTED_INTR_WAKEUP_VECTOR	0xf1
 /*
  * IRQ work vector:
  */
 #define IRQ_WORK_VECTOR			0xf6
 
 #define UV_BAU_MESSAGE			0xf5
+#define DEFERRED_ERROR_VECTOR		0xf4
 
 /* Vector on which hypervisor callbacks will be delivered */
 #define HYPERVISOR_CALLBACK_VECTOR	0xf3
 
+/* Vector for KVM to deliver posted interrupt IPI */
+#ifdef CONFIG_HAVE_KVM
+#define POSTED_INTR_VECTOR		0xf2
+#endif
+
 /*
  * Local APIC timer IRQ vector is on a different priority level,
  * to work around the 'lost local interrupt if more than 2 IRQ
@@ -155,18 +138,22 @@ static inline int invalid_vm86_irq(int irq)
  * static arrays.
  */
 
-#define NR_IRQS_LEGACY			  16
+#define NR_IRQS_LEGACY			16
 
-#define IO_APIC_VECTOR_LIMIT		( 32 * MAX_IO_APICS )
+#define CPU_VECTOR_LIMIT		(64 * NR_CPUS)
+#define IO_APIC_VECTOR_LIMIT		(32 * MAX_IO_APICS)
 
-#ifdef CONFIG_X86_IO_APIC
-# define CPU_VECTOR_LIMIT		(64 * NR_CPUS)
-# define NR_IRQS					\
+#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_PCI_MSI)
+#define NR_IRQS						\
 	(CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ?	\
 		(NR_VECTORS + CPU_VECTOR_LIMIT)  :	\
 		(NR_VECTORS + IO_APIC_VECTOR_LIMIT))
-#else /* !CONFIG_X86_IO_APIC: */
-# define NR_IRQS			NR_IRQS_LEGACY
+#elif defined(CONFIG_X86_IO_APIC)
+#define	NR_IRQS				(NR_VECTORS + IO_APIC_VECTOR_LIMIT)
+#elif defined(CONFIG_PCI_MSI)
+#define NR_IRQS				(NR_VECTORS + CPU_VECTOR_LIMIT)
+#else
+#define NR_IRQS				NR_IRQS_LEGACY
 #endif
 
 #endif /* _ASM_X86_IRQ_VECTORS_H */
diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h
new file mode 100644
index 00000000000000..d26075b52885a8
--- /dev/null
+++ b/arch/x86/include/asm/irqdomain.h
@@ -0,0 +1,63 @@
+#ifndef _ASM_IRQDOMAIN_H
+#define _ASM_IRQDOMAIN_H
+
+#include <linux/irqdomain.h>
+#include <asm/hw_irq.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+enum {
+	/* Allocate contiguous CPU vectors */
+	X86_IRQ_ALLOC_CONTIGUOUS_VECTORS		= 0x1,
+};
+
+extern struct irq_domain *x86_vector_domain;
+
+extern void init_irq_alloc_info(struct irq_alloc_info *info,
+				const struct cpumask *mask);
+extern void copy_irq_alloc_info(struct irq_alloc_info *dst,
+				struct irq_alloc_info *src);
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_IO_APIC
+struct device_node;
+struct irq_data;
+
+enum ioapic_domain_type {
+	IOAPIC_DOMAIN_INVALID,
+	IOAPIC_DOMAIN_LEGACY,
+	IOAPIC_DOMAIN_STRICT,
+	IOAPIC_DOMAIN_DYNAMIC,
+};
+
+struct ioapic_domain_cfg {
+	enum ioapic_domain_type		type;
+	const struct irq_domain_ops	*ops;
+	struct device_node		*dev;
+};
+
+extern const struct irq_domain_ops mp_ioapic_irqdomain_ops;
+
+extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs, void *arg);
+extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs);
+extern void mp_irqdomain_activate(struct irq_domain *domain,
+				  struct irq_data *irq_data);
+extern void mp_irqdomain_deactivate(struct irq_domain *domain,
+				    struct irq_data *irq_data);
+extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain);
+#endif /* CONFIG_X86_IO_APIC */
+
+#ifdef CONFIG_PCI_MSI
+extern void arch_init_msi_domain(struct irq_domain *domain);
+#else
+static inline void arch_init_msi_domain(struct irq_domain *domain) { }
+#endif
+
+#ifdef CONFIG_HT_IRQ
+extern void arch_init_htirq_domain(struct irq_domain *domain);
+#else
+static inline void arch_init_htirq_domain(struct irq_domain *domain) { }
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f4a555beef1908..f8c0ec3a4a979f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1002,8 +1002,6 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
-int fx_init(struct kvm_vcpu *vcpu);
-
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       const u8 *new, int bytes);
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 1f5a86d518db37..6a3034a0a07217 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -117,8 +117,19 @@ struct mca_config {
 };
 
 struct mce_vendor_flags {
-	__u64		overflow_recov	: 1, /* cpuid_ebx(80000007) */
-			__reserved_0	: 63;
+			/*
+			 * overflow recovery cpuid bit indicates that overflow
+			 * conditions are not fatal
+			 */
+	__u64		overflow_recov	: 1,
+
+			/*
+			 * SUCCOR stands for S/W UnCorrectable error COntainment
+			 * and Recovery. It indicates support for data poisoning
+			 * in HW and deferred error interrupts.
+			 */
+			succor		: 1,
+			__reserved_0	: 62;
 };
 extern struct mce_vendor_flags mce_flags;
 
@@ -223,6 +234,9 @@ void do_machine_check(struct pt_regs *, long);
 extern void (*mce_threshold_vector)(void);
 extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 
+/* Deferred error interrupt handler */
+extern void (*deferred_error_int_vector)(void);
+
 /*
  * Thermal handler
  */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 2fb20d6f7e23b0..9e6278c7140eac 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_MICROCODE_H
 #define _ASM_X86_MICROCODE_H
 
+#include <linux/earlycpio.h>
+
 #define native_rdmsr(msr, val1, val2)			\
 do {							\
 	u64 __val = native_read_msr((msr));		\
@@ -152,6 +154,7 @@ extern void __init load_ucode_bsp(void);
 extern void load_ucode_ap(void);
 extern int __init save_microcode_in_initrd(void);
 void reload_early_microcode(void);
+extern bool get_builtin_firmware(struct cpio_data *cd, const char *name);
 #else
 static inline void __init load_ucode_bsp(void) {}
 static inline void load_ucode_ap(void) {}
@@ -160,6 +163,9 @@ static inline int __init save_microcode_in_initrd(void)
 	return 0;
 }
 static inline void reload_early_microcode(void) {}
+static inline bool get_builtin_firmware(struct cpio_data *cd, const char *name)
+{
+	return false;
+}
 #endif
-
 #endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h
index af935397e053e4..b8438543f340a7 100644
--- a/arch/x86/include/asm/microcode_amd.h
+++ b/arch/x86/include/asm/microcode_amd.h
@@ -65,12 +65,12 @@ extern enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, s
 extern u8 amd_ucode_patch[PATCH_MAX_SIZE];
 
 #ifdef CONFIG_MICROCODE_AMD_EARLY
-extern void __init load_ucode_amd_bsp(void);
+extern void __init load_ucode_amd_bsp(int family);
 extern void load_ucode_amd_ap(void);
 extern int __init save_microcode_in_initrd_amd(void);
 void reload_ucode_amd(void);
 #else
-static inline void __init load_ucode_amd_bsp(void) {}
+static inline void __init load_ucode_amd_bsp(int family) {}
 static inline void load_ucode_amd_ap(void) {}
 static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; }
 void reload_ucode_amd(void) {}
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
index 2b9209c46ca939..7991c606125d01 100644
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -51,20 +51,11 @@ struct extended_sigtable {
 	(((struct microcode_intel *)mc)->hdr.datasize ? \
 	 ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
 
-#define sigmatch(s1, s2, p1, p2) \
-	(((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
-
 #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
 
-extern int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc);
+extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev);
 extern int microcode_sanity_check(void *mc, int print_err);
-extern int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc);
-
-static inline int
-revision_is_newer(struct microcode_header_intel *mc_header, int rev)
-{
-	return (mc_header->rev <= rev) ? 0 : 1;
-}
+extern int find_matching_signature(void *mc, unsigned int csig, int cpf);
 
 #ifdef CONFIG_MICROCODE_INTEL_EARLY
 extern void __init load_ucode_intel_bsp(void);
diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h
index a952a13d59a71b..f3c1b71d4faeeb 100644
--- a/arch/x86/include/asm/mpx.h
+++ b/arch/x86/include/asm/mpx.h
@@ -60,8 +60,8 @@
 
 #ifdef CONFIG_X86_INTEL_MPX
 siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
-				struct xsave_struct *xsave_buf);
-int mpx_handle_bd_fault(struct xsave_struct *xsave_buf);
+				struct xregs_state *xsave_buf);
+int mpx_handle_bd_fault(struct xregs_state *xsave_buf);
 static inline int kernel_managing_mpx_tables(struct mm_struct *mm)
 {
 	return (mm->bd_addr != MPX_INVALID_BOUNDS_DIR);
@@ -78,11 +78,11 @@ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
 		      unsigned long start, unsigned long end);
 #else
 static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
-					      struct xsave_struct *xsave_buf)
+					      struct xregs_state *xsave_buf)
 {
 	return NULL;
 }
-static inline int mpx_handle_bd_fault(struct xsave_struct *xsave_buf)
+static inline int mpx_handle_bd_fault(struct xregs_state *xsave_buf)
 {
 	return -EINVAL;
 }
diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h
new file mode 100644
index 00000000000000..93724cc6217763
--- /dev/null
+++ b/arch/x86/include/asm/msi.h
@@ -0,0 +1,7 @@
+#ifndef _ASM_X86_MSI_H
+#define _ASM_X86_MSI_H
+#include <asm/hw_irq.h>
+
+typedef struct irq_alloc_info msi_alloc_info_t;
+
+#endif /* _ASM_X86_MSI_H */
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index f768f62984194a..b94f6f64e23d0c 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -31,7 +31,7 @@
  * arch_phys_wc_add and arch_phys_wc_del.
  */
 # ifdef CONFIG_MTRR
-extern u8 mtrr_type_lookup(u64 addr, u64 end);
+extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform);
 extern void mtrr_save_fixed_ranges(void *);
 extern void mtrr_save_state(void);
 extern int mtrr_add(unsigned long base, unsigned long size,
@@ -48,14 +48,13 @@ extern void mtrr_aps_init(void);
 extern void mtrr_bp_restore(void);
 extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
 extern int amd_special_default_mtrr(void);
-extern int phys_wc_to_mtrr_index(int handle);
 #  else
-static inline u8 mtrr_type_lookup(u64 addr, u64 end)
+static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform)
 {
 	/*
 	 * Return no-MTRRs:
 	 */
-	return 0xff;
+	return MTRR_TYPE_INVALID;
 }
 #define mtrr_save_fixed_ranges(arg) do {} while (0)
 #define mtrr_save_state() do {} while (0)
@@ -84,10 +83,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
 static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
 {
 }
-static inline int phys_wc_to_mtrr_index(int handle)
-{
-	return -1;
-}
 
 #define mtrr_ap_init() do {} while (0)
 #define mtrr_bp_init() do {} while (0)
@@ -127,4 +122,8 @@ struct mtrr_gentry32 {
 				 _IOW(MTRR_IOCTL_BASE,  9, struct mtrr_sentry32)
 #endif /* CONFIG_COMPAT */
 
+/* Bit fields for enabled in struct mtrr_state_type */
+#define MTRR_STATE_MTRR_FIXED_ENABLED	0x01
+#define MTRR_STATE_MTRR_ENABLED		0x02
+
 #endif /* _ASM_X86_MTRR_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 8957810ad7d1e3..d143bfad45d70f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -712,6 +712,31 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
 
 #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
+							u32 val)
+{
+	PVOP_VCALL2(pv_lock_ops.queued_spin_lock_slowpath, lock, val);
+}
+
+static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
+{
+	PVOP_VCALLEE1(pv_lock_ops.queued_spin_unlock, lock);
+}
+
+static __always_inline void pv_wait(u8 *ptr, u8 val)
+{
+	PVOP_VCALL2(pv_lock_ops.wait, ptr, val);
+}
+
+static __always_inline void pv_kick(int cpu)
+{
+	PVOP_VCALL1(pv_lock_ops.kick, cpu);
+}
+
+#else /* !CONFIG_QUEUED_SPINLOCKS */
+
 static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock,
 							__ticket_t ticket)
 {
@@ -724,7 +749,9 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock,
 	PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket);
 }
 
-#endif
+#endif /* CONFIG_QUEUED_SPINLOCKS */
+
+#endif /* SMP && PARAVIRT_SPINLOCKS */
 
 #ifdef CONFIG_X86_32
 #define PV_SAVE_REGS "pushl %ecx; pushl %edx;"
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index f7b0b5c112f28c..a6b8f9fadb0685 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -160,13 +160,14 @@ struct pv_cpu_ops {
 	u64 (*read_pmc)(int counter);
 	unsigned long long (*read_tscp)(unsigned int *aux);
 
+#ifdef CONFIG_X86_32
 	/*
 	 * Atomically enable interrupts and return to userspace.  This
-	 * is only ever used to return to 32-bit processes; in a
-	 * 64-bit kernel, it's used for 32-on-64 compat processes, but
-	 * never native 64-bit processes.  (Jump, not call.)
+	 * is only used in 32-bit kernels.  64-bit kernels use
+	 * usergs_sysret32 instead.
 	 */
 	void (*irq_enable_sysexit)(void);
+#endif
 
 	/*
 	 * Switch to usermode gs and return to 64-bit usermode using
@@ -333,9 +334,19 @@ struct arch_spinlock;
 typedef u16 __ticket_t;
 #endif
 
+struct qspinlock;
+
 struct pv_lock_ops {
+#ifdef CONFIG_QUEUED_SPINLOCKS
+	void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val);
+	struct paravirt_callee_save queued_spin_unlock;
+
+	void (*wait)(u8 *ptr, u8 val);
+	void (*kick)(int cpu);
+#else /* !CONFIG_QUEUED_SPINLOCKS */
 	struct paravirt_callee_save lock_spinning;
 	void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket);
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
 };
 
 /* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index 91bc4ba95f919e..cdcff7f7f6941e 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -4,12 +4,7 @@
 #include <linux/types.h>
 #include <asm/pgtable_types.h>
 
-#ifdef CONFIG_X86_PAT
-extern int pat_enabled;
-#else
-static const int pat_enabled;
-#endif
-
+bool pat_enabled(void);
 extern void pat_init(void);
 void pat_init_cache_modes(void);
 
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 81da003df21b08..e894fade26fed2 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -96,15 +96,10 @@ extern void pci_iommu_alloc(void);
 #ifdef CONFIG_PCI_MSI
 /* implemented in arch/x86/kernel/apic/io_apic. */
 struct msi_desc;
-void native_compose_msi_msg(struct pci_dev *pdev, unsigned int irq,
-			    unsigned int dest, struct msi_msg *msg, u8 hpet_id);
 int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
 void native_teardown_msi_irq(unsigned int irq);
 void native_restore_msi_irqs(struct pci_dev *dev);
-int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
-		  unsigned int irq_base, unsigned int irq_offset);
 #else
-#define native_compose_msi_msg		NULL
 #define native_setup_msi_irqs		NULL
 #define native_teardown_msi_irq		NULL
 #endif
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 23ba6765b718c7..8e04f51d6bea49 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -21,6 +21,7 @@ struct mm_struct;
 #include <asm/desc_defs.h>
 #include <asm/nops.h>
 #include <asm/special_insns.h>
+#include <asm/fpu/types.h>
 
 #include <linux/personality.h>
 #include <linux/cpumask.h>
@@ -52,11 +53,16 @@ static inline void *current_text_addr(void)
 	return pc;
 }
 
+/*
+ * These alignment constraints are for performance in the vSMP case,
+ * but in the task_struct case we must also meet hardware imposed
+ * alignment requirements of the FPU state:
+ */
 #ifdef CONFIG_X86_VSMP
 # define ARCH_MIN_TASKALIGN		(1 << INTERNODE_CACHE_SHIFT)
 # define ARCH_MIN_MMSTRUCT_ALIGN	(1 << INTERNODE_CACHE_SHIFT)
 #else
-# define ARCH_MIN_TASKALIGN		16
+# define ARCH_MIN_TASKALIGN		__alignof__(union fpregs_state)
 # define ARCH_MIN_MMSTRUCT_ALIGN	0
 #endif
 
@@ -166,7 +172,6 @@ extern const struct seq_operations cpuinfo_op;
 #define cache_line_size()	(boot_cpu_data.x86_cache_alignment)
 
 extern void cpu_detect(struct cpuinfo_x86 *c);
-extern void fpu_detect(struct cpuinfo_x86 *c);
 
 extern void early_cpu_init(void);
 extern void identify_boot_cpu(void);
@@ -313,128 +318,6 @@ struct orig_ist {
 	unsigned long		ist[7];
 };
 
-#define	MXCSR_DEFAULT		0x1f80
-
-struct i387_fsave_struct {
-	u32			cwd;	/* FPU Control Word		*/
-	u32			swd;	/* FPU Status Word		*/
-	u32			twd;	/* FPU Tag Word			*/
-	u32			fip;	/* FPU IP Offset		*/
-	u32			fcs;	/* FPU IP Selector		*/
-	u32			foo;	/* FPU Operand Pointer Offset	*/
-	u32			fos;	/* FPU Operand Pointer Selector	*/
-
-	/* 8*10 bytes for each FP-reg = 80 bytes:			*/
-	u32			st_space[20];
-
-	/* Software status information [not touched by FSAVE ]:		*/
-	u32			status;
-};
-
-struct i387_fxsave_struct {
-	u16			cwd; /* Control Word			*/
-	u16			swd; /* Status Word			*/
-	u16			twd; /* Tag Word			*/
-	u16			fop; /* Last Instruction Opcode		*/
-	union {
-		struct {
-			u64	rip; /* Instruction Pointer		*/
-			u64	rdp; /* Data Pointer			*/
-		};
-		struct {
-			u32	fip; /* FPU IP Offset			*/
-			u32	fcs; /* FPU IP Selector			*/
-			u32	foo; /* FPU Operand Offset		*/
-			u32	fos; /* FPU Operand Selector		*/
-		};
-	};
-	u32			mxcsr;		/* MXCSR Register State */
-	u32			mxcsr_mask;	/* MXCSR Mask		*/
-
-	/* 8*16 bytes for each FP-reg = 128 bytes:			*/
-	u32			st_space[32];
-
-	/* 16*16 bytes for each XMM-reg = 256 bytes:			*/
-	u32			xmm_space[64];
-
-	u32			padding[12];
-
-	union {
-		u32		padding1[12];
-		u32		sw_reserved[12];
-	};
-
-} __attribute__((aligned(16)));
-
-struct i387_soft_struct {
-	u32			cwd;
-	u32			swd;
-	u32			twd;
-	u32			fip;
-	u32			fcs;
-	u32			foo;
-	u32			fos;
-	/* 8*10 bytes for each FP-reg = 80 bytes: */
-	u32			st_space[20];
-	u8			ftop;
-	u8			changed;
-	u8			lookahead;
-	u8			no_update;
-	u8			rm;
-	u8			alimit;
-	struct math_emu_info	*info;
-	u32			entry_eip;
-};
-
-struct ymmh_struct {
-	/* 16 * 16 bytes for each YMMH-reg = 256 bytes */
-	u32 ymmh_space[64];
-};
-
-/* We don't support LWP yet: */
-struct lwp_struct {
-	u8 reserved[128];
-};
-
-struct bndreg {
-	u64 lower_bound;
-	u64 upper_bound;
-} __packed;
-
-struct bndcsr {
-	u64 bndcfgu;
-	u64 bndstatus;
-} __packed;
-
-struct xsave_hdr_struct {
-	u64 xstate_bv;
-	u64 xcomp_bv;
-	u64 reserved[6];
-} __attribute__((packed));
-
-struct xsave_struct {
-	struct i387_fxsave_struct i387;
-	struct xsave_hdr_struct xsave_hdr;
-	struct ymmh_struct ymmh;
-	struct lwp_struct lwp;
-	struct bndreg bndreg[4];
-	struct bndcsr bndcsr;
-	/* new processor state extensions will go here */
-} __attribute__ ((packed, aligned (64)));
-
-union thread_xstate {
-	struct i387_fsave_struct	fsave;
-	struct i387_fxsave_struct	fxsave;
-	struct i387_soft_struct		soft;
-	struct xsave_struct		xsave;
-};
-
-struct fpu {
-	unsigned int last_cpu;
-	unsigned int has_fpu;
-	union thread_xstate *state;
-};
-
 #ifdef CONFIG_X86_64
 DECLARE_PER_CPU(struct orig_ist, orig_ist);
 
@@ -483,8 +366,6 @@ DECLARE_PER_CPU(struct irq_stack *, softirq_stack);
 #endif	/* X86_64 */
 
 extern unsigned int xstate_size;
-extern void free_thread_xstate(struct task_struct *);
-extern struct kmem_cache *task_xstate_cachep;
 
 struct perf_event;
 
@@ -508,6 +389,10 @@ struct thread_struct {
 	unsigned long		fs;
 #endif
 	unsigned long		gs;
+
+	/* Floating point and extended processor state */
+	struct fpu		fpu;
+
 	/* Save middle states of ptrace breakpoints */
 	struct perf_event	*ptrace_bps[HBP_NUM];
 	/* Debug status used for traps, single steps, etc... */
@@ -518,8 +403,6 @@ struct thread_struct {
 	unsigned long		cr2;
 	unsigned long		trap_nr;
 	unsigned long		error_code;
-	/* floating point and extended processor state */
-	struct fpu		fpu;
 #ifdef CONFIG_X86_32
 	/* Virtual 86 mode info */
 	struct vm86_struct __user *vm86_info;
@@ -535,15 +418,6 @@ struct thread_struct {
 	unsigned long		iopl;
 	/* Max allowed port in the bitmap, in bytes: */
 	unsigned		io_bitmap_max;
-	/*
-	 * fpu_counter contains the number of consecutive context switches
-	 * that the FPU is used. If this is over a threshold, the lazy fpu
-	 * saving becomes unlazy to save the trap. This is an unsigned char
-	 * so that after 256 times the counter wraps and the behavior turns
-	 * lazy again; this to deal with bursty apps that only use FPU for
-	 * a short time
-	 */
-	unsigned char fpu_counter;
 };
 
 /*
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 19507ffa5d28e9..5fabf1362942c6 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -107,7 +107,7 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
 static inline int user_mode(struct pt_regs *regs)
 {
 #ifdef CONFIG_X86_32
-	return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL;
+	return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= USER_RPL;
 #else
 	return !!(regs->cs & 3);
 #endif
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
new file mode 100644
index 00000000000000..9d51fae1cba345
--- /dev/null
+++ b/arch/x86/include/asm/qspinlock.h
@@ -0,0 +1,57 @@
+#ifndef _ASM_X86_QSPINLOCK_H
+#define _ASM_X86_QSPINLOCK_H
+
+#include <asm/cpufeature.h>
+#include <asm-generic/qspinlock_types.h>
+#include <asm/paravirt.h>
+
+#define	queued_spin_unlock queued_spin_unlock
+/**
+ * queued_spin_unlock - release a queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ *
+ * A smp_store_release() on the least-significant byte.
+ */
+static inline void native_queued_spin_unlock(struct qspinlock *lock)
+{
+	smp_store_release((u8 *)lock, 0);
+}
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
+
+static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+	pv_queued_spin_lock_slowpath(lock, val);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+	pv_queued_spin_unlock(lock);
+}
+#else
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+	native_queued_spin_unlock(lock);
+}
+#endif
+
+#define virt_queued_spin_lock virt_queued_spin_lock
+
+static inline bool virt_queued_spin_lock(struct qspinlock *lock)
+{
+	if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
+		return false;
+
+	while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0)
+		cpu_relax();
+
+	return true;
+}
+
+#include <asm-generic/qspinlock.h>
+
+#endif /* _ASM_X86_QSPINLOCK_H */
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
new file mode 100644
index 00000000000000..b002e711ba88ea
--- /dev/null
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_QSPINLOCK_PARAVIRT_H
+#define __ASM_QSPINLOCK_PARAVIRT_H
+
+PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock);
+
+#endif
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 5a9856eb12bad7..7d5a1929d76b31 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -231,11 +231,21 @@
 #define TLS_SIZE			(GDT_ENTRY_TLS_ENTRIES* 8)
 
 #ifdef __KERNEL__
+
+/*
+ * early_idt_handler_array is an array of entry points referenced in the
+ * early IDT.  For simplicity, it's a real array with one entry point
+ * every nine bytes.  That leaves room for an optional 'push $0' if the
+ * vector has no error code (two bytes), a 'push $vector_number' (two
+ * bytes), and a jump to the common entry code (up to five bytes).
+ */
+#define EARLY_IDT_HANDLER_SIZE 9
+
 #ifndef __ASSEMBLY__
 
-extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5];
+extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE];
 #ifdef CONFIG_TRACING
-# define trace_early_idt_handlers early_idt_handlers
+# define trace_early_idt_handler_array early_idt_handler_array
 #endif
 
 /*
diff --git a/arch/x86/include/asm/simd.h b/arch/x86/include/asm/simd.h
index ee80b92f00962a..6c8a7ed13365ae 100644
--- a/arch/x86/include/asm/simd.h
+++ b/arch/x86/include/asm/simd.h
@@ -1,5 +1,5 @@
 
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 /*
  * may_use_simd - whether it is allowable at this time to issue SIMD
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 17a8dced12daef..222a6a3ca2b5eb 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -37,16 +37,6 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
 DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
 DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
 
-static inline struct cpumask *cpu_sibling_mask(int cpu)
-{
-	return per_cpu(cpu_sibling_map, cpu);
-}
-
-static inline struct cpumask *cpu_core_mask(int cpu)
-{
-	return per_cpu(cpu_core_map, cpu);
-}
-
 static inline struct cpumask *cpu_llc_shared_mask(int cpu)
 {
 	return per_cpu(cpu_llc_shared_map, cpu);
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index aeb4666e0c0a77..2270e41b32fd85 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -215,6 +215,44 @@ static inline void clwb(volatile void *__p)
 		: [pax] "a" (p));
 }
 
+/**
+ * pcommit_sfence() - persistent commit and fence
+ *
+ * The PCOMMIT instruction ensures that data that has been flushed from the
+ * processor's cache hierarchy with CLWB, CLFLUSHOPT or CLFLUSH is accepted to
+ * memory and is durable on the DIMM.  The primary use case for this is
+ * persistent memory.
+ *
+ * This function shows how to properly use CLWB/CLFLUSHOPT/CLFLUSH and PCOMMIT
+ * with appropriate fencing.
+ *
+ * Example:
+ * void flush_and_commit_buffer(void *vaddr, unsigned int size)
+ * {
+ *         unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+ *         void *vend = vaddr + size;
+ *         void *p;
+ *
+ *         for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+ *              p < vend; p += boot_cpu_data.x86_clflush_size)
+ *                 clwb(p);
+ *
+ *         // SFENCE to order CLWB/CLFLUSHOPT/CLFLUSH cache flushes
+ *         // MFENCE via mb() also works
+ *         wmb();
+ *
+ *         // PCOMMIT and the required SFENCE for ordering
+ *         pcommit_sfence();
+ * }
+ *
+ * After this function completes the data pointed to by 'vaddr' has been
+ * accepted to memory and will be durable if the 'vaddr' points to persistent
+ * memory.
+ *
+ * PCOMMIT must always be ordered by an MFENCE or SFENCE, so to help simplify
+ * things we include both the PCOMMIT and the required SFENCE in the
+ * alternatives generated by pcommit_sfence().
+ */
 static inline void pcommit_sfence(void)
 {
 	alternative(ASM_NOP7,
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 64b611782ef085..be0a05913b9105 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -42,6 +42,10 @@
 extern struct static_key paravirt_ticketlocks_enabled;
 static __always_inline bool static_key_false(struct static_key *key);
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#else
+
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 
 static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
@@ -196,6 +200,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
 		cpu_relax();
 	}
 }
+#endif /* CONFIG_QUEUED_SPINLOCKS */
 
 /*
  * Read-write spinlocks, allowing multiple readers
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index 5f9d7572d82b19..65c3e37f879ace 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -23,6 +23,9 @@ typedef u32 __ticketpair_t;
 
 #define TICKET_SHIFT	(sizeof(__ticket_t) * 8)
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm-generic/qspinlock_types.h>
+#else
 typedef struct arch_spinlock {
 	union {
 		__ticketpair_t head_tail;
@@ -33,6 +36,7 @@ typedef struct arch_spinlock {
 } arch_spinlock_t;
 
 #define __ARCH_SPIN_LOCK_UNLOCKED	{ { 0 } }
+#endif /* CONFIG_QUEUED_SPINLOCKS */
 
 #include <asm-generic/qrwlock_types.h>
 
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 6a998598f17242..c2e00bb2a1365c 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -39,7 +39,9 @@
 #include <asm/processor.h>
 #include <asm/percpu.h>
 #include <asm/desc.h>
+
 #include <linux/random.h>
+#include <linux/sched.h>
 
 /*
  * 24 byte read-only segment initializer for stack canary.  Linker
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index 552d6c90a6d43d..d1793f06854d28 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -7,7 +7,7 @@
 #define _ASM_X86_SUSPEND_32_H
 
 #include <asm/desc.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 /* image of the saved processor state */
 struct saved_context {
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index bc6232834babf3..7ebf0ebe4e687f 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -7,7 +7,7 @@
 #define _ASM_X86_SUSPEND_64_H
 
 #include <asm/desc.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 /*
  * Image of the saved processor state, used by the low level ACPI suspend to
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index b4bdec3e9523e0..225ee545e1a05e 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -177,8 +177,6 @@ struct thread_info {
  */
 #ifndef __ASSEMBLY__
 
-DECLARE_PER_CPU(unsigned long, kernel_stack);
-
 static inline struct thread_info *current_thread_info(void)
 {
 	return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE);
@@ -197,9 +195,13 @@ static inline unsigned long current_stack_pointer(void)
 
 #else /* !__ASSEMBLY__ */
 
+#ifdef CONFIG_X86_64
+# define cpu_current_top_of_stack (cpu_tss + TSS_sp0)
+#endif
+
 /* Load thread_info address into "reg" */
 #define GET_THREAD_INFO(reg) \
-	_ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \
+	_ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
 	_ASM_SUB $(THREAD_SIZE),reg ;
 
 /*
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 0e8f04f2c26fda..5a77593fdaced1 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -124,7 +124,7 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu);
 
 #ifdef ENABLE_TOPO_DEFINES
 #define topology_core_cpumask(cpu)		(per_cpu(cpu_core_map, cpu))
-#define topology_thread_cpumask(cpu)		(per_cpu(cpu_sibling_map, cpu))
+#define topology_sibling_cpumask(cpu)		(per_cpu(cpu_sibling_map, cpu))
 #endif
 
 static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h
index 4cab890007a726..38a09a13a9bcda 100644
--- a/arch/x86/include/asm/trace/irq_vectors.h
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -101,6 +101,12 @@ DEFINE_IRQ_VECTOR_EVENT(call_function_single);
 DEFINE_IRQ_VECTOR_EVENT(threshold_apic);
 
 /*
+ * deferred_error_apic - called when entering/exiting a deferred apic interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic);
+
+/*
  * thermal_apic - called when entering/exiting a thermal apic interrupt
  * vector handler
  */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 4e49d7dff78e5f..c5380bea2a36aa 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -108,7 +108,8 @@ extern int panic_on_unrecovered_nmi;
 void math_emulate(struct math_emu_info *);
 #ifndef CONFIG_X86_32
 asmlinkage void smp_thermal_interrupt(void);
-asmlinkage void mce_threshold_interrupt(void);
+asmlinkage void smp_threshold_interrupt(void);
+asmlinkage void smp_deferred_error_interrupt(void);
 #endif
 
 extern enum ctx_state ist_enter(struct pt_regs *regs);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index ace9dec050b17b..a8df874f3e8825 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -74,7 +74,8 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
  * @addr: User space pointer to start of block to check
  * @size: Size of block to check
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Checks if a pointer to a block of memory in user space is valid.
  *
@@ -145,7 +146,8 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
  * @x:   Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -240,7 +242,8 @@ extern void __put_user_8(void);
  * @x:   Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -455,7 +458,8 @@ struct __large_struct { unsigned long buf[100]; };
  * @x:   Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -479,7 +483,8 @@ struct __large_struct { unsigned long buf[100]; };
  * @x:   Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 3c03a5de64d30c..f5dcb5204dcd5b 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -59,6 +59,10 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
 			__put_user_size(*(u32 *)from, (u32 __user *)to,
 					4, ret, 4);
 			return ret;
+		case 8:
+			__put_user_size(*(u64 *)from, (u64 __user *)to,
+					8, ret, 8);
+			return ret;
 		}
 	}
 	return __copy_to_user_ll(to, from, n);
@@ -70,7 +74,8 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
  * @from: Source address, in kernel space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from kernel space to user space.  Caller must check
  * the specified block with access_ok() before calling this function.
@@ -117,7 +122,8 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
  * @from: Source address, in user space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from user space to kernel space.  Caller must check
  * the specified block with access_ok() before calling this function.
diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h
index ccab4af1646d44..59a54e869f1598 100644
--- a/arch/x86/include/asm/user.h
+++ b/arch/x86/include/asm/user.h
@@ -14,8 +14,8 @@ struct user_ymmh_regs {
 	__u32 ymmh_space[64];
 };
 
-struct user_xsave_hdr {
-	__u64 xstate_bv;
+struct user_xstate_header {
+	__u64 xfeatures;
 	__u64 reserved1[2];
 	__u64 reserved2[5];
 };
@@ -41,11 +41,11 @@ struct user_xsave_hdr {
  * particular process/thread.
  *
  * Also when the user modifies certain state FP/SSE/etc through the
- * ptrace interface, they must ensure that the xsave_hdr.xstate_bv
+ * ptrace interface, they must ensure that the header.xfeatures
  * bytes[512..519] of the memory layout are updated correspondingly.
  * i.e., for example when FP state is modified to a non-init state,
- * xsave_hdr.xstate_bv's bit 0 must be set to '1', when SSE is modified to
- * non-init state, xsave_hdr.xstate_bv's bit 1 must to be set to '1', etc.
+ * header.xfeatures's bit 0 must be set to '1', when SSE is modified to
+ * non-init state, header.xfeatures's bit 1 must to be set to '1', etc.
  */
 #define USER_XSTATE_FX_SW_WORDS 6
 #define USER_XSTATE_XCR0_WORD	0
@@ -55,7 +55,7 @@ struct user_xstateregs {
 		__u64 fpx_space[58];
 		__u64 xstate_fx_sw[USER_XSTATE_FX_SW_WORDS];
 	} i387;
-	struct user_xsave_hdr xsave_hdr;
+	struct user_xstate_header header;
 	struct user_ymmh_regs ymmh;
 	/* further processor state extensions go here */
 };
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index f58a9c7a3c8665..48d34d28f5a605 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -171,38 +171,17 @@ struct x86_platform_ops {
 };
 
 struct pci_dev;
-struct msi_msg;
 
 struct x86_msi_ops {
 	int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
-	void (*compose_msi_msg)(struct pci_dev *dev, unsigned int irq,
-				unsigned int dest, struct msi_msg *msg,
-			       u8 hpet_id);
 	void (*teardown_msi_irq)(unsigned int irq);
 	void (*teardown_msi_irqs)(struct pci_dev *dev);
 	void (*restore_msi_irqs)(struct pci_dev *dev);
-	int  (*setup_hpet_msi)(unsigned int irq, unsigned int id);
 };
 
-struct IO_APIC_route_entry;
-struct io_apic_irq_attr;
-struct irq_data;
-struct cpumask;
-
 struct x86_io_apic_ops {
-	void		(*init)   (void);
 	unsigned int	(*read)   (unsigned int apic, unsigned int reg);
-	void		(*write)  (unsigned int apic, unsigned int reg, unsigned int value);
-	void		(*modify) (unsigned int apic, unsigned int reg, unsigned int value);
 	void		(*disable)(void);
-	void		(*print_entries)(unsigned int apic, unsigned int nr_entries);
-	int		(*set_affinity)(struct irq_data *data,
-					const struct cpumask *mask,
-					bool force);
-	int		(*setup_entry)(int irq, struct IO_APIC_route_entry *entry,
-				       unsigned int destination, int vector,
-				       struct io_apic_irq_attr *attr);
-	void		(*eoi_ioapic_pin)(int apic, int pin, int vector);
 };
 
 extern struct x86_init_ops x86_init;
diff --git a/arch/x86/include/asm/xcr.h b/arch/x86/include/asm/xcr.h
deleted file mode 100644
index f2cba4e79a239f..00000000000000
--- a/arch/x86/include/asm/xcr.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *
- *   Copyright 2008 rPath, Inc. - All Rights Reserved
- *
- *   This file is part of the Linux kernel, and is made available under
- *   the terms of the GNU General Public License version 2 or (at your
- *   option) any later version; incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
-
-/*
- * asm-x86/xcr.h
- *
- * Definitions for the eXtended Control Register instructions
- */
-
-#ifndef _ASM_X86_XCR_H
-#define _ASM_X86_XCR_H
-
-#define XCR_XFEATURE_ENABLED_MASK	0x00000000
-
-#ifdef __KERNEL__
-# ifndef __ASSEMBLY__
-
-#include <linux/types.h>
-
-static inline u64 xgetbv(u32 index)
-{
-	u32 eax, edx;
-
-	asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
-		     : "=a" (eax), "=d" (edx)
-		     : "c" (index));
-	return eax + ((u64)edx << 32);
-}
-
-static inline void xsetbv(u32 index, u64 value)
-{
-	u32 eax = value;
-	u32 edx = value >> 32;
-
-	asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
-		     : : "a" (eax), "d" (edx), "c" (index));
-}
-
-# endif /* __ASSEMBLY__ */
-#endif /* __KERNEL__ */
-
-#endif /* _ASM_X86_XCR_H */
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index d8829751b3f895..1f5c5161ead682 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -36,7 +36,7 @@
  * no advantages to be gotten from x86-64 here anyways.
  */
 
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 #ifdef CONFIG_X86_32
 /* reduce register pressure */
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index ce05722e3c68bc..5a08bc8bff3393 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -26,7 +26,7 @@
 #define XO3(x, y)	"       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
 #define XO4(x, y)	"       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
 
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 static void
 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
index 492b29802f571b..7c0a517ec7511a 100644
--- a/arch/x86/include/asm/xor_avx.h
+++ b/arch/x86/include/asm/xor_avx.h
@@ -18,7 +18,7 @@
 #ifdef CONFIG_AS_AVX
 
 #include <linux/compiler.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 #define BLOCK4(i) \
 		BLOCK(32 * i, 0) \
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
deleted file mode 100644
index c9a6d68b8d623b..00000000000000
--- a/arch/x86/include/asm/xsave.h
+++ /dev/null
@@ -1,257 +0,0 @@
-#ifndef __ASM_X86_XSAVE_H
-#define __ASM_X86_XSAVE_H
-
-#include <linux/types.h>
-#include <asm/processor.h>
-
-#define XSTATE_CPUID		0x0000000d
-
-#define XSTATE_FP		0x1
-#define XSTATE_SSE		0x2
-#define XSTATE_YMM		0x4
-#define XSTATE_BNDREGS		0x8
-#define XSTATE_BNDCSR		0x10
-#define XSTATE_OPMASK		0x20
-#define XSTATE_ZMM_Hi256	0x40
-#define XSTATE_Hi16_ZMM		0x80
-
-#define XSTATE_FPSSE	(XSTATE_FP | XSTATE_SSE)
-#define XSTATE_AVX512	(XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
-/* Bit 63 of XCR0 is reserved for future expansion */
-#define XSTATE_EXTEND_MASK	(~(XSTATE_FPSSE | (1ULL << 63)))
-
-#define FXSAVE_SIZE	512
-
-#define XSAVE_HDR_SIZE	    64
-#define XSAVE_HDR_OFFSET    FXSAVE_SIZE
-
-#define XSAVE_YMM_SIZE	    256
-#define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
-
-/* Supported features which support lazy state saving */
-#define XSTATE_LAZY	(XSTATE_FP | XSTATE_SSE | XSTATE_YMM		      \
-			| XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
-
-/* Supported features which require eager state saving */
-#define XSTATE_EAGER	(XSTATE_BNDREGS | XSTATE_BNDCSR)
-
-/* All currently supported features */
-#define XCNTXT_MASK	(XSTATE_LAZY | XSTATE_EAGER)
-
-#ifdef CONFIG_X86_64
-#define REX_PREFIX	"0x48, "
-#else
-#define REX_PREFIX
-#endif
-
-extern unsigned int xstate_size;
-extern u64 pcntxt_mask;
-extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
-extern struct xsave_struct *init_xstate_buf;
-
-extern void xsave_init(void);
-extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
-extern int init_fpu(struct task_struct *child);
-
-/* These macros all use (%edi)/(%rdi) as the single memory argument. */
-#define XSAVE		".byte " REX_PREFIX "0x0f,0xae,0x27"
-#define XSAVEOPT	".byte " REX_PREFIX "0x0f,0xae,0x37"
-#define XSAVES		".byte " REX_PREFIX "0x0f,0xc7,0x2f"
-#define XRSTOR		".byte " REX_PREFIX "0x0f,0xae,0x2f"
-#define XRSTORS		".byte " REX_PREFIX "0x0f,0xc7,0x1f"
-
-#define xstate_fault	".section .fixup,\"ax\"\n"	\
-			"3:  movl $-1,%[err]\n"		\
-			"    jmp  2b\n"			\
-			".previous\n"			\
-			_ASM_EXTABLE(1b, 3b)		\
-			: [err] "=r" (err)
-
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
-static inline int xsave_state_booting(struct xsave_struct *fx, u64 mask)
-{
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-	int err = 0;
-
-	WARN_ON(system_state != SYSTEM_BOOTING);
-
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		asm volatile("1:"XSAVES"\n\t"
-			"2:\n\t"
-			     xstate_fault
-			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-			:   "memory");
-	else
-		asm volatile("1:"XSAVE"\n\t"
-			"2:\n\t"
-			     xstate_fault
-			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-			:   "memory");
-	return err;
-}
-
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
-static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask)
-{
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-	int err = 0;
-
-	WARN_ON(system_state != SYSTEM_BOOTING);
-
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		asm volatile("1:"XRSTORS"\n\t"
-			"2:\n\t"
-			     xstate_fault
-			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-			:   "memory");
-	else
-		asm volatile("1:"XRSTOR"\n\t"
-			"2:\n\t"
-			     xstate_fault
-			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-			:   "memory");
-	return err;
-}
-
-/*
- * Save processor xstate to xsave area.
- */
-static inline int xsave_state(struct xsave_struct *fx, u64 mask)
-{
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-	int err = 0;
-
-	/*
-	 * If xsaves is enabled, xsaves replaces xsaveopt because
-	 * it supports compact format and supervisor states in addition to
-	 * modified optimization in xsaveopt.
-	 *
-	 * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave
-	 * because xsaveopt supports modified optimization which is not
-	 * supported by xsave.
-	 *
-	 * If none of xsaves and xsaveopt is enabled, use xsave.
-	 */
-	alternative_input_2(
-		"1:"XSAVE,
-		XSAVEOPT,
-		X86_FEATURE_XSAVEOPT,
-		XSAVES,
-		X86_FEATURE_XSAVES,
-		[fx] "D" (fx), "a" (lmask), "d" (hmask) :
-		"memory");
-	asm volatile("2:\n\t"
-		     xstate_fault
-		     : "0" (0)
-		     : "memory");
-
-	return err;
-}
-
-/*
- * Restore processor xstate from xsave area.
- */
-static inline int xrstor_state(struct xsave_struct *fx, u64 mask)
-{
-	int err = 0;
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-
-	/*
-	 * Use xrstors to restore context if it is enabled. xrstors supports
-	 * compacted format of xsave area which is not supported by xrstor.
-	 */
-	alternative_input(
-		"1: " XRSTOR,
-		XRSTORS,
-		X86_FEATURE_XSAVES,
-		"D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-		: "memory");
-
-	asm volatile("2:\n"
-		     xstate_fault
-		     : "0" (0)
-		     : "memory");
-
-	return err;
-}
-
-/*
- * Save xstate context for old process during context switch.
- */
-static inline void fpu_xsave(struct fpu *fpu)
-{
-	xsave_state(&fpu->state->xsave, -1);
-}
-
-/*
- * Restore xstate context for new process during context switch.
- */
-static inline int fpu_xrstor_checking(struct xsave_struct *fx)
-{
-	return xrstor_state(fx, -1);
-}
-
-/*
- * Save xstate to user space xsave area.
- *
- * We don't use modified optimization because xrstor/xrstors might track
- * a different application.
- *
- * We don't use compacted format xsave area for
- * backward compatibility for old applications which don't understand
- * compacted format of xsave area.
- */
-static inline int xsave_user(struct xsave_struct __user *buf)
-{
-	int err;
-
-	/*
-	 * Clear the xsave header first, so that reserved fields are
-	 * initialized to zero.
-	 */
-	err = __clear_user(&buf->xsave_hdr, sizeof(buf->xsave_hdr));
-	if (unlikely(err))
-		return -EFAULT;
-
-	__asm__ __volatile__(ASM_STAC "\n"
-			     "1:"XSAVE"\n"
-			     "2: " ASM_CLAC "\n"
-			     xstate_fault
-			     : "D" (buf), "a" (-1), "d" (-1), "0" (0)
-			     : "memory");
-	return err;
-}
-
-/*
- * Restore xstate from user space xsave area.
- */
-static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask)
-{
-	int err = 0;
-	struct xsave_struct *xstate = ((__force struct xsave_struct *)buf);
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-
-	__asm__ __volatile__(ASM_STAC "\n"
-			     "1:"XRSTOR"\n"
-			     "2: " ASM_CLAC "\n"
-			     xstate_fault
-			     : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0)
-			     : "memory");	/* memory required? */
-	return err;
-}
-
-void *get_xsave_addr(struct xsave_struct *xsave, int xstate);
-void setup_xstate_comp(void);
-
-#endif
diff --git a/arch/x86/include/uapi/asm/mtrr.h b/arch/x86/include/uapi/asm/mtrr.h
index d0acb658c8f439..7528dcf5969165 100644
--- a/arch/x86/include/uapi/asm/mtrr.h
+++ b/arch/x86/include/uapi/asm/mtrr.h
@@ -103,7 +103,7 @@ struct mtrr_state_type {
 #define MTRRIOC_GET_PAGE_ENTRY   _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry)
 #define MTRRIOC_KILL_PAGE_ENTRY  _IOW(MTRR_IOCTL_BASE,  9, struct mtrr_sentry)
 
-/*  These are the region types  */
+/* MTRR memory types, which are defined in SDM */
 #define MTRR_TYPE_UNCACHABLE 0
 #define MTRR_TYPE_WRCOMB     1
 /*#define MTRR_TYPE_         2*/
@@ -113,5 +113,11 @@ struct mtrr_state_type {
 #define MTRR_TYPE_WRBACK     6
 #define MTRR_NUM_TYPES       7
 
+/*
+ * Invalid MTRR memory type.  mtrr_type_lookup() returns this value when
+ * MTRRs are disabled.  Note, this value is allocated from the reserved
+ * values (0x7-0xff) of the MTRR memory types.
+ */
+#define MTRR_TYPE_INVALID    0xff
 
 #endif /* _UAPI_ASM_X86_MTRR_H */
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index 16dc4e8a2cd348..0e8a973de9ee8a 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -25,7 +25,7 @@ struct _fpx_sw_bytes {
 	__u32 extended_size;	/* total size of the layout referred by
 				 * fpstate pointer in the sigcontext.
 				 */
-	__u64 xstate_bv;
+	__u64 xfeatures;
 				/* feature bit mask (including fp/sse/extended
 				 * state) that is present in the memory
 				 * layout.
@@ -209,8 +209,8 @@ struct sigcontext {
 
 #endif /* !__i386__ */
 
-struct _xsave_hdr {
-	__u64 xstate_bv;
+struct _header {
+	__u64 xfeatures;
 	__u64 reserved1[2];
 	__u64 reserved2[5];
 };
@@ -228,7 +228,7 @@ struct _ymmh_state {
  */
 struct _xstate {
 	struct _fpstate fpstate;
-	struct _xsave_hdr xstate_hdr;
+	struct _header xstate_hdr;
 	struct _ymmh_state ymmh;
 	/* new processor state extensions go here */
 };
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9bcd0b56ca1775..febaf180621b12 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -44,7 +44,7 @@ obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
 
 obj-y				+= process.o
-obj-y				+= i387.o xsave.o
+obj-y				+= fpu/
 obj-y				+= ptrace.o
 obj-$(CONFIG_X86_32)		+= tls.o
 obj-$(CONFIG_IA32_EMULATION)	+= tls.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index dbe76a14c3c990..e49ee24da85e17 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -31,12 +31,12 @@
 #include <linux/module.h>
 #include <linux/dmi.h>
 #include <linux/irq.h>
-#include <linux/irqdomain.h>
 #include <linux/slab.h>
 #include <linux/bootmem.h>
 #include <linux/ioport.h>
 #include <linux/pci.h>
 
+#include <asm/irqdomain.h>
 #include <asm/pci_x86.h>
 #include <asm/pgtable.h>
 #include <asm/io_apic.h>
@@ -400,57 +400,13 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
 	return 0;
 }
 
-static int mp_register_gsi(struct device *dev, u32 gsi, int trigger,
-			   int polarity)
-{
-	int irq, node;
-
-	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
-		return gsi;
-
-	trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1;
-	polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1;
-	node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
-	if (mp_set_gsi_attr(gsi, trigger, polarity, node)) {
-		pr_warn("Failed to set pin attr for GSI%d\n", gsi);
-		return -1;
-	}
-
-	irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC);
-	if (irq < 0)
-		return irq;
-
-	/* Don't set up the ACPI SCI because it's already set up */
-	if (enable_update_mptable && acpi_gbl_FADT.sci_interrupt != gsi)
-		mp_config_acpi_gsi(dev, gsi, trigger, polarity);
-
-	return irq;
-}
-
-static void mp_unregister_gsi(u32 gsi)
-{
-	int irq;
-
-	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
-		return;
-
-	irq = mp_map_gsi_to_irq(gsi, 0);
-	if (irq > 0)
-		mp_unmap_irq(irq);
-}
-
-static struct irq_domain_ops acpi_irqdomain_ops = {
-	.map = mp_irqdomain_map,
-	.unmap = mp_irqdomain_unmap,
-};
-
 static int __init
 acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 {
 	struct acpi_madt_io_apic *ioapic = NULL;
 	struct ioapic_domain_cfg cfg = {
 		.type = IOAPIC_DOMAIN_DYNAMIC,
-		.ops = &acpi_irqdomain_ops,
+		.ops = &mp_ioapic_irqdomain_ops,
 	};
 
 	ioapic = (struct acpi_madt_io_apic *)header;
@@ -652,7 +608,7 @@ static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
 	 * Make sure all (legacy) PCI IRQs are set as level-triggered.
 	 */
 	if (trigger == ACPI_LEVEL_SENSITIVE)
-		eisa_set_level_irq(gsi);
+		elcr_set_level_irq(gsi);
 #endif
 
 	return gsi;
@@ -663,10 +619,21 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
 				    int trigger, int polarity)
 {
 	int irq = gsi;
-
 #ifdef CONFIG_X86_IO_APIC
+	int node;
+	struct irq_alloc_info info;
+
+	node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
+	trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1;
+	polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1;
+	ioapic_set_alloc_attr(&info, node, trigger, polarity);
+
 	mutex_lock(&acpi_ioapic_lock);
-	irq = mp_register_gsi(dev, gsi, trigger, polarity);
+	irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info);
+	/* Don't set up the ACPI SCI because it's already set up */
+	if (irq >= 0 && enable_update_mptable &&
+	    acpi_gbl_FADT.sci_interrupt != gsi)
+		mp_config_acpi_gsi(dev, gsi, trigger, polarity);
 	mutex_unlock(&acpi_ioapic_lock);
 #endif
 
@@ -676,8 +643,12 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
 static void acpi_unregister_gsi_ioapic(u32 gsi)
 {
 #ifdef CONFIG_X86_IO_APIC
+	int irq;
+
 	mutex_lock(&acpi_ioapic_lock);
-	mp_unregister_gsi(gsi);
+	irq = mp_map_gsi_to_irq(gsi, 0, NULL);
+	if (irq > 0)
+		mp_unmap_irq(irq);
 	mutex_unlock(&acpi_ioapic_lock);
 #endif
 }
@@ -786,7 +757,7 @@ int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
 	u64 addr;
 	struct ioapic_domain_cfg cfg = {
 		.type = IOAPIC_DOMAIN_DYNAMIC,
-		.ops = &acpi_irqdomain_ops,
+		.ops = &mp_ioapic_irqdomain_ops,
 	};
 
 	ioapic_id = acpi_get_ioapic_id(handle, gsi_base, &addr);
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index ae693b51ed8ed5..8c35df4681041e 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -62,7 +62,7 @@ ENTRY(do_suspend_lowlevel)
 	pushfq
 	popq	pt_regs_flags(%rax)
 
-	movq	$resume_point, saved_rip(%rip)
+	movq	$.Lresume_point, saved_rip(%rip)
 
 	movq	%rsp, saved_rsp
 	movq	%rbp, saved_rbp
@@ -75,10 +75,10 @@ ENTRY(do_suspend_lowlevel)
 	xorl	%eax, %eax
 	call	x86_acpi_enter_sleep_state
 	/* in case something went wrong, restore the machine status and go on */
-	jmp	resume_point
+	jmp	.Lresume_point
 
 	.align 4
-resume_point:
+.Lresume_point:
 	/* We don't restore %rax, it must be 0 anyway */
 	movq	$saved_context, %rax
 	movq	saved_context_cr4(%rax), %rbx
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index aef65319316065..c42827eb86cf0c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -21,6 +21,10 @@
 #include <asm/io.h>
 #include <asm/fixmap.h>
 
+int __read_mostly alternatives_patched;
+
+EXPORT_SYMBOL_GPL(alternatives_patched);
+
 #define MAX_PATCH_LEN (255-1)
 
 static int __initdata_or_module debug_alternative;
@@ -227,6 +231,15 @@ void __init arch_init_ideal_nops(void)
 #endif
 		}
 		break;
+
+	case X86_VENDOR_AMD:
+		if (boot_cpu_data.x86 > 0xf) {
+			ideal_nops = p6_nops;
+			return;
+		}
+
+		/* fall through */
+
 	default:
 #ifdef CONFIG_X86_64
 		ideal_nops = k8_nops;
@@ -627,6 +640,7 @@ void __init alternative_instructions(void)
 	apply_paravirt(__parainstructions, __parainstructions_end);
 
 	restart_nmi();
+	alternatives_patched = 1;
 }
 
 /**
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 5caed1dd7ccf89..29fa475ec51823 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -89,9 +89,7 @@ int amd_cache_northbridges(void)
 			next_northbridge(link, amd_nb_link_ids);
 	}
 
-	/* GART present only on Fam15h upto model 0fh */
-	if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
-	    (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10))
+	if (amd_gart_present())
 		amd_northbridges.flags |= AMD_NB_GART;
 
 	/*
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 6a7c23ff21d3de..ede92c3364d327 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -171,10 +171,6 @@ static int __init apbt_clockevent_register(void)
 
 static void apbt_setup_irq(struct apbt_dev *adev)
 {
-	/* timer0 irq has been setup early */
-	if (adev->irq == 0)
-		return;
-
 	irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
 	irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
 }
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 76164e173a24f7..6e85f713641dda 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -262,6 +262,9 @@ void __init early_gart_iommu_check(void)
 	u64 aper_base = 0, last_aper_base = 0;
 	int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0;
 
+	if (!amd_gart_present())
+		return;
+
 	if (!early_pci_allowed())
 		return;
 
@@ -355,6 +358,9 @@ int __init gart_iommu_hole_init(void)
 	int fix, slot, valid_agp = 0;
 	int i, node;
 
+	if (!amd_gart_present())
+		return -ENODEV;
+
 	if (gart_iommu_aperture_disabled || !fix_aperture ||
 	    !early_pci_allowed())
 		return -ENODEV;
@@ -452,7 +458,7 @@ out:
 		   force_iommu ||
 		   valid_agp ||
 		   fallback_aper_force) {
-		pr_info("Your BIOS doesn't leave a aperture memory hole\n");
+		pr_info("Your BIOS doesn't leave an aperture memory hole\n");
 		pr_info("Please enable the IOMMU option in the BIOS setup\n");
 		pr_info("This costs you %dMB of RAM\n",
 			32 << fallback_aper_order);
diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c
index 816f36e979ad03..ae50d3454d7874 100644
--- a/arch/x86/kernel/apic/htirq.c
+++ b/arch/x86/kernel/apic/htirq.c
@@ -3,6 +3,8 @@
  *
  * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
  *	Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ *	Add support of hierarchical irqdomain
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -14,78 +16,112 @@
 #include <linux/device.h>
 #include <linux/pci.h>
 #include <linux/htirq.h>
+#include <asm/irqdomain.h>
 #include <asm/hw_irq.h>
 #include <asm/apic.h>
 #include <asm/hypertransport.h>
 
+static struct irq_domain *htirq_domain;
+
 /*
  * Hypertransport interrupt support
  */
-static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
-{
-	struct ht_irq_msg msg;
-
-	fetch_ht_irq_msg(irq, &msg);
-
-	msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
-	msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
-
-	msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
-	msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
-
-	write_ht_irq_msg(irq, &msg);
-}
-
 static int
 ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	unsigned int dest;
+	struct irq_data *parent = data->parent_data;
 	int ret;
 
-	ret = apic_set_affinity(data, mask, &dest);
-	if (ret)
-		return ret;
-
-	target_ht_irq(data->irq, dest, cfg->vector);
-	return IRQ_SET_MASK_OK_NOCOPY;
+	ret = parent->chip->irq_set_affinity(parent, mask, force);
+	if (ret >= 0) {
+		struct ht_irq_msg msg;
+		struct irq_cfg *cfg = irqd_cfg(data);
+
+		fetch_ht_irq_msg(data->irq, &msg);
+		msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK |
+				    HT_IRQ_LOW_DEST_ID_MASK);
+		msg.address_lo |= HT_IRQ_LOW_VECTOR(cfg->vector) |
+				  HT_IRQ_LOW_DEST_ID(cfg->dest_apicid);
+		msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+		msg.address_hi |= HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid);
+		write_ht_irq_msg(data->irq, &msg);
+	}
+
+	return ret;
 }
 
 static struct irq_chip ht_irq_chip = {
 	.name			= "PCI-HT",
 	.irq_mask		= mask_ht_irq,
 	.irq_unmask		= unmask_ht_irq,
-	.irq_ack		= apic_ack_edge,
+	.irq_ack		= irq_chip_ack_parent,
 	.irq_set_affinity	= ht_set_affinity,
-	.irq_retrigger		= apic_retrigger_irq,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 
-int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
+static int htirq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs, void *arg)
 {
-	struct irq_cfg *cfg;
-	struct ht_irq_msg msg;
-	unsigned dest;
-	int err;
+	struct ht_irq_cfg *ht_cfg;
+	struct irq_alloc_info *info = arg;
+	struct pci_dev *dev;
+	irq_hw_number_t hwirq;
+	int ret;
 
-	if (disable_apic)
-		return -ENXIO;
+	if (nr_irqs > 1 || !info)
+		return -EINVAL;
 
-	cfg = irq_cfg(irq);
-	err = assign_irq_vector(irq, cfg, apic->target_cpus());
-	if (err)
-		return err;
+	dev = info->ht_dev;
+	hwirq = (info->ht_idx & 0xFF) |
+		PCI_DEVID(dev->bus->number, dev->devfn) << 8 |
+		(pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 24;
+	if (irq_find_mapping(domain, hwirq) > 0)
+		return -EEXIST;
 
-	err = apic->cpu_mask_to_apicid_and(cfg->domain,
-					   apic->target_cpus(), &dest);
-	if (err)
-		return err;
+	ht_cfg = kmalloc(sizeof(*ht_cfg), GFP_KERNEL);
+	if (!ht_cfg)
+		return -ENOMEM;
 
-	msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
+	if (ret < 0) {
+		kfree(ht_cfg);
+		return ret;
+	}
+
+	/* Initialize msg to a value that will never match the first write. */
+	ht_cfg->msg.address_lo = 0xffffffff;
+	ht_cfg->msg.address_hi = 0xffffffff;
+	ht_cfg->dev = info->ht_dev;
+	ht_cfg->update = info->ht_update;
+	ht_cfg->pos = info->ht_pos;
+	ht_cfg->idx = 0x10 + (info->ht_idx * 2);
+	irq_domain_set_info(domain, virq, hwirq, &ht_irq_chip, ht_cfg,
+			    handle_edge_irq, ht_cfg, "edge");
+
+	return 0;
+}
+
+static void htirq_domain_free(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs)
+{
+	struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
+
+	BUG_ON(nr_irqs != 1);
+	kfree(irq_data->chip_data);
+	irq_domain_free_irqs_top(domain, virq, nr_irqs);
+}
 
+static void htirq_domain_activate(struct irq_domain *domain,
+				  struct irq_data *irq_data)
+{
+	struct ht_irq_msg msg;
+	struct irq_cfg *cfg = irqd_cfg(irq_data);
+
+	msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid);
 	msg.address_lo =
 		HT_IRQ_LOW_BASE |
-		HT_IRQ_LOW_DEST_ID(dest) |
+		HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) |
 		HT_IRQ_LOW_VECTOR(cfg->vector) |
 		((apic->irq_dest_mode == 0) ?
 			HT_IRQ_LOW_DM_PHYSICAL :
@@ -95,13 +131,56 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 			HT_IRQ_LOW_MT_FIXED :
 			HT_IRQ_LOW_MT_ARBITRATED) |
 		HT_IRQ_LOW_IRQ_MASKED;
+	write_ht_irq_msg(irq_data->irq, &msg);
+}
 
-	write_ht_irq_msg(irq, &msg);
+static void htirq_domain_deactivate(struct irq_domain *domain,
+				    struct irq_data *irq_data)
+{
+	struct ht_irq_msg msg;
 
-	irq_set_chip_and_handler_name(irq, &ht_irq_chip,
-				      handle_edge_irq, "edge");
+	memset(&msg, 0, sizeof(msg));
+	write_ht_irq_msg(irq_data->irq, &msg);
+}
 
-	dev_dbg(&dev->dev, "irq %d for HT\n", irq);
+static const struct irq_domain_ops htirq_domain_ops = {
+	.alloc		= htirq_domain_alloc,
+	.free		= htirq_domain_free,
+	.activate	= htirq_domain_activate,
+	.deactivate	= htirq_domain_deactivate,
+};
 
-	return 0;
+void arch_init_htirq_domain(struct irq_domain *parent)
+{
+	if (disable_apic)
+		return;
+
+	htirq_domain = irq_domain_add_tree(NULL, &htirq_domain_ops, NULL);
+	if (!htirq_domain)
+		pr_warn("failed to initialize irqdomain for HTIRQ.\n");
+	else
+		htirq_domain->parent = parent;
+}
+
+int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev,
+		      ht_irq_update_t *update)
+{
+	struct irq_alloc_info info;
+
+	if (!htirq_domain)
+		return -ENOSYS;
+
+	init_irq_alloc_info(&info, NULL);
+	info.ht_idx = idx;
+	info.ht_pos = pos;
+	info.ht_dev = dev;
+	info.ht_update = update;
+
+	return irq_domain_alloc_irqs(htirq_domain, 1, dev_to_node(&dev->dev),
+				     &info);
+}
+
+void arch_teardown_ht_irq(unsigned int irq)
+{
+	irq_domain_free_irqs(irq, 1);
 }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index f4dc2462a1ac41..845dc0df200247 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -18,6 +18,16 @@
  *					and Rolf G. Tews
  *					for testing these extensively
  *	Paul Diefenbaugh	:	Added full ACPI support
+ *
+ * Historical information which is worth to be preserved:
+ *
+ * - SiS APIC rmw bug:
+ *
+ *	We used to have a workaround for a bug in SiS chips which
+ *	required to rewrite the index register for a read-modify-write
+ *	operation as the chip lost the index information which was
+ *	setup for the read already. We cache the data now, so that
+ *	workaround has been removed.
  */
 
 #include <linux/mm.h>
@@ -31,13 +41,13 @@
 #include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/syscore_ops.h>
-#include <linux/irqdomain.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/jiffies.h>	/* time_after() */
 #include <linux/slab.h>
 #include <linux/bootmem.h>
 
+#include <asm/irqdomain.h>
 #include <asm/idle.h>
 #include <asm/io.h>
 #include <asm/smp.h>
@@ -63,27 +73,31 @@
 #define	for_each_ioapic_pin(idx, pin)	\
 	for_each_ioapic((idx))		\
 		for_each_pin((idx), (pin))
-
 #define for_each_irq_pin(entry, head) \
 	list_for_each_entry(entry, &head, list)
 
-/*
- *      Is the SiS APIC rmw bug present ?
- *      -1 = don't know, 0 = no, 1 = yes
- */
-int sis_apic_bug = -1;
-
 static DEFINE_RAW_SPINLOCK(ioapic_lock);
 static DEFINE_MUTEX(ioapic_mutex);
 static unsigned int ioapic_dynirq_base;
 static int ioapic_initialized;
 
-struct mp_pin_info {
+struct irq_pin_list {
+	struct list_head list;
+	int apic, pin;
+};
+
+struct mp_chip_data {
+	struct list_head irq_2_pin;
+	struct IO_APIC_route_entry entry;
 	int trigger;
 	int polarity;
-	int node;
-	int set;
 	u32 count;
+	bool isa_irq;
+};
+
+struct mp_ioapic_gsi {
+	u32 gsi_base;
+	u32 gsi_end;
 };
 
 static struct ioapic {
@@ -101,7 +115,6 @@ static struct ioapic {
 	struct mp_ioapic_gsi  gsi_config;
 	struct ioapic_domain_cfg irqdomain_cfg;
 	struct irq_domain *irqdomain;
-	struct mp_pin_info *pin_info;
 	struct resource *iomem_res;
 } ioapics[MAX_IO_APICS];
 
@@ -117,7 +130,7 @@ unsigned int mpc_ioapic_addr(int ioapic_idx)
 	return ioapics[ioapic_idx].mp_config.apicaddr;
 }
 
-struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
+static inline struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
 {
 	return &ioapics[ioapic_idx].gsi_config;
 }
@@ -129,11 +142,16 @@ static inline int mp_ioapic_pin_count(int ioapic)
 	return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
 }
 
-u32 mp_pin_to_gsi(int ioapic, int pin)
+static inline u32 mp_pin_to_gsi(int ioapic, int pin)
 {
 	return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin;
 }
 
+static inline bool mp_is_legacy_irq(int irq)
+{
+	return irq >= 0 && irq < nr_legacy_irqs();
+}
+
 /*
  * Initialize all legacy IRQs and all pins on the first IOAPIC
  * if we have legacy interrupt controller. Kernel boot option "pirq="
@@ -144,12 +162,7 @@ static inline int mp_init_irq_at_boot(int ioapic, int irq)
 	if (!nr_legacy_irqs())
 		return 0;
 
-	return ioapic == 0 || (irq >= 0 && irq < nr_legacy_irqs());
-}
-
-static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin)
-{
-	return ioapics[ioapic_idx].pin_info + pin;
+	return ioapic == 0 || mp_is_legacy_irq(irq);
 }
 
 static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic)
@@ -216,16 +229,6 @@ void mp_save_irq(struct mpc_intsrc *m)
 		panic("Max # of irq sources exceeded!!\n");
 }
 
-struct irq_pin_list {
-	struct list_head list;
-	int apic, pin;
-};
-
-static struct irq_pin_list *alloc_irq_pin_list(int node)
-{
-	return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
-}
-
 static void alloc_ioapic_saved_registers(int idx)
 {
 	size_t size;
@@ -247,8 +250,7 @@ static void free_ioapic_saved_registers(int idx)
 
 int __init arch_early_ioapic_init(void)
 {
-	struct irq_cfg *cfg;
-	int i, node = cpu_to_node(0);
+	int i;
 
 	if (!nr_legacy_irqs())
 		io_apic_irqs = ~0UL;
@@ -256,16 +258,6 @@ int __init arch_early_ioapic_init(void)
 	for_each_ioapic(i)
 		alloc_ioapic_saved_registers(i);
 
-	/*
-	 * For legacy IRQ's, start with assigning irq0 to irq15 to
-	 * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's.
-	 */
-	for (i = 0; i < nr_legacy_irqs(); i++) {
-		cfg = alloc_irq_and_cfg_at(i, node);
-		cfg->vector = IRQ0_VECTOR + i;
-		cpumask_setall(cfg->domain);
-	}
-
 	return 0;
 }
 
@@ -283,7 +275,7 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
 		+ (mpc_ioapic_addr(idx) & ~PAGE_MASK);
 }
 
-void io_apic_eoi(unsigned int apic, unsigned int vector)
+static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
 	writel(vector, &io_apic->eoi);
@@ -296,7 +288,8 @@ unsigned int native_io_apic_read(unsigned int apic, unsigned int reg)
 	return readl(&io_apic->data);
 }
 
-void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+static void io_apic_write(unsigned int apic, unsigned int reg,
+			  unsigned int value)
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
 
@@ -304,21 +297,6 @@ void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int valu
 	writel(value, &io_apic->data);
 }
 
-/*
- * Re-write a value: to be used for read-modify-write
- * cycles where the read already set up the index register.
- *
- * Older SiS APIC requires we rewrite the index register
- */
-void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	struct io_apic __iomem *io_apic = io_apic_base(apic);
-
-	if (sis_apic_bug)
-		writel(reg, &io_apic->index);
-	writel(value, &io_apic->data);
-}
-
 union entry_union {
 	struct { u32 w1, w2; };
 	struct IO_APIC_route_entry entry;
@@ -378,7 +356,7 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
 static void ioapic_mask_entry(int apic, int pin)
 {
 	unsigned long flags;
-	union entry_union eu = { .entry.mask = 1 };
+	union entry_union eu = { .entry.mask = IOAPIC_MASKED };
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
@@ -391,16 +369,17 @@ static void ioapic_mask_entry(int apic, int pin)
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+static int __add_pin_to_irq_node(struct mp_chip_data *data,
+				 int node, int apic, int pin)
 {
 	struct irq_pin_list *entry;
 
 	/* don't allow duplicates */
-	for_each_irq_pin(entry, cfg->irq_2_pin)
+	for_each_irq_pin(entry, data->irq_2_pin)
 		if (entry->apic == apic && entry->pin == pin)
 			return 0;
 
-	entry = alloc_irq_pin_list(node);
+	entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node);
 	if (!entry) {
 		pr_err("can not alloc irq_pin_list (%d,%d,%d)\n",
 		       node, apic, pin);
@@ -408,16 +387,16 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi
 	}
 	entry->apic = apic;
 	entry->pin = pin;
+	list_add_tail(&entry->list, &data->irq_2_pin);
 
-	list_add_tail(&entry->list, &cfg->irq_2_pin);
 	return 0;
 }
 
-static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin)
+static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin)
 {
 	struct irq_pin_list *tmp, *entry;
 
-	list_for_each_entry_safe(entry, tmp, &cfg->irq_2_pin, list)
+	list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list)
 		if (entry->apic == apic && entry->pin == pin) {
 			list_del(&entry->list);
 			kfree(entry);
@@ -425,22 +404,23 @@ static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin)
 		}
 }
 
-static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+static void add_pin_to_irq_node(struct mp_chip_data *data,
+				int node, int apic, int pin)
 {
-	if (__add_pin_to_irq_node(cfg, node, apic, pin))
+	if (__add_pin_to_irq_node(data, node, apic, pin))
 		panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
 }
 
 /*
  * Reroute an IRQ to a different pin.
  */
-static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
+static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
 					   int oldapic, int oldpin,
 					   int newapic, int newpin)
 {
 	struct irq_pin_list *entry;
 
-	for_each_irq_pin(entry, cfg->irq_2_pin) {
+	for_each_irq_pin(entry, data->irq_2_pin) {
 		if (entry->apic == oldapic && entry->pin == oldpin) {
 			entry->apic = newapic;
 			entry->pin = newpin;
@@ -450,32 +430,26 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
 	}
 
 	/* old apic/pin didn't exist, so just add new ones */
-	add_pin_to_irq_node(cfg, node, newapic, newpin);
-}
-
-static void __io_apic_modify_irq(struct irq_pin_list *entry,
-				 int mask_and, int mask_or,
-				 void (*final)(struct irq_pin_list *entry))
-{
-	unsigned int reg, pin;
-
-	pin = entry->pin;
-	reg = io_apic_read(entry->apic, 0x10 + pin * 2);
-	reg &= mask_and;
-	reg |= mask_or;
-	io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
-	if (final)
-		final(entry);
+	add_pin_to_irq_node(data, node, newapic, newpin);
 }
 
-static void io_apic_modify_irq(struct irq_cfg *cfg,
+static void io_apic_modify_irq(struct mp_chip_data *data,
 			       int mask_and, int mask_or,
 			       void (*final)(struct irq_pin_list *entry))
 {
+	union entry_union eu;
 	struct irq_pin_list *entry;
 
-	for_each_irq_pin(entry, cfg->irq_2_pin)
-		__io_apic_modify_irq(entry, mask_and, mask_or, final);
+	eu.entry = data->entry;
+	eu.w1 &= mask_and;
+	eu.w1 |= mask_or;
+	data->entry = eu.entry;
+
+	for_each_irq_pin(entry, data->irq_2_pin) {
+		io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1);
+		if (final)
+			final(entry);
+	}
 }
 
 static void io_apic_sync(struct irq_pin_list *entry)
@@ -490,39 +464,31 @@ static void io_apic_sync(struct irq_pin_list *entry)
 	readl(&io_apic->data);
 }
 
-static void mask_ioapic(struct irq_cfg *cfg)
+static void mask_ioapic_irq(struct irq_data *irq_data)
 {
+	struct mp_chip_data *data = irq_data->chip_data;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+	io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void mask_ioapic_irq(struct irq_data *data)
+static void __unmask_ioapic(struct mp_chip_data *data)
 {
-	mask_ioapic(irqd_cfg(data));
+	io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL);
 }
 
-static void __unmask_ioapic(struct irq_cfg *cfg)
-{
-	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
-}
-
-static void unmask_ioapic(struct irq_cfg *cfg)
+static void unmask_ioapic_irq(struct irq_data *irq_data)
 {
+	struct mp_chip_data *data = irq_data->chip_data;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	__unmask_ioapic(cfg);
+	__unmask_ioapic(data);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void unmask_ioapic_irq(struct irq_data *data)
-{
-	unmask_ioapic(irqd_cfg(data));
-}
-
 /*
  * IO-APIC versions below 0x20 don't support EOI register.
  * For the record, here is the information about various versions:
@@ -539,7 +505,7 @@ static void unmask_ioapic_irq(struct irq_data *data)
  * Otherwise, we simulate the EOI message manually by changing the trigger
  * mode to edge and then back to level, with RTE being masked during this.
  */
-void native_eoi_ioapic_pin(int apic, int pin, int vector)
+static void __eoi_ioapic_pin(int apic, int pin, int vector)
 {
 	if (mpc_ioapic_ver(apic) >= 0x20) {
 		io_apic_eoi(apic, vector);
@@ -551,7 +517,7 @@ void native_eoi_ioapic_pin(int apic, int pin, int vector)
 		/*
 		 * Mask the entry and change the trigger mode to edge.
 		 */
-		entry1.mask = 1;
+		entry1.mask = IOAPIC_MASKED;
 		entry1.trigger = IOAPIC_EDGE;
 
 		__ioapic_write_entry(apic, pin, entry1);
@@ -563,15 +529,14 @@ void native_eoi_ioapic_pin(int apic, int pin, int vector)
 	}
 }
 
-void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+void eoi_ioapic_pin(int vector, struct mp_chip_data *data)
 {
-	struct irq_pin_list *entry;
 	unsigned long flags;
+	struct irq_pin_list *entry;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	for_each_irq_pin(entry, cfg->irq_2_pin)
-		x86_io_apic_ops.eoi_ioapic_pin(entry->apic, entry->pin,
-					       cfg->vector);
+	for_each_irq_pin(entry, data->irq_2_pin)
+		__eoi_ioapic_pin(entry->apic, entry->pin, vector);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
@@ -588,8 +553,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 	 * Make sure the entry is masked and re-read the contents to check
 	 * if it is a level triggered pin and if the remote-IRR is set.
 	 */
-	if (!entry.mask) {
-		entry.mask = 1;
+	if (entry.mask == IOAPIC_UNMASKED) {
+		entry.mask = IOAPIC_MASKED;
 		ioapic_write_entry(apic, pin, entry);
 		entry = ioapic_read_entry(apic, pin);
 	}
@@ -602,13 +567,12 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 		 * doesn't clear the remote-IRR if the trigger mode is not
 		 * set to level.
 		 */
-		if (!entry.trigger) {
+		if (entry.trigger == IOAPIC_EDGE) {
 			entry.trigger = IOAPIC_LEVEL;
 			ioapic_write_entry(apic, pin, entry);
 		}
-
 		raw_spin_lock_irqsave(&ioapic_lock, flags);
-		x86_io_apic_ops.eoi_ioapic_pin(apic, pin, entry.vector);
+		__eoi_ioapic_pin(apic, pin, entry.vector);
 		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 
@@ -706,8 +670,8 @@ void mask_ioapic_entries(void)
 			struct IO_APIC_route_entry entry;
 
 			entry = ioapics[apic].saved_registers[pin];
-			if (!entry.mask) {
-				entry.mask = 1;
+			if (entry.mask == IOAPIC_UNMASKED) {
+				entry.mask = IOAPIC_MASKED;
 				ioapic_write_entry(apic, pin, entry);
 			}
 		}
@@ -809,11 +773,11 @@ static int EISA_ELCR(unsigned int irq)
 
 #endif
 
-/* ISA interrupts are always polarity zero edge triggered,
+/* ISA interrupts are always active high edge triggered,
  * when listed as conforming in the MP table. */
 
-#define default_ISA_trigger(idx)	(0)
-#define default_ISA_polarity(idx)	(0)
+#define default_ISA_trigger(idx)	(IOAPIC_EDGE)
+#define default_ISA_polarity(idx)	(IOAPIC_POL_HIGH)
 
 /* EISA interrupts are always polarity zero and can be edge or level
  * trigger depending on the ELCR value.  If an interrupt is listed as
@@ -823,53 +787,55 @@ static int EISA_ELCR(unsigned int irq)
 #define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].srcbusirq))
 #define default_EISA_polarity(idx)	default_ISA_polarity(idx)
 
-/* PCI interrupts are always polarity one level triggered,
+/* PCI interrupts are always active low level triggered,
  * when listed as conforming in the MP table. */
 
-#define default_PCI_trigger(idx)	(1)
-#define default_PCI_polarity(idx)	(1)
+#define default_PCI_trigger(idx)	(IOAPIC_LEVEL)
+#define default_PCI_polarity(idx)	(IOAPIC_POL_LOW)
 
 static int irq_polarity(int idx)
 {
 	int bus = mp_irqs[idx].srcbus;
-	int polarity;
 
 	/*
 	 * Determine IRQ line polarity (high active or low active):
 	 */
-	switch (mp_irqs[idx].irqflag & 3)
-	{
-		case 0: /* conforms, ie. bus-type dependent polarity */
-			if (test_bit(bus, mp_bus_not_pci))
-				polarity = default_ISA_polarity(idx);
-			else
-				polarity = default_PCI_polarity(idx);
-			break;
-		case 1: /* high active */
-		{
-			polarity = 0;
-			break;
-		}
-		case 2: /* reserved */
-		{
-			pr_warn("broken BIOS!!\n");
-			polarity = 1;
-			break;
-		}
-		case 3: /* low active */
-		{
-			polarity = 1;
-			break;
-		}
-		default: /* invalid */
-		{
-			pr_warn("broken BIOS!!\n");
-			polarity = 1;
-			break;
-		}
+	switch (mp_irqs[idx].irqflag & 0x03) {
+	case 0:
+		/* conforms to spec, ie. bus-type dependent polarity */
+		if (test_bit(bus, mp_bus_not_pci))
+			return default_ISA_polarity(idx);
+		else
+			return default_PCI_polarity(idx);
+	case 1:
+		return IOAPIC_POL_HIGH;
+	case 2:
+		pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
+	case 3:
+	default: /* Pointless default required due to do gcc stupidity */
+		return IOAPIC_POL_LOW;
+	}
+}
+
+#ifdef CONFIG_EISA
+static int eisa_irq_trigger(int idx, int bus, int trigger)
+{
+	switch (mp_bus_id_to_type[bus]) {
+	case MP_BUS_PCI:
+	case MP_BUS_ISA:
+		return trigger;
+	case MP_BUS_EISA:
+		return default_EISA_trigger(idx);
 	}
-	return polarity;
+	pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus);
+	return IOAPIC_LEVEL;
 }
+#else
+static inline int eisa_irq_trigger(int idx, int bus, int trigger)
+{
+	return trigger;
+}
+#endif
 
 static int irq_trigger(int idx)
 {
@@ -879,153 +845,227 @@ static int irq_trigger(int idx)
 	/*
 	 * Determine IRQ trigger mode (edge or level sensitive):
 	 */
-	switch ((mp_irqs[idx].irqflag>>2) & 3)
-	{
-		case 0: /* conforms, ie. bus-type dependent */
-			if (test_bit(bus, mp_bus_not_pci))
-				trigger = default_ISA_trigger(idx);
-			else
-				trigger = default_PCI_trigger(idx);
-#ifdef CONFIG_EISA
-			switch (mp_bus_id_to_type[bus]) {
-				case MP_BUS_ISA: /* ISA pin */
-				{
-					/* set before the switch */
-					break;
-				}
-				case MP_BUS_EISA: /* EISA pin */
-				{
-					trigger = default_EISA_trigger(idx);
-					break;
-				}
-				case MP_BUS_PCI: /* PCI pin */
-				{
-					/* set before the switch */
-					break;
-				}
-				default:
-				{
-					pr_warn("broken BIOS!!\n");
-					trigger = 1;
-					break;
-				}
-			}
+	switch ((mp_irqs[idx].irqflag >> 2) & 0x03) {
+	case 0:
+		/* conforms to spec, ie. bus-type dependent trigger mode */
+		if (test_bit(bus, mp_bus_not_pci))
+			trigger = default_ISA_trigger(idx);
+		else
+			trigger = default_PCI_trigger(idx);
+		/* Take EISA into account */
+		return eisa_irq_trigger(idx, bus, trigger);
+	case 1:
+		return IOAPIC_EDGE;
+	case 2:
+		pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
+	case 3:
+	default: /* Pointless default required due to do gcc stupidity */
+		return IOAPIC_LEVEL;
+	}
+}
+
+void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node,
+			   int trigger, int polarity)
+{
+	init_irq_alloc_info(info, NULL);
+	info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+	info->ioapic_node = node;
+	info->ioapic_trigger = trigger;
+	info->ioapic_polarity = polarity;
+	info->ioapic_valid = 1;
+}
+
+#ifndef CONFIG_ACPI
+int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
 #endif
-			break;
-		case 1: /* edge */
-		{
-			trigger = 0;
-			break;
-		}
-		case 2: /* reserved */
-		{
-			pr_warn("broken BIOS!!\n");
-			trigger = 1;
-			break;
-		}
-		case 3: /* level */
-		{
-			trigger = 1;
-			break;
-		}
-		default: /* invalid */
-		{
-			pr_warn("broken BIOS!!\n");
-			trigger = 0;
-			break;
+
+static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
+				   struct irq_alloc_info *src,
+				   u32 gsi, int ioapic_idx, int pin)
+{
+	int trigger, polarity;
+
+	copy_irq_alloc_info(dst, src);
+	dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+	dst->ioapic_id = mpc_ioapic_id(ioapic_idx);
+	dst->ioapic_pin = pin;
+	dst->ioapic_valid = 1;
+	if (src && src->ioapic_valid) {
+		dst->ioapic_node = src->ioapic_node;
+		dst->ioapic_trigger = src->ioapic_trigger;
+		dst->ioapic_polarity = src->ioapic_polarity;
+	} else {
+		dst->ioapic_node = NUMA_NO_NODE;
+		if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) {
+			dst->ioapic_trigger = trigger;
+			dst->ioapic_polarity = polarity;
+		} else {
+			/*
+			 * PCI interrupts are always active low level
+			 * triggered.
+			 */
+			dst->ioapic_trigger = IOAPIC_LEVEL;
+			dst->ioapic_polarity = IOAPIC_POL_LOW;
 		}
 	}
-	return trigger;
 }
 
-static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin)
+static int ioapic_alloc_attr_node(struct irq_alloc_info *info)
+{
+	return (info && info->ioapic_valid) ? info->ioapic_node : NUMA_NO_NODE;
+}
+
+static void mp_register_handler(unsigned int irq, unsigned long trigger)
+{
+	irq_flow_handler_t hdl;
+	bool fasteoi;
+
+	if (trigger) {
+		irq_set_status_flags(irq, IRQ_LEVEL);
+		fasteoi = true;
+	} else {
+		irq_clear_status_flags(irq, IRQ_LEVEL);
+		fasteoi = false;
+	}
+
+	hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
+	__irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge");
+}
+
+static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
 {
+	struct mp_chip_data *data = irq_get_chip_data(irq);
+
+	/*
+	 * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger
+	 * and polarity attirbutes. So allow the first user to reprogram the
+	 * pin with real trigger and polarity attributes.
+	 */
+	if (irq < nr_legacy_irqs() && data->count == 1) {
+		if (info->ioapic_trigger != data->trigger)
+			mp_register_handler(irq, data->trigger);
+		data->entry.trigger = data->trigger = info->ioapic_trigger;
+		data->entry.polarity = data->polarity = info->ioapic_polarity;
+	}
+
+	return data->trigger == info->ioapic_trigger &&
+	       data->polarity == info->ioapic_polarity;
+}
+
+static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
+				 struct irq_alloc_info *info)
+{
+	bool legacy = false;
 	int irq = -1;
-	int ioapic = (int)(long)domain->host_data;
 	int type = ioapics[ioapic].irqdomain_cfg.type;
 
 	switch (type) {
 	case IOAPIC_DOMAIN_LEGACY:
 		/*
-		 * Dynamically allocate IRQ number for non-ISA IRQs in the first 16
-		 * GSIs on some weird platforms.
+		 * Dynamically allocate IRQ number for non-ISA IRQs in the first
+		 * 16 GSIs on some weird platforms.
 		 */
-		if (gsi < nr_legacy_irqs())
-			irq = irq_create_mapping(domain, pin);
-		else if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
+		if (!ioapic_initialized || gsi >= nr_legacy_irqs())
 			irq = gsi;
+		legacy = mp_is_legacy_irq(irq);
 		break;
 	case IOAPIC_DOMAIN_STRICT:
-		if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
-			irq = gsi;
+		irq = gsi;
 		break;
 	case IOAPIC_DOMAIN_DYNAMIC:
-		irq = irq_create_mapping(domain, pin);
 		break;
 	default:
 		WARN(1, "ioapic: unknown irqdomain type %d\n", type);
-		break;
+		return -1;
+	}
+
+	return __irq_domain_alloc_irqs(domain, irq, 1,
+				       ioapic_alloc_attr_node(info),
+				       info, legacy);
+}
+
+/*
+ * Need special handling for ISA IRQs because there may be multiple IOAPIC pins
+ * sharing the same ISA IRQ number and irqdomain only supports 1:1 mapping
+ * between IOAPIC pin and IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are
+ * used for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
+ * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are available, and
+ * some BIOSes may use MP Interrupt Source records to override IRQ numbers for
+ * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be
+ * multiple pins sharing the same legacy IRQ number when ACPI is disabled.
+ */
+static int alloc_isa_irq_from_domain(struct irq_domain *domain,
+				     int irq, int ioapic, int pin,
+				     struct irq_alloc_info *info)
+{
+	struct mp_chip_data *data;
+	struct irq_data *irq_data = irq_get_irq_data(irq);
+	int node = ioapic_alloc_attr_node(info);
+
+	/*
+	 * Legacy ISA IRQ has already been allocated, just add pin to
+	 * the pin list assoicated with this IRQ and program the IOAPIC
+	 * entry. The IOAPIC entry
+	 */
+	if (irq_data && irq_data->parent_data) {
+		if (!mp_check_pin_attr(irq, info))
+			return -EBUSY;
+		if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic,
+					  info->ioapic_pin))
+			return -ENOMEM;
+	} else {
+		irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true);
+		if (irq >= 0) {
+			irq_data = irq_domain_get_irq_data(domain, irq);
+			data = irq_data->chip_data;
+			data->isa_irq = true;
+		}
 	}
 
-	return irq > 0 ? irq : -1;
+	return irq;
 }
 
 static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
-			     unsigned int flags)
+			     unsigned int flags, struct irq_alloc_info *info)
 {
 	int irq;
+	bool legacy = false;
+	struct irq_alloc_info tmp;
+	struct mp_chip_data *data;
 	struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
-	struct mp_pin_info *info = mp_pin_info(ioapic, pin);
 
 	if (!domain)
-		return -1;
+		return -ENOSYS;
 
-	mutex_lock(&ioapic_mutex);
-
-	/*
-	 * Don't use irqdomain to manage ISA IRQs because there may be
-	 * multiple IOAPIC pins sharing the same ISA IRQ number and
-	 * irqdomain only supports 1:1 mapping between IOAPIC pin and
-	 * IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are used
-	 * for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
-	 * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are
-	 * available, and some BIOSes may use MP Interrupt Source records
-	 * to override IRQ numbers for PIRQs instead of reprogramming
-	 * the interrupt routing logic. Thus there may be multiple pins
-	 * sharing the same legacy IRQ number when ACPI is disabled.
-	 */
 	if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
 		irq = mp_irqs[idx].srcbusirq;
-		if (flags & IOAPIC_MAP_ALLOC) {
-			if (info->count == 0 &&
-			    mp_irqdomain_map(domain, irq, pin) != 0)
-				irq = -1;
+		legacy = mp_is_legacy_irq(irq);
+	}
 
-			/* special handling for timer IRQ0 */
+	mutex_lock(&ioapic_mutex);
+	if (!(flags & IOAPIC_MAP_ALLOC)) {
+		if (!legacy) {
+			irq = irq_find_mapping(domain, pin);
 			if (irq == 0)
-				info->count++;
+				irq = -ENOENT;
 		}
 	} else {
-		irq = irq_find_mapping(domain, pin);
-		if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC))
-			irq = alloc_irq_from_domain(domain, gsi, pin);
-	}
-
-	if (flags & IOAPIC_MAP_ALLOC) {
-		/* special handling for legacy IRQs */
-		if (irq < nr_legacy_irqs() && info->count == 1 &&
-		    mp_irqdomain_map(domain, irq, pin) != 0)
-			irq = -1;
-
-		if (irq > 0)
-			info->count++;
-		else if (info->count == 0)
-			info->set = 0;
+		ioapic_copy_alloc_attr(&tmp, info, gsi, ioapic, pin);
+		if (legacy)
+			irq = alloc_isa_irq_from_domain(domain, irq,
+							ioapic, pin, &tmp);
+		else if ((irq = irq_find_mapping(domain, pin)) == 0)
+			irq = alloc_irq_from_domain(domain, ioapic, gsi, &tmp);
+		else if (!mp_check_pin_attr(irq, &tmp))
+			irq = -EBUSY;
+		if (irq >= 0) {
+			data = irq_get_chip_data(irq);
+			data->count++;
+		}
 	}
-
 	mutex_unlock(&ioapic_mutex);
 
-	return irq > 0 ? irq : -1;
+	return irq;
 }
 
 static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
@@ -1058,10 +1098,10 @@ static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
 	}
 #endif
 
-	return  mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+	return  mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL);
 }
 
-int mp_map_gsi_to_irq(u32 gsi, unsigned int flags)
+int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info)
 {
 	int ioapic, pin, idx;
 
@@ -1074,31 +1114,24 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned int flags)
 	if ((flags & IOAPIC_MAP_CHECK) && idx < 0)
 		return -1;
 
-	return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+	return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info);
 }
 
 void mp_unmap_irq(int irq)
 {
-	struct irq_data *data = irq_get_irq_data(irq);
-	struct mp_pin_info *info;
-	int ioapic, pin;
+	struct irq_data *irq_data = irq_get_irq_data(irq);
+	struct mp_chip_data *data;
 
-	if (!data || !data->domain)
+	if (!irq_data || !irq_data->domain)
 		return;
 
-	ioapic = (int)(long)data->domain->host_data;
-	pin = (int)data->hwirq;
-	info = mp_pin_info(ioapic, pin);
+	data = irq_data->chip_data;
+	if (!data || data->isa_irq)
+		return;
 
 	mutex_lock(&ioapic_mutex);
-	if (--info->count == 0) {
-		info->set = 0;
-		if (irq < nr_legacy_irqs() &&
-		    ioapics[ioapic].irqdomain_cfg.type == IOAPIC_DOMAIN_LEGACY)
-			mp_irqdomain_unmap(data->domain, irq);
-		else
-			irq_dispose_mapping(irq);
-	}
+	if (--data->count == 0)
+		irq_domain_free_irqs(irq, 1);
 	mutex_unlock(&ioapic_mutex);
 }
 
@@ -1165,7 +1198,7 @@ out:
 }
 EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
 
-static struct irq_chip ioapic_chip;
+static struct irq_chip ioapic_chip, ioapic_ir_chip;
 
 #ifdef CONFIG_X86_32
 static inline int IO_APIC_irq_trigger(int irq)
@@ -1189,96 +1222,6 @@ static inline int IO_APIC_irq_trigger(int irq)
 }
 #endif
 
-static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
-				 unsigned long trigger)
-{
-	struct irq_chip *chip = &ioapic_chip;
-	irq_flow_handler_t hdl;
-	bool fasteoi;
-
-	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
-	    trigger == IOAPIC_LEVEL) {
-		irq_set_status_flags(irq, IRQ_LEVEL);
-		fasteoi = true;
-	} else {
-		irq_clear_status_flags(irq, IRQ_LEVEL);
-		fasteoi = false;
-	}
-
-	if (setup_remapped_irq(irq, cfg, chip))
-		fasteoi = trigger != 0;
-
-	hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
-	irq_set_chip_and_handler_name(irq, chip, hdl,
-				      fasteoi ? "fasteoi" : "edge");
-}
-
-int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
-			      unsigned int destination, int vector,
-			      struct io_apic_irq_attr *attr)
-{
-	memset(entry, 0, sizeof(*entry));
-
-	entry->delivery_mode = apic->irq_delivery_mode;
-	entry->dest_mode     = apic->irq_dest_mode;
-	entry->dest	     = destination;
-	entry->vector	     = vector;
-	entry->mask	     = 0;			/* enable IRQ */
-	entry->trigger	     = attr->trigger;
-	entry->polarity	     = attr->polarity;
-
-	/*
-	 * Mask level triggered irqs.
-	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
-	 */
-	if (attr->trigger)
-		entry->mask = 1;
-
-	return 0;
-}
-
-static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
-				struct io_apic_irq_attr *attr)
-{
-	struct IO_APIC_route_entry entry;
-	unsigned int dest;
-
-	if (!IO_APIC_IRQ(irq))
-		return;
-
-	if (assign_irq_vector(irq, cfg, apic->target_cpus()))
-		return;
-
-	if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(),
-					 &dest)) {
-		pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n",
-			mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
-		clear_irq_vector(irq, cfg);
-
-		return;
-	}
-
-	apic_printk(APIC_VERBOSE,KERN_DEBUG
-		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
-		    "IRQ %d Mode:%i Active:%i Dest:%d)\n",
-		    attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin,
-		    cfg->vector, irq, attr->trigger, attr->polarity, dest);
-
-	if (x86_io_apic_ops.setup_entry(irq, &entry, dest, cfg->vector, attr)) {
-		pr_warn("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
-			mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
-		clear_irq_vector(irq, cfg);
-
-		return;
-	}
-
-	ioapic_register_intr(irq, cfg, attr->trigger);
-	if (irq < nr_legacy_irqs())
-		legacy_pic->mask(irq);
-
-	ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry);
-}
-
 static void __init setup_IO_APIC_irqs(void)
 {
 	unsigned int ioapic, pin;
@@ -1298,106 +1241,41 @@ static void __init setup_IO_APIC_irqs(void)
 	}
 }
 
-/*
- * Set up the timer pin, possibly with the 8259A-master behind.
- */
-static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
-					unsigned int pin, int vector)
-{
-	struct IO_APIC_route_entry entry;
-	unsigned int dest;
-
-	memset(&entry, 0, sizeof(entry));
-
-	/*
-	 * We use logical delivery to get the timer IRQ
-	 * to the first CPU.
-	 */
-	if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(),
-						  apic->target_cpus(), &dest)))
-		dest = BAD_APICID;
-
-	entry.dest_mode = apic->irq_dest_mode;
-	entry.mask = 0;			/* don't mask IRQ for edge */
-	entry.dest = dest;
-	entry.delivery_mode = apic->irq_delivery_mode;
-	entry.polarity = 0;
-	entry.trigger = 0;
-	entry.vector = vector;
-
-	/*
-	 * The timer IRQ doesn't have to know that behind the
-	 * scene we may have a 8259A-master in AEOI mode ...
-	 */
-	irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
-				      "edge");
-
-	/*
-	 * Add it to the IO-APIC irq-routing table:
-	 */
-	ioapic_write_entry(ioapic_idx, pin, entry);
-}
-
-void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
+void ioapic_zap_locks(void)
 {
-	int i;
-
-	pr_debug(" NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect:\n");
-
-	for (i = 0; i <= nr_entries; i++) {
-		struct IO_APIC_route_entry entry;
-
-		entry = ioapic_read_entry(apic, i);
-
-		pr_debug(" %02x %02X  ", i, entry.dest);
-		pr_cont("%1d    %1d    %1d   %1d   %1d    "
-			"%1d    %1d    %02X\n",
-			entry.mask,
-			entry.trigger,
-			entry.irr,
-			entry.polarity,
-			entry.delivery_status,
-			entry.dest_mode,
-			entry.delivery_mode,
-			entry.vector);
-	}
+	raw_spin_lock_init(&ioapic_lock);
 }
 
-void intel_ir_io_apic_print_entries(unsigned int apic,
-				    unsigned int nr_entries)
+static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
 {
 	int i;
+	char buf[256];
+	struct IO_APIC_route_entry entry;
+	struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry;
 
-	pr_debug(" NR Indx Fmt Mask Trig IRR Pol Stat Indx2 Zero Vect:\n");
-
+	printk(KERN_DEBUG "IOAPIC %d:\n", apic);
 	for (i = 0; i <= nr_entries; i++) {
-		struct IR_IO_APIC_route_entry *ir_entry;
-		struct IO_APIC_route_entry entry;
-
 		entry = ioapic_read_entry(apic, i);
-
-		ir_entry = (struct IR_IO_APIC_route_entry *)&entry;
-
-		pr_debug(" %02x %04X ", i, ir_entry->index);
-		pr_cont("%1d   %1d    %1d    %1d   %1d   "
-			"%1d    %1d     %X    %02X\n",
-			ir_entry->format,
-			ir_entry->mask,
-			ir_entry->trigger,
-			ir_entry->irr,
-			ir_entry->polarity,
-			ir_entry->delivery_status,
-			ir_entry->index2,
-			ir_entry->zero,
-			ir_entry->vector);
+		snprintf(buf, sizeof(buf),
+			 " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
+			 i,
+			 entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ",
+			 entry.trigger == IOAPIC_LEVEL ? "level" : "edge ",
+			 entry.polarity == IOAPIC_POL_LOW ? "low " : "high",
+			 entry.vector, entry.irr, entry.delivery_status);
+		if (ir_entry->format)
+			printk(KERN_DEBUG "%s, remapped, I(%04X),  Z(%X)\n",
+			       buf, (ir_entry->index << 15) | ir_entry->index,
+			       ir_entry->zero);
+		else
+			printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n",
+			       buf,
+			       entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ?
+			       "logical " : "physical",
+			       entry.dest, entry.delivery_mode);
 	}
 }
 
-void ioapic_zap_locks(void)
-{
-	raw_spin_lock_init(&ioapic_lock);
-}
-
 static void __init print_IO_APIC(int ioapic_idx)
 {
 	union IO_APIC_reg_00 reg_00;
@@ -1451,16 +1329,13 @@ static void __init print_IO_APIC(int ioapic_idx)
 	}
 
 	printk(KERN_DEBUG ".... IRQ redirection table:\n");
-
-	x86_io_apic_ops.print_entries(ioapic_idx, reg_01.bits.entries);
+	io_apic_print_entries(ioapic_idx, reg_01.bits.entries);
 }
 
 void __init print_IO_APICs(void)
 {
 	int ioapic_idx;
-	struct irq_cfg *cfg;
 	unsigned int irq;
-	struct irq_chip *chip;
 
 	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
 	for_each_ioapic(ioapic_idx)
@@ -1480,18 +1355,20 @@ void __init print_IO_APICs(void)
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
 	for_each_active_irq(irq) {
 		struct irq_pin_list *entry;
+		struct irq_chip *chip;
+		struct mp_chip_data *data;
 
 		chip = irq_get_chip(irq);
-		if (chip != &ioapic_chip)
+		if (chip != &ioapic_chip && chip != &ioapic_ir_chip)
 			continue;
-
-		cfg = irq_cfg(irq);
-		if (!cfg)
+		data = irq_get_chip_data(irq);
+		if (!data)
 			continue;
-		if (list_empty(&cfg->irq_2_pin))
+		if (list_empty(&data->irq_2_pin))
 			continue;
+
 		printk(KERN_DEBUG "IRQ%d ", irq);
-		for_each_irq_pin(entry, cfg->irq_2_pin)
+		for_each_irq_pin(entry, data->irq_2_pin)
 			pr_cont("-> %d:%d", entry->apic, entry->pin);
 		pr_cont("\n");
 	}
@@ -1564,15 +1441,12 @@ void native_disable_io_apic(void)
 		struct IO_APIC_route_entry entry;
 
 		memset(&entry, 0, sizeof(entry));
-		entry.mask            = 0; /* Enabled */
-		entry.trigger         = 0; /* Edge */
-		entry.irr             = 0;
-		entry.polarity        = 0; /* High */
-		entry.delivery_status = 0;
-		entry.dest_mode       = 0; /* Physical */
-		entry.delivery_mode   = dest_ExtINT; /* ExtInt */
-		entry.vector          = 0;
-		entry.dest            = read_apic_id();
+		entry.mask		= IOAPIC_UNMASKED;
+		entry.trigger		= IOAPIC_EDGE;
+		entry.polarity		= IOAPIC_POL_HIGH;
+		entry.dest_mode		= IOAPIC_DEST_MODE_PHYSICAL;
+		entry.delivery_mode	= dest_ExtINT;
+		entry.dest		= read_apic_id();
 
 		/*
 		 * Add it to the IO-APIC irq-routing table:
@@ -1582,7 +1456,6 @@ void native_disable_io_apic(void)
 
 	if (cpu_has_apic || apic_from_smp_config())
 		disconnect_bsp_APIC(ioapic_i8259.pin != -1);
-
 }
 
 /*
@@ -1792,7 +1665,6 @@ static int __init timer_irq_works(void)
  * This is not complete - we should be able to fake
  * an edge even if it isn't on the 8259A...
  */
-
 static unsigned int startup_ioapic_irq(struct irq_data *data)
 {
 	int was_pending = 0, irq = data->irq;
@@ -1804,74 +1676,22 @@ static unsigned int startup_ioapic_irq(struct irq_data *data)
 		if (legacy_pic->irq_pending(irq))
 			was_pending = 1;
 	}
-	__unmask_ioapic(irqd_cfg(data));
+	__unmask_ioapic(data->chip_data);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return was_pending;
 }
 
-/*
- * Level and edge triggered IO-APIC interrupts need different handling,
- * so we use two separate IRQ descriptors. Edge triggered IRQs can be
- * handled with the level-triggered descriptor, but that one has slightly
- * more overhead. Level-triggered interrupts cannot be handled with the
- * edge-triggered handler, without risking IRQ storms and other ugly
- * races.
- */
-
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
-{
-	int apic, pin;
-	struct irq_pin_list *entry;
-	u8 vector = cfg->vector;
-
-	for_each_irq_pin(entry, cfg->irq_2_pin) {
-		unsigned int reg;
-
-		apic = entry->apic;
-		pin = entry->pin;
-
-		io_apic_write(apic, 0x11 + pin*2, dest);
-		reg = io_apic_read(apic, 0x10 + pin*2);
-		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
-		reg |= vector;
-		io_apic_modify(apic, 0x10 + pin*2, reg);
-	}
-}
-
-int native_ioapic_set_affinity(struct irq_data *data,
-			       const struct cpumask *mask,
-			       bool force)
-{
-	unsigned int dest, irq = data->irq;
-	unsigned long flags;
-	int ret;
-
-	if (!config_enabled(CONFIG_SMP))
-		return -EPERM;
-
-	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	ret = apic_set_affinity(data, mask, &dest);
-	if (!ret) {
-		/* Only the high 8 bits are valid. */
-		dest = SET_APIC_LOGICAL_ID(dest);
-		__target_IO_APIC_irq(irq, dest, irqd_cfg(data));
-		ret = IRQ_SET_MASK_OK_NOCOPY;
-	}
-	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-	return ret;
-}
-
 atomic_t irq_mis_count;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
+static bool io_apic_level_ack_pending(struct mp_chip_data *data)
 {
 	struct irq_pin_list *entry;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	for_each_irq_pin(entry, cfg->irq_2_pin) {
+	for_each_irq_pin(entry, data->irq_2_pin) {
 		unsigned int reg;
 		int pin;
 
@@ -1888,18 +1708,17 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
 	return false;
 }
 
-static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+static inline bool ioapic_irqd_mask(struct irq_data *data)
 {
 	/* If we are moving the irq we need to mask it */
 	if (unlikely(irqd_is_setaffinity_pending(data))) {
-		mask_ioapic(cfg);
+		mask_ioapic_irq(data);
 		return true;
 	}
 	return false;
 }
 
-static inline void ioapic_irqd_unmask(struct irq_data *data,
-				      struct irq_cfg *cfg, bool masked)
+static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
 {
 	if (unlikely(masked)) {
 		/* Only migrate the irq if the ack has been received.
@@ -1928,31 +1747,30 @@ static inline void ioapic_irqd_unmask(struct irq_data *data,
 		 * accurate and is causing problems then it is a hardware bug
 		 * and you can go talk to the chipset vendor about it.
 		 */
-		if (!io_apic_level_ack_pending(cfg))
+		if (!io_apic_level_ack_pending(data->chip_data))
 			irq_move_masked_irq(data);
-		unmask_ioapic(cfg);
+		unmask_ioapic_irq(data);
 	}
 }
 #else
-static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+static inline bool ioapic_irqd_mask(struct irq_data *data)
 {
 	return false;
 }
-static inline void ioapic_irqd_unmask(struct irq_data *data,
-				      struct irq_cfg *cfg, bool masked)
+static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
 {
 }
 #endif
 
-static void ack_ioapic_level(struct irq_data *data)
+static void ioapic_ack_level(struct irq_data *irq_data)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	int i, irq = data->irq;
+	struct irq_cfg *cfg = irqd_cfg(irq_data);
 	unsigned long v;
 	bool masked;
+	int i;
 
 	irq_complete_move(cfg);
-	masked = ioapic_irqd_mask(data, cfg);
+	masked = ioapic_irqd_mask(irq_data);
 
 	/*
 	 * It appears there is an erratum which affects at least version 0x11
@@ -2004,11 +1822,49 @@ static void ack_ioapic_level(struct irq_data *data)
 	 */
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
+		eoi_ioapic_pin(cfg->vector, irq_data->chip_data);
+	}
+
+	ioapic_irqd_unmask(irq_data, masked);
+}
+
+static void ioapic_ir_ack_level(struct irq_data *irq_data)
+{
+	struct mp_chip_data *data = irq_data->chip_data;
+
+	/*
+	 * Intr-remapping uses pin number as the virtual vector
+	 * in the RTE. Actual vector is programmed in
+	 * intr-remapping table entry. Hence for the io-apic
+	 * EOI we use the pin number.
+	 */
+	ack_APIC_irq();
+	eoi_ioapic_pin(data->entry.vector, data);
+}
 
-		eoi_ioapic_irq(irq, cfg);
+static int ioapic_set_affinity(struct irq_data *irq_data,
+			       const struct cpumask *mask, bool force)
+{
+	struct irq_data *parent = irq_data->parent_data;
+	struct mp_chip_data *data = irq_data->chip_data;
+	struct irq_pin_list *entry;
+	struct irq_cfg *cfg;
+	unsigned long flags;
+	int ret;
+
+	ret = parent->chip->irq_set_affinity(parent, mask, force);
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
+		cfg = irqd_cfg(irq_data);
+		data->entry.dest = cfg->dest_apicid;
+		data->entry.vector = cfg->vector;
+		for_each_irq_pin(entry, data->irq_2_pin)
+			__ioapic_write_entry(entry->apic, entry->pin,
+					     data->entry);
 	}
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
-	ioapic_irqd_unmask(data, cfg, masked);
+	return ret;
 }
 
 static struct irq_chip ioapic_chip __read_mostly = {
@@ -2016,10 +1872,20 @@ static struct irq_chip ioapic_chip __read_mostly = {
 	.irq_startup		= startup_ioapic_irq,
 	.irq_mask		= mask_ioapic_irq,
 	.irq_unmask		= unmask_ioapic_irq,
-	.irq_ack		= apic_ack_edge,
-	.irq_eoi		= ack_ioapic_level,
-	.irq_set_affinity	= native_ioapic_set_affinity,
-	.irq_retrigger		= apic_retrigger_irq,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_eoi		= ioapic_ack_level,
+	.irq_set_affinity	= ioapic_set_affinity,
+	.flags			= IRQCHIP_SKIP_SET_WAKE,
+};
+
+static struct irq_chip ioapic_ir_chip __read_mostly = {
+	.name			= "IR-IO-APIC",
+	.irq_startup		= startup_ioapic_irq,
+	.irq_mask		= mask_ioapic_irq,
+	.irq_unmask		= unmask_ioapic_irq,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_eoi		= ioapic_ir_ack_level,
+	.irq_set_affinity	= ioapic_set_affinity,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 
@@ -2113,12 +1979,12 @@ static inline void __init unlock_ExtINT_logic(void)
 
 	memset(&entry1, 0, sizeof(entry1));
 
-	entry1.dest_mode = 0;			/* physical delivery */
-	entry1.mask = 0;			/* unmask IRQ now */
+	entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
+	entry1.mask = IOAPIC_UNMASKED;
 	entry1.dest = hard_smp_processor_id();
 	entry1.delivery_mode = dest_ExtINT;
 	entry1.polarity = entry0.polarity;
-	entry1.trigger = 0;
+	entry1.trigger = IOAPIC_EDGE;
 	entry1.vector = 0;
 
 	ioapic_write_entry(apic, pin, entry1);
@@ -2152,6 +2018,25 @@ static int __init disable_timer_pin_setup(char *arg)
 }
 early_param("disable_timer_pin_1", disable_timer_pin_setup);
 
+static int mp_alloc_timer_irq(int ioapic, int pin)
+{
+	int irq = -1;
+	struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+
+	if (domain) {
+		struct irq_alloc_info info;
+
+		ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0);
+		info.ioapic_id = mpc_ioapic_id(ioapic);
+		info.ioapic_pin = pin;
+		mutex_lock(&ioapic_mutex);
+		irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info);
+		mutex_unlock(&ioapic_mutex);
+	}
+
+	return irq;
+}
+
 /*
  * This code may look a bit paranoid, but it's supposed to cooperate with
  * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
@@ -2162,7 +2047,9 @@ early_param("disable_timer_pin_1", disable_timer_pin_setup);
  */
 static inline void __init check_timer(void)
 {
-	struct irq_cfg *cfg = irq_cfg(0);
+	struct irq_data *irq_data = irq_get_irq_data(0);
+	struct mp_chip_data *data = irq_data->chip_data;
+	struct irq_cfg *cfg = irqd_cfg(irq_data);
 	int node = cpu_to_node(0);
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
@@ -2174,7 +2061,6 @@ static inline void __init check_timer(void)
 	 * get/set the timer IRQ vector:
 	 */
 	legacy_pic->mask(0);
-	assign_irq_vector(0, cfg, apic->target_cpus());
 
 	/*
 	 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2215,23 +2101,21 @@ static inline void __init check_timer(void)
 	}
 
 	if (pin1 != -1) {
-		/*
-		 * Ok, does IRQ0 through the IOAPIC work?
-		 */
+		/* Ok, does IRQ0 through the IOAPIC work? */
 		if (no_pin1) {
-			add_pin_to_irq_node(cfg, node, apic1, pin1);
-			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+			mp_alloc_timer_irq(apic1, pin1);
 		} else {
-			/* for edge trigger, setup_ioapic_irq already
-			 * leave it unmasked.
+			/*
+			 * for edge trigger, it's already unmasked,
 			 * so only need to unmask if it is level-trigger
 			 * do we really have level trigger timer?
 			 */
 			int idx;
 			idx = find_irq_entry(apic1, pin1, mp_INT);
 			if (idx != -1 && irq_trigger(idx))
-				unmask_ioapic(cfg);
+				unmask_ioapic_irq(irq_get_chip_data(0));
 		}
+		irq_domain_activate_irq(irq_data);
 		if (timer_irq_works()) {
 			if (disable_timer_pin_1 > 0)
 				clear_IO_APIC_pin(0, pin1);
@@ -2251,8 +2135,8 @@ static inline void __init check_timer(void)
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
-		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
+		replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2);
+		irq_domain_activate_irq(irq_data);
 		legacy_pic->unmask(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2329,36 +2213,35 @@ out:
 
 static int mp_irqdomain_create(int ioapic)
 {
-	size_t size;
+	struct irq_alloc_info info;
+	struct irq_domain *parent;
 	int hwirqs = mp_ioapic_pin_count(ioapic);
 	struct ioapic *ip = &ioapics[ioapic];
 	struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
 	struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
 
-	size = sizeof(struct mp_pin_info) * mp_ioapic_pin_count(ioapic);
-	ip->pin_info = kzalloc(size, GFP_KERNEL);
-	if (!ip->pin_info)
-		return -ENOMEM;
-
 	if (cfg->type == IOAPIC_DOMAIN_INVALID)
 		return 0;
 
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+	info.ioapic_id = mpc_ioapic_id(ioapic);
+	parent = irq_remapping_get_ir_irq_domain(&info);
+	if (!parent)
+		parent = x86_vector_domain;
+
 	ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops,
 					      (void *)(long)ioapic);
-	if(!ip->irqdomain) {
-		kfree(ip->pin_info);
-		ip->pin_info = NULL;
+	if (!ip->irqdomain)
 		return -ENOMEM;
-	}
+
+	ip->irqdomain->parent = parent;
 
 	if (cfg->type == IOAPIC_DOMAIN_LEGACY ||
 	    cfg->type == IOAPIC_DOMAIN_STRICT)
 		ioapic_dynirq_base = max(ioapic_dynirq_base,
 					 gsi_cfg->gsi_end + 1);
 
-	if (gsi_cfg->gsi_base == 0)
-		irq_set_default_host(ip->irqdomain);
-
 	return 0;
 }
 
@@ -2368,8 +2251,6 @@ static void ioapic_destroy_irqdomain(int idx)
 		irq_domain_remove(ioapics[idx].irqdomain);
 		ioapics[idx].irqdomain = NULL;
 	}
-	kfree(ioapics[idx].pin_info);
-	ioapics[idx].pin_info = NULL;
 }
 
 void __init setup_IO_APIC(void)
@@ -2399,20 +2280,6 @@ void __init setup_IO_APIC(void)
 	ioapic_initialized = 1;
 }
 
-/*
- *      Called after all the initialization is done. If we didn't find any
- *      APIC bugs then we can allow the modify fast path
- */
-
-static int __init io_apic_bug_finalize(void)
-{
-	if (sis_apic_bug == -1)
-		sis_apic_bug = 0;
-	return 0;
-}
-
-late_initcall(io_apic_bug_finalize);
-
 static void resume_ioapic_id(int ioapic_idx)
 {
 	unsigned long flags;
@@ -2451,20 +2318,6 @@ static int __init ioapic_init_ops(void)
 
 device_initcall(ioapic_init_ops);
 
-static int
-io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
-{
-	struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
-	int ret;
-
-	if (!cfg)
-		return -EINVAL;
-	ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
-	if (!ret)
-		setup_ioapic_irq(irq, cfg, attr);
-	return ret;
-}
-
 static int io_apic_get_redir_entries(int ioapic)
 {
 	union IO_APIC_reg_01	reg_01;
@@ -2692,7 +2545,7 @@ void __init setup_ioapic_dest(void)
 		else
 			mask = apic->target_cpus();
 
-		x86_io_apic_ops.set_affinity(idata, mask, false);
+		irq_set_affinity(irq, mask);
 	}
 
 }
@@ -2737,7 +2590,7 @@ static struct resource * __init ioapic_setup_resources(void)
 	return res;
 }
 
-void __init native_io_apic_init_mappings(void)
+void __init io_apic_init_mappings(void)
 {
 	unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
 	struct resource *ioapic_res;
@@ -2962,7 +2815,6 @@ int mp_unregister_ioapic(u32 gsi_base)
 {
 	int ioapic, pin;
 	int found = 0;
-	struct mp_pin_info *pin_info;
 
 	for_each_ioapic(ioapic)
 		if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) {
@@ -2975,11 +2827,17 @@ int mp_unregister_ioapic(u32 gsi_base)
 	}
 
 	for_each_pin(ioapic, pin) {
-		pin_info = mp_pin_info(ioapic, pin);
-		if (pin_info->count) {
-			pr_warn("pin%d on IOAPIC%d is still in use.\n",
-				pin, ioapic);
-			return -EBUSY;
+		u32 gsi = mp_pin_to_gsi(ioapic, pin);
+		int irq = mp_map_gsi_to_irq(gsi, 0, NULL);
+		struct mp_chip_data *data;
+
+		if (irq >= 0) {
+			data = irq_get_chip_data(irq);
+			if (data && data->count) {
+				pr_warn("pin%d on IOAPIC%d is still in use.\n",
+					pin, ioapic);
+				return -EBUSY;
+			}
 		}
 	}
 
@@ -3006,108 +2864,141 @@ int mp_ioapic_registered(u32 gsi_base)
 	return 0;
 }
 
-static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
-					int ioapic, int ioapic_pin,
-					int trigger, int polarity)
+static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
+				  struct irq_alloc_info *info)
 {
-	irq_attr->ioapic	= ioapic;
-	irq_attr->ioapic_pin	= ioapic_pin;
-	irq_attr->trigger	= trigger;
-	irq_attr->polarity	= polarity;
+	if (info && info->ioapic_valid) {
+		data->trigger = info->ioapic_trigger;
+		data->polarity = info->ioapic_polarity;
+	} else if (acpi_get_override_irq(gsi, &data->trigger,
+					 &data->polarity) < 0) {
+		/* PCI interrupts are always active low level triggered. */
+		data->trigger = IOAPIC_LEVEL;
+		data->polarity = IOAPIC_POL_LOW;
+	}
 }
 
-int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
-		     irq_hw_number_t hwirq)
+static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
+			   struct IO_APIC_route_entry *entry)
 {
-	int ioapic = (int)(long)domain->host_data;
-	struct mp_pin_info *info = mp_pin_info(ioapic, hwirq);
-	struct io_apic_irq_attr attr;
+	memset(entry, 0, sizeof(*entry));
+	entry->delivery_mode = apic->irq_delivery_mode;
+	entry->dest_mode     = apic->irq_dest_mode;
+	entry->dest	     = cfg->dest_apicid;
+	entry->vector	     = cfg->vector;
+	entry->trigger	     = data->trigger;
+	entry->polarity	     = data->polarity;
+	/*
+	 * Mask level triggered irqs. Edge triggered irqs are masked
+	 * by the irq core code in case they fire.
+	 */
+	if (data->trigger == IOAPIC_LEVEL)
+		entry->mask = IOAPIC_MASKED;
+	else
+		entry->mask = IOAPIC_UNMASKED;
+}
 
-	/* Get default attribute if not set by caller yet */
-	if (!info->set) {
-		u32 gsi = mp_pin_to_gsi(ioapic, hwirq);
+int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+		       unsigned int nr_irqs, void *arg)
+{
+	int ret, ioapic, pin;
+	struct irq_cfg *cfg;
+	struct irq_data *irq_data;
+	struct mp_chip_data *data;
+	struct irq_alloc_info *info = arg;
 
-		if (acpi_get_override_irq(gsi, &info->trigger,
-					  &info->polarity) < 0) {
-			/*
-			 * PCI interrupts are always polarity one level
-			 * triggered.
-			 */
-			info->trigger = 1;
-			info->polarity = 1;
-		}
-		info->node = NUMA_NO_NODE;
+	if (!info || nr_irqs > 1)
+		return -EINVAL;
+	irq_data = irq_domain_get_irq_data(domain, virq);
+	if (!irq_data)
+		return -EINVAL;
 
-		/*
-		 * setup_IO_APIC_irqs() programs all legacy IRQs with default
-		 * trigger and polarity attributes. Don't set the flag for that
-		 * case so the first legacy IRQ user could reprogram the pin
-		 * with real trigger and polarity attributes.
-		 */
-		if (virq >= nr_legacy_irqs() || info->count)
-			info->set = 1;
-	}
-	set_io_apic_irq_attr(&attr, ioapic, hwirq, info->trigger,
-			     info->polarity);
+	ioapic = mp_irqdomain_ioapic_idx(domain);
+	pin = info->ioapic_pin;
+	if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0)
+		return -EEXIST;
 
-	return io_apic_setup_irq_pin(virq, info->node, &attr);
-}
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
 
-void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq)
-{
-	struct irq_data *data = irq_get_irq_data(virq);
-	struct irq_cfg *cfg = irq_cfg(virq);
-	int ioapic = (int)(long)domain->host_data;
-	int pin = (int)data->hwirq;
+	info->ioapic_entry = &data->entry;
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
+	if (ret < 0) {
+		kfree(data);
+		return ret;
+	}
+
+	INIT_LIST_HEAD(&data->irq_2_pin);
+	irq_data->hwirq = info->ioapic_pin;
+	irq_data->chip = (domain->parent == x86_vector_domain) ?
+			  &ioapic_chip : &ioapic_ir_chip;
+	irq_data->chip_data = data;
+	mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info);
+
+	cfg = irqd_cfg(irq_data);
+	add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin);
+	if (info->ioapic_entry)
+		mp_setup_entry(cfg, data, info->ioapic_entry);
+	mp_register_handler(virq, data->trigger);
+	if (virq < nr_legacy_irqs())
+		legacy_pic->mask(virq);
+
+	apic_printk(APIC_VERBOSE, KERN_DEBUG
+		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n",
+		    ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector,
+		    virq, data->trigger, data->polarity, cfg->dest_apicid);
 
-	ioapic_mask_entry(ioapic, pin);
-	__remove_pin_from_irq(cfg, ioapic, pin);
-	WARN_ON(!list_empty(&cfg->irq_2_pin));
-	arch_teardown_hwirq(virq);
+	return 0;
 }
 
-int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node)
+void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+		       unsigned int nr_irqs)
 {
-	int ret = 0;
-	int ioapic, pin;
-	struct mp_pin_info *info;
+	struct irq_data *irq_data;
+	struct mp_chip_data *data;
 
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0)
-		return -ENODEV;
-
-	pin = mp_find_ioapic_pin(ioapic, gsi);
-	info = mp_pin_info(ioapic, pin);
-	trigger = trigger ? 1 : 0;
-	polarity = polarity ? 1 : 0;
-
-	mutex_lock(&ioapic_mutex);
-	if (!info->set) {
-		info->trigger = trigger;
-		info->polarity = polarity;
-		info->node = node;
-		info->set = 1;
-	} else if (info->trigger != trigger || info->polarity != polarity) {
-		ret = -EBUSY;
+	BUG_ON(nr_irqs != 1);
+	irq_data = irq_domain_get_irq_data(domain, virq);
+	if (irq_data && irq_data->chip_data) {
+		data = irq_data->chip_data;
+		__remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain),
+				      (int)irq_data->hwirq);
+		WARN_ON(!list_empty(&data->irq_2_pin));
+		kfree(irq_data->chip_data);
 	}
-	mutex_unlock(&ioapic_mutex);
-
-	return ret;
+	irq_domain_free_irqs_top(domain, virq, nr_irqs);
 }
 
-/* Enable IOAPIC early just for system timer */
-void __init pre_init_apic_IRQ0(void)
+void mp_irqdomain_activate(struct irq_domain *domain,
+			   struct irq_data *irq_data)
 {
-	struct io_apic_irq_attr attr = { 0, 0, 0, 0 };
+	unsigned long flags;
+	struct irq_pin_list *entry;
+	struct mp_chip_data *data = irq_data->chip_data;
 
-	printk(KERN_INFO "Early APIC setup for system timer0\n");
-#ifndef CONFIG_SMP
-	physid_set_mask_of_physid(boot_cpu_physical_apicid,
-					 &phys_cpu_present_map);
-#endif
-	setup_local_APIC();
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	for_each_irq_pin(entry, data->irq_2_pin)
+		__ioapic_write_entry(entry->apic, entry->pin, data->entry);
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
 
-	io_apic_setup_irq_pin(0, 0, &attr);
-	irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
-				      "edge");
+void mp_irqdomain_deactivate(struct irq_domain *domain,
+			     struct irq_data *irq_data)
+{
+	/* It won't be called for IRQ with multiple IOAPIC pins associated */
+	ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain),
+			  (int)irq_data->hwirq);
+}
+
+int mp_irqdomain_ioapic_idx(struct irq_domain *domain)
+{
+	return (int)(long)domain->host_data;
 }
+
+const struct irq_domain_ops mp_ioapic_irqdomain_ops = {
+	.alloc		= mp_irqdomain_alloc,
+	.free		= mp_irqdomain_free,
+	.activate	= mp_irqdomain_activate,
+	.deactivate	= mp_irqdomain_deactivate,
+};
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index d6ba2d660dc52f..1a9d735e09c62f 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -3,6 +3,8 @@
  *
  * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
  *	Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ *	Convert to hierarchical irqdomain
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -14,22 +16,23 @@
 #include <linux/dmar.h>
 #include <linux/hpet.h>
 #include <linux/msi.h>
+#include <asm/irqdomain.h>
 #include <asm/msidef.h>
 #include <asm/hpet.h>
 #include <asm/hw_irq.h>
 #include <asm/apic.h>
 #include <asm/irq_remapping.h>
 
-void native_compose_msi_msg(struct pci_dev *pdev,
-			    unsigned int irq, unsigned int dest,
-			    struct msi_msg *msg, u8 hpet_id)
+static struct irq_domain *msi_default_domain;
+
+static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
+	struct irq_cfg *cfg = irqd_cfg(data);
 
 	msg->address_hi = MSI_ADDR_BASE_HI;
 
 	if (x2apic_enabled())
-		msg->address_hi |= MSI_ADDR_EXT_DEST_ID(dest);
+		msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid);
 
 	msg->address_lo =
 		MSI_ADDR_BASE_LO |
@@ -39,7 +42,7 @@ void native_compose_msi_msg(struct pci_dev *pdev,
 		((apic->irq_delivery_mode != dest_LowestPrio) ?
 			MSI_ADDR_REDIRECTION_CPU :
 			MSI_ADDR_REDIRECTION_LOWPRI) |
-		MSI_ADDR_DEST_ID(dest);
+		MSI_ADDR_DEST_ID(cfg->dest_apicid);
 
 	msg->data =
 		MSI_DATA_TRIGGER_EDGE |
@@ -50,180 +53,201 @@ void native_compose_msi_msg(struct pci_dev *pdev,
 		MSI_DATA_VECTOR(cfg->vector);
 }
 
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
-			   struct msi_msg *msg, u8 hpet_id)
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip pci_msi_controller = {
+	.name			= "PCI-MSI",
+	.irq_unmask		= pci_msi_unmask_irq,
+	.irq_mask		= pci_msi_mask_irq,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
+	.irq_compose_msi_msg	= irq_msi_compose_msg,
+	.flags			= IRQCHIP_SKIP_SET_WAKE,
+};
+
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
-	struct irq_cfg *cfg;
-	int err;
-	unsigned dest;
+	struct irq_domain *domain;
+	struct irq_alloc_info info;
 
-	if (disable_apic)
-		return -ENXIO;
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_MSI;
+	info.msi_dev = dev;
 
-	cfg = irq_cfg(irq);
-	err = assign_irq_vector(irq, cfg, apic->target_cpus());
-	if (err)
-		return err;
+	domain = irq_remapping_get_irq_domain(&info);
+	if (domain == NULL)
+		domain = msi_default_domain;
+	if (domain == NULL)
+		return -ENOSYS;
 
-	err = apic->cpu_mask_to_apicid_and(cfg->domain,
-					   apic->target_cpus(), &dest);
-	if (err)
-		return err;
+	return pci_msi_domain_alloc_irqs(domain, dev, nvec, type);
+}
 
-	x86_msi.compose_msi_msg(pdev, irq, dest, msg, hpet_id);
+void native_teardown_msi_irq(unsigned int irq)
+{
+	irq_domain_free_irqs(irq, 1);
+}
 
-	return 0;
+static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info,
+					 msi_alloc_info_t *arg)
+{
+	return arg->msi_hwirq;
 }
 
-static int
-msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
+static int pci_msi_prepare(struct irq_domain *domain, struct device *dev,
+			   int nvec, msi_alloc_info_t *arg)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	struct msi_msg msg;
-	unsigned int dest;
-	int ret;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct msi_desc *desc = first_pci_msi_entry(pdev);
+
+	init_irq_alloc_info(arg, NULL);
+	arg->msi_dev = pdev;
+	if (desc->msi_attrib.is_msix) {
+		arg->type = X86_IRQ_ALLOC_TYPE_MSIX;
+	} else {
+		arg->type = X86_IRQ_ALLOC_TYPE_MSI;
+		arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
+	}
 
-	ret = apic_set_affinity(data, mask, &dest);
-	if (ret)
-		return ret;
+	return 0;
+}
 
-	__get_cached_msi_msg(data->msi_desc, &msg);
+static void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
+{
+	arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc);
+}
+
+static struct msi_domain_ops pci_msi_domain_ops = {
+	.get_hwirq	= pci_msi_get_hwirq,
+	.msi_prepare	= pci_msi_prepare,
+	.set_desc	= pci_msi_set_desc,
+};
 
-	msg.data &= ~MSI_DATA_VECTOR_MASK;
-	msg.data |= MSI_DATA_VECTOR(cfg->vector);
-	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
-	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+static struct msi_domain_info pci_msi_domain_info = {
+	.flags		= MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+			  MSI_FLAG_PCI_MSIX,
+	.ops		= &pci_msi_domain_ops,
+	.chip		= &pci_msi_controller,
+	.handler	= handle_edge_irq,
+	.handler_name	= "edge",
+};
 
-	__pci_write_msi_msg(data->msi_desc, &msg);
+void arch_init_msi_domain(struct irq_domain *parent)
+{
+	if (disable_apic)
+		return;
 
-	return IRQ_SET_MASK_OK_NOCOPY;
+	msi_default_domain = pci_msi_create_irq_domain(NULL,
+					&pci_msi_domain_info, parent);
+	if (!msi_default_domain)
+		pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
 }
 
-/*
- * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
- * which implement the MSI or MSI-X Capability Structure.
- */
-static struct irq_chip msi_chip = {
-	.name			= "PCI-MSI",
+#ifdef CONFIG_IRQ_REMAP
+static struct irq_chip pci_msi_ir_controller = {
+	.name			= "IR-PCI-MSI",
 	.irq_unmask		= pci_msi_unmask_irq,
 	.irq_mask		= pci_msi_mask_irq,
-	.irq_ack		= apic_ack_edge,
-	.irq_set_affinity	= msi_set_affinity,
-	.irq_retrigger		= apic_retrigger_irq,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
+	.irq_set_vcpu_affinity	= irq_chip_set_vcpu_affinity_parent,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 
-int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
-		  unsigned int irq_base, unsigned int irq_offset)
-{
-	struct irq_chip *chip = &msi_chip;
-	struct msi_msg msg;
-	unsigned int irq = irq_base + irq_offset;
-	int ret;
-
-	ret = msi_compose_msg(dev, irq, &msg, -1);
-	if (ret < 0)
-		return ret;
-
-	irq_set_msi_desc_off(irq_base, irq_offset, msidesc);
-
-	/*
-	 * MSI-X message is written per-IRQ, the offset is always 0.
-	 * MSI message denotes a contiguous group of IRQs, written for 0th IRQ.
-	 */
-	if (!irq_offset)
-		pci_write_msi_msg(irq, &msg);
+static struct msi_domain_info pci_msi_ir_domain_info = {
+	.flags		= MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+			  MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX,
+	.ops		= &pci_msi_domain_ops,
+	.chip		= &pci_msi_ir_controller,
+	.handler	= handle_edge_irq,
+	.handler_name	= "edge",
+};
 
-	setup_remapped_irq(irq, irq_cfg(irq), chip);
+struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent)
+{
+	return pci_msi_create_irq_domain(NULL, &pci_msi_ir_domain_info, parent);
+}
+#endif
 
-	irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
+#ifdef CONFIG_DMAR_TABLE
+static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
+{
+	dmar_msi_write(data->irq, msg);
+}
 
-	dev_dbg(&dev->dev, "irq %d for MSI/MSI-X\n", irq);
+static struct irq_chip dmar_msi_controller = {
+	.name			= "DMAR-MSI",
+	.irq_unmask		= dmar_msi_unmask,
+	.irq_mask		= dmar_msi_mask,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_set_affinity	= msi_domain_set_affinity,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
+	.irq_compose_msi_msg	= irq_msi_compose_msg,
+	.irq_write_msi_msg	= dmar_msi_write_msg,
+	.flags			= IRQCHIP_SKIP_SET_WAKE,
+};
 
-	return 0;
+static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info,
+					  msi_alloc_info_t *arg)
+{
+	return arg->dmar_id;
 }
 
-int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+static int dmar_msi_init(struct irq_domain *domain,
+			 struct msi_domain_info *info, unsigned int virq,
+			 irq_hw_number_t hwirq, msi_alloc_info_t *arg)
 {
-	struct msi_desc *msidesc;
-	unsigned int irq;
-	int node, ret;
+	irq_domain_set_info(domain, virq, arg->dmar_id, info->chip, NULL,
+			    handle_edge_irq, arg->dmar_data, "edge");
 
-	/* Multiple MSI vectors only supported with interrupt remapping */
-	if (type == PCI_CAP_ID_MSI && nvec > 1)
-		return 1;
+	return 0;
+}
 
-	node = dev_to_node(&dev->dev);
+static struct msi_domain_ops dmar_msi_domain_ops = {
+	.get_hwirq	= dmar_msi_get_hwirq,
+	.msi_init	= dmar_msi_init,
+};
 
-	list_for_each_entry(msidesc, &dev->msi_list, list) {
-		irq = irq_alloc_hwirq(node);
-		if (!irq)
-			return -ENOSPC;
+static struct msi_domain_info dmar_msi_domain_info = {
+	.ops		= &dmar_msi_domain_ops,
+	.chip		= &dmar_msi_controller,
+};
 
-		ret = setup_msi_irq(dev, msidesc, irq, 0);
-		if (ret < 0) {
-			irq_free_hwirq(irq);
-			return ret;
-		}
+static struct irq_domain *dmar_get_irq_domain(void)
+{
+	static struct irq_domain *dmar_domain;
+	static DEFINE_MUTEX(dmar_lock);
 
-	}
-	return 0;
-}
+	mutex_lock(&dmar_lock);
+	if (dmar_domain == NULL)
+		dmar_domain = msi_create_irq_domain(NULL, &dmar_msi_domain_info,
+						    x86_vector_domain);
+	mutex_unlock(&dmar_lock);
 
-void native_teardown_msi_irq(unsigned int irq)
-{
-	irq_free_hwirq(irq);
+	return dmar_domain;
 }
 
-#ifdef CONFIG_DMAR_TABLE
-static int
-dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
-		      bool force)
+int dmar_alloc_hwirq(int id, int node, void *arg)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	unsigned int dest, irq = data->irq;
-	struct msi_msg msg;
-	int ret;
-
-	ret = apic_set_affinity(data, mask, &dest);
-	if (ret)
-		return ret;
+	struct irq_domain *domain = dmar_get_irq_domain();
+	struct irq_alloc_info info;
 
-	dmar_msi_read(irq, &msg);
+	if (!domain)
+		return -1;
 
-	msg.data &= ~MSI_DATA_VECTOR_MASK;
-	msg.data |= MSI_DATA_VECTOR(cfg->vector);
-	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
-	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-	msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest);
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_DMAR;
+	info.dmar_id = id;
+	info.dmar_data = arg;
 
-	dmar_msi_write(irq, &msg);
-
-	return IRQ_SET_MASK_OK_NOCOPY;
+	return irq_domain_alloc_irqs(domain, 1, node, &info);
 }
 
-static struct irq_chip dmar_msi_type = {
-	.name			= "DMAR_MSI",
-	.irq_unmask		= dmar_msi_unmask,
-	.irq_mask		= dmar_msi_mask,
-	.irq_ack		= apic_ack_edge,
-	.irq_set_affinity	= dmar_msi_set_affinity,
-	.irq_retrigger		= apic_retrigger_irq,
-	.flags			= IRQCHIP_SKIP_SET_WAKE,
-};
-
-int arch_setup_dmar_msi(unsigned int irq)
+void dmar_free_hwirq(int irq)
 {
-	int ret;
-	struct msi_msg msg;
-
-	ret = msi_compose_msg(NULL, irq, &msg, -1);
-	if (ret < 0)
-		return ret;
-	dmar_msi_write(irq, &msg);
-	irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
-				      "edge");
-	return 0;
+	irq_domain_free_irqs(irq, 1);
 }
 #endif
 
@@ -231,56 +255,103 @@ int arch_setup_dmar_msi(unsigned int irq)
  * MSI message composition
  */
 #ifdef CONFIG_HPET_TIMER
+static inline int hpet_dev_id(struct irq_domain *domain)
+{
+	struct msi_domain_info *info = msi_get_domain_info(domain);
+
+	return (int)(long)info->data;
+}
 
-static int hpet_msi_set_affinity(struct irq_data *data,
-				 const struct cpumask *mask, bool force)
+static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	struct msi_msg msg;
-	unsigned int dest;
-	int ret;
+	hpet_msi_write(data->handler_data, msg);
+}
 
-	ret = apic_set_affinity(data, mask, &dest);
-	if (ret)
-		return ret;
+static struct irq_chip hpet_msi_controller = {
+	.name = "HPET-MSI",
+	.irq_unmask = hpet_msi_unmask,
+	.irq_mask = hpet_msi_mask,
+	.irq_ack = irq_chip_ack_parent,
+	.irq_set_affinity = msi_domain_set_affinity,
+	.irq_retrigger = irq_chip_retrigger_hierarchy,
+	.irq_compose_msi_msg = irq_msi_compose_msg,
+	.irq_write_msi_msg = hpet_msi_write_msg,
+	.flags = IRQCHIP_SKIP_SET_WAKE,
+};
 
-	hpet_msi_read(data->handler_data, &msg);
+static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info,
+					  msi_alloc_info_t *arg)
+{
+	return arg->hpet_index;
+}
 
-	msg.data &= ~MSI_DATA_VECTOR_MASK;
-	msg.data |= MSI_DATA_VECTOR(cfg->vector);
-	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
-	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+static int hpet_msi_init(struct irq_domain *domain,
+			 struct msi_domain_info *info, unsigned int virq,
+			 irq_hw_number_t hwirq, msi_alloc_info_t *arg)
+{
+	irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
+	irq_domain_set_info(domain, virq, arg->hpet_index, info->chip, NULL,
+			    handle_edge_irq, arg->hpet_data, "edge");
 
-	hpet_msi_write(data->handler_data, &msg);
+	return 0;
+}
 
-	return IRQ_SET_MASK_OK_NOCOPY;
+static void hpet_msi_free(struct irq_domain *domain,
+			  struct msi_domain_info *info, unsigned int virq)
+{
+	irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
 }
 
-static struct irq_chip hpet_msi_type = {
-	.name = "HPET_MSI",
-	.irq_unmask = hpet_msi_unmask,
-	.irq_mask = hpet_msi_mask,
-	.irq_ack = apic_ack_edge,
-	.irq_set_affinity = hpet_msi_set_affinity,
-	.irq_retrigger = apic_retrigger_irq,
-	.flags = IRQCHIP_SKIP_SET_WAKE,
+static struct msi_domain_ops hpet_msi_domain_ops = {
+	.get_hwirq	= hpet_msi_get_hwirq,
+	.msi_init	= hpet_msi_init,
+	.msi_free	= hpet_msi_free,
+};
+
+static struct msi_domain_info hpet_msi_domain_info = {
+	.ops		= &hpet_msi_domain_ops,
+	.chip		= &hpet_msi_controller,
 };
 
-int default_setup_hpet_msi(unsigned int irq, unsigned int id)
+struct irq_domain *hpet_create_irq_domain(int hpet_id)
 {
-	struct irq_chip *chip = &hpet_msi_type;
-	struct msi_msg msg;
-	int ret;
+	struct irq_domain *parent;
+	struct irq_alloc_info info;
+	struct msi_domain_info *domain_info;
+
+	if (x86_vector_domain == NULL)
+		return NULL;
+
+	domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL);
+	if (!domain_info)
+		return NULL;
+
+	*domain_info = hpet_msi_domain_info;
+	domain_info->data = (void *)(long)hpet_id;
+
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_HPET;
+	info.hpet_id = hpet_id;
+	parent = irq_remapping_get_ir_irq_domain(&info);
+	if (parent == NULL)
+		parent = x86_vector_domain;
+	else
+		hpet_msi_controller.name = "IR-HPET-MSI";
+
+	return msi_create_irq_domain(NULL, domain_info, parent);
+}
 
-	ret = msi_compose_msg(NULL, irq, &msg, id);
-	if (ret < 0)
-		return ret;
+int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev,
+		    int dev_num)
+{
+	struct irq_alloc_info info;
 
-	hpet_msi_write(irq_get_handler_data(irq), &msg);
-	irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-	setup_remapped_irq(irq, irq_cfg(irq), chip);
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_HPET;
+	info.hpet_data = dev;
+	info.hpet_id = hpet_dev_id(domain);
+	info.hpet_index = dev_num;
 
-	irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
-	return 0;
+	return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
 }
 #endif
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 6cedd79145813c..28eba2d38b1570 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -3,6 +3,8 @@
  *
  * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
  *	Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ *	Enable support of hierarchical irqdomains
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -11,15 +13,28 @@
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
-#include <linux/irqdomain.h>
 #include <linux/slab.h>
+#include <asm/irqdomain.h>
 #include <asm/hw_irq.h>
 #include <asm/apic.h>
 #include <asm/i8259.h>
 #include <asm/desc.h>
 #include <asm/irq_remapping.h>
 
+struct apic_chip_data {
+	struct irq_cfg		cfg;
+	cpumask_var_t		domain;
+	cpumask_var_t		old_domain;
+	u8			move_in_progress : 1;
+};
+
+struct irq_domain *x86_vector_domain;
 static DEFINE_RAW_SPINLOCK(vector_lock);
+static cpumask_var_t vector_cpumask;
+static struct irq_chip lapic_controller;
+#ifdef	CONFIG_X86_IO_APIC
+static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY];
+#endif
 
 void lock_vector_lock(void)
 {
@@ -34,71 +49,59 @@ void unlock_vector_lock(void)
 	raw_spin_unlock(&vector_lock);
 }
 
-struct irq_cfg *irq_cfg(unsigned int irq)
+static struct apic_chip_data *apic_chip_data(struct irq_data *irq_data)
 {
-	return irq_get_chip_data(irq);
+	if (!irq_data)
+		return NULL;
+
+	while (irq_data->parent_data)
+		irq_data = irq_data->parent_data;
+
+	return irq_data->chip_data;
 }
 
 struct irq_cfg *irqd_cfg(struct irq_data *irq_data)
 {
-	return irq_data->chip_data;
+	struct apic_chip_data *data = apic_chip_data(irq_data);
+
+	return data ? &data->cfg : NULL;
 }
 
-static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
+struct irq_cfg *irq_cfg(unsigned int irq)
 {
-	struct irq_cfg *cfg;
+	return irqd_cfg(irq_get_irq_data(irq));
+}
 
-	cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
-	if (!cfg)
+static struct apic_chip_data *alloc_apic_chip_data(int node)
+{
+	struct apic_chip_data *data;
+
+	data = kzalloc_node(sizeof(*data), GFP_KERNEL, node);
+	if (!data)
 		return NULL;
-	if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
-		goto out_cfg;
-	if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
+	if (!zalloc_cpumask_var_node(&data->domain, GFP_KERNEL, node))
+		goto out_data;
+	if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node))
 		goto out_domain;
-#ifdef	CONFIG_X86_IO_APIC
-	INIT_LIST_HEAD(&cfg->irq_2_pin);
-#endif
-	return cfg;
+	return data;
 out_domain:
-	free_cpumask_var(cfg->domain);
-out_cfg:
-	kfree(cfg);
+	free_cpumask_var(data->domain);
+out_data:
+	kfree(data);
 	return NULL;
 }
 
-struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
+static void free_apic_chip_data(struct apic_chip_data *data)
 {
-	int res = irq_alloc_desc_at(at, node);
-	struct irq_cfg *cfg;
-
-	if (res < 0) {
-		if (res != -EEXIST)
-			return NULL;
-		cfg = irq_cfg(at);
-		if (cfg)
-			return cfg;
+	if (data) {
+		free_cpumask_var(data->domain);
+		free_cpumask_var(data->old_domain);
+		kfree(data);
 	}
-
-	cfg = alloc_irq_cfg(at, node);
-	if (cfg)
-		irq_set_chip_data(at, cfg);
-	else
-		irq_free_desc(at);
-	return cfg;
-}
-
-static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
-{
-	if (!cfg)
-		return;
-	irq_set_chip_data(at, NULL);
-	free_cpumask_var(cfg->domain);
-	free_cpumask_var(cfg->old_domain);
-	kfree(cfg);
 }
 
-static int
-__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+static int __assign_irq_vector(int irq, struct apic_chip_data *d,
+			       const struct cpumask *mask)
 {
 	/*
 	 * NOTE! The local APIC isn't very good at handling
@@ -114,36 +117,33 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
 	static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
 	static int current_offset = VECTOR_OFFSET_START % 16;
 	int cpu, err;
-	cpumask_var_t tmp_mask;
 
-	if (cfg->move_in_progress)
+	if (d->move_in_progress)
 		return -EBUSY;
 
-	if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
-		return -ENOMEM;
-
 	/* Only try and allocate irqs on cpus that are present */
 	err = -ENOSPC;
-	cpumask_clear(cfg->old_domain);
+	cpumask_clear(d->old_domain);
 	cpu = cpumask_first_and(mask, cpu_online_mask);
 	while (cpu < nr_cpu_ids) {
 		int new_cpu, vector, offset;
 
-		apic->vector_allocation_domain(cpu, tmp_mask, mask);
+		apic->vector_allocation_domain(cpu, vector_cpumask, mask);
 
-		if (cpumask_subset(tmp_mask, cfg->domain)) {
+		if (cpumask_subset(vector_cpumask, d->domain)) {
 			err = 0;
-			if (cpumask_equal(tmp_mask, cfg->domain))
+			if (cpumask_equal(vector_cpumask, d->domain))
 				break;
 			/*
 			 * New cpumask using the vector is a proper subset of
 			 * the current in use mask. So cleanup the vector
 			 * allocation for the members that are not used anymore.
 			 */
-			cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask);
-			cfg->move_in_progress =
-			   cpumask_intersects(cfg->old_domain, cpu_online_mask);
-			cpumask_and(cfg->domain, cfg->domain, tmp_mask);
+			cpumask_andnot(d->old_domain, d->domain,
+				       vector_cpumask);
+			d->move_in_progress =
+			   cpumask_intersects(d->old_domain, cpu_online_mask);
+			cpumask_and(d->domain, d->domain, vector_cpumask);
 			break;
 		}
 
@@ -157,16 +157,18 @@ next:
 		}
 
 		if (unlikely(current_vector == vector)) {
-			cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask);
-			cpumask_andnot(tmp_mask, mask, cfg->old_domain);
-			cpu = cpumask_first_and(tmp_mask, cpu_online_mask);
+			cpumask_or(d->old_domain, d->old_domain,
+				   vector_cpumask);
+			cpumask_andnot(vector_cpumask, mask, d->old_domain);
+			cpu = cpumask_first_and(vector_cpumask,
+						cpu_online_mask);
 			continue;
 		}
 
 		if (test_bit(vector, used_vectors))
 			goto next;
 
-		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) {
+		for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) {
 			if (per_cpu(vector_irq, new_cpu)[vector] >
 			    VECTOR_UNDEFINED)
 				goto next;
@@ -174,55 +176,73 @@ next:
 		/* Found one! */
 		current_vector = vector;
 		current_offset = offset;
-		if (cfg->vector) {
-			cpumask_copy(cfg->old_domain, cfg->domain);
-			cfg->move_in_progress =
-			   cpumask_intersects(cfg->old_domain, cpu_online_mask);
+		if (d->cfg.vector) {
+			cpumask_copy(d->old_domain, d->domain);
+			d->move_in_progress =
+			   cpumask_intersects(d->old_domain, cpu_online_mask);
 		}
-		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+		for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask)
 			per_cpu(vector_irq, new_cpu)[vector] = irq;
-		cfg->vector = vector;
-		cpumask_copy(cfg->domain, tmp_mask);
+		d->cfg.vector = vector;
+		cpumask_copy(d->domain, vector_cpumask);
 		err = 0;
 		break;
 	}
-	free_cpumask_var(tmp_mask);
+
+	if (!err) {
+		/* cache destination APIC IDs into cfg->dest_apicid */
+		err = apic->cpu_mask_to_apicid_and(mask, d->domain,
+						   &d->cfg.dest_apicid);
+	}
 
 	return err;
 }
 
-int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+static int assign_irq_vector(int irq, struct apic_chip_data *data,
+			     const struct cpumask *mask)
 {
 	int err;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	err = __assign_irq_vector(irq, cfg, mask);
+	err = __assign_irq_vector(irq, data, mask);
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 	return err;
 }
 
-void clear_irq_vector(int irq, struct irq_cfg *cfg)
+static int assign_irq_vector_policy(int irq, int node,
+				    struct apic_chip_data *data,
+				    struct irq_alloc_info *info)
+{
+	if (info && info->mask)
+		return assign_irq_vector(irq, data, info->mask);
+	if (node != NUMA_NO_NODE &&
+	    assign_irq_vector(irq, data, cpumask_of_node(node)) == 0)
+		return 0;
+	return assign_irq_vector(irq, data, apic->target_cpus());
+}
+
+static void clear_irq_vector(int irq, struct apic_chip_data *data)
 {
 	int cpu, vector;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	BUG_ON(!cfg->vector);
+	BUG_ON(!data->cfg.vector);
 
-	vector = cfg->vector;
-	for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
+	vector = data->cfg.vector;
+	for_each_cpu_and(cpu, data->domain, cpu_online_mask)
 		per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
 
-	cfg->vector = 0;
-	cpumask_clear(cfg->domain);
+	data->cfg.vector = 0;
+	cpumask_clear(data->domain);
 
-	if (likely(!cfg->move_in_progress)) {
+	if (likely(!data->move_in_progress)) {
 		raw_spin_unlock_irqrestore(&vector_lock, flags);
 		return;
 	}
 
-	for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
+	for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
 		for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
 		     vector++) {
 			if (per_cpu(vector_irq, cpu)[vector] != irq)
@@ -231,10 +251,95 @@ void clear_irq_vector(int irq, struct irq_cfg *cfg)
 			break;
 		}
 	}
-	cfg->move_in_progress = 0;
+	data->move_in_progress = 0;
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 }
 
+void init_irq_alloc_info(struct irq_alloc_info *info,
+			 const struct cpumask *mask)
+{
+	memset(info, 0, sizeof(*info));
+	info->mask = mask;
+}
+
+void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src)
+{
+	if (src)
+		*dst = *src;
+	else
+		memset(dst, 0, sizeof(*dst));
+}
+
+static void x86_vector_free_irqs(struct irq_domain *domain,
+				 unsigned int virq, unsigned int nr_irqs)
+{
+	struct irq_data *irq_data;
+	int i;
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i);
+		if (irq_data && irq_data->chip_data) {
+			clear_irq_vector(virq + i, irq_data->chip_data);
+			free_apic_chip_data(irq_data->chip_data);
+#ifdef	CONFIG_X86_IO_APIC
+			if (virq + i < nr_legacy_irqs())
+				legacy_irq_data[virq + i] = NULL;
+#endif
+			irq_domain_reset_irq_data(irq_data);
+		}
+	}
+}
+
+static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
+				 unsigned int nr_irqs, void *arg)
+{
+	struct irq_alloc_info *info = arg;
+	struct apic_chip_data *data;
+	struct irq_data *irq_data;
+	int i, err;
+
+	if (disable_apic)
+		return -ENXIO;
+
+	/* Currently vector allocator can't guarantee contiguous allocations */
+	if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1)
+		return -ENOSYS;
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_data = irq_domain_get_irq_data(domain, virq + i);
+		BUG_ON(!irq_data);
+#ifdef	CONFIG_X86_IO_APIC
+		if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i])
+			data = legacy_irq_data[virq + i];
+		else
+#endif
+			data = alloc_apic_chip_data(irq_data->node);
+		if (!data) {
+			err = -ENOMEM;
+			goto error;
+		}
+
+		irq_data->chip = &lapic_controller;
+		irq_data->chip_data = data;
+		irq_data->hwirq = virq + i;
+		err = assign_irq_vector_policy(virq, irq_data->node, data,
+					       info);
+		if (err)
+			goto error;
+	}
+
+	return 0;
+
+error:
+	x86_vector_free_irqs(domain, virq, i + 1);
+	return err;
+}
+
+static const struct irq_domain_ops x86_vector_domain_ops = {
+	.alloc	= x86_vector_alloc_irqs,
+	.free	= x86_vector_free_irqs,
+};
+
 int __init arch_probe_nr_irqs(void)
 {
 	int nr;
@@ -258,8 +363,43 @@ int __init arch_probe_nr_irqs(void)
 	return nr_legacy_irqs();
 }
 
+#ifdef	CONFIG_X86_IO_APIC
+static void init_legacy_irqs(void)
+{
+	int i, node = cpu_to_node(0);
+	struct apic_chip_data *data;
+
+	/*
+	 * For legacy IRQ's, start with assigning irq0 to irq15 to
+	 * ISA_IRQ_VECTOR(i) for all cpu's.
+	 */
+	for (i = 0; i < nr_legacy_irqs(); i++) {
+		data = legacy_irq_data[i] = alloc_apic_chip_data(node);
+		BUG_ON(!data);
+
+		data->cfg.vector = ISA_IRQ_VECTOR(i);
+		cpumask_setall(data->domain);
+		irq_set_chip_data(i, data);
+	}
+}
+#else
+static void init_legacy_irqs(void) { }
+#endif
+
 int __init arch_early_irq_init(void)
 {
+	init_legacy_irqs();
+
+	x86_vector_domain = irq_domain_add_tree(NULL, &x86_vector_domain_ops,
+						NULL);
+	BUG_ON(x86_vector_domain == NULL);
+	irq_set_default_host(x86_vector_domain);
+
+	arch_init_msi_domain(x86_vector_domain);
+	arch_init_htirq_domain(x86_vector_domain);
+
+	BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL));
+
 	return arch_early_ioapic_init();
 }
 
@@ -267,7 +407,7 @@ static void __setup_vector_irq(int cpu)
 {
 	/* Initialize vector_irq on a new cpu */
 	int irq, vector;
-	struct irq_cfg *cfg;
+	struct apic_chip_data *data;
 
 	/*
 	 * vector_lock will make sure that we don't run into irq vector
@@ -277,13 +417,13 @@ static void __setup_vector_irq(int cpu)
 	raw_spin_lock(&vector_lock);
 	/* Mark the inuse vectors */
 	for_each_active_irq(irq) {
-		cfg = irq_cfg(irq);
-		if (!cfg)
+		data = apic_chip_data(irq_get_irq_data(irq));
+		if (!data)
 			continue;
 
-		if (!cpumask_test_cpu(cpu, cfg->domain))
+		if (!cpumask_test_cpu(cpu, data->domain))
 			continue;
-		vector = cfg->vector;
+		vector = data->cfg.vector;
 		per_cpu(vector_irq, cpu)[vector] = irq;
 	}
 	/* Mark the free vectors */
@@ -292,8 +432,8 @@ static void __setup_vector_irq(int cpu)
 		if (irq <= VECTOR_UNDEFINED)
 			continue;
 
-		cfg = irq_cfg(irq);
-		if (!cpumask_test_cpu(cpu, cfg->domain))
+		data = apic_chip_data(irq_get_irq_data(irq));
+		if (!cpumask_test_cpu(cpu, data->domain))
 			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
 	}
 	raw_spin_unlock(&vector_lock);
@@ -314,20 +454,20 @@ void setup_vector_irq(int cpu)
 	 * legacy vector to irq mapping:
 	 */
 	for (irq = 0; irq < nr_legacy_irqs(); irq++)
-		per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
+		per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq;
 
 	__setup_vector_irq(cpu);
 }
 
-int apic_retrigger_irq(struct irq_data *data)
+static int apic_retrigger_irq(struct irq_data *irq_data)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
+	struct apic_chip_data *data = apic_chip_data(irq_data);
 	unsigned long flags;
 	int cpu;
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	cpu = cpumask_first_and(cfg->domain, cpu_online_mask);
-	apic->send_IPI_mask(cpumask_of(cpu), cfg->vector);
+	cpu = cpumask_first_and(data->domain, cpu_online_mask);
+	apic->send_IPI_mask(cpumask_of(cpu), data->cfg.vector);
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 
 	return 1;
@@ -340,73 +480,76 @@ void apic_ack_edge(struct irq_data *data)
 	ack_APIC_irq();
 }
 
-/*
- * Either sets data->affinity to a valid value, and returns
- * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
- * leaves data->affinity untouched.
- */
-int apic_set_affinity(struct irq_data *data, const struct cpumask *mask,
-		      unsigned int *dest_id)
+static int apic_set_affinity(struct irq_data *irq_data,
+			     const struct cpumask *dest, bool force)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	unsigned int irq = data->irq;
-	int err;
+	struct apic_chip_data *data = irq_data->chip_data;
+	int err, irq = irq_data->irq;
 
 	if (!config_enabled(CONFIG_SMP))
 		return -EPERM;
 
-	if (!cpumask_intersects(mask, cpu_online_mask))
+	if (!cpumask_intersects(dest, cpu_online_mask))
 		return -EINVAL;
 
-	err = assign_irq_vector(irq, cfg, mask);
-	if (err)
-		return err;
-
-	err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id);
+	err = assign_irq_vector(irq, data, dest);
 	if (err) {
-		if (assign_irq_vector(irq, cfg, data->affinity))
+		struct irq_data *top = irq_get_irq_data(irq);
+
+		if (assign_irq_vector(irq, data, top->affinity))
 			pr_err("Failed to recover vector for irq %d\n", irq);
 		return err;
 	}
 
-	cpumask_copy(data->affinity, mask);
-
-	return 0;
+	return IRQ_SET_MASK_OK;
 }
 
+static struct irq_chip lapic_controller = {
+	.irq_ack		= apic_ack_edge,
+	.irq_set_affinity	= apic_set_affinity,
+	.irq_retrigger		= apic_retrigger_irq,
+};
+
 #ifdef CONFIG_SMP
-void send_cleanup_vector(struct irq_cfg *cfg)
+static void __send_cleanup_vector(struct apic_chip_data *data)
 {
 	cpumask_var_t cleanup_mask;
 
 	if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
 		unsigned int i;
 
-		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+		for_each_cpu_and(i, data->old_domain, cpu_online_mask)
 			apic->send_IPI_mask(cpumask_of(i),
 					    IRQ_MOVE_CLEANUP_VECTOR);
 	} else {
-		cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+		cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask);
 		apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
 		free_cpumask_var(cleanup_mask);
 	}
-	cfg->move_in_progress = 0;
+	data->move_in_progress = 0;
+}
+
+void send_cleanup_vector(struct irq_cfg *cfg)
+{
+	struct apic_chip_data *data;
+
+	data = container_of(cfg, struct apic_chip_data, cfg);
+	if (data->move_in_progress)
+		__send_cleanup_vector(data);
 }
 
 asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 {
 	unsigned vector, me;
 
-	ack_APIC_irq();
-	irq_enter();
-	exit_idle();
+	entering_ack_irq();
 
 	me = smp_processor_id();
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
 		int irq;
 		unsigned int irr;
 		struct irq_desc *desc;
-		struct irq_cfg *cfg;
+		struct apic_chip_data *data;
 
 		irq = __this_cpu_read(vector_irq[vector]);
 
@@ -417,8 +560,8 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 		if (!desc)
 			continue;
 
-		cfg = irq_cfg(irq);
-		if (!cfg)
+		data = apic_chip_data(&desc->irq_data);
+		if (!data)
 			continue;
 
 		raw_spin_lock(&desc->lock);
@@ -427,10 +570,11 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 		 * Check if the irq migration is in progress. If so, we
 		 * haven't received the cleanup request yet for this irq.
 		 */
-		if (cfg->move_in_progress)
+		if (data->move_in_progress)
 			goto unlock;
 
-		if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+		if (vector == data->cfg.vector &&
+		    cpumask_test_cpu(me, data->domain))
 			goto unlock;
 
 		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -450,20 +594,21 @@ unlock:
 		raw_spin_unlock(&desc->lock);
 	}
 
-	irq_exit();
+	exiting_irq();
 }
 
 static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
 {
 	unsigned me;
+	struct apic_chip_data *data;
 
-	if (likely(!cfg->move_in_progress))
+	data = container_of(cfg, struct apic_chip_data, cfg);
+	if (likely(!data->move_in_progress))
 		return;
 
 	me = smp_processor_id();
-
-	if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
-		send_cleanup_vector(cfg);
+	if (vector == data->cfg.vector && cpumask_test_cpu(me, data->domain))
+		__send_cleanup_vector(data);
 }
 
 void irq_complete_move(struct irq_cfg *cfg)
@@ -475,46 +620,11 @@ void irq_force_complete_move(int irq)
 {
 	struct irq_cfg *cfg = irq_cfg(irq);
 
-	if (!cfg)
-		return;
-
-	__irq_complete_move(cfg, cfg->vector);
+	if (cfg)
+		__irq_complete_move(cfg, cfg->vector);
 }
 #endif
 
-/*
- * Dynamic irq allocate and deallocation. Should be replaced by irq domains!
- */
-int arch_setup_hwirq(unsigned int irq, int node)
-{
-	struct irq_cfg *cfg;
-	unsigned long flags;
-	int ret;
-
-	cfg = alloc_irq_cfg(irq, node);
-	if (!cfg)
-		return -ENOMEM;
-
-	raw_spin_lock_irqsave(&vector_lock, flags);
-	ret = __assign_irq_vector(irq, cfg, apic->target_cpus());
-	raw_spin_unlock_irqrestore(&vector_lock, flags);
-
-	if (!ret)
-		irq_set_chip_data(irq, cfg);
-	else
-		free_irq_cfg(irq, cfg);
-	return ret;
-}
-
-void arch_teardown_hwirq(unsigned int irq)
-{
-	struct irq_cfg *cfg = irq_cfg(irq);
-
-	free_remapped_irq(irq);
-	clear_irq_vector(irq, cfg);
-	free_irq_cfg(irq, cfg);
-}
-
 static void __init print_APIC_field(int base)
 {
 	int i;
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 6fae733e919489..3ffd925655e0d4 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -21,11 +21,13 @@ early_param("x2apic_phys", set_x2apic_phys_mode);
 
 static bool x2apic_fadt_phys(void)
 {
+#ifdef CONFIG_ACPI
 	if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
 		(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
 		printk(KERN_DEBUG "System requires x2apic physical mode\n");
 		return true;
 	}
+#endif
 	return false;
 }
 
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 9f6b9341950f78..8e3d22a1af9409 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -41,6 +41,25 @@ void common(void) {
 	OFFSET(pbe_orig_address, pbe, orig_address);
 	OFFSET(pbe_next, pbe, next);
 
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+	BLANK();
+	OFFSET(IA32_SIGCONTEXT_ax, sigcontext_ia32, ax);
+	OFFSET(IA32_SIGCONTEXT_bx, sigcontext_ia32, bx);
+	OFFSET(IA32_SIGCONTEXT_cx, sigcontext_ia32, cx);
+	OFFSET(IA32_SIGCONTEXT_dx, sigcontext_ia32, dx);
+	OFFSET(IA32_SIGCONTEXT_si, sigcontext_ia32, si);
+	OFFSET(IA32_SIGCONTEXT_di, sigcontext_ia32, di);
+	OFFSET(IA32_SIGCONTEXT_bp, sigcontext_ia32, bp);
+	OFFSET(IA32_SIGCONTEXT_sp, sigcontext_ia32, sp);
+	OFFSET(IA32_SIGCONTEXT_ip, sigcontext_ia32, ip);
+
+	BLANK();
+	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
+
+	BLANK();
+	OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
+#endif
+
 #ifdef CONFIG_PARAVIRT
 	BLANK();
 	OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
@@ -49,7 +68,9 @@ void common(void) {
 	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
 	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
 	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+#ifdef CONFIG_X86_32
 	OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
+#endif
 	OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
 	OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
 #endif
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 47703aed74cfb7..6ce39025f467fb 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -17,17 +17,6 @@ void foo(void);
 
 void foo(void)
 {
-	OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax);
-	OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx);
-	OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx);
-	OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx);
-	OFFSET(IA32_SIGCONTEXT_si, sigcontext, si);
-	OFFSET(IA32_SIGCONTEXT_di, sigcontext, di);
-	OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp);
-	OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp);
-	OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip);
-	BLANK();
-
 	OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
 	OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
 	OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
@@ -37,10 +26,6 @@ void foo(void)
 	OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
 	BLANK();
 
-	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
-	OFFSET(TI_cpu, thread_info, cpu);
-	BLANK();
-
 	OFFSET(PT_EBX, pt_regs, bx);
 	OFFSET(PT_ECX, pt_regs, cx);
 	OFFSET(PT_EDX, pt_regs, dx);
@@ -60,9 +45,6 @@ void foo(void)
 	OFFSET(PT_OLDSS,  pt_regs, ss);
 	BLANK();
 
-	OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
-	BLANK();
-
 	OFFSET(saved_context_gdt_desc, saved_context, gdt_desc);
 	BLANK();
 
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 5ce6f2da87639c..dcaab87da62980 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -29,27 +29,6 @@ int main(void)
 	BLANK();
 #endif
 
-#ifdef CONFIG_IA32_EMULATION
-	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
-	BLANK();
-
-#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry)
-	ENTRY(ax);
-	ENTRY(bx);
-	ENTRY(cx);
-	ENTRY(dx);
-	ENTRY(si);
-	ENTRY(di);
-	ENTRY(bp);
-	ENTRY(sp);
-	ENTRY(ip);
-	BLANK();
-#undef ENTRY
-
-	OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
-	BLANK();
-#endif
-
 #define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
 	ENTRY(bx);
 	ENTRY(cx);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e4cf63301ff439..94e7051fba1a3f 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -520,8 +520,16 @@ static void early_init_amd(struct cpuinfo_x86 *c)
 			set_cpu_cap(c, X86_FEATURE_K6_MTRR);
 #endif
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
-	/* check CPU config space for extended APIC ID */
-	if (cpu_has_apic && c->x86 >= 0xf) {
+	/*
+	 * ApicID can always be treated as an 8-bit value for AMD APIC versions
+	 * >= 0x10, but even old K8s came out of reset with version 0x10. So, we
+	 * can safely set X86_FEATURE_EXTD_APICID unconditionally for families
+	 * after 16h.
+	 */
+	if (cpu_has_apic && c->x86 > 0x16) {
+		set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+	} else if (cpu_has_apic && c->x86 >= 0xf) {
+		/* check CPU config space for extended APIC ID */
 		unsigned int val;
 		val = read_pci_config(0, 24, 0, 0x68);
 		if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 03445346ee0aae..bd17db15a2c1ef 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -12,57 +12,11 @@
 #include <asm/bugs.h>
 #include <asm/processor.h>
 #include <asm/processor-flags.h>
-#include <asm/i387.h>
+#include <asm/fpu/internal.h>
 #include <asm/msr.h>
 #include <asm/paravirt.h>
 #include <asm/alternative.h>
 
-static double __initdata x = 4195835.0;
-static double __initdata y = 3145727.0;
-
-/*
- * This used to check for exceptions..
- * However, it turns out that to support that,
- * the XMM trap handlers basically had to
- * be buggy. So let's have a correct XMM trap
- * handler, and forget about printing out
- * some status at boot.
- *
- * We should really only care about bugs here
- * anyway. Not features.
- */
-static void __init check_fpu(void)
-{
-	s32 fdiv_bug;
-
-	kernel_fpu_begin();
-
-	/*
-	 * trap_init() enabled FXSR and company _before_ testing for FP
-	 * problems here.
-	 *
-	 * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug
-	 */
-	__asm__("fninit\n\t"
-		"fldl %1\n\t"
-		"fdivl %2\n\t"
-		"fmull %2\n\t"
-		"fldl %1\n\t"
-		"fsubp %%st,%%st(1)\n\t"
-		"fistpl %0\n\t"
-		"fwait\n\t"
-		"fninit"
-		: "=m" (*&fdiv_bug)
-		: "m" (*&x), "m" (*&y));
-
-	kernel_fpu_end();
-
-	if (fdiv_bug) {
-		set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
-		pr_warn("Hmm, FPU with FDIV bug\n");
-	}
-}
-
 void __init check_bugs(void)
 {
 	identify_boot_cpu();
@@ -85,10 +39,5 @@ void __init check_bugs(void)
 		'0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
 	alternative_instructions();
 
-	/*
-	 * kernel_fpu_begin/end() in check_fpu() relies on the patched
-	 * alternative instructions.
-	 */
-	if (cpu_has_fpu)
-		check_fpu();
+	fpu__init_check_bugs();
 }
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a62cf04dac8ae9..1b4ce4c7d2a865 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -5,6 +5,7 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/string.h>
+#include <linux/ctype.h>
 #include <linux/delay.h>
 #include <linux/sched.h>
 #include <linux/init.h>
@@ -31,8 +32,7 @@
 #include <asm/setup.h>
 #include <asm/apic.h>
 #include <asm/desc.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/mtrr.h>
 #include <linux/numa.h>
 #include <asm/asm.h>
@@ -145,33 +145,6 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 } };
 EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
 
-static int __init x86_xsave_setup(char *s)
-{
-	if (strlen(s))
-		return 0;
-	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-	setup_clear_cpu_cap(X86_FEATURE_AVX);
-	setup_clear_cpu_cap(X86_FEATURE_AVX2);
-	return 1;
-}
-__setup("noxsave", x86_xsave_setup);
-
-static int __init x86_xsaveopt_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-	return 1;
-}
-__setup("noxsaveopt", x86_xsaveopt_setup);
-
-static int __init x86_xsaves_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-	return 1;
-}
-__setup("noxsaves", x86_xsaves_setup);
-
 #ifdef CONFIG_X86_32
 static int cachesize_override = -1;
 static int disable_x86_serial_nr = 1;
@@ -183,14 +156,6 @@ static int __init cachesize_setup(char *str)
 }
 __setup("cachesize=", cachesize_setup);
 
-static int __init x86_fxsr_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_FXSR);
-	setup_clear_cpu_cap(X86_FEATURE_XMM);
-	return 1;
-}
-__setup("nofxsr", x86_fxsr_setup);
-
 static int __init x86_sep_setup(char *s)
 {
 	setup_clear_cpu_cap(X86_FEATURE_SEP);
@@ -419,7 +384,7 @@ static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
 static void get_model_name(struct cpuinfo_x86 *c)
 {
 	unsigned int *v;
-	char *p, *q;
+	char *p, *q, *s;
 
 	if (c->extended_cpuid_level < 0x80000004)
 		return;
@@ -430,19 +395,21 @@ static void get_model_name(struct cpuinfo_x86 *c)
 	cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
 	c->x86_model_id[48] = 0;
 
-	/*
-	 * Intel chips right-justify this string for some dumb reason;
-	 * undo that brain damage:
-	 */
-	p = q = &c->x86_model_id[0];
+	/* Trim whitespace */
+	p = q = s = &c->x86_model_id[0];
+
 	while (*p == ' ')
 		p++;
-	if (p != q) {
-		while (*p)
-			*q++ = *p++;
-		while (q <= &c->x86_model_id[48])
-			*q++ = '\0';	/* Zero-pad the rest */
+
+	while (*p) {
+		/* Note the last non-whitespace index */
+		if (!isspace(*p))
+			s = q;
+
+		*q++ = *p++;
 	}
+
+	*(s + 1) = '\0';
 }
 
 void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
@@ -759,7 +726,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	cpu_detect(c);
 	get_cpu_vendor(c);
 	get_cpu_cap(c);
-	fpu_detect(c);
+	fpu__init_system(c);
 
 	if (this_cpu->c_early_init)
 		this_cpu->c_early_init(c);
@@ -1122,7 +1089,7 @@ void print_cpu_info(struct cpuinfo_x86 *c)
 		printk(KERN_CONT "%s ", vendor);
 
 	if (c->x86_model_id[0])
-		printk(KERN_CONT "%s", strim(c->x86_model_id));
+		printk(KERN_CONT "%s", c->x86_model_id);
 	else
 		printk(KERN_CONT "%d86", c->x86);
 
@@ -1155,10 +1122,6 @@ static __init int setup_disablecpuid(char *arg)
 }
 __setup("clearcpuid=", setup_disablecpuid);
 
-DEFINE_PER_CPU(unsigned long, kernel_stack) =
-	(unsigned long)&init_thread_union + THREAD_SIZE;
-EXPORT_PER_CPU_SYMBOL(kernel_stack);
-
 #ifdef CONFIG_X86_64
 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
 struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1,
@@ -1183,8 +1146,6 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
 
-DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
-
 /*
  * Special IST stacks which the CPU switches to when it calls
  * an IST-marked descriptor entry. Up to 7 stacks (hardware
@@ -1275,7 +1236,6 @@ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
 EXPORT_PER_CPU_SYMBOL(current_task);
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
-DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
 
 /*
  * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
@@ -1439,7 +1399,7 @@ void cpu_init(void)
 	clear_all_debug_regs();
 	dbg_restore_debug_regs();
 
-	fpu_init();
+	fpu__init_cpu();
 
 	if (is_uv_system())
 		uv_cpu_init();
@@ -1495,7 +1455,7 @@ void cpu_init(void)
 	clear_all_debug_regs();
 	dbg_restore_debug_regs();
 
-	fpu_init();
+	fpu__init_cpu();
 }
 #endif
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 20190bdac9d58e..0868c193e50a22 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -53,9 +53,12 @@
 static DEFINE_MUTEX(mce_chrdev_read_mutex);
 
 #define rcu_dereference_check_mce(p) \
-	rcu_dereference_index_check((p), \
-			      rcu_read_lock_sched_held() || \
-			      lockdep_is_held(&mce_chrdev_read_mutex))
+({ \
+	rcu_lockdep_assert(rcu_read_lock_sched_held() || \
+			   lockdep_is_held(&mce_chrdev_read_mutex), \
+			   "suspicious rcu_dereference_check_mce() usage"); \
+	smp_load_acquire(&(p)); \
+})
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/mce.h>
@@ -1640,10 +1643,16 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 		mce_intel_feature_init(c);
 		mce_adjust_timer = cmci_intel_adjust_timer;
 		break;
-	case X86_VENDOR_AMD:
+
+	case X86_VENDOR_AMD: {
+		u32 ebx = cpuid_ebx(0x80000007);
+
 		mce_amd_feature_init(c);
-		mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
+		mce_flags.overflow_recov = !!(ebx & BIT(0));
+		mce_flags.succor	 = !!(ebx & BIT(1));
 		break;
+		}
+
 	default:
 		break;
 	}
@@ -1887,7 +1896,7 @@ out:
 static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
 {
 	poll_wait(file, &mce_chrdev_wait, wait);
-	if (rcu_access_index(mcelog.next))
+	if (READ_ONCE(mcelog.next))
 		return POLLIN | POLLRDNORM;
 	if (!mce_apei_read_done && apei_check_mce())
 		return POLLIN | POLLRDNORM;
@@ -1932,8 +1941,8 @@ void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
 }
 EXPORT_SYMBOL_GPL(register_mce_write_callback);
 
-ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
-			 size_t usize, loff_t *off)
+static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+				size_t usize, loff_t *off)
 {
 	if (mce_write)
 		return mce_write(filp, ubuf, usize, off);
@@ -2011,11 +2020,8 @@ static int __init mcheck_enable(char *str)
 	else if (!strcmp(str, "bios_cmci_threshold"))
 		cfg->bios_cmci_threshold = true;
 	else if (isdigit(str[0])) {
-		get_option(&str, &(cfg->tolerant));
-		if (*str == ',') {
-			++str;
+		if (get_option(&str, &cfg->tolerant) == 2)
 			get_option(&str, &(cfg->monarch_timeout));
-		}
 	} else {
 		pr_info("mce argument %s ignored. Please use /sys\n", str);
 		return 0;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 55ad9b37cae853..e99b15077e9464 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -1,19 +1,13 @@
 /*
- *  (c) 2005-2012 Advanced Micro Devices, Inc.
+ *  (c) 2005-2015 Advanced Micro Devices, Inc.
  *  Your use of this code is subject to the terms and conditions of the
  *  GNU general public license version 2. See "COPYING" or
  *  http://www.gnu.org/licenses/gpl.html
  *
  *  Written by Jacob Shin - AMD, Inc.
- *
  *  Maintained by: Borislav Petkov <bp@alien8.de>
  *
- *  April 2006
- *     - added support for AMD Family 0x10 processors
- *  May 2012
- *     - major scrubbing
- *
- *  All MC4_MISCi registers are shared between multi-cores
+ *  All MC4_MISCi registers are shared between cores on a node.
  */
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
@@ -32,6 +26,7 @@
 #include <asm/idle.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
+#include <asm/trace/irq_vectors.h>
 
 #define NR_BLOCKS         9
 #define THRESHOLD_MAX     0xFFF
@@ -47,6 +42,13 @@
 #define MASK_BLKPTR_LO    0xFF000000
 #define MCG_XBLK_ADDR     0xC0000400
 
+/* Deferred error settings */
+#define MSR_CU_DEF_ERR		0xC0000410
+#define MASK_DEF_LVTOFF		0x000000F0
+#define MASK_DEF_INT_TYPE	0x00000006
+#define DEF_LVT_OFF		0x2
+#define DEF_INT_TYPE_APIC	0x2
+
 static const char * const th_names[] = {
 	"load_store",
 	"insn_fetch",
@@ -60,6 +62,13 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
 static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */
 
 static void amd_threshold_interrupt(void);
+static void amd_deferred_error_interrupt(void);
+
+static void default_deferred_error_interrupt(void)
+{
+	pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR);
+}
+void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
 
 /*
  * CPU Initialization
@@ -196,7 +205,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset)
 	threshold_restart_bank(&tr);
 };
 
-static int setup_APIC_mce(int reserved, int new)
+static int setup_APIC_mce_threshold(int reserved, int new)
 {
 	if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
 					      APIC_EILVT_MSG_FIX, 0))
@@ -205,6 +214,39 @@ static int setup_APIC_mce(int reserved, int new)
 	return reserved;
 }
 
+static int setup_APIC_deferred_error(int reserved, int new)
+{
+	if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR,
+					      APIC_EILVT_MSG_FIX, 0))
+		return new;
+
+	return reserved;
+}
+
+static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
+{
+	u32 low = 0, high = 0;
+	int def_offset = -1, def_new;
+
+	if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high))
+		return;
+
+	def_new = (low & MASK_DEF_LVTOFF) >> 4;
+	if (!(low & MASK_DEF_LVTOFF)) {
+		pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n");
+		def_new = DEF_LVT_OFF;
+		low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4);
+	}
+
+	def_offset = setup_APIC_deferred_error(def_offset, def_new);
+	if ((def_offset == def_new) &&
+	    (deferred_error_int_vector != amd_deferred_error_interrupt))
+		deferred_error_int_vector = amd_deferred_error_interrupt;
+
+	low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
+	wrmsr(MSR_CU_DEF_ERR, low, high);
+}
+
 /* cpu init entry point, called from mce.c with preempt off */
 void mce_amd_feature_init(struct cpuinfo_x86 *c)
 {
@@ -252,7 +294,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 
 			b.interrupt_enable = 1;
 			new	= (high & MASK_LVTOFF_HI) >> 20;
-			offset  = setup_APIC_mce(offset, new);
+			offset  = setup_APIC_mce_threshold(offset, new);
 
 			if ((offset == new) &&
 			    (mce_threshold_vector != amd_threshold_interrupt))
@@ -262,6 +304,73 @@ init:
 			mce_threshold_block_init(&b, offset);
 		}
 	}
+
+	if (mce_flags.succor)
+		deferred_error_interrupt_enable(c);
+}
+
+static void __log_error(unsigned int bank, bool threshold_err, u64 misc)
+{
+	struct mce m;
+	u64 status;
+
+	rdmsrl(MSR_IA32_MCx_STATUS(bank), status);
+	if (!(status & MCI_STATUS_VAL))
+		return;
+
+	mce_setup(&m);
+
+	m.status = status;
+	m.bank = bank;
+
+	if (threshold_err)
+		m.misc = misc;
+
+	if (m.status & MCI_STATUS_ADDRV)
+		rdmsrl(MSR_IA32_MCx_ADDR(bank), m.addr);
+
+	mce_log(&m);
+	wrmsrl(MSR_IA32_MCx_STATUS(bank), 0);
+}
+
+static inline void __smp_deferred_error_interrupt(void)
+{
+	inc_irq_stat(irq_deferred_error_count);
+	deferred_error_int_vector();
+}
+
+asmlinkage __visible void smp_deferred_error_interrupt(void)
+{
+	entering_irq();
+	__smp_deferred_error_interrupt();
+	exiting_ack_irq();
+}
+
+asmlinkage __visible void smp_trace_deferred_error_interrupt(void)
+{
+	entering_irq();
+	trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
+	__smp_deferred_error_interrupt();
+	trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
+	exiting_ack_irq();
+}
+
+/* APIC interrupt handler for deferred errors */
+static void amd_deferred_error_interrupt(void)
+{
+	u64 status;
+	unsigned int bank;
+
+	for (bank = 0; bank < mca_cfg.banks; ++bank) {
+		rdmsrl(MSR_IA32_MCx_STATUS(bank), status);
+
+		if (!(status & MCI_STATUS_VAL) ||
+		    !(status & MCI_STATUS_DEFERRED))
+			continue;
+
+		__log_error(bank, false, 0);
+		break;
+	}
 }
 
 /*
@@ -273,12 +382,12 @@ init:
  * the interrupt goes off when error_count reaches threshold_limit.
  * the handler will simply log mcelog w/ software defined bank number.
  */
+
 static void amd_threshold_interrupt(void)
 {
 	u32 low = 0, high = 0, address = 0;
 	int cpu = smp_processor_id();
 	unsigned int bank, block;
-	struct mce m;
 
 	/* assume first bank caused it */
 	for (bank = 0; bank < mca_cfg.banks; ++bank) {
@@ -321,15 +430,7 @@ static void amd_threshold_interrupt(void)
 	return;
 
 log:
-	mce_setup(&m);
-	rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status);
-	if (!(m.status & MCI_STATUS_VAL))
-		return;
-	m.misc = ((u64)high << 32) | low;
-	m.bank = bank;
-	mce_log(&m);
-
-	wrmsrl(MSR_IA32_MCx_STATUS(bank), 0);
+	__log_error(bank, true, ((u64)high << 32) | low);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c
index 737737edbd1ef5..9208a36d0f0305 100644
--- a/arch/x86/kernel/cpu/microcode/amd_early.c
+++ b/arch/x86/kernel/cpu/microcode/amd_early.c
@@ -228,7 +228,18 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
 	}
 }
 
-void __init load_ucode_amd_bsp(void)
+static bool __init load_builtin_amd_microcode(struct cpio_data *cp, int family)
+{
+	char fw_name[36] = "amd-ucode/microcode_amd.bin";
+
+	if (family >= 0x15)
+		snprintf(fw_name, sizeof(fw_name),
+			 "amd-ucode/microcode_amd_fam%.2xh.bin", family);
+
+	return get_builtin_firmware(cp, fw_name);
+}
+
+void __init load_ucode_amd_bsp(int family)
 {
 	struct cpio_data cp;
 	void **data;
@@ -243,8 +254,10 @@ void __init load_ucode_amd_bsp(void)
 #endif
 
 	cp = find_ucode_in_initrd();
-	if (!cp.data)
-		return;
+	if (!cp.data) {
+		if (!load_builtin_amd_microcode(&cp, family))
+			return;
+	}
 
 	*data = cp.data;
 	*size = cp.size;
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 36a83617eb21cc..6236a54a63f449 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -1,74 +1,16 @@
 /*
- *	Intel CPU Microcode Update Driver for Linux
+ * CPU Microcode Update Driver for Linux
  *
- *	Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- *		      2006	Shaohua Li <shaohua.li@intel.com>
+ * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *	      2006	Shaohua Li <shaohua.li@intel.com>
+ *	      2013-2015	Borislav Petkov <bp@alien8.de>
  *
- *	This driver allows to upgrade microcode on Intel processors
- *	belonging to IA-32 family - PentiumPro, Pentium II,
- *	Pentium III, Xeon, Pentium 4, etc.
+ * This driver allows to upgrade microcode on x86 processors.
  *
- *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- *	Software Developer's Manual
- *	Order Number 253668 or free download from:
- *
- *	http://developer.intel.com/Assets/PDF/manual/253668.pdf	
- *
- *	For more information, go to http://www.urbanmyth.org/microcode
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- *	1.0	16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Initial release.
- *	1.01	18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Added read() support + cleanups.
- *	1.02	21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Added 'device trimming' support. open(O_WRONLY) zeroes
- *		and frees the saved copy of applied microcode.
- *	1.03	29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Made to use devfs (/dev/cpu/microcode) + cleanups.
- *	1.04	06 Jun 2000, Simon Trimmer <simon@veritas.com>
- *		Added misc device support (now uses both devfs and misc).
- *		Added MICROCODE_IOCFREE ioctl to clear memory.
- *	1.05	09 Jun 2000, Simon Trimmer <simon@veritas.com>
- *		Messages for error cases (non Intel & no suitable microcode).
- *	1.06	03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- *		Removed ->release(). Removed exclusive open and status bitmap.
- *		Added microcode_rwsem to serialize read()/write()/ioctl().
- *		Removed global kernel lock usage.
- *	1.07	07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- *		Write 0 to 0x8B msr and then cpuid before reading revision,
- *		so that it works even if there were no update done by the
- *		BIOS. Otherwise, reading from 0x8B gives junk (which happened
- *		to be 0 on my machine which is why it worked even when I
- *		disabled update by the BIOS)
- *		Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- *	1.08	11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- *			     Tigran Aivazian <tigran@veritas.com>
- *		Intel Pentium 4 processor support and bugfixes.
- *	1.09	30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- *		Bugfix for HT (Hyper-Threading) enabled processors
- *		whereby processor resources are shared by all logical processors
- *		in a single CPU package.
- *	1.10	28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- *		Tigran Aivazian <tigran@veritas.com>,
- *		Serialize updates as required on HT processors due to
- *		speculative nature of implementation.
- *	1.11	22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- *		Fix the panic when writing zero-length microcode chunk.
- *	1.12	29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
- *		Jun Nakajima <jun.nakajima@intel.com>
- *		Support for the microcode updates in the new format.
- *	1.13	10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- *		Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- *		because we no longer hold a copy of applied microcode
- *		in kernel memory.
- *	1.14	25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- *		Fix sigmatch() macro to handle old CPUs with pf == 0.
- *		Thanks to Stuart Swales for pointing out this bug.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
index a413a69cbd744f..979ca78b15612f 100644
--- a/arch/x86/kernel/cpu/microcode/core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
@@ -3,6 +3,7 @@
  *
  *	Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
  *			   H Peter Anvin" <hpa@zytor.com>
+ *		  (C) 2015 Borislav Petkov <bp@alien8.de>
  *
  *	This driver allows to early upgrade microcode on Intel processors
  *	belonging to IA-32 family - PentiumPro, Pentium II,
@@ -17,6 +18,7 @@
  *	2 of the License, or (at your option) any later version.
  */
 #include <linux/module.h>
+#include <linux/firmware.h>
 #include <asm/microcode.h>
 #include <asm/microcode_intel.h>
 #include <asm/microcode_amd.h>
@@ -43,6 +45,25 @@ static bool __init check_loader_disabled_bsp(void)
 	return *res;
 }
 
+extern struct builtin_fw __start_builtin_fw[];
+extern struct builtin_fw __end_builtin_fw[];
+
+bool get_builtin_firmware(struct cpio_data *cd, const char *name)
+{
+#ifdef CONFIG_FW_LOADER
+	struct builtin_fw *b_fw;
+
+	for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) {
+		if (!strcmp(name, b_fw->name)) {
+			cd->size = b_fw->size;
+			cd->data = b_fw->data;
+			return true;
+		}
+	}
+#endif
+	return false;
+}
+
 void __init load_ucode_bsp(void)
 {
 	int vendor, family;
@@ -63,7 +84,7 @@ void __init load_ucode_bsp(void)
 		break;
 	case X86_VENDOR_AMD:
 		if (family >= 0x10)
-			load_ucode_amd_bsp();
+			load_ucode_amd_bsp(family);
 		break;
 	default:
 		break;
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index a41beadb3db9a3..969dc17eb1b4b8 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -1,74 +1,13 @@
 /*
- *	Intel CPU Microcode Update Driver for Linux
+ * Intel CPU Microcode Update Driver for Linux
  *
- *	Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- *		      2006	Shaohua Li <shaohua.li@intel.com>
+ * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *		 2006 Shaohua Li <shaohua.li@intel.com>
  *
- *	This driver allows to upgrade microcode on Intel processors
- *	belonging to IA-32 family - PentiumPro, Pentium II,
- *	Pentium III, Xeon, Pentium 4, etc.
- *
- *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- *	Software Developer's Manual
- *	Order Number 253668 or free download from:
- *
- *	http://developer.intel.com/Assets/PDF/manual/253668.pdf	
- *
- *	For more information, go to http://www.urbanmyth.org/microcode
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- *	1.0	16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Initial release.
- *	1.01	18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Added read() support + cleanups.
- *	1.02	21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Added 'device trimming' support. open(O_WRONLY) zeroes
- *		and frees the saved copy of applied microcode.
- *	1.03	29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Made to use devfs (/dev/cpu/microcode) + cleanups.
- *	1.04	06 Jun 2000, Simon Trimmer <simon@veritas.com>
- *		Added misc device support (now uses both devfs and misc).
- *		Added MICROCODE_IOCFREE ioctl to clear memory.
- *	1.05	09 Jun 2000, Simon Trimmer <simon@veritas.com>
- *		Messages for error cases (non Intel & no suitable microcode).
- *	1.06	03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- *		Removed ->release(). Removed exclusive open and status bitmap.
- *		Added microcode_rwsem to serialize read()/write()/ioctl().
- *		Removed global kernel lock usage.
- *	1.07	07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- *		Write 0 to 0x8B msr and then cpuid before reading revision,
- *		so that it works even if there were no update done by the
- *		BIOS. Otherwise, reading from 0x8B gives junk (which happened
- *		to be 0 on my machine which is why it worked even when I
- *		disabled update by the BIOS)
- *		Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- *	1.08	11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- *			     Tigran Aivazian <tigran@veritas.com>
- *		Intel Pentium 4 processor support and bugfixes.
- *	1.09	30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- *		Bugfix for HT (Hyper-Threading) enabled processors
- *		whereby processor resources are shared by all logical processors
- *		in a single CPU package.
- *	1.10	28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- *		Tigran Aivazian <tigran@veritas.com>,
- *		Serialize updates as required on HT processors due to
- *		speculative nature of implementation.
- *	1.11	22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- *		Fix the panic when writing zero-length microcode chunk.
- *	1.12	29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
- *		Jun Nakajima <jun.nakajima@intel.com>
- *		Support for the microcode updates in the new format.
- *	1.13	10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- *		Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- *		because we no longer hold a copy of applied microcode
- *		in kernel memory.
- *	1.14	25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- *		Fix sigmatch() macro to handle old CPUs with pf == 0.
- *		Thanks to Stuart Swales for pointing out this bug.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -124,7 +63,7 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
 	cpf = cpu_sig.pf;
 	crev = cpu_sig.rev;
 
-	return get_matching_microcode(csig, cpf, crev, mc_intel);
+	return has_newer_microcode(mc_intel, csig, cpf, crev);
 }
 
 static int apply_microcode_intel(int cpu)
@@ -226,7 +165,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 
 		csig = uci->cpu_sig.sig;
 		cpf = uci->cpu_sig.pf;
-		if (get_matching_microcode(csig, cpf, new_rev, mc)) {
+		if (has_newer_microcode(mc, csig, cpf, new_rev)) {
 			vfree(new_mc);
 			new_rev = mc_header.rev;
 			new_mc  = mc;
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 2f49ab4ac0ae13..10dff3f3f686df 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -59,10 +59,10 @@ load_microcode_early(struct microcode_intel **saved,
 		ucode_ptr = saved[i];
 		mc_hdr	  = (struct microcode_header_intel *)ucode_ptr;
 
-		ret = get_matching_microcode(uci->cpu_sig.sig,
-					     uci->cpu_sig.pf,
-					     new_rev,
-					     ucode_ptr);
+		ret = has_newer_microcode(ucode_ptr,
+					  uci->cpu_sig.sig,
+					  uci->cpu_sig.pf,
+					  new_rev);
 		if (!ret)
 			continue;
 
@@ -246,7 +246,7 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved,
 			     u8 *ucode_ptr, unsigned int num_saved)
 {
 	struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
-	unsigned int sig, pf, new_rev;
+	unsigned int sig, pf;
 	int found = 0, i;
 
 	mc_hdr = (struct microcode_header_intel *)ucode_ptr;
@@ -255,14 +255,13 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved,
 		mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
 		sig	     = mc_saved_hdr->sig;
 		pf	     = mc_saved_hdr->pf;
-		new_rev	     = mc_hdr->rev;
 
-		if (!get_matching_sig(sig, pf, new_rev, ucode_ptr))
+		if (!find_matching_signature(ucode_ptr, sig, pf))
 			continue;
 
 		found = 1;
 
-		if (!revision_is_newer(mc_hdr, new_rev))
+		if (mc_hdr->rev <= mc_saved_hdr->rev)
 			continue;
 
 		/*
@@ -522,6 +521,23 @@ out:
 EXPORT_SYMBOL_GPL(save_mc_for_early);
 #endif
 
+static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
+{
+	u32 eax = 0x00000001, ebx, ecx = 0, edx;
+	int family, model, stepping;
+	char name[30];
+
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+
+	family   = __x86_family(eax);
+	model    = x86_model(eax);
+	stepping = eax & 0xf;
+
+	sprintf(name, "intel-ucode/%02x-%02x-%02x", family, model, stepping);
+
+	return get_builtin_firmware(cp, name);
+}
+
 static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
 static __init enum ucode_state
 scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
@@ -540,8 +556,10 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
 	cd.size = 0;
 
 	cd = find_cpio_data(p, (void *)start, size, &offset);
-	if (!cd.data)
-		return UCODE_ERROR;
+	if (!cd.data) {
+		if (!load_builtin_intel_microcode(&cd))
+			return UCODE_ERROR;
+	}
 
 	return get_matching_model_microcode(0, start, cd.data, cd.size,
 					    mc_saved_data, initrd, uci);
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index cd47a510a3f174..1883d252ff7d60 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -31,11 +31,18 @@
 #include <asm/processor.h>
 #include <asm/msr.h>
 
-static inline int
-update_match_cpu(unsigned int csig, unsigned int cpf,
-		 unsigned int sig, unsigned int pf)
+static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
+					unsigned int s2, unsigned int p2)
 {
-	return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
+	if (s1 != s2)
+		return false;
+
+	/* Processor flags are either both 0 ... */
+	if (!p1 && !p2)
+		return true;
+
+	/* ... or they intersect. */
+	return p1 & p2;
 }
 
 int microcode_sanity_check(void *mc, int print_err)
@@ -124,27 +131,25 @@ EXPORT_SYMBOL_GPL(microcode_sanity_check);
 /*
  * Returns 1 if update has been found, 0 otherwise.
  */
-int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc)
+int find_matching_signature(void *mc, unsigned int csig, int cpf)
 {
-	struct microcode_header_intel *mc_header = mc;
-	struct extended_sigtable *ext_header;
-	unsigned long total_size = get_totalsize(mc_header);
-	int ext_sigcount, i;
+	struct microcode_header_intel *mc_hdr = mc;
+	struct extended_sigtable *ext_hdr;
 	struct extended_signature *ext_sig;
+	int i;
 
-	if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf))
+	if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
 		return 1;
 
 	/* Look for ext. headers: */
-	if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
+	if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
 		return 0;
 
-	ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
-	ext_sigcount = ext_header->count;
-	ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+	ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
+	ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
 
-	for (i = 0; i < ext_sigcount; i++) {
-		if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf))
+	for (i = 0; i < ext_hdr->count; i++) {
+		if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
 			return 1;
 		ext_sig++;
 	}
@@ -154,13 +159,13 @@ int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc)
 /*
  * Returns 1 if update has been found, 0 otherwise.
  */
-int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc)
+int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
 {
 	struct microcode_header_intel *mc_hdr = mc;
 
-	if (!revision_is_newer(mc_hdr, rev))
+	if (mc_hdr->rev <= new_rev)
 		return 0;
 
-	return get_matching_sig(csig, cpf, rev, mc);
+	return find_matching_signature(mc, csig, cpf);
 }
-EXPORT_SYMBOL_GPL(get_matching_microcode);
+EXPORT_SYMBOL_GPL(has_newer_microcode);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 939155ffdecec6..aad4bd84b475ec 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -39,14 +39,12 @@ void hyperv_vector_handler(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
-	irq_enter();
-	exit_idle();
-
+	entering_irq();
 	inc_irq_stat(irq_hv_callback_count);
 	if (vmbus_handler)
 		vmbus_handler();
 
-	irq_exit();
+	exiting_irq();
 	set_irq_regs(old_regs);
 }
 
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 5f90b85ff22e58..70d7c93f455083 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -98,7 +98,8 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
 			continue;
 		base = range_state[i].base_pfn;
 		if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
-		    (mtrr_state.enabled & 1)) {
+		    (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+		    (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
 			/* Var MTRR contains UC entry below 1M? Skip it: */
 			printk(BIOS_BUG_MSG, i);
 			if (base + size <= (1<<(20-PAGE_SHIFT)))
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 7d74f7b3c6ba49..3b533cf37c745c 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -102,59 +102,76 @@ static int check_type_overlap(u8 *prev, u8 *curr)
 	return 0;
 }
 
-/*
- * Error/Semi-error returns:
- * 0xFF - when MTRR is not enabled
- * *repeat == 1 implies [start:end] spanned across MTRR range and type returned
- *		corresponds only to [start:*partial_end].
- *		Caller has to lookup again for [*partial_end:end].
+/**
+ * mtrr_type_lookup_fixed - look up memory type in MTRR fixed entries
+ *
+ * Return the MTRR fixed memory type of 'start'.
+ *
+ * MTRR fixed entries are divided into the following ways:
+ *  0x00000 - 0x7FFFF : This range is divided into eight 64KB sub-ranges
+ *  0x80000 - 0xBFFFF : This range is divided into sixteen 16KB sub-ranges
+ *  0xC0000 - 0xFFFFF : This range is divided into sixty-four 4KB sub-ranges
+ *
+ * Return Values:
+ * MTRR_TYPE_(type)  - Matched memory type
+ * MTRR_TYPE_INVALID - Unmatched
+ */
+static u8 mtrr_type_lookup_fixed(u64 start, u64 end)
+{
+	int idx;
+
+	if (start >= 0x100000)
+		return MTRR_TYPE_INVALID;
+
+	/* 0x0 - 0x7FFFF */
+	if (start < 0x80000) {
+		idx = 0;
+		idx += (start >> 16);
+		return mtrr_state.fixed_ranges[idx];
+	/* 0x80000 - 0xBFFFF */
+	} else if (start < 0xC0000) {
+		idx = 1 * 8;
+		idx += ((start - 0x80000) >> 14);
+		return mtrr_state.fixed_ranges[idx];
+	}
+
+	/* 0xC0000 - 0xFFFFF */
+	idx = 3 * 8;
+	idx += ((start - 0xC0000) >> 12);
+	return mtrr_state.fixed_ranges[idx];
+}
+
+/**
+ * mtrr_type_lookup_variable - look up memory type in MTRR variable entries
+ *
+ * Return Value:
+ * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched)
+ *
+ * Output Arguments:
+ * repeat - Set to 1 when [start:end] spanned across MTRR range and type
+ *	    returned corresponds only to [start:*partial_end].  Caller has
+ *	    to lookup again for [*partial_end:end].
+ *
+ * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
+ *	     region is fully covered by a single MTRR entry or the default
+ *	     type.
  */
-static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
+static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end,
+				    int *repeat, u8 *uniform)
 {
 	int i;
 	u64 base, mask;
 	u8 prev_match, curr_match;
 
 	*repeat = 0;
-	if (!mtrr_state_set)
-		return 0xFF;
-
-	if (!mtrr_state.enabled)
-		return 0xFF;
+	*uniform = 1;
 
-	/* Make end inclusive end, instead of exclusive */
+	/* Make end inclusive instead of exclusive */
 	end--;
 
-	/* Look in fixed ranges. Just return the type as per start */
-	if (mtrr_state.have_fixed && (start < 0x100000)) {
-		int idx;
-
-		if (start < 0x80000) {
-			idx = 0;
-			idx += (start >> 16);
-			return mtrr_state.fixed_ranges[idx];
-		} else if (start < 0xC0000) {
-			idx = 1 * 8;
-			idx += ((start - 0x80000) >> 14);
-			return mtrr_state.fixed_ranges[idx];
-		} else if (start < 0x1000000) {
-			idx = 3 * 8;
-			idx += ((start - 0xC0000) >> 12);
-			return mtrr_state.fixed_ranges[idx];
-		}
-	}
-
-	/*
-	 * Look in variable ranges
-	 * Look of multiple ranges matching this address and pick type
-	 * as per MTRR precedence
-	 */
-	if (!(mtrr_state.enabled & 2))
-		return mtrr_state.def_type;
-
-	prev_match = 0xFF;
+	prev_match = MTRR_TYPE_INVALID;
 	for (i = 0; i < num_var_ranges; ++i) {
-		unsigned short start_state, end_state;
+		unsigned short start_state, end_state, inclusive;
 
 		if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11)))
 			continue;
@@ -166,20 +183,29 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
 
 		start_state = ((start & mask) == (base & mask));
 		end_state = ((end & mask) == (base & mask));
+		inclusive = ((start < base) && (end > base));
 
-		if (start_state != end_state) {
+		if ((start_state != end_state) || inclusive) {
 			/*
 			 * We have start:end spanning across an MTRR.
-			 * We split the region into
-			 * either
-			 * (start:mtrr_end) (mtrr_end:end)
-			 * or
-			 * (start:mtrr_start) (mtrr_start:end)
+			 * We split the region into either
+			 *
+			 * - start_state:1
+			 * (start:mtrr_end)(mtrr_end:end)
+			 * - end_state:1
+			 * (start:mtrr_start)(mtrr_start:end)
+			 * - inclusive:1
+			 * (start:mtrr_start)(mtrr_start:mtrr_end)(mtrr_end:end)
+			 *
 			 * depending on kind of overlap.
-			 * Return the type for first region and a pointer to
-			 * the start of second region so that caller will
-			 * lookup again on the second region.
-			 * Note: This way we handle multiple overlaps as well.
+			 *
+			 * Return the type of the first region and a pointer
+			 * to the start of next region so that caller will be
+			 * advised to lookup again after having adjusted start
+			 * and end.
+			 *
+			 * Note: This way we handle overlaps with multiple
+			 * entries and the default type properly.
 			 */
 			if (start_state)
 				*partial_end = base + get_mtrr_size(mask);
@@ -193,59 +219,94 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
 
 			end = *partial_end - 1; /* end is inclusive */
 			*repeat = 1;
+			*uniform = 0;
 		}
 
 		if ((start & mask) != (base & mask))
 			continue;
 
 		curr_match = mtrr_state.var_ranges[i].base_lo & 0xff;
-		if (prev_match == 0xFF) {
+		if (prev_match == MTRR_TYPE_INVALID) {
 			prev_match = curr_match;
 			continue;
 		}
 
+		*uniform = 0;
 		if (check_type_overlap(&prev_match, &curr_match))
 			return curr_match;
 	}
 
-	if (mtrr_tom2) {
-		if (start >= (1ULL<<32) && (end < mtrr_tom2))
-			return MTRR_TYPE_WRBACK;
-	}
-
-	if (prev_match != 0xFF)
+	if (prev_match != MTRR_TYPE_INVALID)
 		return prev_match;
 
 	return mtrr_state.def_type;
 }
 
-/*
- * Returns the effective MTRR type for the region
- * Error return:
- * 0xFF - when MTRR is not enabled
+/**
+ * mtrr_type_lookup - look up memory type in MTRR
+ *
+ * Return Values:
+ * MTRR_TYPE_(type)  - The effective MTRR type for the region
+ * MTRR_TYPE_INVALID - MTRR is disabled
+ *
+ * Output Argument:
+ * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
+ *	     region is fully covered by a single MTRR entry or the default
+ *	     type.
  */
-u8 mtrr_type_lookup(u64 start, u64 end)
+u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
 {
-	u8 type, prev_type;
+	u8 type, prev_type, is_uniform = 1, dummy;
 	int repeat;
 	u64 partial_end;
 
-	type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+	if (!mtrr_state_set)
+		return MTRR_TYPE_INVALID;
+
+	if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED))
+		return MTRR_TYPE_INVALID;
+
+	/*
+	 * Look up the fixed ranges first, which take priority over
+	 * the variable ranges.
+	 */
+	if ((start < 0x100000) &&
+	    (mtrr_state.have_fixed) &&
+	    (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
+		is_uniform = 0;
+		type = mtrr_type_lookup_fixed(start, end);
+		goto out;
+	}
+
+	/*
+	 * Look up the variable ranges.  Look of multiple ranges matching
+	 * this address and pick type as per MTRR precedence.
+	 */
+	type = mtrr_type_lookup_variable(start, end, &partial_end,
+					 &repeat, &is_uniform);
 
 	/*
 	 * Common path is with repeat = 0.
 	 * However, we can have cases where [start:end] spans across some
-	 * MTRR range. Do repeated lookups for that case here.
+	 * MTRR ranges and/or the default type.  Do repeated lookups for
+	 * that case here.
 	 */
 	while (repeat) {
 		prev_type = type;
 		start = partial_end;
-		type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+		is_uniform = 0;
+		type = mtrr_type_lookup_variable(start, end, &partial_end,
+						 &repeat, &dummy);
 
 		if (check_type_overlap(&prev_type, &type))
-			return type;
+			goto out;
 	}
 
+	if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2))
+		type = MTRR_TYPE_WRBACK;
+
+out:
+	*uniform = is_uniform;
 	return type;
 }
 
@@ -347,7 +408,9 @@ static void __init print_mtrr_state(void)
 		 mtrr_attrib_to_str(mtrr_state.def_type));
 	if (mtrr_state.have_fixed) {
 		pr_debug("MTRR fixed ranges %sabled:\n",
-			 mtrr_state.enabled & 1 ? "en" : "dis");
+			((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+			 (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ?
+			 "en" : "dis");
 		print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
 		for (i = 0; i < 2; ++i)
 			print_fixed(0x80000 + i * 0x20000, 0x04000,
@@ -360,7 +423,7 @@ static void __init print_mtrr_state(void)
 		print_fixed_last();
 	}
 	pr_debug("MTRR variable ranges %sabled:\n",
-		 mtrr_state.enabled & 2 ? "en" : "dis");
+		 mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis");
 	high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4;
 
 	for (i = 0; i < num_var_ranges; ++i) {
@@ -382,7 +445,7 @@ static void __init print_mtrr_state(void)
 }
 
 /* Grab all of the MTRR state for this CPU into *state */
-void __init get_mtrr_state(void)
+bool __init get_mtrr_state(void)
 {
 	struct mtrr_var_range *vrs;
 	unsigned long flags;
@@ -426,6 +489,8 @@ void __init get_mtrr_state(void)
 
 	post_set();
 	local_irq_restore(flags);
+
+	return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED);
 }
 
 /* Some BIOS's are messed up and don't set all MTRRs the same! */
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index ea5f363a194866..e7ed0d8ebacb15 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -59,6 +59,12 @@
 #define MTRR_TO_PHYS_WC_OFFSET 1000
 
 u32 num_var_ranges;
+static bool __mtrr_enabled;
+
+static bool mtrr_enabled(void)
+{
+	return __mtrr_enabled;
+}
 
 unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
 static DEFINE_MUTEX(mtrr_mutex);
@@ -286,7 +292,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 	int i, replace, error;
 	mtrr_type ltype;
 
-	if (!mtrr_if)
+	if (!mtrr_enabled())
 		return -ENXIO;
 
 	error = mtrr_if->validate_add_page(base, size, type);
@@ -435,6 +441,8 @@ static int mtrr_check(unsigned long base, unsigned long size)
 int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
 	     bool increment)
 {
+	if (!mtrr_enabled())
+		return -ENODEV;
 	if (mtrr_check(base, size))
 		return -EINVAL;
 	return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
@@ -463,8 +471,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 	unsigned long lbase, lsize;
 	int error = -EINVAL;
 
-	if (!mtrr_if)
-		return -ENXIO;
+	if (!mtrr_enabled())
+		return -ENODEV;
 
 	max = num_var_ranges;
 	/* No CPU hotplug when we change MTRR entries */
@@ -523,6 +531,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
  */
 int mtrr_del(int reg, unsigned long base, unsigned long size)
 {
+	if (!mtrr_enabled())
+		return -ENODEV;
 	if (mtrr_check(base, size))
 		return -EINVAL;
 	return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
@@ -538,6 +548,9 @@ EXPORT_SYMBOL(mtrr_del);
  * attempts to add a WC MTRR covering size bytes starting at base and
  * logs an error if this fails.
  *
+ * The called should provide a power of two size on an equivalent
+ * power of two boundary.
+ *
  * Drivers must store the return value to pass to mtrr_del_wc_if_needed,
  * but drivers should not try to interpret that return value.
  */
@@ -545,7 +558,7 @@ int arch_phys_wc_add(unsigned long base, unsigned long size)
 {
 	int ret;
 
-	if (pat_enabled)
+	if (pat_enabled() || !mtrr_enabled())
 		return 0;  /* Success!  (We don't need to do anything.) */
 
 	ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true);
@@ -577,7 +590,7 @@ void arch_phys_wc_del(int handle)
 EXPORT_SYMBOL(arch_phys_wc_del);
 
 /*
- * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value
+ * arch_phys_wc_index - translates arch_phys_wc_add's return value
  * @handle: Return value from arch_phys_wc_add
  *
  * This will turn the return value from arch_phys_wc_add into an mtrr
@@ -587,14 +600,14 @@ EXPORT_SYMBOL(arch_phys_wc_del);
  * in printk line.  Alas there is an illegitimate use in some ancient
  * drm ioctls.
  */
-int phys_wc_to_mtrr_index(int handle)
+int arch_phys_wc_index(int handle)
 {
 	if (handle < MTRR_TO_PHYS_WC_OFFSET)
 		return -1;
 	else
 		return handle - MTRR_TO_PHYS_WC_OFFSET;
 }
-EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index);
+EXPORT_SYMBOL_GPL(arch_phys_wc_index);
 
 /*
  * HACK ALERT!
@@ -734,10 +747,12 @@ void __init mtrr_bp_init(void)
 	}
 
 	if (mtrr_if) {
+		__mtrr_enabled = true;
 		set_num_var_ranges();
 		init_table();
 		if (use_intel()) {
-			get_mtrr_state();
+			/* BIOS may override */
+			__mtrr_enabled = get_mtrr_state();
 
 			if (mtrr_cleanup(phys_addr)) {
 				changed_by_mtrr_cleanup = 1;
@@ -745,10 +760,16 @@ void __init mtrr_bp_init(void)
 			}
 		}
 	}
+
+	if (!mtrr_enabled())
+		pr_info("MTRR: Disabled\n");
 }
 
 void mtrr_ap_init(void)
 {
+	if (!mtrr_enabled())
+		return;
+
 	if (!use_intel() || mtrr_aps_delayed_init)
 		return;
 	/*
@@ -774,6 +795,9 @@ void mtrr_save_state(void)
 {
 	int first_cpu;
 
+	if (!mtrr_enabled())
+		return;
+
 	get_online_cpus();
 	first_cpu = cpumask_first(cpu_online_mask);
 	smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
@@ -782,6 +806,8 @@ void mtrr_save_state(void)
 
 void set_mtrr_aps_delayed_init(void)
 {
+	if (!mtrr_enabled())
+		return;
 	if (!use_intel())
 		return;
 
@@ -793,7 +819,7 @@ void set_mtrr_aps_delayed_init(void)
  */
 void mtrr_aps_init(void)
 {
-	if (!use_intel())
+	if (!use_intel() || !mtrr_enabled())
 		return;
 
 	/*
@@ -810,7 +836,7 @@ void mtrr_aps_init(void)
 
 void mtrr_bp_restore(void)
 {
-	if (!use_intel())
+	if (!use_intel() || !mtrr_enabled())
 		return;
 
 	mtrr_if->set_all();
@@ -818,7 +844,7 @@ void mtrr_bp_restore(void)
 
 static int __init mtrr_init_finialize(void)
 {
-	if (!mtrr_if)
+	if (!mtrr_enabled())
 		return 0;
 
 	if (use_intel()) {
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index df5e41f31a27e7..951884dcc43354 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -51,7 +51,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
 
 void fill_mtrr_var_range(unsigned int index,
 		u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
-void get_mtrr_state(void);
+bool get_mtrr_state(void);
 
 extern void set_mtrr_ops(const struct mtrr_ops *ops);
 
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 87848ebe2bb79a..dbe3328f8ad774 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -190,6 +190,7 @@ static bool check_hw_exists(void)
 	u64 val, val_fail, val_new= ~0;
 	int i, reg, reg_fail, ret = 0;
 	int bios_fail = 0;
+	int reg_safe = -1;
 
 	/*
 	 * Check to see if the BIOS enabled any of the counters, if so
@@ -204,6 +205,8 @@ static bool check_hw_exists(void)
 			bios_fail = 1;
 			val_fail = val;
 			reg_fail = reg;
+		} else {
+			reg_safe = i;
 		}
 	}
 
@@ -222,11 +225,22 @@ static bool check_hw_exists(void)
 	}
 
 	/*
+	 * If all the counters are enabled, the below test will always
+	 * fail.  The tools will also become useless in this scenario.
+	 * Just fail and disable the hardware counters.
+	 */
+
+	if (reg_safe == -1) {
+		reg = reg_safe;
+		goto msr_fail;
+	}
+
+	/*
 	 * Read the current value, change it and read it back to see if it
 	 * matches, this is needed to detect certain hardware emulators
 	 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
 	 */
-	reg = x86_pmu_event_addr(0);
+	reg = x86_pmu_event_addr(reg_safe);
 	if (rdmsrl_safe(reg, &val))
 		goto msr_fail;
 	val ^= 0xffffUL;
@@ -611,6 +625,7 @@ struct sched_state {
 	int	event;		/* event index */
 	int	counter;	/* counter index */
 	int	unassigned;	/* number of events to be assigned left */
+	int	nr_gp;		/* number of GP counters used */
 	unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 };
 
@@ -620,27 +635,29 @@ struct sched_state {
 struct perf_sched {
 	int			max_weight;
 	int			max_events;
-	struct perf_event	**events;
-	struct sched_state	state;
+	int			max_gp;
 	int			saved_states;
+	struct event_constraint	**constraints;
+	struct sched_state	state;
 	struct sched_state	saved[SCHED_STATES_MAX];
 };
 
 /*
  * Initialize interator that runs through all events and counters.
  */
-static void perf_sched_init(struct perf_sched *sched, struct perf_event **events,
-			    int num, int wmin, int wmax)
+static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
+			    int num, int wmin, int wmax, int gpmax)
 {
 	int idx;
 
 	memset(sched, 0, sizeof(*sched));
 	sched->max_events	= num;
 	sched->max_weight	= wmax;
-	sched->events		= events;
+	sched->max_gp		= gpmax;
+	sched->constraints	= constraints;
 
 	for (idx = 0; idx < num; idx++) {
-		if (events[idx]->hw.constraint->weight == wmin)
+		if (constraints[idx]->weight == wmin)
 			break;
 	}
 
@@ -687,7 +704,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
 	if (sched->state.event >= sched->max_events)
 		return false;
 
-	c = sched->events[sched->state.event]->hw.constraint;
+	c = sched->constraints[sched->state.event];
 	/* Prefer fixed purpose counters */
 	if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
 		idx = INTEL_PMC_IDX_FIXED;
@@ -696,11 +713,16 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
 				goto done;
 		}
 	}
+
 	/* Grab the first unused counter starting with idx */
 	idx = sched->state.counter;
 	for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
-		if (!__test_and_set_bit(idx, sched->state.used))
+		if (!__test_and_set_bit(idx, sched->state.used)) {
+			if (sched->state.nr_gp++ >= sched->max_gp)
+				return false;
+
 			goto done;
+		}
 	}
 
 	return false;
@@ -745,7 +767,7 @@ static bool perf_sched_next_event(struct perf_sched *sched)
 			if (sched->state.weight > sched->max_weight)
 				return false;
 		}
-		c = sched->events[sched->state.event]->hw.constraint;
+		c = sched->constraints[sched->state.event];
 	} while (c->weight != sched->state.weight);
 
 	sched->state.counter = 0;	/* start with first counter */
@@ -756,12 +778,12 @@ static bool perf_sched_next_event(struct perf_sched *sched)
 /*
  * Assign a counter for each event.
  */
-int perf_assign_events(struct perf_event **events, int n,
-			int wmin, int wmax, int *assign)
+int perf_assign_events(struct event_constraint **constraints, int n,
+			int wmin, int wmax, int gpmax, int *assign)
 {
 	struct perf_sched sched;
 
-	perf_sched_init(&sched, events, n, wmin, wmax);
+	perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
 
 	do {
 		if (!perf_sched_find_counter(&sched))
@@ -788,9 +810,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 		x86_pmu.start_scheduling(cpuc);
 
 	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
-		hwc = &cpuc->event_list[i]->hw;
+		cpuc->event_constraint[i] = NULL;
 		c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
-		hwc->constraint = c;
+		cpuc->event_constraint[i] = c;
 
 		wmin = min(wmin, c->weight);
 		wmax = max(wmax, c->weight);
@@ -801,7 +823,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	 */
 	for (i = 0; i < n; i++) {
 		hwc = &cpuc->event_list[i]->hw;
-		c = hwc->constraint;
+		c = cpuc->event_constraint[i];
 
 		/* never assigned */
 		if (hwc->idx == -1)
@@ -821,9 +843,26 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	}
 
 	/* slow path */
-	if (i != n)
-		unsched = perf_assign_events(cpuc->event_list, n, wmin,
-					     wmax, assign);
+	if (i != n) {
+		int gpmax = x86_pmu.num_counters;
+
+		/*
+		 * Do not allow scheduling of more than half the available
+		 * generic counters.
+		 *
+		 * This helps avoid counter starvation of sibling thread by
+		 * ensuring at most half the counters cannot be in exclusive
+		 * mode. There is no designated counters for the limits. Any
+		 * N/2 counters can be used. This helps with events with
+		 * specific counter constraints.
+		 */
+		if (is_ht_workaround_enabled() && !cpuc->is_fake &&
+		    READ_ONCE(cpuc->excl_cntrs->exclusive_present))
+			gpmax /= 2;
+
+		unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
+					     wmax, gpmax, assign);
+	}
 
 	/*
 	 * In case of success (unsched = 0), mark events as committed,
@@ -840,12 +879,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 			e = cpuc->event_list[i];
 			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
 			if (x86_pmu.commit_scheduling)
-				x86_pmu.commit_scheduling(cpuc, e, assign[i]);
+				x86_pmu.commit_scheduling(cpuc, i, assign[i]);
 		}
-	}
-
-	if (!assign || unsched) {
-
+	} else {
 		for (i = 0; i < n; i++) {
 			e = cpuc->event_list[i];
 			/*
@@ -1292,8 +1328,10 @@ static void x86_pmu_del(struct perf_event *event, int flags)
 		x86_pmu.put_event_constraints(cpuc, event);
 
 	/* Delete the array entry. */
-	while (++i < cpuc->n_events)
+	while (++i < cpuc->n_events) {
 		cpuc->event_list[i-1] = cpuc->event_list[i];
+		cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
+	}
 	--cpuc->n_events;
 
 	perf_event_update_userpage(event);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 6ac5cb7a9e1483..89e6cd61e6ae39 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -74,6 +74,7 @@ struct event_constraint {
 #define PERF_X86_EVENT_EXCL		0x0040 /* HT exclusivity on counter */
 #define PERF_X86_EVENT_DYNAMIC		0x0080 /* dynamic alloc'd constraint */
 #define PERF_X86_EVENT_RDPMC_ALLOWED	0x0100 /* grant rdpmc permission */
+#define PERF_X86_EVENT_EXCL_ACCT	0x0200 /* accounted EXCL event */
 
 
 struct amd_nb {
@@ -132,10 +133,7 @@ enum intel_excl_state_type {
 };
 
 struct intel_excl_states {
-	enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
 	enum intel_excl_state_type state[X86_PMC_IDX_MAX];
-	int  num_alloc_cntrs;/* #counters allocated */
-	int  max_alloc_cntrs;/* max #counters allowed */
 	bool sched_started; /* true if scheduling has started */
 };
 
@@ -144,6 +142,11 @@ struct intel_excl_cntrs {
 
 	struct intel_excl_states states[2];
 
+	union {
+		u16	has_exclusive[2];
+		u32	exclusive_present;
+	};
+
 	int		refcnt;		/* per-core: #HT threads */
 	unsigned	core_id;	/* per-core: core id */
 };
@@ -172,7 +175,11 @@ struct cpu_hw_events {
 					     added in the current transaction */
 	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
 	u64			tags[X86_PMC_IDX_MAX];
+
 	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+	struct event_constraint	*event_constraint[X86_PMC_IDX_MAX];
+
+	int			n_excl; /* the number of exclusive events */
 
 	unsigned int		group_flag;
 	int			is_fake;
@@ -519,12 +526,10 @@ struct x86_pmu {
 	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
 						 struct perf_event *event);
 
-	void		(*commit_scheduling)(struct cpu_hw_events *cpuc,
-					     struct perf_event *event,
-					     int cntr);
-
 	void		(*start_scheduling)(struct cpu_hw_events *cpuc);
 
+	void		(*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);
+
 	void		(*stop_scheduling)(struct cpu_hw_events *cpuc);
 
 	struct event_constraint *event_constraints;
@@ -717,8 +722,8 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
 
 void x86_pmu_enable_all(int added);
 
-int perf_assign_events(struct perf_event **events, int n,
-			int wmin, int wmax, int *assign);
+int perf_assign_events(struct event_constraint **constraints, int n,
+			int wmin, int wmax, int gpmax, int *assign);
 int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
 
 void x86_pmu_stop(struct perf_event *event, int flags);
@@ -929,4 +934,8 @@ static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
 	return NULL;
 }
 
+static inline int is_ht_workaround_enabled(void)
+{
+	return 0;
+}
 #endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 3998131d1a6830..160d36e60c1d2d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1903,9 +1903,8 @@ static void
 intel_start_scheduling(struct cpu_hw_events *cpuc)
 {
 	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct intel_excl_states *xl, *xlo;
+	struct intel_excl_states *xl;
 	int tid = cpuc->excl_thread_id;
-	int o_tid = 1 - tid; /* sibling thread */
 
 	/*
 	 * nothing needed if in group validation mode
@@ -1916,35 +1915,52 @@ intel_start_scheduling(struct cpu_hw_events *cpuc)
 	/*
 	 * no exclusion needed
 	 */
-	if (!excl_cntrs)
+	if (WARN_ON_ONCE(!excl_cntrs))
 		return;
 
-	xlo = &excl_cntrs->states[o_tid];
 	xl = &excl_cntrs->states[tid];
 
 	xl->sched_started = true;
-	xl->num_alloc_cntrs = 0;
 	/*
 	 * lock shared state until we are done scheduling
 	 * in stop_event_scheduling()
 	 * makes scheduling appear as a transaction
 	 */
-	WARN_ON_ONCE(!irqs_disabled());
 	raw_spin_lock(&excl_cntrs->lock);
+}
 
-	/*
-	 * save initial state of sibling thread
-	 */
-	memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state));
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
+{
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct event_constraint *c = cpuc->event_constraint[idx];
+	struct intel_excl_states *xl;
+	int tid = cpuc->excl_thread_id;
+
+	if (cpuc->is_fake || !is_ht_workaround_enabled())
+		return;
+
+	if (WARN_ON_ONCE(!excl_cntrs))
+		return;
+
+	if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+		return;
+
+	xl = &excl_cntrs->states[tid];
+
+	lockdep_assert_held(&excl_cntrs->lock);
+
+	if (c->flags & PERF_X86_EVENT_EXCL)
+		xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
+	else
+		xl->state[cntr] = INTEL_EXCL_SHARED;
 }
 
 static void
 intel_stop_scheduling(struct cpu_hw_events *cpuc)
 {
 	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct intel_excl_states *xl, *xlo;
+	struct intel_excl_states *xl;
 	int tid = cpuc->excl_thread_id;
-	int o_tid = 1 - tid; /* sibling thread */
 
 	/*
 	 * nothing needed if in group validation mode
@@ -1954,17 +1970,11 @@ intel_stop_scheduling(struct cpu_hw_events *cpuc)
 	/*
 	 * no exclusion needed
 	 */
-	if (!excl_cntrs)
+	if (WARN_ON_ONCE(!excl_cntrs))
 		return;
 
-	xlo = &excl_cntrs->states[o_tid];
 	xl = &excl_cntrs->states[tid];
 
-	/*
-	 * make new sibling thread state visible
-	 */
-	memcpy(xlo->state, xlo->init_state, sizeof(xlo->state));
-
 	xl->sched_started = false;
 	/*
 	 * release shared state lock (acquired in intel_start_scheduling())
@@ -1976,12 +1986,10 @@ static struct event_constraint *
 intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 			   int idx, struct event_constraint *c)
 {
-	struct event_constraint *cx;
 	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct intel_excl_states *xl, *xlo;
-	int is_excl, i;
+	struct intel_excl_states *xlo;
 	int tid = cpuc->excl_thread_id;
-	int o_tid = 1 - tid; /* alternate */
+	int is_excl, i;
 
 	/*
 	 * validating a group does not require
@@ -1993,34 +2001,8 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 	/*
 	 * no exclusion needed
 	 */
-	if (!excl_cntrs)
+	if (WARN_ON_ONCE(!excl_cntrs))
 		return c;
-	/*
-	 * event requires exclusive counter access
-	 * across HT threads
-	 */
-	is_excl = c->flags & PERF_X86_EVENT_EXCL;
-
-	/*
-	 * xl = state of current HT
-	 * xlo = state of sibling HT
-	 */
-	xl = &excl_cntrs->states[tid];
-	xlo = &excl_cntrs->states[o_tid];
-
-	/*
-	 * do not allow scheduling of more than max_alloc_cntrs
-	 * which is set to half the available generic counters.
-	 * this helps avoid counter starvation of sibling thread
-	 * by ensuring at most half the counters cannot be in
-	 * exclusive mode. There is not designated counters for the
-	 * limits. Any N/2 counters can be used. This helps with
-	 * events with specifix counter constraints
-	 */
-	if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
-		return &emptyconstraint;
-
-	cx = c;
 
 	/*
 	 * because we modify the constraint, we need
@@ -2031,10 +2013,7 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 	 * been cloned (marked dynamic)
 	 */
 	if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
-
-		/* sanity check */
-		if (idx < 0)
-			return &emptyconstraint;
+		struct event_constraint *cx;
 
 		/*
 		 * grab pre-allocated constraint entry
@@ -2045,13 +2024,14 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 		 * initialize dynamic constraint
 		 * with static constraint
 		 */
-		memcpy(cx, c, sizeof(*cx));
+		*cx = *c;
 
 		/*
 		 * mark constraint as dynamic, so we
 		 * can free it later on
 		 */
 		cx->flags |= PERF_X86_EVENT_DYNAMIC;
+		c = cx;
 	}
 
 	/*
@@ -2062,6 +2042,22 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 	 */
 
 	/*
+	 * state of sibling HT
+	 */
+	xlo = &excl_cntrs->states[tid ^ 1];
+
+	/*
+	 * event requires exclusive counter access
+	 * across HT threads
+	 */
+	is_excl = c->flags & PERF_X86_EVENT_EXCL;
+	if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
+		event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
+		if (!cpuc->n_excl++)
+			WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
+	}
+
+	/*
 	 * Modify static constraint with current dynamic
 	 * state of thread
 	 *
@@ -2069,44 +2065,44 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 	 * SHARED   : sibling counter measuring non-exclusive event
 	 * UNUSED   : sibling counter unused
 	 */
-	for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) {
+	for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
 		/*
 		 * exclusive event in sibling counter
 		 * our corresponding counter cannot be used
 		 * regardless of our event
 		 */
-		if (xl->state[i] == INTEL_EXCL_EXCLUSIVE)
-			__clear_bit(i, cx->idxmsk);
+		if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
+			__clear_bit(i, c->idxmsk);
 		/*
 		 * if measuring an exclusive event, sibling
 		 * measuring non-exclusive, then counter cannot
 		 * be used
 		 */
-		if (is_excl && xl->state[i] == INTEL_EXCL_SHARED)
-			__clear_bit(i, cx->idxmsk);
+		if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
+			__clear_bit(i, c->idxmsk);
 	}
 
 	/*
 	 * recompute actual bit weight for scheduling algorithm
 	 */
-	cx->weight = hweight64(cx->idxmsk64);
+	c->weight = hweight64(c->idxmsk64);
 
 	/*
 	 * if we return an empty mask, then switch
 	 * back to static empty constraint to avoid
 	 * the cost of freeing later on
 	 */
-	if (cx->weight == 0)
-		cx = &emptyconstraint;
+	if (c->weight == 0)
+		c = &emptyconstraint;
 
-	return cx;
+	return c;
 }
 
 static struct event_constraint *
 intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 			    struct perf_event *event)
 {
-	struct event_constraint *c1 = event->hw.constraint;
+	struct event_constraint *c1 = cpuc->event_constraint[idx];
 	struct event_constraint *c2;
 
 	/*
@@ -2132,10 +2128,8 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
 {
 	struct hw_perf_event *hwc = &event->hw;
 	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct intel_excl_states *xlo, *xl;
-	unsigned long flags = 0; /* keep compiler happy */
 	int tid = cpuc->excl_thread_id;
-	int o_tid = 1 - tid;
+	struct intel_excl_states *xl;
 
 	/*
 	 * nothing needed if in group validation mode
@@ -2143,31 +2137,35 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
 	if (cpuc->is_fake)
 		return;
 
-	WARN_ON_ONCE(!excl_cntrs);
-
-	if (!excl_cntrs)
+	if (WARN_ON_ONCE(!excl_cntrs))
 		return;
 
-	xl = &excl_cntrs->states[tid];
-	xlo = &excl_cntrs->states[o_tid];
+	if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
+		hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
+		if (!--cpuc->n_excl)
+			WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
+	}
 
 	/*
-	 * put_constraint may be called from x86_schedule_events()
-	 * which already has the lock held so here make locking
-	 * conditional
+	 * If event was actually assigned, then mark the counter state as
+	 * unused now.
 	 */
-	if (!xl->sched_started)
-		raw_spin_lock_irqsave(&excl_cntrs->lock, flags);
+	if (hwc->idx >= 0) {
+		xl = &excl_cntrs->states[tid];
 
-	/*
-	 * if event was actually assigned, then mark the
-	 * counter state as unused now
-	 */
-	if (hwc->idx >= 0)
-		xlo->state[hwc->idx] = INTEL_EXCL_UNUSED;
+		/*
+		 * put_constraint may be called from x86_schedule_events()
+		 * which already has the lock held so here make locking
+		 * conditional.
+		 */
+		if (!xl->sched_started)
+			raw_spin_lock(&excl_cntrs->lock);
+
+		xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
 
-	if (!xl->sched_started)
-		raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags);
+		if (!xl->sched_started)
+			raw_spin_unlock(&excl_cntrs->lock);
+	}
 }
 
 static void
@@ -2188,8 +2186,6 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
 static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
 					struct perf_event *event)
 {
-	struct event_constraint *c = event->hw.constraint;
-
 	intel_put_shared_regs_event_constraints(cpuc, event);
 
 	/*
@@ -2197,48 +2193,8 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
 	 * all events are subject to and must call the
 	 * put_excl_constraints() routine
 	 */
-	if (c && cpuc->excl_cntrs)
+	if (cpuc->excl_cntrs)
 		intel_put_excl_constraints(cpuc, event);
-
-	/* cleanup dynamic constraint */
-	if (c && (c->flags & PERF_X86_EVENT_DYNAMIC))
-		event->hw.constraint = NULL;
-}
-
-static void intel_commit_scheduling(struct cpu_hw_events *cpuc,
-				    struct perf_event *event, int cntr)
-{
-	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct event_constraint *c = event->hw.constraint;
-	struct intel_excl_states *xlo, *xl;
-	int tid = cpuc->excl_thread_id;
-	int o_tid = 1 - tid;
-	int is_excl;
-
-	if (cpuc->is_fake || !c)
-		return;
-
-	is_excl = c->flags & PERF_X86_EVENT_EXCL;
-
-	if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
-		return;
-
-	WARN_ON_ONCE(!excl_cntrs);
-
-	if (!excl_cntrs)
-		return;
-
-	xl = &excl_cntrs->states[tid];
-	xlo = &excl_cntrs->states[o_tid];
-
-	WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock));
-
-	if (cntr >= 0) {
-		if (is_excl)
-			xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
-		else
-			xlo->init_state[cntr] = INTEL_EXCL_SHARED;
-	}
 }
 
 static void intel_pebs_aliases_core2(struct perf_event *event)
@@ -2554,19 +2510,11 @@ struct intel_shared_regs *allocate_shared_regs(int cpu)
 static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
 {
 	struct intel_excl_cntrs *c;
-	int i;
 
 	c = kzalloc_node(sizeof(struct intel_excl_cntrs),
 			 GFP_KERNEL, cpu_to_node(cpu));
 	if (c) {
 		raw_spin_lock_init(&c->lock);
-		for (i = 0; i < X86_PMC_IDX_MAX; i++) {
-			c->states[0].state[i] = INTEL_EXCL_UNUSED;
-			c->states[0].init_state[i] = INTEL_EXCL_UNUSED;
-
-			c->states[1].state[i] = INTEL_EXCL_UNUSED;
-			c->states[1].init_state[i] = INTEL_EXCL_UNUSED;
-		}
 		c->core_id = -1;
 	}
 	return c;
@@ -2621,7 +2569,7 @@ static void intel_pmu_cpu_starting(int cpu)
 	if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
 		void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
 
-		for_each_cpu(i, topology_thread_cpumask(cpu)) {
+		for_each_cpu(i, topology_sibling_cpumask(cpu)) {
 			struct intel_shared_regs *pc;
 
 			pc = per_cpu(cpu_hw_events, i).shared_regs;
@@ -2639,9 +2587,7 @@ static void intel_pmu_cpu_starting(int cpu)
 		cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
 
 	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
-		int h = x86_pmu.num_counters >> 1;
-
-		for_each_cpu(i, topology_thread_cpumask(cpu)) {
+		for_each_cpu(i, topology_sibling_cpumask(cpu)) {
 			struct intel_excl_cntrs *c;
 
 			c = per_cpu(cpu_hw_events, i).excl_cntrs;
@@ -2654,11 +2600,6 @@ static void intel_pmu_cpu_starting(int cpu)
 		}
 		cpuc->excl_cntrs->core_id = core_id;
 		cpuc->excl_cntrs->refcnt++;
-		/*
-		 * set hard limit to half the number of generic counters
-		 */
-		cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
-		cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
 	}
 }
 
@@ -2956,8 +2897,8 @@ static __init void intel_ht_bug(void)
 {
 	x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
 
-	x86_pmu.commit_scheduling = intel_commit_scheduling;
 	x86_pmu.start_scheduling = intel_start_scheduling;
+	x86_pmu.commit_scheduling = intel_commit_scheduling;
 	x86_pmu.stop_scheduling = intel_stop_scheduling;
 }
 
@@ -3403,7 +3344,7 @@ static __init int fixup_ht_bug(void)
 	if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
 		return 0;
 
-	w = cpumask_weight(topology_thread_cpumask(cpu));
+	w = cpumask_weight(topology_sibling_cpumask(cpu));
 	if (w > 1) {
 		pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
 		return 0;
@@ -3413,8 +3354,8 @@ static __init int fixup_ht_bug(void)
 
 	x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
 
-	x86_pmu.commit_scheduling = NULL;
 	x86_pmu.start_scheduling = NULL;
+	x86_pmu.commit_scheduling = NULL;
 	x86_pmu.stop_scheduling = NULL;
 
 	watchdog_nmi_enable_all();
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index e4d1b8b738fa8a..188076161c1be5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -13,16 +13,35 @@
 #define MSR_IA32_QM_CTR		0x0c8e
 #define MSR_IA32_QM_EVTSEL	0x0c8d
 
-static unsigned int cqm_max_rmid = -1;
+static u32 cqm_max_rmid = -1;
 static unsigned int cqm_l3_scale; /* supposedly cacheline size */
 
-struct intel_cqm_state {
-	raw_spinlock_t		lock;
-	int			rmid;
-	int			cnt;
+/**
+ * struct intel_pqr_state - State cache for the PQR MSR
+ * @rmid:		The cached Resource Monitoring ID
+ * @closid:		The cached Class Of Service ID
+ * @rmid_usecnt:	The usage counter for rmid
+ *
+ * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct intel_pqr_state {
+	u32			rmid;
+	u32			closid;
+	int			rmid_usecnt;
 };
 
-static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Both functions which modify the state
+ * (intel_cqm_event_start and intel_cqm_event_stop) are called with
+ * interrupts disabled, which is sufficient for the protection.
+ */
+static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
 
 /*
  * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
@@ -57,7 +76,7 @@ static cpumask_t cqm_cpumask;
  * near-zero occupancy value, i.e. no cachelines are tagged with this
  * RMID, once __intel_cqm_rmid_rotate() returns.
  */
-static unsigned int intel_cqm_rotation_rmid;
+static u32 intel_cqm_rotation_rmid;
 
 #define INVALID_RMID		(-1)
 
@@ -69,7 +88,7 @@ static unsigned int intel_cqm_rotation_rmid;
  * Likewise, an rmid value of -1 is used to indicate "no rmid currently
  * assigned" and is used as part of the rotation code.
  */
-static inline bool __rmid_valid(unsigned int rmid)
+static inline bool __rmid_valid(u32 rmid)
 {
 	if (!rmid || rmid == INVALID_RMID)
 		return false;
@@ -77,7 +96,7 @@ static inline bool __rmid_valid(unsigned int rmid)
 	return true;
 }
 
-static u64 __rmid_read(unsigned int rmid)
+static u64 __rmid_read(u32 rmid)
 {
 	u64 val;
 
@@ -102,7 +121,7 @@ enum rmid_recycle_state {
 };
 
 struct cqm_rmid_entry {
-	unsigned int rmid;
+	u32 rmid;
 	enum rmid_recycle_state state;
 	struct list_head list;
 	unsigned long queue_time;
@@ -147,7 +166,7 @@ static LIST_HEAD(cqm_rmid_limbo_lru);
  */
 static struct cqm_rmid_entry **cqm_rmid_ptrs;
 
-static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
+static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
 {
 	struct cqm_rmid_entry *entry;
 
@@ -162,7 +181,7 @@ static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
  *
  * We expect to be called with cache_mutex held.
  */
-static int __get_rmid(void)
+static u32 __get_rmid(void)
 {
 	struct cqm_rmid_entry *entry;
 
@@ -177,7 +196,7 @@ static int __get_rmid(void)
 	return entry->rmid;
 }
 
-static void __put_rmid(unsigned int rmid)
+static void __put_rmid(u32 rmid)
 {
 	struct cqm_rmid_entry *entry;
 
@@ -372,7 +391,7 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b)
 }
 
 struct rmid_read {
-	unsigned int rmid;
+	u32 rmid;
 	atomic64_t value;
 };
 
@@ -381,12 +400,11 @@ static void __intel_cqm_event_count(void *info);
 /*
  * Exchange the RMID of a group of events.
  */
-static unsigned int
-intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid)
+static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
 {
 	struct perf_event *event;
-	unsigned int old_rmid = group->hw.cqm_rmid;
 	struct list_head *head = &group->hw.cqm_group_entry;
+	u32 old_rmid = group->hw.cqm_rmid;
 
 	lockdep_assert_held(&cache_mutex);
 
@@ -451,7 +469,7 @@ static void intel_cqm_stable(void *arg)
  * If we have group events waiting for an RMID that don't conflict with
  * events already running, assign @rmid.
  */
-static bool intel_cqm_sched_in_event(unsigned int rmid)
+static bool intel_cqm_sched_in_event(u32 rmid)
 {
 	struct perf_event *leader, *event;
 
@@ -598,7 +616,7 @@ static bool intel_cqm_rmid_stabilize(unsigned int *available)
 static void __intel_cqm_pick_and_rotate(struct perf_event *next)
 {
 	struct perf_event *rotor;
-	unsigned int rmid;
+	u32 rmid;
 
 	lockdep_assert_held(&cache_mutex);
 
@@ -626,7 +644,7 @@ static void __intel_cqm_pick_and_rotate(struct perf_event *next)
 static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
 {
 	struct perf_event *group, *g;
-	unsigned int rmid;
+	u32 rmid;
 
 	lockdep_assert_held(&cache_mutex);
 
@@ -828,8 +846,8 @@ static void intel_cqm_setup_event(struct perf_event *event,
 				  struct perf_event **group)
 {
 	struct perf_event *iter;
-	unsigned int rmid;
 	bool conflict = false;
+	u32 rmid;
 
 	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
 		rmid = iter->hw.cqm_rmid;
@@ -860,7 +878,7 @@ static void intel_cqm_setup_event(struct perf_event *event,
 static void intel_cqm_event_read(struct perf_event *event)
 {
 	unsigned long flags;
-	unsigned int rmid;
+	u32 rmid;
 	u64 val;
 
 	/*
@@ -961,55 +979,48 @@ out:
 
 static void intel_cqm_event_start(struct perf_event *event, int mode)
 {
-	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
-	unsigned int rmid = event->hw.cqm_rmid;
-	unsigned long flags;
+	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+	u32 rmid = event->hw.cqm_rmid;
 
 	if (!(event->hw.cqm_state & PERF_HES_STOPPED))
 		return;
 
 	event->hw.cqm_state &= ~PERF_HES_STOPPED;
 
-	raw_spin_lock_irqsave(&state->lock, flags);
-
-	if (state->cnt++)
-		WARN_ON_ONCE(state->rmid != rmid);
-	else
+	if (state->rmid_usecnt++) {
+		if (!WARN_ON_ONCE(state->rmid != rmid))
+			return;
+	} else {
 		WARN_ON_ONCE(state->rmid);
+	}
 
 	state->rmid = rmid;
-	wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid);
-
-	raw_spin_unlock_irqrestore(&state->lock, flags);
+	wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
 }
 
 static void intel_cqm_event_stop(struct perf_event *event, int mode)
 {
-	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
-	unsigned long flags;
+	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
 
 	if (event->hw.cqm_state & PERF_HES_STOPPED)
 		return;
 
 	event->hw.cqm_state |= PERF_HES_STOPPED;
 
-	raw_spin_lock_irqsave(&state->lock, flags);
 	intel_cqm_event_read(event);
 
-	if (!--state->cnt) {
+	if (!--state->rmid_usecnt) {
 		state->rmid = 0;
-		wrmsrl(MSR_IA32_PQR_ASSOC, 0);
+		wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
 	} else {
 		WARN_ON_ONCE(!state->rmid);
 	}
-
-	raw_spin_unlock_irqrestore(&state->lock, flags);
 }
 
 static int intel_cqm_event_add(struct perf_event *event, int mode)
 {
 	unsigned long flags;
-	unsigned int rmid;
+	u32 rmid;
 
 	raw_spin_lock_irqsave(&cache_lock, flags);
 
@@ -1024,11 +1035,6 @@ static int intel_cqm_event_add(struct perf_event *event, int mode)
 	return 0;
 }
 
-static void intel_cqm_event_del(struct perf_event *event, int mode)
-{
-	intel_cqm_event_stop(event, mode);
-}
-
 static void intel_cqm_event_destroy(struct perf_event *event)
 {
 	struct perf_event *group_other = NULL;
@@ -1057,7 +1063,7 @@ static void intel_cqm_event_destroy(struct perf_event *event)
 			list_replace(&event->hw.cqm_groups_entry,
 				     &group_other->hw.cqm_groups_entry);
 		} else {
-			unsigned int rmid = event->hw.cqm_rmid;
+			u32 rmid = event->hw.cqm_rmid;
 
 			if (__rmid_valid(rmid))
 				__put_rmid(rmid);
@@ -1221,7 +1227,7 @@ static struct pmu intel_cqm_pmu = {
 	.task_ctx_nr	     = perf_sw_context,
 	.event_init	     = intel_cqm_event_init,
 	.add		     = intel_cqm_event_add,
-	.del		     = intel_cqm_event_del,
+	.del		     = intel_cqm_event_stop,
 	.start		     = intel_cqm_event_start,
 	.stop		     = intel_cqm_event_stop,
 	.read		     = intel_cqm_event_read,
@@ -1243,12 +1249,12 @@ static inline void cqm_pick_event_reader(int cpu)
 
 static void intel_cqm_cpu_prepare(unsigned int cpu)
 {
-	struct intel_cqm_state *state = &per_cpu(cqm_state, cpu);
+	struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 
-	raw_spin_lock_init(&state->lock);
 	state->rmid = 0;
-	state->cnt  = 0;
+	state->closid = 0;
+	state->rmid_usecnt = 0;
 
 	WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
 	WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 813f75d71175e3..7f73b3553e2ee0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -706,9 +706,9 @@ void intel_pmu_pebs_disable(struct perf_event *event)
 
 	cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
 
-	if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_LDLAT)
+	if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
 		cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
-	else if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_ST)
+	else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
 		cpuc->pebs_enabled &= ~(1ULL << 63);
 
 	if (cpuc->enabled)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index ffe666c2c6b586..59596d25cb2950 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -187,15 +187,6 @@ static bool pt_event_valid(struct perf_event *event)
  * These all are cpu affine and operate on a local PT
  */
 
-static bool pt_is_running(void)
-{
-	u64 ctl;
-
-	rdmsrl(MSR_IA32_RTIT_CTL, ctl);
-
-	return !!(ctl & RTIT_CTL_TRACEEN);
-}
-
 static void pt_config(struct perf_event *event)
 {
 	u64 reg;
@@ -609,16 +600,19 @@ static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
  * @handle:	Current output handle.
  *
  * Place INT and STOP marks to prevent overwriting old data that the consumer
- * hasn't yet collected.
+ * hasn't yet collected and waking up the consumer after a certain fraction of
+ * the buffer has filled up. Only needed and sensible for non-snapshot counters.
+ *
+ * This obviously relies on buf::head to figure out buffer markers, so it has
+ * to be called after pt_buffer_reset_offsets() and before the hardware tracing
+ * is enabled.
  */
 static int pt_buffer_reset_markers(struct pt_buffer *buf,
 				   struct perf_output_handle *handle)
 
 {
-	unsigned long idx, npages, end;
-
-	if (buf->snapshot)
-		return 0;
+	unsigned long head = local64_read(&buf->head);
+	unsigned long idx, npages, wakeup;
 
 	/* can't stop in the middle of an output region */
 	if (buf->output_off + handle->size + 1 <
@@ -634,17 +628,26 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
 	buf->topa_index[buf->stop_pos]->stop = 0;
 	buf->topa_index[buf->intr_pos]->intr = 0;
 
-	if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
-		npages = (handle->size + 1) >> PAGE_SHIFT;
-		end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages;
-		/*if (end > handle->wakeup >> PAGE_SHIFT)
-		  end = handle->wakeup >> PAGE_SHIFT;*/
-		idx = end & (buf->nr_pages - 1);
-		buf->stop_pos = idx;
-		idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1;
-		idx &= buf->nr_pages - 1;
-		buf->intr_pos = idx;
-	}
+	/* how many pages till the STOP marker */
+	npages = handle->size >> PAGE_SHIFT;
+
+	/* if it's on a page boundary, fill up one more page */
+	if (!offset_in_page(head + handle->size + 1))
+		npages++;
+
+	idx = (head >> PAGE_SHIFT) + npages;
+	idx &= buf->nr_pages - 1;
+	buf->stop_pos = idx;
+
+	wakeup = handle->wakeup >> PAGE_SHIFT;
+
+	/* in the worst case, wake up the consumer one page before hard stop */
+	idx = (head >> PAGE_SHIFT) + npages - 1;
+	if (idx > wakeup)
+		idx = wakeup;
+
+	idx &= buf->nr_pages - 1;
+	buf->intr_pos = idx;
 
 	buf->topa_index[buf->stop_pos]->stop = 1;
 	buf->topa_index[buf->intr_pos]->intr = 1;
@@ -664,7 +667,7 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
 	struct topa *cur = buf->first, *prev = buf->last;
 	struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
 		*te_prev = TOPA_ENTRY(prev, prev->last - 1);
-	int pg = 0, idx = 0, ntopa = 0;
+	int pg = 0, idx = 0;
 
 	while (pg < buf->nr_pages) {
 		int tidx;
@@ -679,9 +682,9 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
 			/* advance to next topa table */
 			idx = 0;
 			cur = list_entry(cur->list.next, struct topa, list);
-			ntopa++;
-		} else
+		} else {
 			idx++;
+		}
 		te_cur = TOPA_ENTRY(cur, idx);
 	}
 
@@ -693,7 +696,14 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
  * @head:	Write pointer (aux_head) from AUX buffer.
  *
  * Find the ToPA table and entry corresponding to given @head and set buffer's
- * "current" pointers accordingly.
+ * "current" pointers accordingly. This is done after we have obtained the
+ * current aux_head position from a successful call to perf_aux_output_begin()
+ * to make sure the hardware is writing to the right place.
+ *
+ * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
+ * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
+ * which are used to determine INT and STOP markers' locations by a subsequent
+ * call to pt_buffer_reset_markers().
  */
 static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
 {
@@ -891,6 +901,7 @@ void intel_pt_interrupt(void)
 		}
 
 		pt_buffer_reset_offsets(buf, pt->handle.head);
+		/* snapshot counters don't use PMI, so it's safe */
 		ret = pt_buffer_reset_markers(buf, &pt->handle);
 		if (ret) {
 			perf_aux_output_end(&pt->handle, 0, true);
@@ -913,7 +924,7 @@ static void pt_event_start(struct perf_event *event, int mode)
 	struct pt *pt = this_cpu_ptr(&pt_ctx);
 	struct pt_buffer *buf = perf_get_aux(&pt->handle);
 
-	if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) {
+	if (!buf || pt_buffer_is_full(buf, pt)) {
 		event->hw.state = PERF_HES_STOPPED;
 		return;
 	}
@@ -944,7 +955,6 @@ static void pt_event_stop(struct perf_event *event, int mode)
 	event->hw.state = PERF_HES_STOPPED;
 
 	if (mode & PERF_EF_UPDATE) {
-		struct pt *pt = this_cpu_ptr(&pt_ctx);
 		struct pt_buffer *buf = perf_get_aux(&pt->handle);
 
 		if (!buf)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index 358c54ad20d408..5cbd4e64feb582 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -204,9 +204,8 @@ again:
 
 static void rapl_start_hrtimer(struct rapl_pmu *pmu)
 {
-	__hrtimer_start_range_ns(&pmu->hrtimer,
-			pmu->timer_interval, 0,
-			HRTIMER_MODE_REL_PINNED, 0);
+       hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
+		     HRTIMER_MODE_REL_PINNED);
 }
 
 static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index c635b8b49e931e..471fccb0a144b2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -233,9 +233,8 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
 
 void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
 {
-	__hrtimer_start_range_ns(&box->hrtimer,
-			ns_to_ktime(box->hrtimer_duration), 0,
-			HRTIMER_MODE_REL_PINNED, 0);
+	hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration),
+		      HRTIMER_MODE_REL_PINNED);
 }
 
 void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
@@ -365,9 +364,8 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int
 	bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
 
 	for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
-		hwc = &box->event_list[i]->hw;
 		c = uncore_get_event_constraint(box, box->event_list[i]);
-		hwc->constraint = c;
+		box->event_constraint[i] = c;
 		wmin = min(wmin, c->weight);
 		wmax = max(wmax, c->weight);
 	}
@@ -375,7 +373,7 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int
 	/* fastpath, try to reuse previous register */
 	for (i = 0; i < n; i++) {
 		hwc = &box->event_list[i]->hw;
-		c = hwc->constraint;
+		c = box->event_constraint[i];
 
 		/* never assigned */
 		if (hwc->idx == -1)
@@ -395,8 +393,8 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int
 	}
 	/* slow path */
 	if (i != n)
-		ret = perf_assign_events(box->event_list, n,
-					 wmin, wmax, assign);
+		ret = perf_assign_events(box->event_constraint, n,
+					 wmin, wmax, n, assign);
 
 	if (!assign || ret) {
 		for (i = 0; i < n; i++)
@@ -922,6 +920,9 @@ static int __init uncore_pci_init(void)
 	case 69: /* Haswell Celeron */
 		ret = hsw_uncore_pci_init();
 		break;
+	case 61: /* Broadwell */
+		ret = bdw_uncore_pci_init();
+		break;
 	default:
 		return 0;
 	}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 6c8c1e7e69d85d..ebab4ba4c0d6c4 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -97,6 +97,7 @@ struct intel_uncore_box {
 	atomic_t refcnt;
 	struct perf_event *events[UNCORE_PMC_IDX_MAX];
 	struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
+	struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX];
 	unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
 	u64 tags[UNCORE_PMC_IDX_MAX];
 	struct pci_dev *pci_dev;
@@ -326,6 +327,7 @@ extern struct event_constraint uncore_constraint_empty;
 int snb_uncore_pci_init(void);
 int ivb_uncore_pci_init(void);
 int hsw_uncore_pci_init(void);
+int bdw_uncore_pci_init(void);
 void snb_uncore_cpu_init(void);
 void nhm_uncore_cpu_init(void);
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 4562e9e22c6060..b005a78c701286 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -7,6 +7,7 @@
 #define PCI_DEVICE_ID_INTEL_IVB_E3_IMC	0x0150
 #define PCI_DEVICE_ID_INTEL_HSW_IMC	0x0c00
 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC	0x0a04
+#define PCI_DEVICE_ID_INTEL_BDW_IMC	0x1604
 
 /* SNB event control */
 #define SNB_UNC_CTL_EV_SEL_MASK			0x000000ff
@@ -486,6 +487,14 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = {
 	{ /* end: all zeroes */ },
 };
 
+static const struct pci_device_id bdw_uncore_pci_ids[] = {
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* end: all zeroes */ },
+};
+
 static struct pci_driver snb_uncore_pci_driver = {
 	.name		= "snb_uncore",
 	.id_table	= snb_uncore_pci_ids,
@@ -501,6 +510,11 @@ static struct pci_driver hsw_uncore_pci_driver = {
 	.id_table	= hsw_uncore_pci_ids,
 };
 
+static struct pci_driver bdw_uncore_pci_driver = {
+	.name		= "bdw_uncore",
+	.id_table	= bdw_uncore_pci_ids,
+};
+
 struct imc_uncore_pci_dev {
 	__u32 pci_id;
 	struct pci_driver *driver;
@@ -514,6 +528,7 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
 	IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
 	IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),    /* 4th Gen Core Processor */
 	IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile Processor */
+	IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver),    /* 5th Gen Core U */
 	{  /* end marker */ }
 };
 
@@ -561,6 +576,11 @@ int hsw_uncore_pci_init(void)
 	return imc_uncore_pci_init();
 }
 
+int bdw_uncore_pci_init(void)
+{
+	return imc_uncore_pci_init();
+}
+
 /* end of Sandy Bridge uncore support */
 
 /* Nehalem uncore support */
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index e7d8c7608471e9..18ca99f2798b16 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -12,7 +12,8 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
 {
 #ifdef CONFIG_SMP
 	seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
-	seq_printf(m, "siblings\t: %d\n", cpumask_weight(cpu_core_mask(cpu)));
+	seq_printf(m, "siblings\t: %d\n",
+		   cpumask_weight(topology_core_cpumask(cpu)));
 	seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
 	seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
 	seq_printf(m, "apicid\t\t: %d\n", c->apicid);
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 6367a780cc8ca8..5ee771859b6f6e 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -4,7 +4,6 @@
 #include <linux/bootmem.h>
 #include <linux/export.h>
 #include <linux/io.h>
-#include <linux/irqdomain.h>
 #include <linux/interrupt.h>
 #include <linux/list.h>
 #include <linux/of.h>
@@ -17,6 +16,7 @@
 #include <linux/of_pci.h>
 #include <linux/initrd.h>
 
+#include <asm/irqdomain.h>
 #include <asm/hpet.h>
 #include <asm/apic.h>
 #include <asm/pci_x86.h>
@@ -196,38 +196,31 @@ static struct of_ioapic_type of_ioapic_type[] =
 	},
 };
 
-static int ioapic_xlate(struct irq_domain *domain,
-			struct device_node *controller,
-			const u32 *intspec, u32 intsize,
-			irq_hw_number_t *out_hwirq, u32 *out_type)
+static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs, void *arg)
 {
+	struct of_phandle_args *irq_data = (void *)arg;
 	struct of_ioapic_type *it;
-	u32 line, idx, gsi;
+	struct irq_alloc_info tmp;
 
-	if (WARN_ON(intsize < 2))
+	if (WARN_ON(irq_data->args_count < 2))
 		return -EINVAL;
-
-	line = intspec[0];
-
-	if (intspec[1] >= ARRAY_SIZE(of_ioapic_type))
+	if (irq_data->args[1] >= ARRAY_SIZE(of_ioapic_type))
 		return -EINVAL;
 
-	it = &of_ioapic_type[intspec[1]];
+	it = &of_ioapic_type[irq_data->args[1]];
+	ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity);
+	tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
+	tmp.ioapic_pin = irq_data->args[0];
 
-	idx = (u32)(long)domain->host_data;
-	gsi = mp_pin_to_gsi(idx, line);
-	if (mp_set_gsi_attr(gsi, it->trigger, it->polarity, cpu_to_node(0)))
-		return -EBUSY;
-
-	*out_hwirq = line;
-	*out_type = it->out_type;
-	return 0;
+	return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp);
 }
 
-const struct irq_domain_ops ioapic_irq_domain_ops = {
-	.map = mp_irqdomain_map,
-	.unmap = mp_irqdomain_unmap,
-	.xlate = ioapic_xlate,
+static const struct irq_domain_ops ioapic_irq_domain_ops = {
+	.alloc		= dt_irqdomain_alloc,
+	.free		= mp_irqdomain_free,
+	.activate	= mp_irqdomain_activate,
+	.deactivate	= mp_irqdomain_deactivate,
 };
 
 static void __init dtb_add_ioapic(struct device_node *dn)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 1c309763e32197..0ac73de925d1dc 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -50,7 +50,6 @@
 #include <asm/smp.h>
 #include <asm/page_types.h>
 #include <asm/percpu.h>
-#include <asm/dwarf2.h>
 #include <asm/processor-flags.h>
 #include <asm/ftrace.h>
 #include <asm/irq_vectors.h>
@@ -113,11 +112,10 @@
 
  /* unfortunately push/pop can't be no-op */
 .macro PUSH_GS
-	pushl_cfi $0
+	pushl $0
 .endm
 .macro POP_GS pop=0
 	addl $(4 + \pop), %esp
-	CFI_ADJUST_CFA_OFFSET -(4 + \pop)
 .endm
 .macro POP_GS_EX
 .endm
@@ -137,16 +135,13 @@
 #else	/* CONFIG_X86_32_LAZY_GS */
 
 .macro PUSH_GS
-	pushl_cfi %gs
-	/*CFI_REL_OFFSET gs, 0*/
+	pushl %gs
 .endm
 
 .macro POP_GS pop=0
-98:	popl_cfi %gs
-	/*CFI_RESTORE gs*/
+98:	popl %gs
   .if \pop <> 0
 	add $\pop, %esp
-	CFI_ADJUST_CFA_OFFSET -\pop
   .endif
 .endm
 .macro POP_GS_EX
@@ -170,11 +165,9 @@
 
 .macro GS_TO_REG reg
 	movl %gs, \reg
-	/*CFI_REGISTER gs, \reg*/
 .endm
 .macro REG_TO_PTGS reg
 	movl \reg, PT_GS(%esp)
-	/*CFI_REL_OFFSET gs, PT_GS*/
 .endm
 .macro SET_KERNEL_GS reg
 	movl $(__KERNEL_STACK_CANARY), \reg
@@ -186,26 +179,16 @@
 .macro SAVE_ALL
 	cld
 	PUSH_GS
-	pushl_cfi %fs
-	/*CFI_REL_OFFSET fs, 0;*/
-	pushl_cfi %es
-	/*CFI_REL_OFFSET es, 0;*/
-	pushl_cfi %ds
-	/*CFI_REL_OFFSET ds, 0;*/
-	pushl_cfi %eax
-	CFI_REL_OFFSET eax, 0
-	pushl_cfi %ebp
-	CFI_REL_OFFSET ebp, 0
-	pushl_cfi %edi
-	CFI_REL_OFFSET edi, 0
-	pushl_cfi %esi
-	CFI_REL_OFFSET esi, 0
-	pushl_cfi %edx
-	CFI_REL_OFFSET edx, 0
-	pushl_cfi %ecx
-	CFI_REL_OFFSET ecx, 0
-	pushl_cfi %ebx
-	CFI_REL_OFFSET ebx, 0
+	pushl %fs
+	pushl %es
+	pushl %ds
+	pushl %eax
+	pushl %ebp
+	pushl %edi
+	pushl %esi
+	pushl %edx
+	pushl %ecx
+	pushl %ebx
 	movl $(__USER_DS), %edx
 	movl %edx, %ds
 	movl %edx, %es
@@ -215,30 +198,20 @@
 .endm
 
 .macro RESTORE_INT_REGS
-	popl_cfi %ebx
-	CFI_RESTORE ebx
-	popl_cfi %ecx
-	CFI_RESTORE ecx
-	popl_cfi %edx
-	CFI_RESTORE edx
-	popl_cfi %esi
-	CFI_RESTORE esi
-	popl_cfi %edi
-	CFI_RESTORE edi
-	popl_cfi %ebp
-	CFI_RESTORE ebp
-	popl_cfi %eax
-	CFI_RESTORE eax
+	popl %ebx
+	popl %ecx
+	popl %edx
+	popl %esi
+	popl %edi
+	popl %ebp
+	popl %eax
 .endm
 
 .macro RESTORE_REGS pop=0
 	RESTORE_INT_REGS
-1:	popl_cfi %ds
-	/*CFI_RESTORE ds;*/
-2:	popl_cfi %es
-	/*CFI_RESTORE es;*/
-3:	popl_cfi %fs
-	/*CFI_RESTORE fs;*/
+1:	popl %ds
+2:	popl %es
+3:	popl %fs
 	POP_GS \pop
 .pushsection .fixup, "ax"
 4:	movl $0, (%esp)
@@ -254,64 +227,27 @@
 	POP_GS_EX
 .endm
 
-.macro RING0_INT_FRAME
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA esp, 3*4
-	/*CFI_OFFSET cs, -2*4;*/
-	CFI_OFFSET eip, -3*4
-.endm
-
-.macro RING0_EC_FRAME
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA esp, 4*4
-	/*CFI_OFFSET cs, -2*4;*/
-	CFI_OFFSET eip, -3*4
-.endm
-
-.macro RING0_PTREGS_FRAME
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
-	/*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
-	CFI_OFFSET eip, PT_EIP-PT_OLDESP
-	/*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
-	/*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
-	CFI_OFFSET eax, PT_EAX-PT_OLDESP
-	CFI_OFFSET ebp, PT_EBP-PT_OLDESP
-	CFI_OFFSET edi, PT_EDI-PT_OLDESP
-	CFI_OFFSET esi, PT_ESI-PT_OLDESP
-	CFI_OFFSET edx, PT_EDX-PT_OLDESP
-	CFI_OFFSET ecx, PT_ECX-PT_OLDESP
-	CFI_OFFSET ebx, PT_EBX-PT_OLDESP
-.endm
-
 ENTRY(ret_from_fork)
-	CFI_STARTPROC
-	pushl_cfi %eax
+	pushl %eax
 	call schedule_tail
 	GET_THREAD_INFO(%ebp)
-	popl_cfi %eax
-	pushl_cfi $0x0202		# Reset kernel eflags
-	popfl_cfi
+	popl %eax
+	pushl $0x0202		# Reset kernel eflags
+	popfl
 	jmp syscall_exit
-	CFI_ENDPROC
 END(ret_from_fork)
 
 ENTRY(ret_from_kernel_thread)
-	CFI_STARTPROC
-	pushl_cfi %eax
+	pushl %eax
 	call schedule_tail
 	GET_THREAD_INFO(%ebp)
-	popl_cfi %eax
-	pushl_cfi $0x0202		# Reset kernel eflags
-	popfl_cfi
+	popl %eax
+	pushl $0x0202		# Reset kernel eflags
+	popfl
 	movl PT_EBP(%esp),%eax
 	call *PT_EBX(%esp)
 	movl $0,PT_EAX(%esp)
 	jmp syscall_exit
-	CFI_ENDPROC
 ENDPROC(ret_from_kernel_thread)
 
 /*
@@ -323,7 +259,6 @@ ENDPROC(ret_from_kernel_thread)
 
 	# userspace resumption stub bypassing syscall exit tracing
 	ALIGN
-	RING0_PTREGS_FRAME
 ret_from_exception:
 	preempt_stop(CLBR_ANY)
 ret_from_intr:
@@ -367,17 +302,12 @@ need_resched:
 	jmp need_resched
 END(resume_kernel)
 #endif
-	CFI_ENDPROC
 
 /* SYSENTER_RETURN points to after the "sysenter" instruction in
    the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
 
 	# sysenter call handler stub
 ENTRY(ia32_sysenter_target)
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA esp, 0
-	CFI_REGISTER esp, ebp
 	movl TSS_sysenter_sp0(%esp),%esp
 sysenter_past_esp:
 	/*
@@ -385,14 +315,11 @@ sysenter_past_esp:
 	 * enough kernel state to call TRACE_IRQS_OFF can be called - but
 	 * we immediately enable interrupts at that point anyway.
 	 */
-	pushl_cfi $__USER_DS
-	/*CFI_REL_OFFSET ss, 0*/
-	pushl_cfi %ebp
-	CFI_REL_OFFSET esp, 0
-	pushfl_cfi
+	pushl $__USER_DS
+	pushl %ebp
+	pushfl
 	orl $X86_EFLAGS_IF, (%esp)
-	pushl_cfi $__USER_CS
-	/*CFI_REL_OFFSET cs, 0*/
+	pushl $__USER_CS
 	/*
 	 * Push current_thread_info()->sysenter_return to the stack.
 	 * A tiny bit of offset fixup is necessary: TI_sysenter_return
@@ -401,10 +328,9 @@ sysenter_past_esp:
 	 * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
 	 * and THREAD_SIZE takes us to the bottom.
 	 */
-	pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
-	CFI_REL_OFFSET eip, 0
+	pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
 
-	pushl_cfi %eax
+	pushl %eax
 	SAVE_ALL
 	ENABLE_INTERRUPTS(CLBR_NONE)
 
@@ -453,11 +379,11 @@ sysenter_audit:
 	/* movl PT_EAX(%esp), %eax	already set, syscall number: 1st arg to audit */
 	movl PT_EBX(%esp), %edx		/* ebx/a0: 2nd arg to audit */
 	/* movl PT_ECX(%esp), %ecx	already set, a1: 3nd arg to audit */
-	pushl_cfi PT_ESI(%esp)		/* a3: 5th arg */
-	pushl_cfi PT_EDX+4(%esp)	/* a2: 4th arg */
+	pushl PT_ESI(%esp)		/* a3: 5th arg */
+	pushl PT_EDX+4(%esp)	/* a2: 4th arg */
 	call __audit_syscall_entry
-	popl_cfi %ecx /* get that remapped edx off the stack */
-	popl_cfi %ecx /* get that remapped esi off the stack */
+	popl %ecx /* get that remapped edx off the stack */
+	popl %ecx /* get that remapped esi off the stack */
 	movl PT_EAX(%esp),%eax		/* reload syscall number */
 	jmp sysenter_do_call
 
@@ -480,7 +406,6 @@ sysexit_audit:
 	jmp sysenter_exit
 #endif
 
-	CFI_ENDPROC
 .pushsection .fixup,"ax"
 2:	movl $0,PT_FS(%esp)
 	jmp 1b
@@ -491,9 +416,8 @@ ENDPROC(ia32_sysenter_target)
 
 	# system call handler stub
 ENTRY(system_call)
-	RING0_INT_FRAME			# can't unwind into user space anyway
 	ASM_CLAC
-	pushl_cfi %eax			# save orig_eax
+	pushl %eax			# save orig_eax
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
 					# system call tracing in operation / emulation
@@ -527,7 +451,6 @@ restore_all_notrace:
 	movb PT_CS(%esp), %al
 	andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
 	cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
-	CFI_REMEMBER_STATE
 	je ldt_ss			# returning to user-space with LDT SS
 #endif
 restore_nocheck:
@@ -543,7 +466,6 @@ ENTRY(iret_exc)
 	_ASM_EXTABLE(irq_return,iret_exc)
 
 #ifdef CONFIG_X86_ESPFIX32
-	CFI_RESTORE_STATE
 ldt_ss:
 #ifdef CONFIG_PARAVIRT
 	/*
@@ -577,22 +499,19 @@ ldt_ss:
 	shr $16, %edx
 	mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
 	mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
-	pushl_cfi $__ESPFIX_SS
-	pushl_cfi %eax			/* new kernel esp */
+	pushl $__ESPFIX_SS
+	pushl %eax			/* new kernel esp */
 	/* Disable interrupts, but do not irqtrace this section: we
 	 * will soon execute iret and the tracer was already set to
 	 * the irqstate after the iret */
 	DISABLE_INTERRUPTS(CLBR_EAX)
 	lss (%esp), %esp		/* switch to espfix segment */
-	CFI_ADJUST_CFA_OFFSET -8
 	jmp restore_nocheck
 #endif
-	CFI_ENDPROC
 ENDPROC(system_call)
 
 	# perform work that needs to be done immediately before resumption
 	ALIGN
-	RING0_PTREGS_FRAME		# can't unwind into user space anyway
 work_pending:
 	testb $_TIF_NEED_RESCHED, %cl
 	jz work_notifysig
@@ -634,9 +553,9 @@ work_notifysig:				# deal with pending signals and
 #ifdef CONFIG_VM86
 	ALIGN
 work_notifysig_v86:
-	pushl_cfi %ecx			# save ti_flags for do_notify_resume
+	pushl %ecx			# save ti_flags for do_notify_resume
 	call save_v86_state		# %eax contains pt_regs pointer
-	popl_cfi %ecx
+	popl %ecx
 	movl %eax, %esp
 	jmp 1b
 #endif
@@ -666,9 +585,7 @@ syscall_exit_work:
 	call syscall_trace_leave
 	jmp resume_userspace
 END(syscall_exit_work)
-	CFI_ENDPROC
 
-	RING0_INT_FRAME			# can't unwind into user space anyway
 syscall_fault:
 	ASM_CLAC
 	GET_THREAD_INFO(%ebp)
@@ -685,7 +602,6 @@ sysenter_badsys:
 	movl $-ENOSYS,%eax
 	jmp sysenter_after_call
 END(sysenter_badsys)
-	CFI_ENDPROC
 
 .macro FIXUP_ESPFIX_STACK
 /*
@@ -701,10 +617,9 @@ END(sysenter_badsys)
 	mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
 	shl $16, %eax
 	addl %esp, %eax			/* the adjusted stack pointer */
-	pushl_cfi $__KERNEL_DS
-	pushl_cfi %eax
+	pushl $__KERNEL_DS
+	pushl %eax
 	lss (%esp), %esp		/* switch to the normal stack segment */
-	CFI_ADJUST_CFA_OFFSET -8
 #endif
 .endm
 .macro UNWIND_ESPFIX_STACK
@@ -728,13 +643,11 @@ END(sysenter_badsys)
  */
 	.align 8
 ENTRY(irq_entries_start)
-	RING0_INT_FRAME
     vector=FIRST_EXTERNAL_VECTOR
     .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
-	pushl_cfi $(~vector+0x80)	/* Note: always in signed byte range */
+	pushl $(~vector+0x80)	/* Note: always in signed byte range */
     vector=vector+1
 	jmp	common_interrupt
-	CFI_ADJUST_CFA_OFFSET -4
 	.align	8
     .endr
 END(irq_entries_start)
@@ -753,19 +666,16 @@ common_interrupt:
 	call do_IRQ
 	jmp ret_from_intr
 ENDPROC(common_interrupt)
-	CFI_ENDPROC
 
 #define BUILD_INTERRUPT3(name, nr, fn)	\
 ENTRY(name)				\
-	RING0_INT_FRAME;		\
 	ASM_CLAC;			\
-	pushl_cfi $~(nr);		\
+	pushl $~(nr);		\
 	SAVE_ALL;			\
 	TRACE_IRQS_OFF			\
 	movl %esp,%eax;			\
 	call fn;			\
 	jmp ret_from_intr;		\
-	CFI_ENDPROC;			\
 ENDPROC(name)
 
 
@@ -784,37 +694,31 @@ ENDPROC(name)
 #include <asm/entry_arch.h>
 
 ENTRY(coprocessor_error)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_coprocessor_error
+	pushl $0
+	pushl $do_coprocessor_error
 	jmp error_code
-	CFI_ENDPROC
 END(coprocessor_error)
 
 ENTRY(simd_coprocessor_error)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $0
+	pushl $0
 #ifdef CONFIG_X86_INVD_BUG
 	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-	ALTERNATIVE "pushl_cfi $do_general_protection",	\
+	ALTERNATIVE "pushl $do_general_protection",	\
 		    "pushl $do_simd_coprocessor_error", \
 		    X86_FEATURE_XMM
 #else
-	pushl_cfi $do_simd_coprocessor_error
+	pushl $do_simd_coprocessor_error
 #endif
 	jmp error_code
-	CFI_ENDPROC
 END(simd_coprocessor_error)
 
 ENTRY(device_not_available)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $-1			# mark this as an int
-	pushl_cfi $do_device_not_available
+	pushl $-1			# mark this as an int
+	pushl $do_device_not_available
 	jmp error_code
-	CFI_ENDPROC
 END(device_not_available)
 
 #ifdef CONFIG_PARAVIRT
@@ -830,115 +734,89 @@ END(native_irq_enable_sysexit)
 #endif
 
 ENTRY(overflow)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_overflow
+	pushl $0
+	pushl $do_overflow
 	jmp error_code
-	CFI_ENDPROC
 END(overflow)
 
 ENTRY(bounds)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_bounds
+	pushl $0
+	pushl $do_bounds
 	jmp error_code
-	CFI_ENDPROC
 END(bounds)
 
 ENTRY(invalid_op)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_invalid_op
+	pushl $0
+	pushl $do_invalid_op
 	jmp error_code
-	CFI_ENDPROC
 END(invalid_op)
 
 ENTRY(coprocessor_segment_overrun)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_coprocessor_segment_overrun
+	pushl $0
+	pushl $do_coprocessor_segment_overrun
 	jmp error_code
-	CFI_ENDPROC
 END(coprocessor_segment_overrun)
 
 ENTRY(invalid_TSS)
-	RING0_EC_FRAME
 	ASM_CLAC
-	pushl_cfi $do_invalid_TSS
+	pushl $do_invalid_TSS
 	jmp error_code
-	CFI_ENDPROC
 END(invalid_TSS)
 
 ENTRY(segment_not_present)
-	RING0_EC_FRAME
 	ASM_CLAC
-	pushl_cfi $do_segment_not_present
+	pushl $do_segment_not_present
 	jmp error_code
-	CFI_ENDPROC
 END(segment_not_present)
 
 ENTRY(stack_segment)
-	RING0_EC_FRAME
 	ASM_CLAC
-	pushl_cfi $do_stack_segment
+	pushl $do_stack_segment
 	jmp error_code
-	CFI_ENDPROC
 END(stack_segment)
 
 ENTRY(alignment_check)
-	RING0_EC_FRAME
 	ASM_CLAC
-	pushl_cfi $do_alignment_check
+	pushl $do_alignment_check
 	jmp error_code
-	CFI_ENDPROC
 END(alignment_check)
 
 ENTRY(divide_error)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $0			# no error code
-	pushl_cfi $do_divide_error
+	pushl $0			# no error code
+	pushl $do_divide_error
 	jmp error_code
-	CFI_ENDPROC
 END(divide_error)
 
 #ifdef CONFIG_X86_MCE
 ENTRY(machine_check)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi machine_check_vector
+	pushl $0
+	pushl machine_check_vector
 	jmp error_code
-	CFI_ENDPROC
 END(machine_check)
 #endif
 
 ENTRY(spurious_interrupt_bug)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_spurious_interrupt_bug
+	pushl $0
+	pushl $do_spurious_interrupt_bug
 	jmp error_code
-	CFI_ENDPROC
 END(spurious_interrupt_bug)
 
 #ifdef CONFIG_XEN
 /* Xen doesn't set %esp to be precisely what the normal sysenter
    entrypoint expects, so fix it up before using the normal path. */
 ENTRY(xen_sysenter_target)
-	RING0_INT_FRAME
 	addl $5*4, %esp		/* remove xen-provided frame */
-	CFI_ADJUST_CFA_OFFSET -5*4
 	jmp sysenter_past_esp
-	CFI_ENDPROC
 
 ENTRY(xen_hypervisor_callback)
-	CFI_STARTPROC
-	pushl_cfi $-1 /* orig_ax = -1 => not a system call */
+	pushl $-1 /* orig_ax = -1 => not a system call */
 	SAVE_ALL
 	TRACE_IRQS_OFF
 
@@ -962,7 +840,6 @@ ENTRY(xen_do_upcall)
 	call xen_maybe_preempt_hcall
 #endif
 	jmp  ret_from_intr
-	CFI_ENDPROC
 ENDPROC(xen_hypervisor_callback)
 
 # Hypervisor uses this for application faults while it executes.
@@ -976,8 +853,7 @@ ENDPROC(xen_hypervisor_callback)
 # to pop the stack frame we end up in an infinite loop of failsafe callbacks.
 # We distinguish between categories by maintaining a status value in EAX.
 ENTRY(xen_failsafe_callback)
-	CFI_STARTPROC
-	pushl_cfi %eax
+	pushl %eax
 	movl $1,%eax
 1:	mov 4(%esp),%ds
 2:	mov 8(%esp),%es
@@ -986,15 +862,13 @@ ENTRY(xen_failsafe_callback)
 	/* EAX == 0 => Category 1 (Bad segment)
 	   EAX != 0 => Category 2 (Bad IRET) */
 	testl %eax,%eax
-	popl_cfi %eax
+	popl %eax
 	lea 16(%esp),%esp
-	CFI_ADJUST_CFA_OFFSET -16
 	jz 5f
 	jmp iret_exc
-5:	pushl_cfi $-1 /* orig_ax = -1 => not a system call */
+5:	pushl $-1 /* orig_ax = -1 => not a system call */
 	SAVE_ALL
 	jmp ret_from_exception
-	CFI_ENDPROC
 
 .section .fixup,"ax"
 6:	xorl %eax,%eax
@@ -1195,34 +1069,28 @@ return_to_handler:
 
 #ifdef CONFIG_TRACING
 ENTRY(trace_page_fault)
-	RING0_EC_FRAME
 	ASM_CLAC
-	pushl_cfi $trace_do_page_fault
+	pushl $trace_do_page_fault
 	jmp error_code
-	CFI_ENDPROC
 END(trace_page_fault)
 #endif
 
 ENTRY(page_fault)
-	RING0_EC_FRAME
 	ASM_CLAC
-	pushl_cfi $do_page_fault
+	pushl $do_page_fault
 	ALIGN
 error_code:
 	/* the function address is in %gs's slot on the stack */
-	pushl_cfi %fs
-	/*CFI_REL_OFFSET fs, 0*/
-	pushl_cfi %es
-	/*CFI_REL_OFFSET es, 0*/
-	pushl_cfi %ds
-	/*CFI_REL_OFFSET ds, 0*/
-	pushl_cfi_reg eax
-	pushl_cfi_reg ebp
-	pushl_cfi_reg edi
-	pushl_cfi_reg esi
-	pushl_cfi_reg edx
-	pushl_cfi_reg ecx
-	pushl_cfi_reg ebx
+	pushl %fs
+	pushl %es
+	pushl %ds
+	pushl %eax
+	pushl %ebp
+	pushl %edi
+	pushl %esi
+	pushl %edx
+	pushl %ecx
+	pushl %ebx
 	cld
 	movl $(__KERNEL_PERCPU), %ecx
 	movl %ecx, %fs
@@ -1240,7 +1108,6 @@ error_code:
 	movl %esp,%eax			# pt_regs pointer
 	call *%edi
 	jmp ret_from_exception
-	CFI_ENDPROC
 END(page_fault)
 
 /*
@@ -1261,29 +1128,24 @@ END(page_fault)
 	jne \ok
 \label:
 	movl TSS_sysenter_sp0 + \offset(%esp), %esp
-	CFI_DEF_CFA esp, 0
-	CFI_UNDEFINED eip
-	pushfl_cfi
-	pushl_cfi $__KERNEL_CS
-	pushl_cfi $sysenter_past_esp
-	CFI_REL_OFFSET eip, 0
+	pushfl
+	pushl $__KERNEL_CS
+	pushl $sysenter_past_esp
 .endm
 
 ENTRY(debug)
-	RING0_INT_FRAME
 	ASM_CLAC
 	cmpl $ia32_sysenter_target,(%esp)
 	jne debug_stack_correct
 	FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
 debug_stack_correct:
-	pushl_cfi $-1			# mark this as an int
+	pushl $-1			# mark this as an int
 	SAVE_ALL
 	TRACE_IRQS_OFF
 	xorl %edx,%edx			# error code 0
 	movl %esp,%eax			# pt_regs pointer
 	call do_debug
 	jmp ret_from_exception
-	CFI_ENDPROC
 END(debug)
 
 /*
@@ -1295,45 +1157,40 @@ END(debug)
  * fault happened on the sysenter path.
  */
 ENTRY(nmi)
-	RING0_INT_FRAME
 	ASM_CLAC
 #ifdef CONFIG_X86_ESPFIX32
-	pushl_cfi %eax
+	pushl %eax
 	movl %ss, %eax
 	cmpw $__ESPFIX_SS, %ax
-	popl_cfi %eax
+	popl %eax
 	je nmi_espfix_stack
 #endif
 	cmpl $ia32_sysenter_target,(%esp)
 	je nmi_stack_fixup
-	pushl_cfi %eax
+	pushl %eax
 	movl %esp,%eax
 	/* Do not access memory above the end of our stack page,
 	 * it might not exist.
 	 */
 	andl $(THREAD_SIZE-1),%eax
 	cmpl $(THREAD_SIZE-20),%eax
-	popl_cfi %eax
+	popl %eax
 	jae nmi_stack_correct
 	cmpl $ia32_sysenter_target,12(%esp)
 	je nmi_debug_stack_check
 nmi_stack_correct:
-	/* We have a RING0_INT_FRAME here */
-	pushl_cfi %eax
+	pushl %eax
 	SAVE_ALL
 	xorl %edx,%edx		# zero error code
 	movl %esp,%eax		# pt_regs pointer
 	call do_nmi
 	jmp restore_all_notrace
-	CFI_ENDPROC
 
 nmi_stack_fixup:
-	RING0_INT_FRAME
 	FIX_STACK 12, nmi_stack_correct, 1
 	jmp nmi_stack_correct
 
 nmi_debug_stack_check:
-	/* We have a RING0_INT_FRAME here */
 	cmpw $__KERNEL_CS,16(%esp)
 	jne nmi_stack_correct
 	cmpl $debug,(%esp)
@@ -1345,57 +1202,48 @@ nmi_debug_stack_check:
 
 #ifdef CONFIG_X86_ESPFIX32
 nmi_espfix_stack:
-	/* We have a RING0_INT_FRAME here.
-	 *
+	/*
 	 * create the pointer to lss back
 	 */
-	pushl_cfi %ss
-	pushl_cfi %esp
+	pushl %ss
+	pushl %esp
 	addl $4, (%esp)
 	/* copy the iret frame of 12 bytes */
 	.rept 3
-	pushl_cfi 16(%esp)
+	pushl 16(%esp)
 	.endr
-	pushl_cfi %eax
+	pushl %eax
 	SAVE_ALL
 	FIXUP_ESPFIX_STACK		# %eax == %esp
 	xorl %edx,%edx			# zero error code
 	call do_nmi
 	RESTORE_REGS
 	lss 12+4(%esp), %esp		# back to espfix stack
-	CFI_ADJUST_CFA_OFFSET -24
 	jmp irq_return
 #endif
-	CFI_ENDPROC
 END(nmi)
 
 ENTRY(int3)
-	RING0_INT_FRAME
 	ASM_CLAC
-	pushl_cfi $-1			# mark this as an int
+	pushl $-1			# mark this as an int
 	SAVE_ALL
 	TRACE_IRQS_OFF
 	xorl %edx,%edx		# zero error code
 	movl %esp,%eax		# pt_regs pointer
 	call do_int3
 	jmp ret_from_exception
-	CFI_ENDPROC
 END(int3)
 
 ENTRY(general_protection)
-	RING0_EC_FRAME
-	pushl_cfi $do_general_protection
+	pushl $do_general_protection
 	jmp error_code
-	CFI_ENDPROC
 END(general_protection)
 
 #ifdef CONFIG_KVM_GUEST
 ENTRY(async_page_fault)
-	RING0_EC_FRAME
 	ASM_CLAC
-	pushl_cfi $do_async_page_fault
+	pushl $do_async_page_fault
 	jmp error_code
-	CFI_ENDPROC
 END(async_page_fault)
 #endif
 
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 02c2eff7478da6..c6c11ac8b53fb1 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -19,8 +19,6 @@
  * at the top of the kernel process stack.
  *
  * Some macro usage:
- * - CFI macros are used to generate dwarf2 unwind information for better
- * backtraces. They don't change any code.
  * - ENTRY/END Define functions in the symbol table.
  * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
  * - idtentry - Define exception entry points.
@@ -30,7 +28,6 @@
 #include <asm/segment.h>
 #include <asm/cache.h>
 #include <asm/errno.h>
-#include <asm/dwarf2.h>
 #include <asm/calling.h>
 #include <asm/asm-offsets.h>
 #include <asm/msr.h>
@@ -113,61 +110,6 @@ ENDPROC(native_usergs_sysret64)
 #endif
 
 /*
- * empty frame
- */
-	.macro EMPTY_FRAME start=1 offset=0
-	.if \start
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA rsp,8+\offset
-	.else
-	CFI_DEF_CFA_OFFSET 8+\offset
-	.endif
-	.endm
-
-/*
- * initial frame state for interrupts (and exceptions without error code)
- */
-	.macro INTR_FRAME start=1 offset=0
-	EMPTY_FRAME \start, 5*8+\offset
-	/*CFI_REL_OFFSET ss, 4*8+\offset*/
-	CFI_REL_OFFSET rsp, 3*8+\offset
-	/*CFI_REL_OFFSET rflags, 2*8+\offset*/
-	/*CFI_REL_OFFSET cs, 1*8+\offset*/
-	CFI_REL_OFFSET rip, 0*8+\offset
-	.endm
-
-/*
- * initial frame state for exceptions with error code (and interrupts
- * with vector already pushed)
- */
-	.macro XCPT_FRAME start=1 offset=0
-	INTR_FRAME \start, 1*8+\offset
-	.endm
-
-/*
- * frame that enables passing a complete pt_regs to a C function.
- */
-	.macro DEFAULT_FRAME start=1 offset=0
-	XCPT_FRAME \start, ORIG_RAX+\offset
-	CFI_REL_OFFSET rdi, RDI+\offset
-	CFI_REL_OFFSET rsi, RSI+\offset
-	CFI_REL_OFFSET rdx, RDX+\offset
-	CFI_REL_OFFSET rcx, RCX+\offset
-	CFI_REL_OFFSET rax, RAX+\offset
-	CFI_REL_OFFSET r8, R8+\offset
-	CFI_REL_OFFSET r9, R9+\offset
-	CFI_REL_OFFSET r10, R10+\offset
-	CFI_REL_OFFSET r11, R11+\offset
-	CFI_REL_OFFSET rbx, RBX+\offset
-	CFI_REL_OFFSET rbp, RBP+\offset
-	CFI_REL_OFFSET r12, R12+\offset
-	CFI_REL_OFFSET r13, R13+\offset
-	CFI_REL_OFFSET r14, R14+\offset
-	CFI_REL_OFFSET r15, R15+\offset
-	.endm
-
-/*
  * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
  *
  * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
@@ -196,12 +138,6 @@ ENDPROC(native_usergs_sysret64)
  */
 
 ENTRY(system_call)
-	CFI_STARTPROC	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,0
-	CFI_REGISTER	rip,rcx
-	/*CFI_REGISTER	rflags,r11*/
-
 	/*
 	 * Interrupts are off on entry.
 	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
@@ -216,11 +152,11 @@ ENTRY(system_call)
 GLOBAL(system_call_after_swapgs)
 
 	movq	%rsp,PER_CPU_VAR(rsp_scratch)
-	movq	PER_CPU_VAR(kernel_stack),%rsp
+	movq	PER_CPU_VAR(cpu_current_top_of_stack),%rsp
 
 	/* Construct struct pt_regs on stack */
-	pushq_cfi $__USER_DS			/* pt_regs->ss */
-	pushq_cfi PER_CPU_VAR(rsp_scratch)	/* pt_regs->sp */
+	pushq $__USER_DS			/* pt_regs->ss */
+	pushq PER_CPU_VAR(rsp_scratch)	/* pt_regs->sp */
 	/*
 	 * Re-enable interrupts.
 	 * We use 'rsp_scratch' as a scratch space, hence irq-off block above
@@ -229,22 +165,20 @@ GLOBAL(system_call_after_swapgs)
 	 * with using rsp_scratch:
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq_cfi	%r11			/* pt_regs->flags */
-	pushq_cfi	$__USER_CS		/* pt_regs->cs */
-	pushq_cfi	%rcx			/* pt_regs->ip */
-	CFI_REL_OFFSET rip,0
-	pushq_cfi_reg	rax			/* pt_regs->orig_ax */
-	pushq_cfi_reg	rdi			/* pt_regs->di */
-	pushq_cfi_reg	rsi			/* pt_regs->si */
-	pushq_cfi_reg	rdx			/* pt_regs->dx */
-	pushq_cfi_reg	rcx			/* pt_regs->cx */
-	pushq_cfi	$-ENOSYS		/* pt_regs->ax */
-	pushq_cfi_reg	r8			/* pt_regs->r8 */
-	pushq_cfi_reg	r9			/* pt_regs->r9 */
-	pushq_cfi_reg	r10			/* pt_regs->r10 */
-	pushq_cfi_reg	r11			/* pt_regs->r11 */
+	pushq	%r11			/* pt_regs->flags */
+	pushq	$__USER_CS		/* pt_regs->cs */
+	pushq	%rcx			/* pt_regs->ip */
+	pushq	%rax			/* pt_regs->orig_ax */
+	pushq	%rdi			/* pt_regs->di */
+	pushq	%rsi			/* pt_regs->si */
+	pushq	%rdx			/* pt_regs->dx */
+	pushq	%rcx			/* pt_regs->cx */
+	pushq	$-ENOSYS		/* pt_regs->ax */
+	pushq	%r8			/* pt_regs->r8 */
+	pushq	%r9			/* pt_regs->r9 */
+	pushq	%r10			/* pt_regs->r10 */
+	pushq	%r11			/* pt_regs->r11 */
 	sub	$(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
-	CFI_ADJUST_CFA_OFFSET 6*8
 
 	testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz tracesys
@@ -282,13 +216,9 @@ system_call_fastpath:
 	testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz int_ret_from_sys_call_irqs_off	/* Go to the slow path */
 
-	CFI_REMEMBER_STATE
-
 	RESTORE_C_REGS_EXCEPT_RCX_R11
 	movq	RIP(%rsp),%rcx
-	CFI_REGISTER	rip,rcx
 	movq	EFLAGS(%rsp),%r11
-	/*CFI_REGISTER	rflags,r11*/
 	movq	RSP(%rsp),%rsp
 	/*
 	 * 64bit SYSRET restores rip from rcx,
@@ -307,8 +237,6 @@ system_call_fastpath:
 	 */
 	USERGS_SYSRET64
 
-	CFI_RESTORE_STATE
-
 	/* Do syscall entry tracing */
 tracesys:
 	movq %rsp, %rdi
@@ -374,9 +302,9 @@ int_careful:
 	jnc  int_very_careful
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq_cfi %rdi
+	pushq %rdi
 	SCHEDULE_USER
-	popq_cfi %rdi
+	popq %rdi
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp int_with_check
@@ -389,10 +317,10 @@ int_very_careful:
 	/* Check for syscall exit trace */
 	testl $_TIF_WORK_SYSCALL_EXIT,%edx
 	jz int_signal
-	pushq_cfi %rdi
+	pushq %rdi
 	leaq 8(%rsp),%rdi	# &ptregs -> arg1
 	call syscall_trace_leave
-	popq_cfi %rdi
+	popq %rdi
 	andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
 	jmp int_restore_rest
 
@@ -419,26 +347,27 @@ syscall_return:
 	 * a completely clean 64-bit userspace context.
 	 */
 	movq RCX(%rsp),%rcx
-	cmpq %rcx,RIP(%rsp)		/* RCX == RIP */
+	movq RIP(%rsp),%r11
+	cmpq %rcx,%r11			/* RCX == RIP */
 	jne opportunistic_sysret_failed
 
 	/*
 	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
 	 * in kernel space.  This essentially lets the user take over
-	 * the kernel, since userspace controls RSP.  It's not worth
-	 * testing for canonicalness exactly -- this check detects any
-	 * of the 17 high bits set, which is true for non-canonical
-	 * or kernel addresses.  (This will pessimize vsyscall=native.
-	 * Big deal.)
+	 * the kernel, since userspace controls RSP.
 	 *
-	 * If virtual addresses ever become wider, this will need
+	 * If width of "canonical tail" ever becomes variable, this will need
 	 * to be updated to remain correct on both old and new CPUs.
 	 */
 	.ifne __VIRTUAL_MASK_SHIFT - 47
 	.error "virtual address width changed -- SYSRET checks need update"
 	.endif
-	shr $__VIRTUAL_MASK_SHIFT, %rcx
-	jnz opportunistic_sysret_failed
+	/* Change top 16 bits to be the sign-extension of 47th bit */
+	shl	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+	sar	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+	/* If this changed %rcx, it was not canonical */
+	cmpq	%rcx, %r11
+	jne	opportunistic_sysret_failed
 
 	cmpq $__USER_CS,CS(%rsp)	/* CS must match SYSRET */
 	jne opportunistic_sysret_failed
@@ -474,27 +403,21 @@ syscall_return:
 	 * perf profiles.  Nothing jumps here.
 	 */
 syscall_return_via_sysret:
-	CFI_REMEMBER_STATE
-	/* r11 is already restored (see code above) */
-	RESTORE_C_REGS_EXCEPT_R11
+	/* rcx and r11 are already restored (see code above) */
+	RESTORE_C_REGS_EXCEPT_RCX_R11
 	movq RSP(%rsp),%rsp
 	USERGS_SYSRET64
-	CFI_RESTORE_STATE
 
 opportunistic_sysret_failed:
 	SWAPGS
 	jmp	restore_c_regs_and_iret
-	CFI_ENDPROC
 END(system_call)
 
 
 	.macro FORK_LIKE func
 ENTRY(stub_\func)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8		/* offset 8: return address */
 	SAVE_EXTRA_REGS 8
 	jmp sys_\func
-	CFI_ENDPROC
 END(stub_\func)
 	.endm
 
@@ -503,8 +426,6 @@ END(stub_\func)
 	FORK_LIKE  vfork
 
 ENTRY(stub_execve)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
 	call	sys_execve
 return_from_execve:
 	testl	%eax, %eax
@@ -514,11 +435,9 @@ return_from_execve:
 1:
 	/* must use IRET code path (pt_regs->cs may have changed) */
 	addq	$8, %rsp
-	CFI_ADJUST_CFA_OFFSET -8
 	ZERO_EXTRA_REGS
 	movq	%rax,RAX(%rsp)
 	jmp	int_ret_from_sys_call
-	CFI_ENDPROC
 END(stub_execve)
 /*
  * Remaining execve stubs are only 7 bytes long.
@@ -526,47 +445,25 @@ END(stub_execve)
  */
 	.align	8
 GLOBAL(stub_execveat)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
 	call	sys_execveat
 	jmp	return_from_execve
-	CFI_ENDPROC
 END(stub_execveat)
 
-#ifdef CONFIG_X86_X32_ABI
+#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
 	.align	8
 GLOBAL(stub_x32_execve)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
-	call	compat_sys_execve
-	jmp	return_from_execve
-	CFI_ENDPROC
-END(stub_x32_execve)
-	.align	8
-GLOBAL(stub_x32_execveat)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
-	call	compat_sys_execveat
-	jmp	return_from_execve
-	CFI_ENDPROC
-END(stub_x32_execveat)
-#endif
-
-#ifdef CONFIG_IA32_EMULATION
-	.align	8
 GLOBAL(stub32_execve)
-	CFI_STARTPROC
 	call	compat_sys_execve
 	jmp	return_from_execve
-	CFI_ENDPROC
 END(stub32_execve)
+END(stub_x32_execve)
 	.align	8
+GLOBAL(stub_x32_execveat)
 GLOBAL(stub32_execveat)
-	CFI_STARTPROC
 	call	compat_sys_execveat
 	jmp	return_from_execve
-	CFI_ENDPROC
 END(stub32_execveat)
+END(stub_x32_execveat)
 #endif
 
 /*
@@ -574,8 +471,6 @@ END(stub32_execveat)
  * This cannot be done with SYSRET, so use the IRET return path instead.
  */
 ENTRY(stub_rt_sigreturn)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
 	/*
 	 * SAVE_EXTRA_REGS result is not normally needed:
 	 * sigreturn overwrites all pt_regs->GPREGS.
@@ -587,21 +482,16 @@ ENTRY(stub_rt_sigreturn)
 	call sys_rt_sigreturn
 return_from_stub:
 	addq	$8, %rsp
-	CFI_ADJUST_CFA_OFFSET -8
 	RESTORE_EXTRA_REGS
 	movq %rax,RAX(%rsp)
 	jmp int_ret_from_sys_call
-	CFI_ENDPROC
 END(stub_rt_sigreturn)
 
 #ifdef CONFIG_X86_X32_ABI
 ENTRY(stub_x32_rt_sigreturn)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
 	SAVE_EXTRA_REGS 8
 	call sys32_x32_rt_sigreturn
 	jmp  return_from_stub
-	CFI_ENDPROC
 END(stub_x32_rt_sigreturn)
 #endif
 
@@ -611,18 +501,17 @@ END(stub_x32_rt_sigreturn)
  * rdi: prev task we switched from
  */
 ENTRY(ret_from_fork)
-	DEFAULT_FRAME
 
 	LOCK ; btr $TIF_FORK,TI_flags(%r8)
 
-	pushq_cfi $0x0002
-	popfq_cfi				# reset kernel eflags
+	pushq $0x0002
+	popfq				# reset kernel eflags
 
 	call schedule_tail			# rdi: 'prev' task parameter
 
 	RESTORE_EXTRA_REGS
 
-	testl $3,CS(%rsp)			# from kernel_thread?
+	testb	$3, CS(%rsp)			# from kernel_thread?
 
 	/*
 	 * By the time we get here, we have no idea whether our pt_regs,
@@ -640,7 +529,6 @@ ENTRY(ret_from_fork)
 	movl $0, RAX(%rsp)
 	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
-	CFI_ENDPROC
 END(ret_from_fork)
 
 /*
@@ -649,16 +537,13 @@ END(ret_from_fork)
  */
 	.align 8
 ENTRY(irq_entries_start)
-	INTR_FRAME
     vector=FIRST_EXTERNAL_VECTOR
     .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
-	pushq_cfi $(~vector+0x80)	/* Note: always in signed byte range */
+	pushq $(~vector+0x80)	/* Note: always in signed byte range */
     vector=vector+1
 	jmp	common_interrupt
-	CFI_ADJUST_CFA_OFFSET -8
 	.align	8
     .endr
-	CFI_ENDPROC
 END(irq_entries_start)
 
 /*
@@ -686,8 +571,8 @@ END(irq_entries_start)
 
 	leaq -RBP(%rsp),%rdi	/* arg1 for \func (pointer to pt_regs) */
 
-	testl $3, CS-RBP(%rsp)
-	je 1f
+	testb	$3, CS-RBP(%rsp)
+	jz	1f
 	SWAPGS
 1:
 	/*
@@ -700,17 +585,7 @@ END(irq_entries_start)
 	movq %rsp, %rsi
 	incl PER_CPU_VAR(irq_count)
 	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
-	CFI_DEF_CFA_REGISTER	rsi
 	pushq %rsi
-	/*
-	 * For debugger:
-	 * "CFA (Current Frame Address) is the value on stack + offset"
-	 */
-	CFI_ESCAPE	0x0f /* DW_CFA_def_cfa_expression */, 6, \
-			0x77 /* DW_OP_breg7 (rsp) */, 0, \
-			0x06 /* DW_OP_deref */, \
-			0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
-			0x22 /* DW_OP_plus */
 	/* We entered an interrupt context - irqs are off: */
 	TRACE_IRQS_OFF
 
@@ -723,7 +598,6 @@ END(irq_entries_start)
 	 */
 	.p2align CONFIG_X86_L1_CACHE_SHIFT
 common_interrupt:
-	XCPT_FRAME
 	ASM_CLAC
 	addq $-0x80,(%rsp)		/* Adjust vector to [-256,-1] range */
 	interrupt do_IRQ
@@ -735,16 +609,13 @@ ret_from_intr:
 
 	/* Restore saved previous stack */
 	popq %rsi
-	CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */
 	/* return code expects complete pt_regs - adjust rsp accordingly: */
 	leaq -RBP(%rsi),%rsp
-	CFI_DEF_CFA_REGISTER	rsp
-	CFI_ADJUST_CFA_OFFSET	RBP
 
-	testl $3,CS(%rsp)
-	je retint_kernel
+	testb	$3, CS(%rsp)
+	jz	retint_kernel
 	/* Interrupt came from user space */
-
+retint_user:
 	GET_THREAD_INFO(%rcx)
 	/*
 	 * %rcx: thread info. Interrupts off.
@@ -755,7 +626,6 @@ retint_check:
 	LOCKDEP_SYS_EXIT_IRQ
 	movl TI_flags(%rcx),%edx
 	andl %edi,%edx
-	CFI_REMEMBER_STATE
 	jnz  retint_careful
 
 retint_swapgs:		/* return to user-space */
@@ -819,8 +689,8 @@ native_irq_return_iret:
 
 #ifdef CONFIG_X86_ESPFIX64
 native_irq_return_ldt:
-	pushq_cfi %rax
-	pushq_cfi %rdi
+	pushq %rax
+	pushq %rdi
 	SWAPGS
 	movq PER_CPU_VAR(espfix_waddr),%rdi
 	movq %rax,(0*8)(%rdi)	/* RAX */
@@ -835,24 +705,23 @@ native_irq_return_ldt:
 	movq (5*8)(%rsp),%rax	/* RSP */
 	movq %rax,(4*8)(%rdi)
 	andl $0xffff0000,%eax
-	popq_cfi %rdi
+	popq %rdi
 	orq PER_CPU_VAR(espfix_stack),%rax
 	SWAPGS
 	movq %rax,%rsp
-	popq_cfi %rax
+	popq %rax
 	jmp native_irq_return_iret
 #endif
 
 	/* edi: workmask, edx: work */
 retint_careful:
-	CFI_RESTORE_STATE
 	bt    $TIF_NEED_RESCHED,%edx
 	jnc   retint_signal
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq_cfi %rdi
+	pushq %rdi
 	SCHEDULE_USER
-	popq_cfi %rdi
+	popq %rdi
 	GET_THREAD_INFO(%rcx)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
@@ -874,7 +743,6 @@ retint_signal:
 	GET_THREAD_INFO(%rcx)
 	jmp retint_with_reschedule
 
-	CFI_ENDPROC
 END(common_interrupt)
 
 /*
@@ -882,13 +750,11 @@ END(common_interrupt)
  */
 .macro apicinterrupt3 num sym do_sym
 ENTRY(\sym)
-	INTR_FRAME
 	ASM_CLAC
-	pushq_cfi $~(\num)
+	pushq $~(\num)
 .Lcommon_\sym:
 	interrupt \do_sym
 	jmp ret_from_intr
-	CFI_ENDPROC
 END(\sym)
 .endm
 
@@ -928,6 +794,8 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
 #ifdef CONFIG_HAVE_KVM
 apicinterrupt3 POSTED_INTR_VECTOR \
 	kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
+apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR \
+	kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi
 #endif
 
 #ifdef CONFIG_X86_MCE_THRESHOLD
@@ -935,6 +803,11 @@ apicinterrupt THRESHOLD_APIC_VECTOR \
 	threshold_interrupt smp_threshold_interrupt
 #endif
 
+#ifdef CONFIG_X86_MCE_AMD
+apicinterrupt DEFERRED_ERROR_VECTOR \
+	deferred_error_interrupt smp_deferred_error_interrupt
+#endif
+
 #ifdef CONFIG_X86_THERMAL_VECTOR
 apicinterrupt THERMAL_APIC_VECTOR \
 	thermal_interrupt smp_thermal_interrupt
@@ -971,25 +844,18 @@ ENTRY(\sym)
 	.error "using shift_ist requires paranoid=1"
 	.endif
 
-	.if \has_error_code
-	XCPT_FRAME
-	.else
-	INTR_FRAME
-	.endif
-
 	ASM_CLAC
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
 
 	.ifeq \has_error_code
-	pushq_cfi $-1			/* ORIG_RAX: no syscall to restart */
+	pushq $-1			/* ORIG_RAX: no syscall to restart */
 	.endif
 
 	ALLOC_PT_GPREGS_ON_STACK
 
 	.if \paranoid
 	.if \paranoid == 1
-	CFI_REMEMBER_STATE
-	testl $3, CS(%rsp)		/* If coming from userspace, switch */
+	testb	$3, CS(%rsp)		/* If coming from userspace, switch */
 	jnz 1f				/* stacks. */
 	.endif
 	call paranoid_entry
@@ -998,8 +864,6 @@ ENTRY(\sym)
 	.endif
 	/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
 
-	DEFAULT_FRAME 0
-
 	.if \paranoid
 	.if \shift_ist != -1
 	TRACE_IRQS_OFF_DEBUG		/* reload IDT in case of recursion */
@@ -1035,7 +899,6 @@ ENTRY(\sym)
 	.endif
 
 	.if \paranoid == 1
-	CFI_RESTORE_STATE
 	/*
 	 * Paranoid entry from userspace.  Switch stacks and treat it
 	 * as a normal entry.  This means that paranoid handlers
@@ -1044,7 +907,6 @@ ENTRY(\sym)
 1:
 	call error_entry
 
-	DEFAULT_FRAME 0
 
 	movq %rsp,%rdi			/* pt_regs pointer */
 	call sync_regs
@@ -1063,8 +925,6 @@ ENTRY(\sym)
 
 	jmp error_exit			/* %ebx: no swapgs flag */
 	.endif
-
-	CFI_ENDPROC
 END(\sym)
 .endm
 
@@ -1097,17 +957,15 @@ idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
 	/* Reload gs selector with exception handling */
 	/* edi:  new selector */
 ENTRY(native_load_gs_index)
-	CFI_STARTPROC
-	pushfq_cfi
+	pushfq
 	DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
 	SWAPGS
 gs_change:
 	movl %edi,%gs
 2:	mfence		/* workaround */
 	SWAPGS
-	popfq_cfi
+	popfq
 	ret
-	CFI_ENDPROC
 END(native_load_gs_index)
 
 	_ASM_EXTABLE(gs_change,bad_gs)
@@ -1122,22 +980,15 @@ bad_gs:
 
 /* Call softirq on interrupt stack. Interrupts are off. */
 ENTRY(do_softirq_own_stack)
-	CFI_STARTPROC
-	pushq_cfi %rbp
-	CFI_REL_OFFSET rbp,0
+	pushq %rbp
 	mov  %rsp,%rbp
-	CFI_DEF_CFA_REGISTER rbp
 	incl PER_CPU_VAR(irq_count)
 	cmove PER_CPU_VAR(irq_stack_ptr),%rsp
 	push  %rbp			# backlink for old unwinder
 	call __do_softirq
 	leaveq
-	CFI_RESTORE		rbp
-	CFI_DEF_CFA_REGISTER	rsp
-	CFI_ADJUST_CFA_OFFSET   -8
 	decl PER_CPU_VAR(irq_count)
 	ret
-	CFI_ENDPROC
 END(do_softirq_own_stack)
 
 #ifdef CONFIG_XEN
@@ -1157,28 +1008,22 @@ idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
  * activation and restart the handler using the previous one.
  */
 ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
-	CFI_STARTPROC
 /*
  * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
  * see the correct pointer to the pt_regs
  */
 	movq %rdi, %rsp            # we don't return, adjust the stack frame
-	CFI_ENDPROC
-	DEFAULT_FRAME
 11:	incl PER_CPU_VAR(irq_count)
 	movq %rsp,%rbp
-	CFI_DEF_CFA_REGISTER rbp
 	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
 	pushq %rbp			# backlink for old unwinder
 	call xen_evtchn_do_upcall
 	popq %rsp
-	CFI_DEF_CFA_REGISTER rsp
 	decl PER_CPU_VAR(irq_count)
 #ifndef CONFIG_PREEMPT
 	call xen_maybe_preempt_hcall
 #endif
 	jmp  error_exit
-	CFI_ENDPROC
 END(xen_do_hypervisor_callback)
 
 /*
@@ -1195,51 +1040,35 @@ END(xen_do_hypervisor_callback)
  * with its current contents: any discrepancy means we in category 1.
  */
 ENTRY(xen_failsafe_callback)
-	INTR_FRAME 1 (6*8)
-	/*CFI_REL_OFFSET gs,GS*/
-	/*CFI_REL_OFFSET fs,FS*/
-	/*CFI_REL_OFFSET es,ES*/
-	/*CFI_REL_OFFSET ds,DS*/
-	CFI_REL_OFFSET r11,8
-	CFI_REL_OFFSET rcx,0
-	movw %ds,%cx
+	movl %ds,%ecx
 	cmpw %cx,0x10(%rsp)
-	CFI_REMEMBER_STATE
 	jne 1f
-	movw %es,%cx
+	movl %es,%ecx
 	cmpw %cx,0x18(%rsp)
 	jne 1f
-	movw %fs,%cx
+	movl %fs,%ecx
 	cmpw %cx,0x20(%rsp)
 	jne 1f
-	movw %gs,%cx
+	movl %gs,%ecx
 	cmpw %cx,0x28(%rsp)
 	jne 1f
 	/* All segments match their saved values => Category 2 (Bad IRET). */
 	movq (%rsp),%rcx
-	CFI_RESTORE rcx
 	movq 8(%rsp),%r11
-	CFI_RESTORE r11
 	addq $0x30,%rsp
-	CFI_ADJUST_CFA_OFFSET -0x30
-	pushq_cfi $0	/* RIP */
-	pushq_cfi %r11
-	pushq_cfi %rcx
+	pushq $0	/* RIP */
+	pushq %r11
+	pushq %rcx
 	jmp general_protection
-	CFI_RESTORE_STATE
 1:	/* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
 	movq (%rsp),%rcx
-	CFI_RESTORE rcx
 	movq 8(%rsp),%r11
-	CFI_RESTORE r11
 	addq $0x30,%rsp
-	CFI_ADJUST_CFA_OFFSET -0x30
-	pushq_cfi $-1 /* orig_ax = -1 => not a system call */
+	pushq $-1 /* orig_ax = -1 => not a system call */
 	ALLOC_PT_GPREGS_ON_STACK
 	SAVE_C_REGS
 	SAVE_EXTRA_REGS
 	jmp error_exit
-	CFI_ENDPROC
 END(xen_failsafe_callback)
 
 apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
@@ -1275,7 +1104,6 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(
  * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
  */
 ENTRY(paranoid_entry)
-	XCPT_FRAME 1 15*8
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
@@ -1287,7 +1115,6 @@ ENTRY(paranoid_entry)
 	SWAPGS
 	xorl %ebx,%ebx
 1:	ret
-	CFI_ENDPROC
 END(paranoid_entry)
 
 /*
@@ -1302,7 +1129,6 @@ END(paranoid_entry)
  */
 /* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
 ENTRY(paranoid_exit)
-	DEFAULT_FRAME
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF_DEBUG
 	testl %ebx,%ebx				/* swapgs needed? */
@@ -1317,7 +1143,6 @@ paranoid_exit_restore:
 	RESTORE_C_REGS
 	REMOVE_PT_GPREGS_FROM_STACK 8
 	INTERRUPT_RETURN
-	CFI_ENDPROC
 END(paranoid_exit)
 
 /*
@@ -1325,13 +1150,12 @@ END(paranoid_exit)
  * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
  */
 ENTRY(error_entry)
-	XCPT_FRAME 1 15*8
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
 	xorl %ebx,%ebx
-	testl $3,CS+8(%rsp)
-	je error_kernelspace
+	testb	$3, CS+8(%rsp)
+	jz	error_kernelspace
 error_swapgs:
 	SWAPGS
 error_sti:
@@ -1345,7 +1169,6 @@ error_sti:
 	 * for these here too.
 	 */
 error_kernelspace:
-	CFI_REL_OFFSET rcx, RCX+8
 	incl %ebx
 	leaq native_irq_return_iret(%rip),%rcx
 	cmpq %rcx,RIP+8(%rsp)
@@ -1369,32 +1192,22 @@ error_bad_iret:
 	mov %rax,%rsp
 	decl %ebx	/* Return to usergs */
 	jmp error_sti
-	CFI_ENDPROC
 END(error_entry)
 
 
 /* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
 ENTRY(error_exit)
-	DEFAULT_FRAME
 	movl %ebx,%eax
 	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	GET_THREAD_INFO(%rcx)
 	testl %eax,%eax
-	jne retint_kernel
-	LOCKDEP_SYS_EXIT_IRQ
-	movl TI_flags(%rcx),%edx
-	movl $_TIF_WORK_MASK,%edi
-	andl %edi,%edx
-	jnz retint_careful
-	jmp retint_swapgs
-	CFI_ENDPROC
+	jnz retint_kernel
+	jmp retint_user
 END(error_exit)
 
 /* Runs on exception stack */
 ENTRY(nmi)
-	INTR_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	/*
 	 * We allow breakpoints in NMIs. If a breakpoint occurs, then
@@ -1429,8 +1242,7 @@ ENTRY(nmi)
 	 */
 
 	/* Use %rdx as our temp variable throughout */
-	pushq_cfi %rdx
-	CFI_REL_OFFSET rdx, 0
+	pushq %rdx
 
 	/*
 	 * If %cs was not the kernel segment, then the NMI triggered in user
@@ -1464,8 +1276,6 @@ ENTRY(nmi)
 	jb	first_nmi
 	/* Ah, it is within the NMI stack, treat it as nested */
 
-	CFI_REMEMBER_STATE
-
 nested_nmi:
 	/*
 	 * Do nothing if we interrupted the fixup in repeat_nmi.
@@ -1483,26 +1293,22 @@ nested_nmi:
 	/* Set up the interrupted NMIs stack to jump to repeat_nmi */
 	leaq -1*8(%rsp), %rdx
 	movq %rdx, %rsp
-	CFI_ADJUST_CFA_OFFSET 1*8
 	leaq -10*8(%rsp), %rdx
-	pushq_cfi $__KERNEL_DS
-	pushq_cfi %rdx
-	pushfq_cfi
-	pushq_cfi $__KERNEL_CS
-	pushq_cfi $repeat_nmi
+	pushq $__KERNEL_DS
+	pushq %rdx
+	pushfq
+	pushq $__KERNEL_CS
+	pushq $repeat_nmi
 
 	/* Put stack back */
 	addq $(6*8), %rsp
-	CFI_ADJUST_CFA_OFFSET -6*8
 
 nested_nmi_out:
-	popq_cfi %rdx
-	CFI_RESTORE rdx
+	popq %rdx
 
 	/* No need to check faults here */
 	INTERRUPT_RETURN
 
-	CFI_RESTORE_STATE
 first_nmi:
 	/*
 	 * Because nested NMIs will use the pushed location that we
@@ -1541,22 +1347,19 @@ first_nmi:
 	 */
 	/* Do not pop rdx, nested NMIs will corrupt that part of the stack */
 	movq (%rsp), %rdx
-	CFI_RESTORE rdx
 
 	/* Set the NMI executing variable on the stack. */
-	pushq_cfi $1
+	pushq $1
 
 	/*
 	 * Leave room for the "copied" frame
 	 */
 	subq $(5*8), %rsp
-	CFI_ADJUST_CFA_OFFSET 5*8
 
 	/* Copy the stack frame to the Saved frame */
 	.rept 5
-	pushq_cfi 11*8(%rsp)
+	pushq 11*8(%rsp)
 	.endr
-	CFI_DEF_CFA_OFFSET 5*8
 
 	/* Everything up to here is safe from nested NMIs */
 
@@ -1579,12 +1382,10 @@ repeat_nmi:
 
 	/* Make another copy, this one may be modified by nested NMIs */
 	addq $(10*8), %rsp
-	CFI_ADJUST_CFA_OFFSET -10*8
 	.rept 5
-	pushq_cfi -6*8(%rsp)
+	pushq -6*8(%rsp)
 	.endr
 	subq $(5*8), %rsp
-	CFI_DEF_CFA_OFFSET 5*8
 end_repeat_nmi:
 
 	/*
@@ -1592,7 +1393,7 @@ end_repeat_nmi:
 	 * NMI if the first NMI took an exception and reset our iret stack
 	 * so that we repeat another NMI.
 	 */
-	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
+	pushq $-1		/* ORIG_RAX: no syscall to restart */
 	ALLOC_PT_GPREGS_ON_STACK
 
 	/*
@@ -1603,7 +1404,6 @@ end_repeat_nmi:
 	 * exceptions might do.
 	 */
 	call paranoid_entry
-	DEFAULT_FRAME 0
 
 	/*
 	 * Save off the CR2 register. If we take a page fault in the NMI then
@@ -1627,7 +1427,6 @@ end_repeat_nmi:
 	je 1f
 	movq %r12, %cr2
 1:
-	
 	testl %ebx,%ebx				/* swapgs needed? */
 	jnz nmi_restore
 nmi_swapgs:
@@ -1641,13 +1440,10 @@ nmi_restore:
 	/* Clear the NMI executing stack variable */
 	movq $0, 5*8(%rsp)
 	jmp irq_return
-	CFI_ENDPROC
 END(nmi)
 
 ENTRY(ignore_sysret)
-	CFI_STARTPROC
 	mov $-ENOSYS,%eax
 	sysret
-	CFI_ENDPROC
 END(ignore_sysret)
 
diff --git a/arch/x86/kernel/fpu/Makefile b/arch/x86/kernel/fpu/Makefile
new file mode 100644
index 00000000000000..68279efb811a3c
--- /dev/null
+++ b/arch/x86/kernel/fpu/Makefile
@@ -0,0 +1,5 @@
+#
+# Build rules for the FPU support code:
+#
+
+obj-y				+= init.o bugs.o core.o regset.o signal.o xstate.o
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
new file mode 100644
index 00000000000000..dd9ca9b60ff3a4
--- /dev/null
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -0,0 +1,71 @@
+/*
+ * x86 FPU bug checks:
+ */
+#include <asm/fpu/internal.h>
+
+/*
+ * Boot time CPU/FPU FDIV bug detection code:
+ */
+
+static double __initdata x = 4195835.0;
+static double __initdata y = 3145727.0;
+
+/*
+ * This used to check for exceptions..
+ * However, it turns out that to support that,
+ * the XMM trap handlers basically had to
+ * be buggy. So let's have a correct XMM trap
+ * handler, and forget about printing out
+ * some status at boot.
+ *
+ * We should really only care about bugs here
+ * anyway. Not features.
+ */
+static void __init check_fpu(void)
+{
+	u32 cr0_saved;
+	s32 fdiv_bug;
+
+	/* We might have CR0::TS set already, clear it: */
+	cr0_saved = read_cr0();
+	write_cr0(cr0_saved & ~X86_CR0_TS);
+
+	kernel_fpu_begin();
+
+	/*
+	 * trap_init() enabled FXSR and company _before_ testing for FP
+	 * problems here.
+	 *
+	 * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug
+	 */
+	__asm__("fninit\n\t"
+		"fldl %1\n\t"
+		"fdivl %2\n\t"
+		"fmull %2\n\t"
+		"fldl %1\n\t"
+		"fsubp %%st,%%st(1)\n\t"
+		"fistpl %0\n\t"
+		"fwait\n\t"
+		"fninit"
+		: "=m" (*&fdiv_bug)
+		: "m" (*&x), "m" (*&y));
+
+	kernel_fpu_end();
+
+	write_cr0(cr0_saved);
+
+	if (fdiv_bug) {
+		set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
+		pr_warn("Hmm, FPU with FDIV bug\n");
+	}
+}
+
+void __init fpu__init_check_bugs(void)
+{
+	/*
+	 * kernel_fpu_begin/end() in check_fpu() relies on the patched
+	 * alternative instructions.
+	 */
+	if (cpu_has_fpu)
+		check_fpu();
+}
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
new file mode 100644
index 00000000000000..79de954626fd97
--- /dev/null
+++ b/arch/x86/kernel/fpu/core.c
@@ -0,0 +1,523 @@
+/*
+ *  Copyright (C) 1994 Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *  General FPU state handling cleanups
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+#include <asm/fpu/internal.h>
+#include <asm/fpu/regset.h>
+#include <asm/fpu/signal.h>
+#include <asm/traps.h>
+
+#include <linux/hardirq.h>
+
+/*
+ * Represents the initial FPU state. It's mostly (but not completely) zeroes,
+ * depending on the FPU hardware format:
+ */
+union fpregs_state init_fpstate __read_mostly;
+
+/*
+ * Track whether the kernel is using the FPU state
+ * currently.
+ *
+ * This flag is used:
+ *
+ *   - by IRQ context code to potentially use the FPU
+ *     if it's unused.
+ *
+ *   - to debug kernel_fpu_begin()/end() correctness
+ */
+static DEFINE_PER_CPU(bool, in_kernel_fpu);
+
+/*
+ * Track which context is using the FPU on the CPU:
+ */
+DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+
+static void kernel_fpu_disable(void)
+{
+	WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
+	this_cpu_write(in_kernel_fpu, true);
+}
+
+static void kernel_fpu_enable(void)
+{
+	WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
+	this_cpu_write(in_kernel_fpu, false);
+}
+
+static bool kernel_fpu_disabled(void)
+{
+	return this_cpu_read(in_kernel_fpu);
+}
+
+/*
+ * Were we in an interrupt that interrupted kernel mode?
+ *
+ * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that
+ * pair does nothing at all: the thread must not have fpu (so
+ * that we don't try to save the FPU state), and TS must
+ * be set (so that the clts/stts pair does nothing that is
+ * visible in the interrupted kernel thread).
+ *
+ * Except for the eagerfpu case when we return true; in the likely case
+ * the thread has FPU but we are not going to set/clear TS.
+ */
+static bool interrupted_kernel_fpu_idle(void)
+{
+	if (kernel_fpu_disabled())
+		return false;
+
+	if (use_eager_fpu())
+		return true;
+
+	return !current->thread.fpu.fpregs_active && (read_cr0() & X86_CR0_TS);
+}
+
+/*
+ * Were we in user mode (or vm86 mode) when we were
+ * interrupted?
+ *
+ * Doing kernel_fpu_begin/end() is ok if we are running
+ * in an interrupt context from user mode - we'll just
+ * save the FPU state as required.
+ */
+static bool interrupted_user_mode(void)
+{
+	struct pt_regs *regs = get_irq_regs();
+	return regs && user_mode(regs);
+}
+
+/*
+ * Can we use the FPU in kernel mode with the
+ * whole "kernel_fpu_begin/end()" sequence?
+ *
+ * It's always ok in process context (ie "not interrupt")
+ * but it is sometimes ok even from an irq.
+ */
+bool irq_fpu_usable(void)
+{
+	return !in_interrupt() ||
+		interrupted_user_mode() ||
+		interrupted_kernel_fpu_idle();
+}
+EXPORT_SYMBOL(irq_fpu_usable);
+
+void __kernel_fpu_begin(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	WARN_ON_FPU(!irq_fpu_usable());
+
+	kernel_fpu_disable();
+
+	if (fpu->fpregs_active) {
+		copy_fpregs_to_fpstate(fpu);
+	} else {
+		this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+		__fpregs_activate_hw();
+	}
+}
+EXPORT_SYMBOL(__kernel_fpu_begin);
+
+void __kernel_fpu_end(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	if (fpu->fpregs_active)
+		copy_kernel_to_fpregs(&fpu->state);
+	else
+		__fpregs_deactivate_hw();
+
+	kernel_fpu_enable();
+}
+EXPORT_SYMBOL(__kernel_fpu_end);
+
+void kernel_fpu_begin(void)
+{
+	preempt_disable();
+	__kernel_fpu_begin();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+
+void kernel_fpu_end(void)
+{
+	__kernel_fpu_end();
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_end);
+
+/*
+ * CR0::TS save/restore functions:
+ */
+int irq_ts_save(void)
+{
+	/*
+	 * If in process context and not atomic, we can take a spurious DNA fault.
+	 * Otherwise, doing clts() in process context requires disabling preemption
+	 * or some heavy lifting like kernel_fpu_begin()
+	 */
+	if (!in_atomic())
+		return 0;
+
+	if (read_cr0() & X86_CR0_TS) {
+		clts();
+		return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(irq_ts_save);
+
+void irq_ts_restore(int TS_state)
+{
+	if (TS_state)
+		stts();
+}
+EXPORT_SYMBOL_GPL(irq_ts_restore);
+
+/*
+ * Save the FPU state (mark it for reload if necessary):
+ *
+ * This only ever gets called for the current task.
+ */
+void fpu__save(struct fpu *fpu)
+{
+	WARN_ON_FPU(fpu != &current->thread.fpu);
+
+	preempt_disable();
+	if (fpu->fpregs_active) {
+		if (!copy_fpregs_to_fpstate(fpu))
+			fpregs_deactivate(fpu);
+	}
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(fpu__save);
+
+/*
+ * Legacy x87 fpstate state init:
+ */
+static inline void fpstate_init_fstate(struct fregs_state *fp)
+{
+	fp->cwd = 0xffff037fu;
+	fp->swd = 0xffff0000u;
+	fp->twd = 0xffffffffu;
+	fp->fos = 0xffff0000u;
+}
+
+void fpstate_init(union fpregs_state *state)
+{
+	if (!cpu_has_fpu) {
+		fpstate_init_soft(&state->soft);
+		return;
+	}
+
+	memset(state, 0, xstate_size);
+
+	if (cpu_has_fxsr)
+		fpstate_init_fxstate(&state->fxsave);
+	else
+		fpstate_init_fstate(&state->fsave);
+}
+EXPORT_SYMBOL_GPL(fpstate_init);
+
+/*
+ * Copy the current task's FPU state to a new task's FPU context.
+ *
+ * In both the 'eager' and the 'lazy' case we save hardware registers
+ * directly to the destination buffer.
+ */
+static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
+{
+	WARN_ON_FPU(src_fpu != &current->thread.fpu);
+
+	/*
+	 * Don't let 'init optimized' areas of the XSAVE area
+	 * leak into the child task:
+	 */
+	if (use_eager_fpu())
+		memset(&dst_fpu->state.xsave, 0, xstate_size);
+
+	/*
+	 * Save current FPU registers directly into the child
+	 * FPU context, without any memory-to-memory copying.
+	 *
+	 * If the FPU context got destroyed in the process (FNSAVE
+	 * done on old CPUs) then copy it back into the source
+	 * context and mark the current task for lazy restore.
+	 *
+	 * We have to do all this with preemption disabled,
+	 * mostly because of the FNSAVE case, because in that
+	 * case we must not allow preemption in the window
+	 * between the FNSAVE and us marking the context lazy.
+	 *
+	 * It shouldn't be an issue as even FNSAVE is plenty
+	 * fast in terms of critical section length.
+	 */
+	preempt_disable();
+	if (!copy_fpregs_to_fpstate(dst_fpu)) {
+		memcpy(&src_fpu->state, &dst_fpu->state, xstate_size);
+		fpregs_deactivate(src_fpu);
+	}
+	preempt_enable();
+}
+
+int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
+{
+	dst_fpu->counter = 0;
+	dst_fpu->fpregs_active = 0;
+	dst_fpu->last_cpu = -1;
+
+	if (src_fpu->fpstate_active)
+		fpu_copy(dst_fpu, src_fpu);
+
+	return 0;
+}
+
+/*
+ * Activate the current task's in-memory FPU context,
+ * if it has not been used before:
+ */
+void fpu__activate_curr(struct fpu *fpu)
+{
+	WARN_ON_FPU(fpu != &current->thread.fpu);
+
+	if (!fpu->fpstate_active) {
+		fpstate_init(&fpu->state);
+
+		/* Safe to do for the current task: */
+		fpu->fpstate_active = 1;
+	}
+}
+EXPORT_SYMBOL_GPL(fpu__activate_curr);
+
+/*
+ * This function must be called before we read a task's fpstate.
+ *
+ * If the task has not used the FPU before then initialize its
+ * fpstate.
+ *
+ * If the task has used the FPU before then save it.
+ */
+void fpu__activate_fpstate_read(struct fpu *fpu)
+{
+	/*
+	 * If fpregs are active (in the current CPU), then
+	 * copy them to the fpstate:
+	 */
+	if (fpu->fpregs_active) {
+		fpu__save(fpu);
+	} else {
+		if (!fpu->fpstate_active) {
+			fpstate_init(&fpu->state);
+
+			/* Safe to do for current and for stopped child tasks: */
+			fpu->fpstate_active = 1;
+		}
+	}
+}
+
+/*
+ * This function must be called before we write a task's fpstate.
+ *
+ * If the task has used the FPU before then unlazy it.
+ * If the task has not used the FPU before then initialize its fpstate.
+ *
+ * After this function call, after registers in the fpstate are
+ * modified and the child task has woken up, the child task will
+ * restore the modified FPU state from the modified context. If we
+ * didn't clear its lazy status here then the lazy in-registers
+ * state pending on its former CPU could be restored, corrupting
+ * the modifications.
+ */
+void fpu__activate_fpstate_write(struct fpu *fpu)
+{
+	/*
+	 * Only stopped child tasks can be used to modify the FPU
+	 * state in the fpstate buffer:
+	 */
+	WARN_ON_FPU(fpu == &current->thread.fpu);
+
+	if (fpu->fpstate_active) {
+		/* Invalidate any lazy state: */
+		fpu->last_cpu = -1;
+	} else {
+		fpstate_init(&fpu->state);
+
+		/* Safe to do for stopped child tasks: */
+		fpu->fpstate_active = 1;
+	}
+}
+
+/*
+ * 'fpu__restore()' is called to copy FPU registers from
+ * the FPU fpstate to the live hw registers and to activate
+ * access to the hardware registers, so that FPU instructions
+ * can be used afterwards.
+ *
+ * Must be called with kernel preemption disabled (for example
+ * with local interrupts disabled, as it is in the case of
+ * do_device_not_available()).
+ */
+void fpu__restore(struct fpu *fpu)
+{
+	fpu__activate_curr(fpu);
+
+	/* Avoid __kernel_fpu_begin() right after fpregs_activate() */
+	kernel_fpu_disable();
+	fpregs_activate(fpu);
+	copy_kernel_to_fpregs(&fpu->state);
+	fpu->counter++;
+	kernel_fpu_enable();
+}
+EXPORT_SYMBOL_GPL(fpu__restore);
+
+/*
+ * Drops current FPU state: deactivates the fpregs and
+ * the fpstate. NOTE: it still leaves previous contents
+ * in the fpregs in the eager-FPU case.
+ *
+ * This function can be used in cases where we know that
+ * a state-restore is coming: either an explicit one,
+ * or a reschedule.
+ */
+void fpu__drop(struct fpu *fpu)
+{
+	preempt_disable();
+	fpu->counter = 0;
+
+	if (fpu->fpregs_active) {
+		/* Ignore delayed exceptions from user space */
+		asm volatile("1: fwait\n"
+			     "2:\n"
+			     _ASM_EXTABLE(1b, 2b));
+		fpregs_deactivate(fpu);
+	}
+
+	fpu->fpstate_active = 0;
+
+	preempt_enable();
+}
+
+/*
+ * Clear FPU registers by setting them up from
+ * the init fpstate:
+ */
+static inline void copy_init_fpstate_to_fpregs(void)
+{
+	if (use_xsave())
+		copy_kernel_to_xregs(&init_fpstate.xsave, -1);
+	else
+		copy_kernel_to_fxregs(&init_fpstate.fxsave);
+}
+
+/*
+ * Clear the FPU state back to init state.
+ *
+ * Called by sys_execve(), by the signal handler code and by various
+ * error paths.
+ */
+void fpu__clear(struct fpu *fpu)
+{
+	WARN_ON_FPU(fpu != &current->thread.fpu); /* Almost certainly an anomaly */
+
+	if (!use_eager_fpu()) {
+		/* FPU state will be reallocated lazily at the first use. */
+		fpu__drop(fpu);
+	} else {
+		if (!fpu->fpstate_active) {
+			fpu__activate_curr(fpu);
+			user_fpu_begin();
+		}
+		copy_init_fpstate_to_fpregs();
+	}
+}
+
+/*
+ * x87 math exception handling:
+ */
+
+static inline unsigned short get_fpu_cwd(struct fpu *fpu)
+{
+	if (cpu_has_fxsr) {
+		return fpu->state.fxsave.cwd;
+	} else {
+		return (unsigned short)fpu->state.fsave.cwd;
+	}
+}
+
+static inline unsigned short get_fpu_swd(struct fpu *fpu)
+{
+	if (cpu_has_fxsr) {
+		return fpu->state.fxsave.swd;
+	} else {
+		return (unsigned short)fpu->state.fsave.swd;
+	}
+}
+
+static inline unsigned short get_fpu_mxcsr(struct fpu *fpu)
+{
+	if (cpu_has_xmm) {
+		return fpu->state.fxsave.mxcsr;
+	} else {
+		return MXCSR_DEFAULT;
+	}
+}
+
+int fpu__exception_code(struct fpu *fpu, int trap_nr)
+{
+	int err;
+
+	if (trap_nr == X86_TRAP_MF) {
+		unsigned short cwd, swd;
+		/*
+		 * (~cwd & swd) will mask out exceptions that are not set to unmasked
+		 * status.  0x3f is the exception bits in these regs, 0x200 is the
+		 * C1 reg you need in case of a stack fault, 0x040 is the stack
+		 * fault bit.  We should only be taking one exception at a time,
+		 * so if this combination doesn't produce any single exception,
+		 * then we have a bad program that isn't synchronizing its FPU usage
+		 * and it will suffer the consequences since we won't be able to
+		 * fully reproduce the context of the exception
+		 */
+		cwd = get_fpu_cwd(fpu);
+		swd = get_fpu_swd(fpu);
+
+		err = swd & ~cwd;
+	} else {
+		/*
+		 * The SIMD FPU exceptions are handled a little differently, as there
+		 * is only a single status/control register.  Thus, to determine which
+		 * unmasked exception was caught we must mask the exception mask bits
+		 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+		 */
+		unsigned short mxcsr = get_fpu_mxcsr(fpu);
+		err = ~(mxcsr >> 7) & mxcsr;
+	}
+
+	if (err & 0x001) {	/* Invalid op */
+		/*
+		 * swd & 0x240 == 0x040: Stack Underflow
+		 * swd & 0x240 == 0x240: Stack Overflow
+		 * User must clear the SF bit (0x40) if set
+		 */
+		return FPE_FLTINV;
+	} else if (err & 0x004) { /* Divide by Zero */
+		return FPE_FLTDIV;
+	} else if (err & 0x008) { /* Overflow */
+		return FPE_FLTOVF;
+	} else if (err & 0x012) { /* Denormal, Underflow */
+		return FPE_FLTUND;
+	} else if (err & 0x020) { /* Precision */
+		return FPE_FLTRES;
+	}
+
+	/*
+	 * If we're using IRQ 13, or supposedly even some trap
+	 * X86_TRAP_MF implementations, it's possible
+	 * we get a spurious trap, which is not an error.
+	 */
+	return 0;
+}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
new file mode 100644
index 00000000000000..fc878fee6a512a
--- /dev/null
+++ b/arch/x86/kernel/fpu/init.c
@@ -0,0 +1,354 @@
+/*
+ * x86 FPU boot time init code:
+ */
+#include <asm/fpu/internal.h>
+#include <asm/tlbflush.h>
+
+/*
+ * Initialize the TS bit in CR0 according to the style of context-switches
+ * we are using:
+ */
+static void fpu__init_cpu_ctx_switch(void)
+{
+	if (!cpu_has_eager_fpu)
+		stts();
+	else
+		clts();
+}
+
+/*
+ * Initialize the registers found in all CPUs, CR0 and CR4:
+ */
+static void fpu__init_cpu_generic(void)
+{
+	unsigned long cr0;
+	unsigned long cr4_mask = 0;
+
+	if (cpu_has_fxsr)
+		cr4_mask |= X86_CR4_OSFXSR;
+	if (cpu_has_xmm)
+		cr4_mask |= X86_CR4_OSXMMEXCPT;
+	if (cr4_mask)
+		cr4_set_bits(cr4_mask);
+
+	cr0 = read_cr0();
+	cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
+	if (!cpu_has_fpu)
+		cr0 |= X86_CR0_EM;
+	write_cr0(cr0);
+
+	/* Flush out any pending x87 state: */
+	asm volatile ("fninit");
+}
+
+/*
+ * Enable all supported FPU features. Called when a CPU is brought online:
+ */
+void fpu__init_cpu(void)
+{
+	fpu__init_cpu_generic();
+	fpu__init_cpu_xstate();
+	fpu__init_cpu_ctx_switch();
+}
+
+/*
+ * The earliest FPU detection code.
+ *
+ * Set the X86_FEATURE_FPU CPU-capability bit based on
+ * trying to execute an actual sequence of FPU instructions:
+ */
+static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
+{
+	unsigned long cr0;
+	u16 fsw, fcw;
+
+	fsw = fcw = 0xffff;
+
+	cr0 = read_cr0();
+	cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
+	write_cr0(cr0);
+
+	asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
+		     : "+m" (fsw), "+m" (fcw));
+
+	if (fsw == 0 && (fcw & 0x103f) == 0x003f)
+		set_cpu_cap(c, X86_FEATURE_FPU);
+	else
+		clear_cpu_cap(c, X86_FEATURE_FPU);
+
+#ifndef CONFIG_MATH_EMULATION
+	if (!cpu_has_fpu) {
+		pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n");
+		for (;;)
+			asm volatile("hlt");
+	}
+#endif
+}
+
+/*
+ * Boot time FPU feature detection code:
+ */
+unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
+
+static void __init fpu__init_system_mxcsr(void)
+{
+	unsigned int mask = 0;
+
+	if (cpu_has_fxsr) {
+		struct fxregs_state fx_tmp __aligned(32) = { };
+
+		asm volatile("fxsave %0" : "+m" (fx_tmp));
+
+		mask = fx_tmp.mxcsr_mask;
+
+		/*
+		 * If zero then use the default features mask,
+		 * which has all features set, except the
+		 * denormals-are-zero feature bit:
+		 */
+		if (mask == 0)
+			mask = 0x0000ffbf;
+	}
+	mxcsr_feature_mask &= mask;
+}
+
+/*
+ * Once per bootup FPU initialization sequences that will run on most x86 CPUs:
+ */
+static void __init fpu__init_system_generic(void)
+{
+	/*
+	 * Set up the legacy init FPU context. (xstate init might overwrite this
+	 * with a more modern format, if the CPU supports it.)
+	 */
+	fpstate_init_fxstate(&init_fpstate.fxsave);
+
+	fpu__init_system_mxcsr();
+}
+
+/*
+ * Size of the FPU context state. All tasks in the system use the
+ * same context size, regardless of what portion they use.
+ * This is inherent to the XSAVE architecture which puts all state
+ * components into a single, continuous memory block:
+ */
+unsigned int xstate_size;
+EXPORT_SYMBOL_GPL(xstate_size);
+
+/*
+ * Set up the xstate_size based on the legacy FPU context size.
+ *
+ * We set this up first, and later it will be overwritten by
+ * fpu__init_system_xstate() if the CPU knows about xstates.
+ */
+static void __init fpu__init_system_xstate_size_legacy(void)
+{
+	static int on_boot_cpu = 1;
+
+	WARN_ON_FPU(!on_boot_cpu);
+	on_boot_cpu = 0;
+
+	/*
+	 * Note that xstate_size might be overwriten later during
+	 * fpu__init_system_xstate().
+	 */
+
+	if (!cpu_has_fpu) {
+		/*
+		 * Disable xsave as we do not support it if i387
+		 * emulation is enabled.
+		 */
+		setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+		setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+		xstate_size = sizeof(struct swregs_state);
+	} else {
+		if (cpu_has_fxsr)
+			xstate_size = sizeof(struct fxregs_state);
+		else
+			xstate_size = sizeof(struct fregs_state);
+	}
+	/*
+	 * Quirk: we don't yet handle the XSAVES* instructions
+	 * correctly, as we don't correctly convert between
+	 * standard and compacted format when interfacing
+	 * with user-space - so disable it for now.
+	 *
+	 * The difference is small: with recent CPUs the
+	 * compacted format is only marginally smaller than
+	 * the standard FPU state format.
+	 *
+	 * ( This is easy to backport while we are fixing
+	 *   XSAVES* support. )
+	 */
+	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+}
+
+/*
+ * FPU context switching strategies:
+ *
+ * Against popular belief, we don't do lazy FPU saves, due to the
+ * task migration complications it brings on SMP - we only do
+ * lazy FPU restores.
+ *
+ * 'lazy' is the traditional strategy, which is based on setting
+ * CR0::TS to 1 during context-switch (instead of doing a full
+ * restore of the FPU state), which causes the first FPU instruction
+ * after the context switch (whenever it is executed) to fault - at
+ * which point we lazily restore the FPU state into FPU registers.
+ *
+ * Tasks are of course under no obligation to execute FPU instructions,
+ * so it can easily happen that another context-switch occurs without
+ * a single FPU instruction being executed. If we eventually switch
+ * back to the original task (that still owns the FPU) then we have
+ * not only saved the restores along the way, but we also have the
+ * FPU ready to be used for the original task.
+ *
+ * 'eager' switching is used on modern CPUs, there we switch the FPU
+ * state during every context switch, regardless of whether the task
+ * has used FPU instructions in that time slice or not. This is done
+ * because modern FPU context saving instructions are able to optimize
+ * state saving and restoration in hardware: they can detect both
+ * unused and untouched FPU state and optimize accordingly.
+ *
+ * [ Note that even in 'lazy' mode we might optimize context switches
+ *   to use 'eager' restores, if we detect that a task is using the FPU
+ *   frequently. See the fpu->counter logic in fpu/internal.h for that. ]
+ */
+static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
+
+static int __init eager_fpu_setup(char *s)
+{
+	if (!strcmp(s, "on"))
+		eagerfpu = ENABLE;
+	else if (!strcmp(s, "off"))
+		eagerfpu = DISABLE;
+	else if (!strcmp(s, "auto"))
+		eagerfpu = AUTO;
+	return 1;
+}
+__setup("eagerfpu=", eager_fpu_setup);
+
+/*
+ * Pick the FPU context switching strategy:
+ */
+static void __init fpu__init_system_ctx_switch(void)
+{
+	static bool on_boot_cpu = 1;
+
+	WARN_ON_FPU(!on_boot_cpu);
+	on_boot_cpu = 0;
+
+	WARN_ON_FPU(current->thread.fpu.fpstate_active);
+	current_thread_info()->status = 0;
+
+	/* Auto enable eagerfpu for xsaveopt */
+	if (cpu_has_xsaveopt && eagerfpu != DISABLE)
+		eagerfpu = ENABLE;
+
+	if (xfeatures_mask & XSTATE_EAGER) {
+		if (eagerfpu == DISABLE) {
+			pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
+			       xfeatures_mask & XSTATE_EAGER);
+			xfeatures_mask &= ~XSTATE_EAGER;
+		} else {
+			eagerfpu = ENABLE;
+		}
+	}
+
+	if (eagerfpu == ENABLE)
+		setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
+
+	printk(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy");
+}
+
+/*
+ * Called on the boot CPU once per system bootup, to set up the initial
+ * FPU state that is later cloned into all processes:
+ */
+void __init fpu__init_system(struct cpuinfo_x86 *c)
+{
+	fpu__init_system_early_generic(c);
+
+	/*
+	 * The FPU has to be operational for some of the
+	 * later FPU init activities:
+	 */
+	fpu__init_cpu();
+
+	/*
+	 * But don't leave CR0::TS set yet, as some of the FPU setup
+	 * methods depend on being able to execute FPU instructions
+	 * that will fault on a set TS, such as the FXSAVE in
+	 * fpu__init_system_mxcsr().
+	 */
+	clts();
+
+	fpu__init_system_generic();
+	fpu__init_system_xstate_size_legacy();
+	fpu__init_system_xstate();
+
+	fpu__init_system_ctx_switch();
+}
+
+/*
+ * Boot parameter to turn off FPU support and fall back to math-emu:
+ */
+static int __init no_387(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_FPU);
+	return 1;
+}
+__setup("no387", no_387);
+
+/*
+ * Disable all xstate CPU features:
+ */
+static int __init x86_noxsave_setup(char *s)
+{
+	if (strlen(s))
+		return 0;
+
+	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+	setup_clear_cpu_cap(X86_FEATURE_AVX);
+	setup_clear_cpu_cap(X86_FEATURE_AVX2);
+
+	return 1;
+}
+__setup("noxsave", x86_noxsave_setup);
+
+/*
+ * Disable the XSAVEOPT instruction specifically:
+ */
+static int __init x86_noxsaveopt_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+
+	return 1;
+}
+__setup("noxsaveopt", x86_noxsaveopt_setup);
+
+/*
+ * Disable the XSAVES instruction:
+ */
+static int __init x86_noxsaves_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+	return 1;
+}
+__setup("noxsaves", x86_noxsaves_setup);
+
+/*
+ * Disable FX save/restore and SSE support:
+ */
+static int __init x86_nofxsr_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_FXSR);
+	setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
+	setup_clear_cpu_cap(X86_FEATURE_XMM);
+
+	return 1;
+}
+__setup("nofxsr", x86_nofxsr_setup);
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
new file mode 100644
index 00000000000000..dc60810c1c74c6
--- /dev/null
+++ b/arch/x86/kernel/fpu/regset.c
@@ -0,0 +1,356 @@
+/*
+ * FPU register's regset abstraction, for ptrace, core dumps, etc.
+ */
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+
+/*
+ * The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
+ * as the "regset->n" for the xstate regset will be updated based on the feature
+ * capabilites supported by the xsave.
+ */
+int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
+{
+	struct fpu *target_fpu = &target->thread.fpu;
+
+	return target_fpu->fpstate_active ? regset->n : 0;
+}
+
+int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
+{
+	struct fpu *target_fpu = &target->thread.fpu;
+
+	return (cpu_has_fxsr && target_fpu->fpstate_active) ? regset->n : 0;
+}
+
+int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
+		unsigned int pos, unsigned int count,
+		void *kbuf, void __user *ubuf)
+{
+	struct fpu *fpu = &target->thread.fpu;
+
+	if (!cpu_has_fxsr)
+		return -ENODEV;
+
+	fpu__activate_fpstate_read(fpu);
+	fpstate_sanitize_xstate(fpu);
+
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				   &fpu->state.fxsave, 0, -1);
+}
+
+int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
+		unsigned int pos, unsigned int count,
+		const void *kbuf, const void __user *ubuf)
+{
+	struct fpu *fpu = &target->thread.fpu;
+	int ret;
+
+	if (!cpu_has_fxsr)
+		return -ENODEV;
+
+	fpu__activate_fpstate_write(fpu);
+	fpstate_sanitize_xstate(fpu);
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &fpu->state.fxsave, 0, -1);
+
+	/*
+	 * mxcsr reserved bits must be masked to zero for security reasons.
+	 */
+	fpu->state.fxsave.mxcsr &= mxcsr_feature_mask;
+
+	/*
+	 * update the header bits in the xsave header, indicating the
+	 * presence of FP and SSE state.
+	 */
+	if (cpu_has_xsave)
+		fpu->state.xsave.header.xfeatures |= XSTATE_FPSSE;
+
+	return ret;
+}
+
+int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
+		unsigned int pos, unsigned int count,
+		void *kbuf, void __user *ubuf)
+{
+	struct fpu *fpu = &target->thread.fpu;
+	struct xregs_state *xsave;
+	int ret;
+
+	if (!cpu_has_xsave)
+		return -ENODEV;
+
+	fpu__activate_fpstate_read(fpu);
+
+	xsave = &fpu->state.xsave;
+
+	/*
+	 * Copy the 48bytes defined by the software first into the xstate
+	 * memory layout in the thread struct, so that we can copy the entire
+	 * xstateregs to the user using one user_regset_copyout().
+	 */
+	memcpy(&xsave->i387.sw_reserved,
+		xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
+	/*
+	 * Copy the xstate memory layout.
+	 */
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
+	return ret;
+}
+
+int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
+		  unsigned int pos, unsigned int count,
+		  const void *kbuf, const void __user *ubuf)
+{
+	struct fpu *fpu = &target->thread.fpu;
+	struct xregs_state *xsave;
+	int ret;
+
+	if (!cpu_has_xsave)
+		return -ENODEV;
+
+	fpu__activate_fpstate_write(fpu);
+
+	xsave = &fpu->state.xsave;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
+	/*
+	 * mxcsr reserved bits must be masked to zero for security reasons.
+	 */
+	xsave->i387.mxcsr &= mxcsr_feature_mask;
+	xsave->header.xfeatures &= xfeatures_mask;
+	/*
+	 * These bits must be zero.
+	 */
+	memset(&xsave->header.reserved, 0, 48);
+
+	return ret;
+}
+
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+
+/*
+ * FPU tag word conversions.
+ */
+
+static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
+{
+	unsigned int tmp; /* to avoid 16 bit prefixes in the code */
+
+	/* Transform each pair of bits into 01 (valid) or 00 (empty) */
+	tmp = ~twd;
+	tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
+	/* and move the valid bits to the lower byte. */
+	tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
+	tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
+	tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
+
+	return tmp;
+}
+
+#define FPREG_ADDR(f, n)	((void *)&(f)->st_space + (n) * 16)
+#define FP_EXP_TAG_VALID	0
+#define FP_EXP_TAG_ZERO		1
+#define FP_EXP_TAG_SPECIAL	2
+#define FP_EXP_TAG_EMPTY	3
+
+static inline u32 twd_fxsr_to_i387(struct fxregs_state *fxsave)
+{
+	struct _fpxreg *st;
+	u32 tos = (fxsave->swd >> 11) & 7;
+	u32 twd = (unsigned long) fxsave->twd;
+	u32 tag;
+	u32 ret = 0xffff0000u;
+	int i;
+
+	for (i = 0; i < 8; i++, twd >>= 1) {
+		if (twd & 0x1) {
+			st = FPREG_ADDR(fxsave, (i - tos) & 7);
+
+			switch (st->exponent & 0x7fff) {
+			case 0x7fff:
+				tag = FP_EXP_TAG_SPECIAL;
+				break;
+			case 0x0000:
+				if (!st->significand[0] &&
+				    !st->significand[1] &&
+				    !st->significand[2] &&
+				    !st->significand[3])
+					tag = FP_EXP_TAG_ZERO;
+				else
+					tag = FP_EXP_TAG_SPECIAL;
+				break;
+			default:
+				if (st->significand[3] & 0x8000)
+					tag = FP_EXP_TAG_VALID;
+				else
+					tag = FP_EXP_TAG_SPECIAL;
+				break;
+			}
+		} else {
+			tag = FP_EXP_TAG_EMPTY;
+		}
+		ret |= tag << (2 * i);
+	}
+	return ret;
+}
+
+/*
+ * FXSR floating point environment conversions.
+ */
+
+void
+convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
+{
+	struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave;
+	struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
+	struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
+	int i;
+
+	env->cwd = fxsave->cwd | 0xffff0000u;
+	env->swd = fxsave->swd | 0xffff0000u;
+	env->twd = twd_fxsr_to_i387(fxsave);
+
+#ifdef CONFIG_X86_64
+	env->fip = fxsave->rip;
+	env->foo = fxsave->rdp;
+	/*
+	 * should be actually ds/cs at fpu exception time, but
+	 * that information is not available in 64bit mode.
+	 */
+	env->fcs = task_pt_regs(tsk)->cs;
+	if (tsk == current) {
+		savesegment(ds, env->fos);
+	} else {
+		env->fos = tsk->thread.ds;
+	}
+	env->fos |= 0xffff0000;
+#else
+	env->fip = fxsave->fip;
+	env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
+	env->foo = fxsave->foo;
+	env->fos = fxsave->fos;
+#endif
+
+	for (i = 0; i < 8; ++i)
+		memcpy(&to[i], &from[i], sizeof(to[0]));
+}
+
+void convert_to_fxsr(struct task_struct *tsk,
+		     const struct user_i387_ia32_struct *env)
+
+{
+	struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave;
+	struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
+	struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
+	int i;
+
+	fxsave->cwd = env->cwd;
+	fxsave->swd = env->swd;
+	fxsave->twd = twd_i387_to_fxsr(env->twd);
+	fxsave->fop = (u16) ((u32) env->fcs >> 16);
+#ifdef CONFIG_X86_64
+	fxsave->rip = env->fip;
+	fxsave->rdp = env->foo;
+	/* cs and ds ignored */
+#else
+	fxsave->fip = env->fip;
+	fxsave->fcs = (env->fcs & 0xffff);
+	fxsave->foo = env->foo;
+	fxsave->fos = env->fos;
+#endif
+
+	for (i = 0; i < 8; ++i)
+		memcpy(&to[i], &from[i], sizeof(from[0]));
+}
+
+int fpregs_get(struct task_struct *target, const struct user_regset *regset,
+	       unsigned int pos, unsigned int count,
+	       void *kbuf, void __user *ubuf)
+{
+	struct fpu *fpu = &target->thread.fpu;
+	struct user_i387_ia32_struct env;
+
+	fpu__activate_fpstate_read(fpu);
+
+	if (!static_cpu_has(X86_FEATURE_FPU))
+		return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
+
+	if (!cpu_has_fxsr)
+		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+					   &fpu->state.fsave, 0,
+					   -1);
+
+	fpstate_sanitize_xstate(fpu);
+
+	if (kbuf && pos == 0 && count == sizeof(env)) {
+		convert_from_fxsr(kbuf, target);
+		return 0;
+	}
+
+	convert_from_fxsr(&env, target);
+
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+}
+
+int fpregs_set(struct task_struct *target, const struct user_regset *regset,
+	       unsigned int pos, unsigned int count,
+	       const void *kbuf, const void __user *ubuf)
+{
+	struct fpu *fpu = &target->thread.fpu;
+	struct user_i387_ia32_struct env;
+	int ret;
+
+	fpu__activate_fpstate_write(fpu);
+	fpstate_sanitize_xstate(fpu);
+
+	if (!static_cpu_has(X86_FEATURE_FPU))
+		return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
+
+	if (!cpu_has_fxsr)
+		return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+					  &fpu->state.fsave, 0,
+					  -1);
+
+	if (pos > 0 || count < sizeof(env))
+		convert_from_fxsr(&env, target);
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+	if (!ret)
+		convert_to_fxsr(target, &env);
+
+	/*
+	 * update the header bit in the xsave header, indicating the
+	 * presence of FP.
+	 */
+	if (cpu_has_xsave)
+		fpu->state.xsave.header.xfeatures |= XSTATE_FP;
+	return ret;
+}
+
+/*
+ * FPU state for core dumps.
+ * This is only used for a.out dumps now.
+ * It is declared generically using elf_fpregset_t (which is
+ * struct user_i387_struct) but is in fact only used for 32-bit
+ * dumps, so on 64-bit it is really struct user_i387_ia32_struct.
+ */
+int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu)
+{
+	struct task_struct *tsk = current;
+	struct fpu *fpu = &tsk->thread.fpu;
+	int fpvalid;
+
+	fpvalid = fpu->fpstate_active;
+	if (fpvalid)
+		fpvalid = !fpregs_get(tsk, NULL,
+				      0, sizeof(struct user_i387_ia32_struct),
+				      ufpu, NULL);
+
+	return fpvalid;
+}
+EXPORT_SYMBOL(dump_fpu);
+
+#endif	/* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
new file mode 100644
index 00000000000000..50ec9af1bd5189
--- /dev/null
+++ b/arch/x86/kernel/fpu/signal.c
@@ -0,0 +1,404 @@
+/*
+ * FPU signal frame handling routines.
+ */
+
+#include <linux/compat.h>
+#include <linux/cpu.h>
+
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+
+#include <asm/sigframe.h>
+
+static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
+
+/*
+ * Check for the presence of extended state information in the
+ * user fpstate pointer in the sigcontext.
+ */
+static inline int check_for_xstate(struct fxregs_state __user *buf,
+				   void __user *fpstate,
+				   struct _fpx_sw_bytes *fx_sw)
+{
+	int min_xstate_size = sizeof(struct fxregs_state) +
+			      sizeof(struct xstate_header);
+	unsigned int magic2;
+
+	if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw)))
+		return -1;
+
+	/* Check for the first magic field and other error scenarios. */
+	if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
+	    fx_sw->xstate_size < min_xstate_size ||
+	    fx_sw->xstate_size > xstate_size ||
+	    fx_sw->xstate_size > fx_sw->extended_size)
+		return -1;
+
+	/*
+	 * Check for the presence of second magic word at the end of memory
+	 * layout. This detects the case where the user just copied the legacy
+	 * fpstate layout with out copying the extended state information
+	 * in the memory layout.
+	 */
+	if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))
+	    || magic2 != FP_XSTATE_MAGIC2)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Signal frame handlers.
+ */
+static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
+{
+	if (use_fxsr()) {
+		struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+		struct user_i387_ia32_struct env;
+		struct _fpstate_ia32 __user *fp = buf;
+
+		convert_from_fxsr(&env, tsk);
+
+		if (__copy_to_user(buf, &env, sizeof(env)) ||
+		    __put_user(xsave->i387.swd, &fp->status) ||
+		    __put_user(X86_FXSR_MAGIC, &fp->magic))
+			return -1;
+	} else {
+		struct fregs_state __user *fp = buf;
+		u32 swd;
+		if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
+			return -1;
+	}
+
+	return 0;
+}
+
+static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
+{
+	struct xregs_state __user *x = buf;
+	struct _fpx_sw_bytes *sw_bytes;
+	u32 xfeatures;
+	int err;
+
+	/* Setup the bytes not touched by the [f]xsave and reserved for SW. */
+	sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved;
+	err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
+
+	if (!use_xsave())
+		return err;
+
+	err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size));
+
+	/*
+	 * Read the xfeatures which we copied (directly from the cpu or
+	 * from the state in task struct) to the user buffers.
+	 */
+	err |= __get_user(xfeatures, (__u32 *)&x->header.xfeatures);
+
+	/*
+	 * For legacy compatible, we always set FP/SSE bits in the bit
+	 * vector while saving the state to the user context. This will
+	 * enable us capturing any changes(during sigreturn) to
+	 * the FP/SSE bits by the legacy applications which don't touch
+	 * xfeatures in the xsave header.
+	 *
+	 * xsave aware apps can change the xfeatures in the xsave
+	 * header as well as change any contents in the memory layout.
+	 * xrestore as part of sigreturn will capture all the changes.
+	 */
+	xfeatures |= XSTATE_FPSSE;
+
+	err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures);
+
+	return err;
+}
+
+static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
+{
+	int err;
+
+	if (use_xsave())
+		err = copy_xregs_to_user(buf);
+	else if (use_fxsr())
+		err = copy_fxregs_to_user((struct fxregs_state __user *) buf);
+	else
+		err = copy_fregs_to_user((struct fregs_state __user *) buf);
+
+	if (unlikely(err) && __clear_user(buf, xstate_size))
+		err = -EFAULT;
+	return err;
+}
+
+/*
+ * Save the fpu, extended register state to the user signal frame.
+ *
+ * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save
+ *  state is copied.
+ *  'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'.
+ *
+ *	buf == buf_fx for 64-bit frames and 32-bit fsave frame.
+ *	buf != buf_fx for 32-bit frames with fxstate.
+ *
+ * If the fpu, extended register state is live, save the state directly
+ * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise,
+ * copy the thread's fpu state to the user frame starting at 'buf_fx'.
+ *
+ * If this is a 32-bit frame with fxstate, put a fsave header before
+ * the aligned state at 'buf_fx'.
+ *
+ * For [f]xsave state, update the SW reserved fields in the [f]xsave frame
+ * indicating the absence/presence of the extended state to the user.
+ */
+int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
+{
+	struct xregs_state *xsave = &current->thread.fpu.state.xsave;
+	struct task_struct *tsk = current;
+	int ia32_fxstate = (buf != buf_fx);
+
+	ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
+			 config_enabled(CONFIG_IA32_EMULATION));
+
+	if (!access_ok(VERIFY_WRITE, buf, size))
+		return -EACCES;
+
+	if (!static_cpu_has(X86_FEATURE_FPU))
+		return fpregs_soft_get(current, NULL, 0,
+			sizeof(struct user_i387_ia32_struct), NULL,
+			(struct _fpstate_ia32 __user *) buf) ? -1 : 1;
+
+	if (fpregs_active()) {
+		/* Save the live register state to the user directly. */
+		if (copy_fpregs_to_sigframe(buf_fx))
+			return -1;
+		/* Update the thread's fxstate to save the fsave header. */
+		if (ia32_fxstate)
+			copy_fxregs_to_kernel(&tsk->thread.fpu);
+	} else {
+		fpstate_sanitize_xstate(&tsk->thread.fpu);
+		if (__copy_to_user(buf_fx, xsave, xstate_size))
+			return -1;
+	}
+
+	/* Save the fsave header for the 32-bit frames. */
+	if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
+		return -1;
+
+	if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
+		return -1;
+
+	return 0;
+}
+
+static inline void
+sanitize_restored_xstate(struct task_struct *tsk,
+			 struct user_i387_ia32_struct *ia32_env,
+			 u64 xfeatures, int fx_only)
+{
+	struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+	struct xstate_header *header = &xsave->header;
+
+	if (use_xsave()) {
+		/* These bits must be zero. */
+		memset(header->reserved, 0, 48);
+
+		/*
+		 * Init the state that is not present in the memory
+		 * layout and not enabled by the OS.
+		 */
+		if (fx_only)
+			header->xfeatures = XSTATE_FPSSE;
+		else
+			header->xfeatures &= (xfeatures_mask & xfeatures);
+	}
+
+	if (use_fxsr()) {
+		/*
+		 * mscsr reserved bits must be masked to zero for security
+		 * reasons.
+		 */
+		xsave->i387.mxcsr &= mxcsr_feature_mask;
+
+		convert_to_fxsr(tsk, ia32_env);
+	}
+}
+
+/*
+ * Restore the extended state if present. Otherwise, restore the FP/SSE state.
+ */
+static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
+{
+	if (use_xsave()) {
+		if ((unsigned long)buf % 64 || fx_only) {
+			u64 init_bv = xfeatures_mask & ~XSTATE_FPSSE;
+			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+			return copy_user_to_fxregs(buf);
+		} else {
+			u64 init_bv = xfeatures_mask & ~xbv;
+			if (unlikely(init_bv))
+				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+			return copy_user_to_xregs(buf, xbv);
+		}
+	} else if (use_fxsr()) {
+		return copy_user_to_fxregs(buf);
+	} else
+		return copy_user_to_fregs(buf);
+}
+
+static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
+{
+	int ia32_fxstate = (buf != buf_fx);
+	struct task_struct *tsk = current;
+	struct fpu *fpu = &tsk->thread.fpu;
+	int state_size = xstate_size;
+	u64 xfeatures = 0;
+	int fx_only = 0;
+
+	ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
+			 config_enabled(CONFIG_IA32_EMULATION));
+
+	if (!buf) {
+		fpu__clear(fpu);
+		return 0;
+	}
+
+	if (!access_ok(VERIFY_READ, buf, size))
+		return -EACCES;
+
+	fpu__activate_curr(fpu);
+
+	if (!static_cpu_has(X86_FEATURE_FPU))
+		return fpregs_soft_set(current, NULL,
+				       0, sizeof(struct user_i387_ia32_struct),
+				       NULL, buf) != 0;
+
+	if (use_xsave()) {
+		struct _fpx_sw_bytes fx_sw_user;
+		if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) {
+			/*
+			 * Couldn't find the extended state information in the
+			 * memory layout. Restore just the FP/SSE and init all
+			 * the other extended state.
+			 */
+			state_size = sizeof(struct fxregs_state);
+			fx_only = 1;
+		} else {
+			state_size = fx_sw_user.xstate_size;
+			xfeatures = fx_sw_user.xfeatures;
+		}
+	}
+
+	if (ia32_fxstate) {
+		/*
+		 * For 32-bit frames with fxstate, copy the user state to the
+		 * thread's fpu state, reconstruct fxstate from the fsave
+		 * header. Sanitize the copied state etc.
+		 */
+		struct fpu *fpu = &tsk->thread.fpu;
+		struct user_i387_ia32_struct env;
+		int err = 0;
+
+		/*
+		 * Drop the current fpu which clears fpu->fpstate_active. This ensures
+		 * that any context-switch during the copy of the new state,
+		 * avoids the intermediate state from getting restored/saved.
+		 * Thus avoiding the new restored state from getting corrupted.
+		 * We will be ready to restore/save the state only after
+		 * fpu->fpstate_active is again set.
+		 */
+		fpu__drop(fpu);
+
+		if (__copy_from_user(&fpu->state.xsave, buf_fx, state_size) ||
+		    __copy_from_user(&env, buf, sizeof(env))) {
+			fpstate_init(&fpu->state);
+			err = -1;
+		} else {
+			sanitize_restored_xstate(tsk, &env, xfeatures, fx_only);
+		}
+
+		fpu->fpstate_active = 1;
+		if (use_eager_fpu()) {
+			preempt_disable();
+			fpu__restore(fpu);
+			preempt_enable();
+		}
+
+		return err;
+	} else {
+		/*
+		 * For 64-bit frames and 32-bit fsave frames, restore the user
+		 * state to the registers directly (with exceptions handled).
+		 */
+		user_fpu_begin();
+		if (copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) {
+			fpu__clear(fpu);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static inline int xstate_sigframe_size(void)
+{
+	return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size;
+}
+
+/*
+ * Restore FPU state from a sigframe:
+ */
+int fpu__restore_sig(void __user *buf, int ia32_frame)
+{
+	void __user *buf_fx = buf;
+	int size = xstate_sigframe_size();
+
+	if (ia32_frame && use_fxsr()) {
+		buf_fx = buf + sizeof(struct fregs_state);
+		size += sizeof(struct fregs_state);
+	}
+
+	return __fpu__restore_sig(buf, buf_fx, size);
+}
+
+unsigned long
+fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
+		     unsigned long *buf_fx, unsigned long *size)
+{
+	unsigned long frame_size = xstate_sigframe_size();
+
+	*buf_fx = sp = round_down(sp - frame_size, 64);
+	if (ia32_frame && use_fxsr()) {
+		frame_size += sizeof(struct fregs_state);
+		sp -= sizeof(struct fregs_state);
+	}
+
+	*size = frame_size;
+
+	return sp;
+}
+/*
+ * Prepare the SW reserved portion of the fxsave memory layout, indicating
+ * the presence of the extended state information in the memory layout
+ * pointed by the fpstate pointer in the sigcontext.
+ * This will be saved when ever the FP and extended state context is
+ * saved on the user stack during the signal handler delivery to the user.
+ */
+void fpu__init_prepare_fx_sw_frame(void)
+{
+	int fsave_header_size = sizeof(struct fregs_state);
+	int size = xstate_size + FP_XSTATE_MAGIC2_SIZE;
+
+	if (config_enabled(CONFIG_X86_32))
+		size += fsave_header_size;
+
+	fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
+	fx_sw_reserved.extended_size = size;
+	fx_sw_reserved.xfeatures = xfeatures_mask;
+	fx_sw_reserved.xstate_size = xstate_size;
+
+	if (config_enabled(CONFIG_IA32_EMULATION)) {
+		fx_sw_reserved_ia32 = fx_sw_reserved;
+		fx_sw_reserved_ia32.extended_size += fsave_header_size;
+	}
+}
+
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
new file mode 100644
index 00000000000000..a580eb5c7e5284
--- /dev/null
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -0,0 +1,400 @@
+/*
+ * xsave/xrstor support.
+ *
+ * Author: Suresh Siddha <suresh.b.siddha@intel.com>
+ */
+#include <linux/compat.h>
+#include <linux/cpu.h>
+
+#include <asm/fpu/api.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+
+#include <asm/tlbflush.h>
+
+static const char *xfeature_names[] =
+{
+	"x87 floating point registers"	,
+	"SSE registers"			,
+	"AVX registers"			,
+	"MPX bounds registers"		,
+	"MPX CSR"			,
+	"AVX-512 opmask"		,
+	"AVX-512 Hi256"			,
+	"AVX-512 ZMM_Hi256"		,
+	"unknown xstate feature"	,
+};
+
+/*
+ * Mask of xstate features supported by the CPU and the kernel:
+ */
+u64 xfeatures_mask __read_mostly;
+
+static unsigned int xstate_offsets[XFEATURES_NR_MAX], xstate_sizes[XFEATURES_NR_MAX];
+static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
+
+/* The number of supported xfeatures in xfeatures_mask: */
+static unsigned int xfeatures_nr;
+
+/*
+ * Return whether the system supports a given xfeature.
+ *
+ * Also return the name of the (most advanced) feature that the caller requested:
+ */
+int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
+{
+	u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask;
+
+	if (unlikely(feature_name)) {
+		long xfeature_idx, max_idx;
+		u64 xfeatures_print;
+		/*
+		 * So we use FLS here to be able to print the most advanced
+		 * feature that was requested but is missing. So if a driver
+		 * asks about "XSTATE_SSE | XSTATE_YMM" we'll print the
+		 * missing AVX feature - this is the most informative message
+		 * to users:
+		 */
+		if (xfeatures_missing)
+			xfeatures_print = xfeatures_missing;
+		else
+			xfeatures_print = xfeatures_needed;
+
+		xfeature_idx = fls64(xfeatures_print)-1;
+		max_idx = ARRAY_SIZE(xfeature_names)-1;
+		xfeature_idx = min(xfeature_idx, max_idx);
+
+		*feature_name = xfeature_names[xfeature_idx];
+	}
+
+	if (xfeatures_missing)
+		return 0;
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
+
+/*
+ * When executing XSAVEOPT (or other optimized XSAVE instructions), if
+ * a processor implementation detects that an FPU state component is still
+ * (or is again) in its initialized state, it may clear the corresponding
+ * bit in the header.xfeatures field, and can skip the writeout of registers
+ * to the corresponding memory layout.
+ *
+ * This means that when the bit is zero, the state component might still contain
+ * some previous - non-initialized register state.
+ *
+ * Before writing xstate information to user-space we sanitize those components,
+ * to always ensure that the memory layout of a feature will be in the init state
+ * if the corresponding header bit is zero. This is to ensure that user-space doesn't
+ * see some stale state in the memory layout during signal handling, debugging etc.
+ */
+void fpstate_sanitize_xstate(struct fpu *fpu)
+{
+	struct fxregs_state *fx = &fpu->state.fxsave;
+	int feature_bit;
+	u64 xfeatures;
+
+	if (!use_xsaveopt())
+		return;
+
+	xfeatures = fpu->state.xsave.header.xfeatures;
+
+	/*
+	 * None of the feature bits are in init state. So nothing else
+	 * to do for us, as the memory layout is up to date.
+	 */
+	if ((xfeatures & xfeatures_mask) == xfeatures_mask)
+		return;
+
+	/*
+	 * FP is in init state
+	 */
+	if (!(xfeatures & XSTATE_FP)) {
+		fx->cwd = 0x37f;
+		fx->swd = 0;
+		fx->twd = 0;
+		fx->fop = 0;
+		fx->rip = 0;
+		fx->rdp = 0;
+		memset(&fx->st_space[0], 0, 128);
+	}
+
+	/*
+	 * SSE is in init state
+	 */
+	if (!(xfeatures & XSTATE_SSE))
+		memset(&fx->xmm_space[0], 0, 256);
+
+	/*
+	 * First two features are FPU and SSE, which above we handled
+	 * in a special way already:
+	 */
+	feature_bit = 0x2;
+	xfeatures = (xfeatures_mask & ~xfeatures) >> 2;
+
+	/*
+	 * Update all the remaining memory layouts according to their
+	 * standard xstate layout, if their header bit is in the init
+	 * state:
+	 */
+	while (xfeatures) {
+		if (xfeatures & 0x1) {
+			int offset = xstate_offsets[feature_bit];
+			int size = xstate_sizes[feature_bit];
+
+			memcpy((void *)fx + offset,
+			       (void *)&init_fpstate.xsave + offset,
+			       size);
+		}
+
+		xfeatures >>= 1;
+		feature_bit++;
+	}
+}
+
+/*
+ * Enable the extended processor state save/restore feature.
+ * Called once per CPU onlining.
+ */
+void fpu__init_cpu_xstate(void)
+{
+	if (!cpu_has_xsave || !xfeatures_mask)
+		return;
+
+	cr4_set_bits(X86_CR4_OSXSAVE);
+	xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+}
+
+/*
+ * Record the offsets and sizes of various xstates contained
+ * in the XSAVE state memory layout.
+ *
+ * ( Note that certain features might be non-present, for them
+ *   we'll have 0 offset and 0 size. )
+ */
+static void __init setup_xstate_features(void)
+{
+	u32 eax, ebx, ecx, edx, leaf;
+
+	xfeatures_nr = fls64(xfeatures_mask);
+
+	for (leaf = 2; leaf < xfeatures_nr; leaf++) {
+		cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx);
+
+		xstate_offsets[leaf] = ebx;
+		xstate_sizes[leaf] = eax;
+
+		printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %04x, xstate_sizes[%d]: %04x\n", leaf, ebx, leaf, eax);
+		leaf++;
+	}
+}
+
+static void __init print_xstate_feature(u64 xstate_mask)
+{
+	const char *feature_name;
+
+	if (cpu_has_xfeatures(xstate_mask, &feature_name))
+		pr_info("x86/fpu: Supporting XSAVE feature 0x%02Lx: '%s'\n", xstate_mask, feature_name);
+}
+
+/*
+ * Print out all the supported xstate features:
+ */
+static void __init print_xstate_features(void)
+{
+	print_xstate_feature(XSTATE_FP);
+	print_xstate_feature(XSTATE_SSE);
+	print_xstate_feature(XSTATE_YMM);
+	print_xstate_feature(XSTATE_BNDREGS);
+	print_xstate_feature(XSTATE_BNDCSR);
+	print_xstate_feature(XSTATE_OPMASK);
+	print_xstate_feature(XSTATE_ZMM_Hi256);
+	print_xstate_feature(XSTATE_Hi16_ZMM);
+}
+
+/*
+ * This function sets up offsets and sizes of all extended states in
+ * xsave area. This supports both standard format and compacted format
+ * of the xsave aread.
+ */
+static void __init setup_xstate_comp(void)
+{
+	unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8];
+	int i;
+
+	/*
+	 * The FP xstates and SSE xstates are legacy states. They are always
+	 * in the fixed offsets in the xsave area in either compacted form
+	 * or standard form.
+	 */
+	xstate_comp_offsets[0] = 0;
+	xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space);
+
+	if (!cpu_has_xsaves) {
+		for (i = 2; i < xfeatures_nr; i++) {
+			if (test_bit(i, (unsigned long *)&xfeatures_mask)) {
+				xstate_comp_offsets[i] = xstate_offsets[i];
+				xstate_comp_sizes[i] = xstate_sizes[i];
+			}
+		}
+		return;
+	}
+
+	xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+
+	for (i = 2; i < xfeatures_nr; i++) {
+		if (test_bit(i, (unsigned long *)&xfeatures_mask))
+			xstate_comp_sizes[i] = xstate_sizes[i];
+		else
+			xstate_comp_sizes[i] = 0;
+
+		if (i > 2)
+			xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
+					+ xstate_comp_sizes[i-1];
+
+	}
+}
+
+/*
+ * setup the xstate image representing the init state
+ */
+static void __init setup_init_fpu_buf(void)
+{
+	static int on_boot_cpu = 1;
+
+	WARN_ON_FPU(!on_boot_cpu);
+	on_boot_cpu = 0;
+
+	if (!cpu_has_xsave)
+		return;
+
+	setup_xstate_features();
+	print_xstate_features();
+
+	if (cpu_has_xsaves) {
+		init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
+		init_fpstate.xsave.header.xfeatures = xfeatures_mask;
+	}
+
+	/*
+	 * Init all the features state with header_bv being 0x0
+	 */
+	copy_kernel_to_xregs_booting(&init_fpstate.xsave);
+
+	/*
+	 * Dump the init state again. This is to identify the init state
+	 * of any feature which is not represented by all zero's.
+	 */
+	copy_xregs_to_kernel_booting(&init_fpstate.xsave);
+}
+
+/*
+ * Calculate total size of enabled xstates in XCR0/xfeatures_mask.
+ */
+static void __init init_xstate_size(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+	int i;
+
+	if (!cpu_has_xsaves) {
+		cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+		xstate_size = ebx;
+		return;
+	}
+
+	xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+	for (i = 2; i < 64; i++) {
+		if (test_bit(i, (unsigned long *)&xfeatures_mask)) {
+			cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+			xstate_size += eax;
+		}
+	}
+}
+
+/*
+ * Enable and initialize the xsave feature.
+ * Called once per system bootup.
+ */
+void __init fpu__init_system_xstate(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+	static int on_boot_cpu = 1;
+
+	WARN_ON_FPU(!on_boot_cpu);
+	on_boot_cpu = 0;
+
+	if (!cpu_has_xsave) {
+		pr_info("x86/fpu: Legacy x87 FPU detected.\n");
+		return;
+	}
+
+	if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
+		WARN_ON_FPU(1);
+		return;
+	}
+
+	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+	xfeatures_mask = eax + ((u64)edx << 32);
+
+	if ((xfeatures_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
+		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask);
+		BUG();
+	}
+
+	/* Support only the state known to the OS: */
+	xfeatures_mask = xfeatures_mask & XCNTXT_MASK;
+
+	/* Enable xstate instructions to be able to continue with initialization: */
+	fpu__init_cpu_xstate();
+
+	/* Recompute the context size for enabled features: */
+	init_xstate_size();
+
+	update_regset_xstate_info(xstate_size, xfeatures_mask);
+	fpu__init_prepare_fx_sw_frame();
+	setup_init_fpu_buf();
+	setup_xstate_comp();
+
+	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is 0x%x bytes, using '%s' format.\n",
+		xfeatures_mask,
+		xstate_size,
+		cpu_has_xsaves ? "compacted" : "standard");
+}
+
+/*
+ * Restore minimal FPU state after suspend:
+ */
+void fpu__resume_cpu(void)
+{
+	/*
+	 * Restore XCR0 on xsave capable CPUs:
+	 */
+	if (cpu_has_xsave)
+		xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+}
+
+/*
+ * Given the xsave area and a state inside, this function returns the
+ * address of the state.
+ *
+ * This is the API that is called to get xstate address in either
+ * standard format or compacted format of xsave area.
+ *
+ * Inputs:
+ *	xsave: base address of the xsave area;
+ *	xstate: state which is defined in xsave.h (e.g. XSTATE_FP, XSTATE_SSE,
+ *	etc.)
+ * Output:
+ *	address of the state in the xsave area.
+ */
+void *get_xsave_addr(struct xregs_state *xsave, int xstate)
+{
+	int feature = fls64(xstate) - 1;
+	if (!test_bit(feature, (unsigned long *)&xfeatures_mask))
+		return NULL;
+
+	return (void *)xsave + xstate_comp_offsets[feature];
+}
+EXPORT_SYMBOL_GPL(get_xsave_addr);
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 2b55ee6db053c7..5a4668136e9892 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -167,7 +167,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
 	clear_bss();
 
 	for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
-		set_intr_gate(i, early_idt_handlers[i]);
+		set_intr_gate(i, early_idt_handler_array[i]);
 	load_idt((const struct desc_ptr *)&idt_descr);
 
 	copy_bootdata(__va(real_mode_data));
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index d031bad9e07ead..544dec4cc605dd 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -478,21 +478,22 @@ is486:
 __INIT
 setup_once:
 	/*
-	 * Set up a idt with 256 entries pointing to ignore_int,
-	 * interrupt gates. It doesn't actually load idt - that needs
-	 * to be done on each CPU. Interrupts are enabled elsewhere,
-	 * when we can be relatively sure everything is ok.
+	 * Set up a idt with 256 interrupt gates that push zero if there
+	 * is no error code and then jump to early_idt_handler_common.
+	 * It doesn't actually load the idt - that needs to be done on
+	 * each CPU. Interrupts are enabled elsewhere, when we can be
+	 * relatively sure everything is ok.
 	 */
 
 	movl $idt_table,%edi
-	movl $early_idt_handlers,%eax
+	movl $early_idt_handler_array,%eax
 	movl $NUM_EXCEPTION_VECTORS,%ecx
 1:
 	movl %eax,(%edi)
 	movl %eax,4(%edi)
 	/* interrupt gate, dpl=0, present */
 	movl $(0x8E000000 + __KERNEL_CS),2(%edi)
-	addl $9,%eax
+	addl $EARLY_IDT_HANDLER_SIZE,%eax
 	addl $8,%edi
 	loop 1b
 
@@ -524,30 +525,32 @@ setup_once:
 	andl $0,setup_once_ref	/* Once is enough, thanks */
 	ret
 
-ENTRY(early_idt_handlers)
+ENTRY(early_idt_handler_array)
 	# 36(%esp) %eflags
 	# 32(%esp) %cs
 	# 28(%esp) %eip
 	# 24(%rsp) error code
 	i = 0
 	.rept NUM_EXCEPTION_VECTORS
-	.if (EXCEPTION_ERRCODE_MASK >> i) & 1
-	ASM_NOP2
-	.else
+	.ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
 	pushl $0		# Dummy error code, to make stack frame uniform
 	.endif
 	pushl $i		# 20(%esp) Vector number
-	jmp early_idt_handler
+	jmp early_idt_handler_common
 	i = i + 1
+	.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
 	.endr
-ENDPROC(early_idt_handlers)
+ENDPROC(early_idt_handler_array)
 	
-	/* This is global to keep gas from relaxing the jumps */
-ENTRY(early_idt_handler)
+early_idt_handler_common:
+	/*
+	 * The stack is the hardware frame, an error code or zero, and the
+	 * vector number.
+	 */
 	cld
 
 	cmpl $2,(%esp)		# X86_TRAP_NMI
-	je is_nmi		# Ignore NMI
+	je .Lis_nmi		# Ignore NMI
 
 	cmpl $2,%ss:early_recursion_flag
 	je hlt_loop
@@ -600,10 +603,10 @@ ex_entry:
 	pop %ecx
 	pop %eax
 	decl %ss:early_recursion_flag
-is_nmi:
+.Lis_nmi:
 	addl $8,%esp		/* drop vector number and error code */
 	iret
-ENDPROC(early_idt_handler)
+ENDPROC(early_idt_handler_common)
 
 /* This is the default interrupt "handler" :-) */
 	ALIGN
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ae6588b301c248..e5c27f729a3840 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -321,30 +321,32 @@ bad_address:
 	jmp bad_address
 
 	__INIT
-	.globl early_idt_handlers
-early_idt_handlers:
+ENTRY(early_idt_handler_array)
 	# 104(%rsp) %rflags
 	#  96(%rsp) %cs
 	#  88(%rsp) %rip
 	#  80(%rsp) error code
 	i = 0
 	.rept NUM_EXCEPTION_VECTORS
-	.if (EXCEPTION_ERRCODE_MASK >> i) & 1
-	ASM_NOP2
-	.else
+	.ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
 	pushq $0		# Dummy error code, to make stack frame uniform
 	.endif
 	pushq $i		# 72(%rsp) Vector number
-	jmp early_idt_handler
+	jmp early_idt_handler_common
 	i = i + 1
+	.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
 	.endr
+ENDPROC(early_idt_handler_array)
 
-/* This is global to keep gas from relaxing the jumps */
-ENTRY(early_idt_handler)
+early_idt_handler_common:
+	/*
+	 * The stack is the hardware frame, an error code or zero, and the
+	 * vector number.
+	 */
 	cld
 
 	cmpl $2,(%rsp)		# X86_TRAP_NMI
-	je is_nmi		# Ignore NMI
+	je .Lis_nmi		# Ignore NMI
 
 	cmpl $2,early_recursion_flag(%rip)
 	jz  1f
@@ -409,10 +411,10 @@ ENTRY(early_idt_handler)
 	popq %rcx
 	popq %rax
 	decl early_recursion_flag(%rip)
-is_nmi:
+.Lis_nmi:
 	addq $16,%rsp		# drop vector number and error code
 	INTERRUPT_RETURN
-ENDPROC(early_idt_handler)
+ENDPROC(early_idt_handler_common)
 
 	__INITDATA
 
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 3acbff4716b088..e2449cf38b06eb 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -12,6 +12,7 @@
 #include <linux/pm.h>
 #include <linux/io.h>
 
+#include <asm/irqdomain.h>
 #include <asm/fixmap.h>
 #include <asm/hpet.h>
 #include <asm/time.h>
@@ -305,8 +306,6 @@ static void hpet_legacy_clockevent_register(void)
 	printk(KERN_DEBUG "hpet clockevent registered\n");
 }
 
-static int hpet_setup_msi_irq(unsigned int irq);
-
 static void hpet_set_mode(enum clock_event_mode mode,
 			  struct clock_event_device *evt, int timer)
 {
@@ -357,7 +356,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
 			hpet_enable_legacy_int();
 		} else {
 			struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
-			hpet_setup_msi_irq(hdev->irq);
+			irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
 			disable_irq(hdev->irq);
 			irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
 			enable_irq(hdev->irq);
@@ -423,6 +422,7 @@ static int hpet_legacy_next_event(unsigned long delta,
 
 static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
 static struct hpet_dev	*hpet_devs;
+static struct irq_domain *hpet_domain;
 
 void hpet_msi_unmask(struct irq_data *data)
 {
@@ -473,31 +473,6 @@ static int hpet_msi_next_event(unsigned long delta,
 	return hpet_next_event(delta, evt, hdev->num);
 }
 
-static int hpet_setup_msi_irq(unsigned int irq)
-{
-	if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) {
-		irq_free_hwirq(irq);
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static int hpet_assign_irq(struct hpet_dev *dev)
-{
-	unsigned int irq = irq_alloc_hwirq(-1);
-
-	if (!irq)
-		return -EINVAL;
-
-	irq_set_handler_data(irq, dev);
-
-	if (hpet_setup_msi_irq(irq))
-		return -EINVAL;
-
-	dev->irq = irq;
-	return 0;
-}
-
 static irqreturn_t hpet_interrupt_handler(int irq, void *data)
 {
 	struct hpet_dev *dev = (struct hpet_dev *)data;
@@ -540,9 +515,6 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
 	if (!(hdev->flags & HPET_DEV_VALID))
 		return;
 
-	if (hpet_setup_msi_irq(hdev->irq))
-		return;
-
 	hdev->cpu = cpu;
 	per_cpu(cpu_hpet_dev, cpu) = hdev;
 	evt->name = hdev->name;
@@ -574,7 +546,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
 	unsigned int id;
 	unsigned int num_timers;
 	unsigned int num_timers_used = 0;
-	int i;
+	int i, irq;
 
 	if (hpet_msi_disable)
 		return;
@@ -587,6 +559,10 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
 	num_timers++; /* Value read out starts from 0 */
 	hpet_print_config();
 
+	hpet_domain = hpet_create_irq_domain(hpet_blockid);
+	if (!hpet_domain)
+		return;
+
 	hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL);
 	if (!hpet_devs)
 		return;
@@ -601,15 +577,16 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
 		if (!(cfg & HPET_TN_FSB_CAP))
 			continue;
 
+		irq = hpet_assign_irq(hpet_domain, hdev, hdev->num);
+		if (irq < 0)
+			continue;
+
+		sprintf(hdev->name, "hpet%d", i);
+		hdev->num = i;
+		hdev->irq = irq;
 		hdev->flags = 0;
 		if (cfg & HPET_TN_PERIODIC_CAP)
 			hdev->flags |= HPET_DEV_PERI_CAP;
-		hdev->num = i;
-
-		sprintf(hdev->name, "hpet%d", i);
-		if (hpet_assign_irq(hdev))
-			continue;
-
 		hdev->flags |= HPET_DEV_FSB_CAP;
 		hdev->flags |= HPET_DEV_VALID;
 		num_timers_used++;
@@ -709,10 +686,6 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
 }
 #else
 
-static int hpet_setup_msi_irq(unsigned int irq)
-{
-	return 0;
-}
 static void hpet_msi_capability_lookup(unsigned int start_timer)
 {
 	return;
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
deleted file mode 100644
index 6185d314121901..00000000000000
--- a/arch/x86/kernel/i387.c
+++ /dev/null
@@ -1,671 +0,0 @@
-/*
- *  Copyright (C) 1994 Linus Torvalds
- *
- *  Pentium III FXSR, SSE support
- *  General FPU state handling cleanups
- *	Gareth Hughes <gareth@valinux.com>, May 2000
- */
-#include <linux/module.h>
-#include <linux/regset.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-#include <asm/sigcontext.h>
-#include <asm/processor.h>
-#include <asm/math_emu.h>
-#include <asm/tlbflush.h>
-#include <asm/uaccess.h>
-#include <asm/ptrace.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
-#include <asm/user.h>
-
-static DEFINE_PER_CPU(bool, in_kernel_fpu);
-
-void kernel_fpu_disable(void)
-{
-	WARN_ON(this_cpu_read(in_kernel_fpu));
-	this_cpu_write(in_kernel_fpu, true);
-}
-
-void kernel_fpu_enable(void)
-{
-	this_cpu_write(in_kernel_fpu, false);
-}
-
-/*
- * Were we in an interrupt that interrupted kernel mode?
- *
- * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that
- * pair does nothing at all: the thread must not have fpu (so
- * that we don't try to save the FPU state), and TS must
- * be set (so that the clts/stts pair does nothing that is
- * visible in the interrupted kernel thread).
- *
- * Except for the eagerfpu case when we return true; in the likely case
- * the thread has FPU but we are not going to set/clear TS.
- */
-static inline bool interrupted_kernel_fpu_idle(void)
-{
-	if (this_cpu_read(in_kernel_fpu))
-		return false;
-
-	if (use_eager_fpu())
-		return true;
-
-	return !__thread_has_fpu(current) &&
-		(read_cr0() & X86_CR0_TS);
-}
-
-/*
- * Were we in user mode (or vm86 mode) when we were
- * interrupted?
- *
- * Doing kernel_fpu_begin/end() is ok if we are running
- * in an interrupt context from user mode - we'll just
- * save the FPU state as required.
- */
-static inline bool interrupted_user_mode(void)
-{
-	struct pt_regs *regs = get_irq_regs();
-	return regs && user_mode(regs);
-}
-
-/*
- * Can we use the FPU in kernel mode with the
- * whole "kernel_fpu_begin/end()" sequence?
- *
- * It's always ok in process context (ie "not interrupt")
- * but it is sometimes ok even from an irq.
- */
-bool irq_fpu_usable(void)
-{
-	return !in_interrupt() ||
-		interrupted_user_mode() ||
-		interrupted_kernel_fpu_idle();
-}
-EXPORT_SYMBOL(irq_fpu_usable);
-
-void __kernel_fpu_begin(void)
-{
-	struct task_struct *me = current;
-
-	this_cpu_write(in_kernel_fpu, true);
-
-	if (__thread_has_fpu(me)) {
-		__save_init_fpu(me);
-	} else {
-		this_cpu_write(fpu_owner_task, NULL);
-		if (!use_eager_fpu())
-			clts();
-	}
-}
-EXPORT_SYMBOL(__kernel_fpu_begin);
-
-void __kernel_fpu_end(void)
-{
-	struct task_struct *me = current;
-
-	if (__thread_has_fpu(me)) {
-		if (WARN_ON(restore_fpu_checking(me)))
-			fpu_reset_state(me);
-	} else if (!use_eager_fpu()) {
-		stts();
-	}
-
-	this_cpu_write(in_kernel_fpu, false);
-}
-EXPORT_SYMBOL(__kernel_fpu_end);
-
-void unlazy_fpu(struct task_struct *tsk)
-{
-	preempt_disable();
-	if (__thread_has_fpu(tsk)) {
-		if (use_eager_fpu()) {
-			__save_fpu(tsk);
-		} else {
-			__save_init_fpu(tsk);
-			__thread_fpu_end(tsk);
-		}
-	}
-	preempt_enable();
-}
-EXPORT_SYMBOL(unlazy_fpu);
-
-unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
-unsigned int xstate_size;
-EXPORT_SYMBOL_GPL(xstate_size);
-static struct i387_fxsave_struct fx_scratch;
-
-static void mxcsr_feature_mask_init(void)
-{
-	unsigned long mask = 0;
-
-	if (cpu_has_fxsr) {
-		memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct));
-		asm volatile("fxsave %0" : "+m" (fx_scratch));
-		mask = fx_scratch.mxcsr_mask;
-		if (mask == 0)
-			mask = 0x0000ffbf;
-	}
-	mxcsr_feature_mask &= mask;
-}
-
-static void init_thread_xstate(void)
-{
-	/*
-	 * Note that xstate_size might be overwriten later during
-	 * xsave_init().
-	 */
-
-	if (!cpu_has_fpu) {
-		/*
-		 * Disable xsave as we do not support it if i387
-		 * emulation is enabled.
-		 */
-		setup_clear_cpu_cap(X86_FEATURE_XSAVE);
-		setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-		xstate_size = sizeof(struct i387_soft_struct);
-		return;
-	}
-
-	if (cpu_has_fxsr)
-		xstate_size = sizeof(struct i387_fxsave_struct);
-	else
-		xstate_size = sizeof(struct i387_fsave_struct);
-
-	/*
-	 * Quirk: we don't yet handle the XSAVES* instructions
-	 * correctly, as we don't correctly convert between
-	 * standard and compacted format when interfacing
-	 * with user-space - so disable it for now.
-	 *
-	 * The difference is small: with recent CPUs the
-	 * compacted format is only marginally smaller than
-	 * the standard FPU state format.
-	 *
-	 * ( This is easy to backport while we are fixing
-	 *   XSAVES* support. )
-	 */
-	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-}
-
-/*
- * Called at bootup to set up the initial FPU state that is later cloned
- * into all processes.
- */
-
-void fpu_init(void)
-{
-	unsigned long cr0;
-	unsigned long cr4_mask = 0;
-
-#ifndef CONFIG_MATH_EMULATION
-	if (!cpu_has_fpu) {
-		pr_emerg("No FPU found and no math emulation present\n");
-		pr_emerg("Giving up\n");
-		for (;;)
-			asm volatile("hlt");
-	}
-#endif
-	if (cpu_has_fxsr)
-		cr4_mask |= X86_CR4_OSFXSR;
-	if (cpu_has_xmm)
-		cr4_mask |= X86_CR4_OSXMMEXCPT;
-	if (cr4_mask)
-		cr4_set_bits(cr4_mask);
-
-	cr0 = read_cr0();
-	cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
-	if (!cpu_has_fpu)
-		cr0 |= X86_CR0_EM;
-	write_cr0(cr0);
-
-	/*
-	 * init_thread_xstate is only called once to avoid overriding
-	 * xstate_size during boot time or during CPU hotplug.
-	 */
-	if (xstate_size == 0)
-		init_thread_xstate();
-
-	mxcsr_feature_mask_init();
-	xsave_init();
-	eager_fpu_init();
-}
-
-void fpu_finit(struct fpu *fpu)
-{
-	if (!cpu_has_fpu) {
-		finit_soft_fpu(&fpu->state->soft);
-		return;
-	}
-
-	memset(fpu->state, 0, xstate_size);
-
-	if (cpu_has_fxsr) {
-		fx_finit(&fpu->state->fxsave);
-	} else {
-		struct i387_fsave_struct *fp = &fpu->state->fsave;
-		fp->cwd = 0xffff037fu;
-		fp->swd = 0xffff0000u;
-		fp->twd = 0xffffffffu;
-		fp->fos = 0xffff0000u;
-	}
-}
-EXPORT_SYMBOL_GPL(fpu_finit);
-
-/*
- * The _current_ task is using the FPU for the first time
- * so initialize it and set the mxcsr to its default
- * value at reset if we support XMM instructions and then
- * remember the current task has used the FPU.
- */
-int init_fpu(struct task_struct *tsk)
-{
-	int ret;
-
-	if (tsk_used_math(tsk)) {
-		if (cpu_has_fpu && tsk == current)
-			unlazy_fpu(tsk);
-		task_disable_lazy_fpu_restore(tsk);
-		return 0;
-	}
-
-	/*
-	 * Memory allocation at the first usage of the FPU and other state.
-	 */
-	ret = fpu_alloc(&tsk->thread.fpu);
-	if (ret)
-		return ret;
-
-	fpu_finit(&tsk->thread.fpu);
-
-	set_stopped_child_used_math(tsk);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(init_fpu);
-
-/*
- * The xstateregs_active() routine is the same as the fpregs_active() routine,
- * as the "regset->n" for the xstate regset will be updated based on the feature
- * capabilites supported by the xsave.
- */
-int fpregs_active(struct task_struct *target, const struct user_regset *regset)
-{
-	return tsk_used_math(target) ? regset->n : 0;
-}
-
-int xfpregs_active(struct task_struct *target, const struct user_regset *regset)
-{
-	return (cpu_has_fxsr && tsk_used_math(target)) ? regset->n : 0;
-}
-
-int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
-		unsigned int pos, unsigned int count,
-		void *kbuf, void __user *ubuf)
-{
-	int ret;
-
-	if (!cpu_has_fxsr)
-		return -ENODEV;
-
-	ret = init_fpu(target);
-	if (ret)
-		return ret;
-
-	sanitize_i387_state(target);
-
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				   &target->thread.fpu.state->fxsave, 0, -1);
-}
-
-int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
-		unsigned int pos, unsigned int count,
-		const void *kbuf, const void __user *ubuf)
-{
-	int ret;
-
-	if (!cpu_has_fxsr)
-		return -ENODEV;
-
-	ret = init_fpu(target);
-	if (ret)
-		return ret;
-
-	sanitize_i387_state(target);
-
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 &target->thread.fpu.state->fxsave, 0, -1);
-
-	/*
-	 * mxcsr reserved bits must be masked to zero for security reasons.
-	 */
-	target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
-
-	/*
-	 * update the header bits in the xsave header, indicating the
-	 * presence of FP and SSE state.
-	 */
-	if (cpu_has_xsave)
-		target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
-
-	return ret;
-}
-
-int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
-		unsigned int pos, unsigned int count,
-		void *kbuf, void __user *ubuf)
-{
-	struct xsave_struct *xsave;
-	int ret;
-
-	if (!cpu_has_xsave)
-		return -ENODEV;
-
-	ret = init_fpu(target);
-	if (ret)
-		return ret;
-
-	xsave = &target->thread.fpu.state->xsave;
-
-	/*
-	 * Copy the 48bytes defined by the software first into the xstate
-	 * memory layout in the thread struct, so that we can copy the entire
-	 * xstateregs to the user using one user_regset_copyout().
-	 */
-	memcpy(&xsave->i387.sw_reserved,
-		xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
-	/*
-	 * Copy the xstate memory layout.
-	 */
-	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
-	return ret;
-}
-
-int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
-		  unsigned int pos, unsigned int count,
-		  const void *kbuf, const void __user *ubuf)
-{
-	struct xsave_struct *xsave;
-	int ret;
-
-	if (!cpu_has_xsave)
-		return -ENODEV;
-
-	ret = init_fpu(target);
-	if (ret)
-		return ret;
-
-	xsave = &target->thread.fpu.state->xsave;
-
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
-	/*
-	 * mxcsr reserved bits must be masked to zero for security reasons.
-	 */
-	xsave->i387.mxcsr &= mxcsr_feature_mask;
-	xsave->xsave_hdr.xstate_bv &= pcntxt_mask;
-	/*
-	 * These bits must be zero.
-	 */
-	memset(&xsave->xsave_hdr.reserved, 0, 48);
-	return ret;
-}
-
-#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
-
-/*
- * FPU tag word conversions.
- */
-
-static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
-{
-	unsigned int tmp; /* to avoid 16 bit prefixes in the code */
-
-	/* Transform each pair of bits into 01 (valid) or 00 (empty) */
-	tmp = ~twd;
-	tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
-	/* and move the valid bits to the lower byte. */
-	tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
-	tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
-	tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
-
-	return tmp;
-}
-
-#define FPREG_ADDR(f, n)	((void *)&(f)->st_space + (n) * 16)
-#define FP_EXP_TAG_VALID	0
-#define FP_EXP_TAG_ZERO		1
-#define FP_EXP_TAG_SPECIAL	2
-#define FP_EXP_TAG_EMPTY	3
-
-static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
-{
-	struct _fpxreg *st;
-	u32 tos = (fxsave->swd >> 11) & 7;
-	u32 twd = (unsigned long) fxsave->twd;
-	u32 tag;
-	u32 ret = 0xffff0000u;
-	int i;
-
-	for (i = 0; i < 8; i++, twd >>= 1) {
-		if (twd & 0x1) {
-			st = FPREG_ADDR(fxsave, (i - tos) & 7);
-
-			switch (st->exponent & 0x7fff) {
-			case 0x7fff:
-				tag = FP_EXP_TAG_SPECIAL;
-				break;
-			case 0x0000:
-				if (!st->significand[0] &&
-				    !st->significand[1] &&
-				    !st->significand[2] &&
-				    !st->significand[3])
-					tag = FP_EXP_TAG_ZERO;
-				else
-					tag = FP_EXP_TAG_SPECIAL;
-				break;
-			default:
-				if (st->significand[3] & 0x8000)
-					tag = FP_EXP_TAG_VALID;
-				else
-					tag = FP_EXP_TAG_SPECIAL;
-				break;
-			}
-		} else {
-			tag = FP_EXP_TAG_EMPTY;
-		}
-		ret |= tag << (2 * i);
-	}
-	return ret;
-}
-
-/*
- * FXSR floating point environment conversions.
- */
-
-void
-convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
-{
-	struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
-	struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
-	struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
-	int i;
-
-	env->cwd = fxsave->cwd | 0xffff0000u;
-	env->swd = fxsave->swd | 0xffff0000u;
-	env->twd = twd_fxsr_to_i387(fxsave);
-
-#ifdef CONFIG_X86_64
-	env->fip = fxsave->rip;
-	env->foo = fxsave->rdp;
-	/*
-	 * should be actually ds/cs at fpu exception time, but
-	 * that information is not available in 64bit mode.
-	 */
-	env->fcs = task_pt_regs(tsk)->cs;
-	if (tsk == current) {
-		savesegment(ds, env->fos);
-	} else {
-		env->fos = tsk->thread.ds;
-	}
-	env->fos |= 0xffff0000;
-#else
-	env->fip = fxsave->fip;
-	env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
-	env->foo = fxsave->foo;
-	env->fos = fxsave->fos;
-#endif
-
-	for (i = 0; i < 8; ++i)
-		memcpy(&to[i], &from[i], sizeof(to[0]));
-}
-
-void convert_to_fxsr(struct task_struct *tsk,
-		     const struct user_i387_ia32_struct *env)
-
-{
-	struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
-	struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
-	struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
-	int i;
-
-	fxsave->cwd = env->cwd;
-	fxsave->swd = env->swd;
-	fxsave->twd = twd_i387_to_fxsr(env->twd);
-	fxsave->fop = (u16) ((u32) env->fcs >> 16);
-#ifdef CONFIG_X86_64
-	fxsave->rip = env->fip;
-	fxsave->rdp = env->foo;
-	/* cs and ds ignored */
-#else
-	fxsave->fip = env->fip;
-	fxsave->fcs = (env->fcs & 0xffff);
-	fxsave->foo = env->foo;
-	fxsave->fos = env->fos;
-#endif
-
-	for (i = 0; i < 8; ++i)
-		memcpy(&to[i], &from[i], sizeof(from[0]));
-}
-
-int fpregs_get(struct task_struct *target, const struct user_regset *regset,
-	       unsigned int pos, unsigned int count,
-	       void *kbuf, void __user *ubuf)
-{
-	struct user_i387_ia32_struct env;
-	int ret;
-
-	ret = init_fpu(target);
-	if (ret)
-		return ret;
-
-	if (!static_cpu_has(X86_FEATURE_FPU))
-		return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
-
-	if (!cpu_has_fxsr)
-		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-					   &target->thread.fpu.state->fsave, 0,
-					   -1);
-
-	sanitize_i387_state(target);
-
-	if (kbuf && pos == 0 && count == sizeof(env)) {
-		convert_from_fxsr(kbuf, target);
-		return 0;
-	}
-
-	convert_from_fxsr(&env, target);
-
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
-}
-
-int fpregs_set(struct task_struct *target, const struct user_regset *regset,
-	       unsigned int pos, unsigned int count,
-	       const void *kbuf, const void __user *ubuf)
-{
-	struct user_i387_ia32_struct env;
-	int ret;
-
-	ret = init_fpu(target);
-	if (ret)
-		return ret;
-
-	sanitize_i387_state(target);
-
-	if (!static_cpu_has(X86_FEATURE_FPU))
-		return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
-
-	if (!cpu_has_fxsr)
-		return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-					  &target->thread.fpu.state->fsave, 0,
-					  -1);
-
-	if (pos > 0 || count < sizeof(env))
-		convert_from_fxsr(&env, target);
-
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
-	if (!ret)
-		convert_to_fxsr(target, &env);
-
-	/*
-	 * update the header bit in the xsave header, indicating the
-	 * presence of FP.
-	 */
-	if (cpu_has_xsave)
-		target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
-	return ret;
-}
-
-/*
- * FPU state for core dumps.
- * This is only used for a.out dumps now.
- * It is declared generically using elf_fpregset_t (which is
- * struct user_i387_struct) but is in fact only used for 32-bit
- * dumps, so on 64-bit it is really struct user_i387_ia32_struct.
- */
-int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu)
-{
-	struct task_struct *tsk = current;
-	int fpvalid;
-
-	fpvalid = !!used_math();
-	if (fpvalid)
-		fpvalid = !fpregs_get(tsk, NULL,
-				      0, sizeof(struct user_i387_ia32_struct),
-				      fpu, NULL);
-
-	return fpvalid;
-}
-EXPORT_SYMBOL(dump_fpu);
-
-#endif	/* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
-
-static int __init no_387(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_FPU);
-	return 1;
-}
-
-__setup("no387", no_387);
-
-void fpu_detect(struct cpuinfo_x86 *c)
-{
-	unsigned long cr0;
-	u16 fsw, fcw;
-
-	fsw = fcw = 0xffff;
-
-	cr0 = read_cr0();
-	cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
-	write_cr0(cr0);
-
-	asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
-		     : "+m" (fsw), "+m" (fcw));
-
-	if (fsw == 0 && (fcw & 0x103f) == 0x003f)
-		set_cpu_cap(c, X86_FEATURE_FPU);
-	else
-		clear_cpu_cap(c, X86_FEATURE_FPU);
-
-	/* The final cr0 value is set in fpu_init() */
-}
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index e7cc5370cd2fca..16cb827a5b2774 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -329,8 +329,8 @@ static void init_8259A(int auto_eoi)
 	 */
 	outb_pic(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
 
-	/* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
-	outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
+	/* ICW2: 8259A-1 IR0-7 mapped to ISA_IRQ_VECTOR(0) */
+	outb_pic(ISA_IRQ_VECTOR(0), PIC_MASTER_IMR);
 
 	/* 8259A-1 (the master) has a slave on IR2 */
 	outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);
@@ -342,8 +342,8 @@ static void init_8259A(int auto_eoi)
 
 	outb_pic(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
 
-	/* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */
-	outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
+	/* ICW2: 8259A-2 IR0-7 mapped to ISA_IRQ_VECTOR(8) */
+	outb_pic(ISA_IRQ_VECTOR(8), PIC_SLAVE_IMR);
 	/* 8259A-2 is a slave on master's IR2 */
 	outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
 	/* (slave's support for AEOI in flat mode is to be investigated) */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index e5952c22553241..88b366487b0e44 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -22,6 +22,12 @@
 #define CREATE_TRACE_POINTS
 #include <asm/trace/irq_vectors.h>
 
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
+
 atomic_t irq_err_count;
 
 /* Function pointer for generic interrupt vector handling */
@@ -116,6 +122,12 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
 	seq_puts(p, "  Threshold APIC interrupts\n");
 #endif
+#ifdef CONFIG_X86_MCE_AMD
+	seq_printf(p, "%*s: ", prec, "DFR");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count);
+	seq_puts(p, "  Deferred Error APIC interrupts\n");
+#endif
 #ifdef CONFIG_X86_MCE
 	seq_printf(p, "%*s: ", prec, "MCE");
 	for_each_online_cpu(j)
@@ -136,6 +148,18 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 #if defined(CONFIG_X86_IO_APIC)
 	seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
 #endif
+#ifdef CONFIG_HAVE_KVM
+	seq_printf(p, "%*s: ", prec, "PIN");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis);
+	seq_puts(p, "  Posted-interrupt notification event\n");
+
+	seq_printf(p, "%*s: ", prec, "PIW");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ",
+			   irq_stats(j)->kvm_posted_intr_wakeup_ipis);
+	seq_puts(p, "  Posted-interrupt wakeup event\n");
+#endif
 	return 0;
 }
 
@@ -192,8 +216,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 	unsigned vector = ~regs->orig_ax;
 	unsigned irq;
 
-	irq_enter();
-	exit_idle();
+	entering_irq();
 
 	irq = __this_cpu_read(vector_irq[vector]);
 
@@ -209,7 +232,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 		}
 	}
 
-	irq_exit();
+	exiting_irq();
 
 	set_irq_regs(old_regs);
 	return 1;
@@ -237,6 +260,18 @@ __visible void smp_x86_platform_ipi(struct pt_regs *regs)
 }
 
 #ifdef CONFIG_HAVE_KVM
+static void dummy_handler(void) {}
+static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler;
+
+void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
+{
+	if (handler)
+		kvm_posted_intr_wakeup_handler = handler;
+	else
+		kvm_posted_intr_wakeup_handler = dummy_handler;
+}
+EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
+
 /*
  * Handler for POSTED_INTERRUPT_VECTOR.
  */
@@ -244,16 +279,23 @@ __visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
-	ack_APIC_irq();
-
-	irq_enter();
-
-	exit_idle();
-
+	entering_ack_irq();
 	inc_irq_stat(kvm_posted_intr_ipis);
+	exiting_irq();
+	set_irq_regs(old_regs);
+}
 
-	irq_exit();
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
 
+	entering_ack_irq();
+	inc_irq_stat(kvm_posted_intr_wakeup_ipis);
+	kvm_posted_intr_wakeup_handler();
+	exiting_irq();
 	set_irq_regs(old_regs);
 }
 #endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index f9fd86a7fcc7d1..cd74f5978ab97a 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -21,12 +21,6 @@
 
 #include <asm/apic.h>
 
-DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
-EXPORT_PER_CPU_SYMBOL(irq_stat);
-
-DEFINE_PER_CPU(struct pt_regs *, irq_regs);
-EXPORT_PER_CPU_SYMBOL(irq_regs);
-
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 
 int sysctl_panic_on_stackoverflow __read_mostly;
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 394e643d7830fc..bc4604e500a328 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -20,12 +20,6 @@
 #include <asm/idle.h>
 #include <asm/apic.h>
 
-DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
-EXPORT_PER_CPU_SYMBOL(irq_stat);
-
-DEFINE_PER_CPU(struct pt_regs *, irq_regs);
-EXPORT_PER_CPU_SYMBOL(irq_regs);
-
 int sysctl_panic_on_stackoverflow;
 
 /*
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index 15d741ddfeeb7c..dc5fa6a1e8d640 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -10,12 +10,6 @@
 #include <asm/apic.h>
 #include <asm/trace/irq_vectors.h>
 
-static inline void irq_work_entering_irq(void)
-{
-	irq_enter();
-	ack_APIC_irq();
-}
-
 static inline void __smp_irq_work_interrupt(void)
 {
 	inc_irq_stat(apic_irq_work_irqs);
@@ -24,14 +18,14 @@ static inline void __smp_irq_work_interrupt(void)
 
 __visible void smp_irq_work_interrupt(struct pt_regs *regs)
 {
-	irq_work_entering_irq();
+	ipi_entering_ack_irq();
 	__smp_irq_work_interrupt();
 	exiting_irq();
 }
 
 __visible void smp_trace_irq_work_interrupt(struct pt_regs *regs)
 {
-	irq_work_entering_irq();
+	ipi_entering_ack_irq();
 	trace_irq_work_entry(IRQ_WORK_VECTOR);
 	__smp_irq_work_interrupt();
 	trace_irq_work_exit(IRQ_WORK_VECTOR);
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index cd10a64372647c..a3a5e158ed6955 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -86,7 +86,7 @@ void __init init_IRQ(void)
 	int i;
 
 	/*
-	 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
+	 * On cpu 0, Assign ISA_IRQ_VECTOR(irq) to IRQ 0..15.
 	 * If these IRQ's are handled by legacy interrupt-controllers like PIC,
 	 * then this configuration will likely be static after the boot. If
 	 * these IRQ's are handled by more mordern controllers like IO-APIC,
@@ -94,7 +94,7 @@ void __init init_IRQ(void)
 	 * irq's migrate etc.
 	 */
 	for (i = 0; i < nr_legacy_irqs(); i++)
-		per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;
+		per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = i;
 
 	x86_init.irqs.intr_init();
 }
@@ -135,6 +135,10 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
 #endif
 
+#ifdef CONFIG_X86_MCE_AMD
+	alloc_intr_gate(DEFERRED_ERROR_VECTOR, deferred_error_interrupt);
+#endif
+
 #ifdef CONFIG_X86_LOCAL_APIC
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
@@ -144,6 +148,8 @@ static void __init apic_intr_init(void)
 #ifdef CONFIG_HAVE_KVM
 	/* IPI for KVM to deliver posted interrupt */
 	alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi);
+	/* IPI for KVM to deliver interrupt to wake up tasks */
+	alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi);
 #endif
 
 	/* IPI vectors for APIC spurious and error interrupts */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 9435620062df30..1681504e44a4c3 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -584,6 +584,39 @@ static void kvm_kick_cpu(int cpu)
 	kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
 }
 
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+#include <asm/qspinlock.h>
+
+static void kvm_wait(u8 *ptr, u8 val)
+{
+	unsigned long flags;
+
+	if (in_nmi())
+		return;
+
+	local_irq_save(flags);
+
+	if (READ_ONCE(*ptr) != val)
+		goto out;
+
+	/*
+	 * halt until it's our turn and kicked. Note that we do safe halt
+	 * for irq enabled case to avoid hang when lock info is overwritten
+	 * in irq spinlock slowpath and no spurious interrupt occur to save us.
+	 */
+	if (arch_irqs_disabled_flags(flags))
+		halt();
+	else
+		safe_halt();
+
+out:
+	local_irq_restore(flags);
+}
+
+#else /* !CONFIG_QUEUED_SPINLOCKS */
+
 enum kvm_contention_stat {
 	TAKEN_SLOW,
 	TAKEN_SLOW_PICKUP,
@@ -817,6 +850,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
 	}
 }
 
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
+
 /*
  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
  */
@@ -828,8 +863,16 @@ void __init kvm_spinlock_init(void)
 	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
 		return;
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+	__pv_init_lock_hash();
+	pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+	pv_lock_ops.wait = kvm_wait;
+	pv_lock_ops.kick = kvm_kick_cpu;
+#else /* !CONFIG_QUEUED_SPINLOCKS */
 	pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
 	pv_lock_ops.unlock_kick = kvm_unlock_kick;
+#endif
 }
 
 static __init int kvm_spinlock_init_jump(void)
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 2d2a237f2c7369..30ca7607cbbbbc 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -19,8 +19,8 @@
 #include <linux/module.h>
 #include <linux/smp.h>
 #include <linux/pci.h>
-#include <linux/irqdomain.h>
 
+#include <asm/irqdomain.h>
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
 #include <asm/pgalloc.h>
@@ -113,11 +113,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
 		pr_warn("Unknown bustype %s - ignoring\n", str);
 }
 
-static struct irq_domain_ops mp_ioapic_irqdomain_ops = {
-	.map = mp_irqdomain_map,
-	.unmap = mp_irqdomain_unmap,
-};
-
 static void __init MP_ioapic_info(struct mpc_ioapic *m)
 {
 	struct ioapic_domain_cfg cfg = {
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index bbb6c7316341f8..33ee3e0efd65bc 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -8,11 +8,33 @@
 
 #include <asm/paravirt.h>
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+__visible void __native_queued_spin_unlock(struct qspinlock *lock)
+{
+	native_queued_spin_unlock(lock);
+}
+
+PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);
+
+bool pv_is_native_spin_unlock(void)
+{
+	return pv_lock_ops.queued_spin_unlock.func ==
+		__raw_callee_save___native_queued_spin_unlock;
+}
+#endif
+
 struct pv_lock_ops pv_lock_ops = {
 #ifdef CONFIG_SMP
+#ifdef CONFIG_QUEUED_SPINLOCKS
+	.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
+	.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
+	.wait = paravirt_nop,
+	.kick = paravirt_nop,
+#else /* !CONFIG_QUEUED_SPINLOCKS */
 	.lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop),
 	.unlock_kick = paravirt_nop,
-#endif
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
+#endif /* SMP */
 };
 EXPORT_SYMBOL(pv_lock_ops);
 
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c614dd492f5f72..58bcfb67c01f1f 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -154,7 +154,9 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
 		ret = paravirt_patch_ident_64(insnbuf, len);
 
 	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
+#ifdef CONFIG_X86_32
 		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
+#endif
 		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
 		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
 		/* If operation requires a jmp, then jmp */
@@ -371,7 +373,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
 
 	.load_sp0 = native_load_sp0,
 
-#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+#if defined(CONFIG_X86_32)
 	.irq_enable_sysexit = native_irq_enable_sysexit,
 #endif
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index d9f32e6d6ab654..e1b013696dde58 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -12,6 +12,10 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
 DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
 
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
+#endif
+
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
 {
 	/* arg in %eax, return in %eax */
@@ -24,6 +28,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
 	return 0;
 }
 
+extern bool pv_is_native_spin_unlock(void);
+
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		      unsigned long addr, unsigned len)
 {
@@ -47,14 +53,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_mmu_ops, write_cr3);
 		PATCH_SITE(pv_cpu_ops, clts);
 		PATCH_SITE(pv_cpu_ops, read_tsc);
-
-	patch_site:
-		ret = paravirt_patch_insns(ibuf, len, start, end);
-		break;
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
+			if (pv_is_native_spin_unlock()) {
+				start = start_pv_lock_ops_queued_spin_unlock;
+				end   = end_pv_lock_ops_queued_spin_unlock;
+				goto patch_site;
+			}
+#endif
 
 	default:
 		ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
 		break;
+
+patch_site:
+		ret = paravirt_patch_insns(ibuf, len, start, end);
+		break;
 	}
 #undef PATCH_SITE
 	return ret;
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index a1da6737ba5b80..8aa05583bc42de 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -21,6 +21,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
 DEF_NATIVE(, mov32, "mov %edi, %eax");
 DEF_NATIVE(, mov64, "mov %rdi, %rax");
 
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
+#endif
+
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
 {
 	return paravirt_patch_insns(insnbuf, len,
@@ -33,6 +37,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
 				    start__mov64, end__mov64);
 }
 
+extern bool pv_is_native_spin_unlock(void);
+
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		      unsigned long addr, unsigned len)
 {
@@ -49,7 +55,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_irq_ops, save_fl);
 		PATCH_SITE(pv_irq_ops, irq_enable);
 		PATCH_SITE(pv_irq_ops, irq_disable);
-		PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
 		PATCH_SITE(pv_cpu_ops, usergs_sysret32);
 		PATCH_SITE(pv_cpu_ops, usergs_sysret64);
 		PATCH_SITE(pv_cpu_ops, swapgs);
@@ -59,14 +64,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_cpu_ops, clts);
 		PATCH_SITE(pv_mmu_ops, flush_tlb_single);
 		PATCH_SITE(pv_cpu_ops, wbinvd);
-
-	patch_site:
-		ret = paravirt_patch_insns(ibuf, len, start, end);
-		break;
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
+			if (pv_is_native_spin_unlock()) {
+				start = start_pv_lock_ops_queued_spin_unlock;
+				end   = end_pv_lock_ops_queued_spin_unlock;
+				goto patch_site;
+			}
+#endif
 
 	default:
 		ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
 		break;
+
+patch_site:
+		ret = paravirt_patch_insns(ibuf, len, start, end);
+		break;
 	}
 #undef PATCH_SITE
 	return ret;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index a25e202bb319ca..353972c1946cd3 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -140,6 +140,51 @@ void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
 		free_pages((unsigned long)vaddr, get_order(size));
 }
 
+void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
+		      gfp_t gfp, struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+	void *memory;
+
+	gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+
+	if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
+		return memory;
+
+	if (!dev)
+		dev = &x86_dma_fallback_dev;
+
+	if (!is_device_dma_capable(dev))
+		return NULL;
+
+	if (!ops->alloc)
+		return NULL;
+
+	memory = ops->alloc(dev, size, dma_handle,
+			    dma_alloc_coherent_gfp_flags(dev, gfp), attrs);
+	debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
+
+	return memory;
+}
+EXPORT_SYMBOL(dma_alloc_attrs);
+
+void dma_free_attrs(struct device *dev, size_t size,
+		    void *vaddr, dma_addr_t bus,
+		    struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	WARN_ON(irqs_disabled());       /* for portability */
+
+	if (dma_release_from_coherent(dev, get_order(size), vaddr))
+		return;
+
+	debug_dma_free_coherent(dev, size, vaddr, bus);
+	if (ops->free)
+		ops->free(dev, size, vaddr, bus, attrs);
+}
+EXPORT_SYMBOL(dma_free_attrs);
+
 /*
  * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
  * parameter documentation.
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6e338e3b1dc04c..9cad694ed7c4d6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -25,8 +25,7 @@
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/mwait.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/debugreg.h>
 #include <asm/nmi.h>
 #include <asm/tlbflush.h>
@@ -76,9 +75,6 @@ void idle_notifier_unregister(struct notifier_block *n)
 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
 #endif
 
-struct kmem_cache *task_xstate_cachep;
-EXPORT_SYMBOL_GPL(task_xstate_cachep);
-
 /*
  * this gets called so that we can store lazy state into memory and copy the
  * current task into the new thread.
@@ -87,36 +83,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
 	*dst = *src;
 
-	dst->thread.fpu_counter = 0;
-	dst->thread.fpu.has_fpu = 0;
-	dst->thread.fpu.state = NULL;
-	task_disable_lazy_fpu_restore(dst);
-	if (tsk_used_math(src)) {
-		int err = fpu_alloc(&dst->thread.fpu);
-		if (err)
-			return err;
-		fpu_copy(dst, src);
-	}
-	return 0;
-}
-
-void free_thread_xstate(struct task_struct *tsk)
-{
-	fpu_free(&tsk->thread.fpu);
-}
-
-void arch_release_task_struct(struct task_struct *tsk)
-{
-	free_thread_xstate(tsk);
-}
-
-void arch_task_cache_init(void)
-{
-        task_xstate_cachep =
-        	kmem_cache_create("task_xstate", xstate_size,
-				  __alignof__(union thread_xstate),
-				  SLAB_PANIC | SLAB_NOTRACK, NULL);
-	setup_xstate_comp();
+	return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
 }
 
 /*
@@ -127,6 +94,7 @@ void exit_thread(void)
 	struct task_struct *me = current;
 	struct thread_struct *t = &me->thread;
 	unsigned long *bp = t->io_bitmap_ptr;
+	struct fpu *fpu = &t->fpu;
 
 	if (bp) {
 		struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
@@ -142,7 +110,7 @@ void exit_thread(void)
 		kfree(bp);
 	}
 
-	drop_fpu(me);
+	fpu__drop(fpu);
 }
 
 void flush_thread(void)
@@ -152,19 +120,7 @@ void flush_thread(void)
 	flush_ptrace_hw_breakpoint(tsk);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 
-	if (!use_eager_fpu()) {
-		/* FPU state will be reallocated lazily at the first use. */
-		drop_fpu(tsk);
-		free_thread_xstate(tsk);
-	} else {
-		if (!tsk_used_math(tsk)) {
-			/* kthread execs. TODO: cleanup this horror. */
-			if (WARN_ON(init_fpu(tsk)))
-				force_sig(SIGKILL, tsk);
-			user_fpu_begin();
-		}
-		restore_init_xstate();
-	}
+	fpu__clear(&tsk->thread.fpu);
 }
 
 static void hard_disable_TSC(void)
@@ -445,11 +401,10 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
 }
 
 /*
- * MONITOR/MWAIT with no hints, used for default default C1 state.
- * This invokes MWAIT with interrutps enabled and no flags,
- * which is backwards compatible with the original MWAIT implementation.
+ * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT
+ * with interrupts enabled and no flags, which is backwards compatible with the
+ * original MWAIT implementation.
  */
-
 static void mwait_idle(void)
 {
 	if (!current_set_polling_and_test()) {
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8ed2106b06da63..c09c99ccf3e33f 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -39,8 +39,7 @@
 #include <asm/pgtable.h>
 #include <asm/ldt.h>
 #include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/desc.h>
 #ifdef CONFIG_MATH_EMULATION
 #include <asm/math_emu.h>
@@ -242,14 +241,16 @@ __visible __notrace_funcgraph struct task_struct *
 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread,
-				 *next = &next_p->thread;
+			     *next = &next_p->thread;
+	struct fpu *prev_fpu = &prev->fpu;
+	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
-	fpu_switch_t fpu;
+	fpu_switch_t fpu_switch;
 
 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
-	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
+	fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
 
 	/*
 	 * Save away %gs. No need to save %fs, as it was saved on the
@@ -296,19 +297,16 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * Leave lazy mode, flushing any hypercalls made here.
 	 * This must be done before restoring TLS segments so
 	 * the GDT and LDT are properly updated, and must be
-	 * done before math_state_restore, so the TS bit is up
+	 * done before fpu__restore(), so the TS bit is up
 	 * to date.
 	 */
 	arch_end_context_switch(next_p);
 
 	/*
-	 * Reload esp0, kernel_stack, and current_top_of_stack.  This changes
+	 * Reload esp0 and cpu_current_top_of_stack.  This changes
 	 * current_thread_info().
 	 */
 	load_sp0(tss, next);
-	this_cpu_write(kernel_stack,
-		       (unsigned long)task_stack_page(next_p) +
-		       THREAD_SIZE);
 	this_cpu_write(cpu_current_top_of_stack,
 		       (unsigned long)task_stack_page(next_p) +
 		       THREAD_SIZE);
@@ -319,7 +317,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	if (prev->gs | next->gs)
 		lazy_load_gs(next->gs);
 
-	switch_fpu_finish(next_p, fpu);
+	switch_fpu_finish(next_fpu, fpu_switch);
 
 	this_cpu_write(current_task, next_p);
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ddfdbf74f1744c..843f92e4c7110c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -38,8 +38,7 @@
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/mmu_context.h>
 #include <asm/prctl.h>
 #include <asm/desc.h>
@@ -274,12 +273,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread;
 	struct thread_struct *next = &next_p->thread;
+	struct fpu *prev_fpu = &prev->fpu;
+	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
 	unsigned fsindex, gsindex;
-	fpu_switch_t fpu;
+	fpu_switch_t fpu_switch;
 
-	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
+	fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
 
 	/* We must save %fs and %gs before load_TLS() because
 	 * %fs and %gs may be cleared by load_TLS().
@@ -299,7 +300,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * Leave lazy mode, flushing any hypercalls made here.  This
 	 * must be done after loading TLS entries in the GDT but before
 	 * loading segments that might reference them, and and it must
-	 * be done before math_state_restore, so the TS bit is up to
+	 * be done before fpu__restore(), so the TS bit is up to
 	 * date.
 	 */
 	arch_end_context_switch(next_p);
@@ -391,7 +392,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
 	prev->gsindex = gsindex;
 
-	switch_fpu_finish(next_p, fpu);
+	switch_fpu_finish(next_fpu, fpu_switch);
 
 	/*
 	 * Switch the PDA and FPU contexts.
@@ -409,9 +410,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/* Reload esp0 and ss1.  This changes current_thread_info(). */
 	load_sp0(tss, next);
 
-	this_cpu_write(kernel_stack,
-		(unsigned long)task_stack_page(next_p) + THREAD_SIZE);
-
 	/*
 	 * Now maybe reload the debug registers and handle I/O bitmaps
 	 */
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index a7bc794807195a..9be72bc3613f80 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -11,7 +11,6 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/ptrace.h>
-#include <linux/regset.h>
 #include <linux/tracehook.h>
 #include <linux/user.h>
 #include <linux/elf.h>
@@ -28,8 +27,9 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
 #include <asm/debugreg.h>
 #include <asm/ldt.h>
 #include <asm/desc.h>
@@ -1297,7 +1297,7 @@ static struct user_regset x86_64_regsets[] __read_mostly = {
 		.core_note_type = NT_PRFPREG,
 		.n = sizeof(struct user_i387_struct) / sizeof(long),
 		.size = sizeof(long), .align = sizeof(long),
-		.active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
+		.active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set
 	},
 	[REGSET_XSTATE] = {
 		.core_note_type = NT_X86_XSTATE,
@@ -1338,13 +1338,13 @@ static struct user_regset x86_32_regsets[] __read_mostly = {
 		.core_note_type = NT_PRFPREG,
 		.n = sizeof(struct user_i387_ia32_struct) / sizeof(u32),
 		.size = sizeof(u32), .align = sizeof(u32),
-		.active = fpregs_active, .get = fpregs_get, .set = fpregs_set
+		.active = regset_fpregs_active, .get = fpregs_get, .set = fpregs_set
 	},
 	[REGSET_XFP] = {
 		.core_note_type = NT_PRXFPREG,
 		.n = sizeof(struct user32_fxsr_struct) / sizeof(u32),
 		.size = sizeof(u32), .align = sizeof(u32),
-		.active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
+		.active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set
 	},
 	[REGSET_XSTATE] = {
 		.core_note_type = NT_X86_XSTATE,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d74ac33290ae3e..8d04a7594a0363 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1222,8 +1222,7 @@ void __init setup_arch(char **cmdline_p)
 	init_cpu_to_node();
 
 	init_apic_mappings();
-	if (x86_io_apic_ops.init)
-		x86_io_apic_ops.init();
+	io_apic_init_mappings();
 
 	kvm_guest_init();
 
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 1ea14fd53933ba..206996c1669db3 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -26,8 +26,8 @@
 
 #include <asm/processor.h>
 #include <asm/ucontext.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
 #include <asm/vdso.h>
 #include <asm/mce.h>
 #include <asm/sighandling.h>
@@ -103,7 +103,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 		get_user_ex(buf, &sc->fpstate);
 	} get_user_catch(err);
 
-	err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32));
+	err |= fpu__restore_sig(buf, config_enabled(CONFIG_X86_32));
 
 	force_iret();
 
@@ -199,6 +199,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 	unsigned long sp = regs->sp;
 	unsigned long buf_fx = 0;
 	int onsigstack = on_sig_stack(sp);
+	struct fpu *fpu = &current->thread.fpu;
 
 	/* redzone */
 	if (config_enabled(CONFIG_X86_64))
@@ -218,9 +219,9 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 		}
 	}
 
-	if (used_math()) {
-		sp = alloc_mathframe(sp, config_enabled(CONFIG_X86_32),
-				     &buf_fx, &math_size);
+	if (fpu->fpstate_active) {
+		sp = fpu__alloc_mathframe(sp, config_enabled(CONFIG_X86_32),
+					  &buf_fx, &math_size);
 		*fpstate = (void __user *)sp;
 	}
 
@@ -234,8 +235,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 		return (void __user *)-1L;
 
 	/* save i387 and extended state */
-	if (used_math() &&
-	    save_xstate_sig(*fpstate, (void __user *)buf_fx, math_size) < 0)
+	if (fpu->fpstate_active &&
+	    copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size) < 0)
 		return (void __user *)-1L;
 
 	return (void __user *)sp;
@@ -593,6 +594,22 @@ badframe:
 	return 0;
 }
 
+static inline int is_ia32_compat_frame(void)
+{
+	return config_enabled(CONFIG_IA32_EMULATION) &&
+	       test_thread_flag(TIF_IA32);
+}
+
+static inline int is_ia32_frame(void)
+{
+	return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame();
+}
+
+static inline int is_x32_frame(void)
+{
+	return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32);
+}
+
 static int
 setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
 {
@@ -617,6 +634,7 @@ static void
 handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 {
 	bool stepping, failed;
+	struct fpu *fpu = &current->thread.fpu;
 
 	/* Are we from a system call? */
 	if (syscall_get_nr(current, regs) >= 0) {
@@ -665,8 +683,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 		/*
 		 * Ensure the signal handler starts with the new fpu state.
 		 */
-		if (used_math())
-			fpu_reset_state(current);
+		if (fpu->fpstate_active)
+			fpu__clear(fpu);
 	}
 	signal_setup_done(failed, ksig, stepping);
 }
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index be8e1bde07aa47..15aaa69bbb5eff 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -170,8 +170,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
 
 asmlinkage __visible void smp_reboot_interrupt(void)
 {
-	ack_APIC_irq();
-	irq_enter();
+	ipi_entering_ack_irq();
 	stop_this_cpu(NULL);
 	irq_exit();
 }
@@ -265,12 +264,6 @@ __visible void smp_reschedule_interrupt(struct pt_regs *regs)
 	 */
 }
 
-static inline void smp_entering_irq(void)
-{
-	ack_APIC_irq();
-	irq_enter();
-}
-
 __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
 {
 	/*
@@ -279,7 +272,7 @@ __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
 	 * scheduler_ipi(). This is OK, since those functions are allowed
 	 * to nest.
 	 */
-	smp_entering_irq();
+	ipi_entering_ack_irq();
 	trace_reschedule_entry(RESCHEDULE_VECTOR);
 	__smp_reschedule_interrupt();
 	trace_reschedule_exit(RESCHEDULE_VECTOR);
@@ -297,14 +290,14 @@ static inline void __smp_call_function_interrupt(void)
 
 __visible void smp_call_function_interrupt(struct pt_regs *regs)
 {
-	smp_entering_irq();
+	ipi_entering_ack_irq();
 	__smp_call_function_interrupt();
 	exiting_irq();
 }
 
 __visible void smp_trace_call_function_interrupt(struct pt_regs *regs)
 {
-	smp_entering_irq();
+	ipi_entering_ack_irq();
 	trace_call_function_entry(CALL_FUNCTION_VECTOR);
 	__smp_call_function_interrupt();
 	trace_call_function_exit(CALL_FUNCTION_VECTOR);
@@ -319,14 +312,14 @@ static inline void __smp_call_function_single_interrupt(void)
 
 __visible void smp_call_function_single_interrupt(struct pt_regs *regs)
 {
-	smp_entering_irq();
+	ipi_entering_ack_irq();
 	__smp_call_function_single_interrupt();
 	exiting_irq();
 }
 
 __visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs)
 {
-	smp_entering_irq();
+	ipi_entering_ack_irq();
 	trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
 	__smp_call_function_single_interrupt();
 	trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 50e547eac8cd4b..8add66b22f333c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -68,8 +68,7 @@
 #include <asm/mwait.h>
 #include <asm/apic.h>
 #include <asm/io_apic.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/setup.h>
 #include <asm/uv/uv.h>
 #include <linux/mc146818rtc.h>
@@ -314,10 +313,10 @@ topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
 		cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
 }
 
-#define link_mask(_m, c1, c2)						\
+#define link_mask(mfunc, c1, c2)					\
 do {									\
-	cpumask_set_cpu((c1), cpu_##_m##_mask(c2));			\
-	cpumask_set_cpu((c2), cpu_##_m##_mask(c1));			\
+	cpumask_set_cpu((c1), mfunc(c2));				\
+	cpumask_set_cpu((c2), mfunc(c1));				\
 } while (0)
 
 static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
@@ -398,9 +397,9 @@ void set_cpu_sibling_map(int cpu)
 	cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
 
 	if (!has_mp) {
-		cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
+		cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
 		cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
-		cpumask_set_cpu(cpu, cpu_core_mask(cpu));
+		cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
 		c->booted_cores = 1;
 		return;
 	}
@@ -409,32 +408,34 @@ void set_cpu_sibling_map(int cpu)
 		o = &cpu_data(i);
 
 		if ((i == cpu) || (has_smt && match_smt(c, o)))
-			link_mask(sibling, cpu, i);
+			link_mask(topology_sibling_cpumask, cpu, i);
 
 		if ((i == cpu) || (has_mp && match_llc(c, o)))
-			link_mask(llc_shared, cpu, i);
+			link_mask(cpu_llc_shared_mask, cpu, i);
 
 	}
 
 	/*
 	 * This needs a separate iteration over the cpus because we rely on all
-	 * cpu_sibling_mask links to be set-up.
+	 * topology_sibling_cpumask links to be set-up.
 	 */
 	for_each_cpu(i, cpu_sibling_setup_mask) {
 		o = &cpu_data(i);
 
 		if ((i == cpu) || (has_mp && match_die(c, o))) {
-			link_mask(core, cpu, i);
+			link_mask(topology_core_cpumask, cpu, i);
 
 			/*
 			 *  Does this new cpu bringup a new core?
 			 */
-			if (cpumask_weight(cpu_sibling_mask(cpu)) == 1) {
+			if (cpumask_weight(
+			    topology_sibling_cpumask(cpu)) == 1) {
 				/*
 				 * for each core in package, increment
 				 * the booted_cores for this new cpu
 				 */
-				if (cpumask_first(cpu_sibling_mask(i)) == i)
+				if (cpumask_first(
+				    topology_sibling_cpumask(i)) == i)
 					c->booted_cores++;
 				/*
 				 * increment the core count for all
@@ -514,6 +515,40 @@ void __inquire_remote_apic(int apicid)
 }
 
 /*
+ * The Multiprocessor Specification 1.4 (1997) example code suggests
+ * that there should be a 10ms delay between the BSP asserting INIT
+ * and de-asserting INIT, when starting a remote processor.
+ * But that slows boot and resume on modern processors, which include
+ * many cores and don't require that delay.
+ *
+ * Cmdline "init_cpu_udelay=" is available to over-ride this delay.
+ * Modern processor families are quirked to remove the delay entirely.
+ */
+#define UDELAY_10MS_DEFAULT 10000
+
+static unsigned int init_udelay = UDELAY_10MS_DEFAULT;
+
+static int __init cpu_init_udelay(char *str)
+{
+	get_option(&str, &init_udelay);
+
+	return 0;
+}
+early_param("cpu_init_udelay", cpu_init_udelay);
+
+static void __init smp_quirk_init_udelay(void)
+{
+	/* if cmdline changed it from default, leave it alone */
+	if (init_udelay != UDELAY_10MS_DEFAULT)
+		return;
+
+	/* if modern processor, use no delay */
+	if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
+	    ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF)))
+		init_udelay = 0;
+}
+
+/*
  * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
  * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
  * won't ... remember to clear down the APIC, etc later.
@@ -555,7 +590,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
 static int
 wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 {
-	unsigned long send_status, accept_status = 0;
+	unsigned long send_status = 0, accept_status = 0;
 	int maxlvt, num_starts, j;
 
 	maxlvt = lapic_get_maxlvt();
@@ -583,7 +618,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 	pr_debug("Waiting for send to finish...\n");
 	send_status = safe_apic_wait_icr_idle();
 
-	mdelay(10);
+	udelay(init_udelay);
 
 	pr_debug("Deasserting INIT\n");
 
@@ -651,6 +686,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 		 * Give the other CPU some time to accept the IPI.
 		 */
 		udelay(200);
+
 		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP.  */
 			apic_write(APIC_ESR, 0);
 		accept_status = (apic_read(APIC_ESR) & 0xEF);
@@ -792,8 +828,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
 	clear_tsk_thread_flag(idle, TIF_FORK);
 	initial_gs = per_cpu_offset(cpu);
 #endif
-	per_cpu(kernel_stack, cpu) =
-		(unsigned long)task_stack_page(idle) + THREAD_SIZE;
 }
 
 /*
@@ -1009,8 +1043,8 @@ static __init void disable_smp(void)
 		physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
 	else
 		physid_set_mask_of_physid(0, &phys_cpu_present_map);
-	cpumask_set_cpu(0, cpu_sibling_mask(0));
-	cpumask_set_cpu(0, cpu_core_mask(0));
+	cpumask_set_cpu(0, topology_sibling_cpumask(0));
+	cpumask_set_cpu(0, topology_core_cpumask(0));
 }
 
 enum {
@@ -1176,6 +1210,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 		uv_system_init();
 
 	set_mtrr_aps_delayed_init();
+
+	smp_quirk_init_udelay();
 }
 
 void arch_enable_nonboot_cpus_begin(void)
@@ -1293,22 +1329,22 @@ static void remove_siblinginfo(int cpu)
 	int sibling;
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 
-	for_each_cpu(sibling, cpu_core_mask(cpu)) {
-		cpumask_clear_cpu(cpu, cpu_core_mask(sibling));
+	for_each_cpu(sibling, topology_core_cpumask(cpu)) {
+		cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
 		/*/
 		 * last thread sibling in this cpu core going down
 		 */
-		if (cpumask_weight(cpu_sibling_mask(cpu)) == 1)
+		if (cpumask_weight(topology_sibling_cpumask(cpu)) == 1)
 			cpu_data(sibling).booted_cores--;
 	}
 
-	for_each_cpu(sibling, cpu_sibling_mask(cpu))
-		cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling));
+	for_each_cpu(sibling, topology_sibling_cpumask(cpu))
+		cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
 	for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
 		cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
 	cpumask_clear(cpu_llc_shared_mask(cpu));
-	cpumask_clear(cpu_sibling_mask(cpu));
-	cpumask_clear(cpu_core_mask(cpu));
+	cpumask_clear(topology_sibling_cpumask(cpu));
+	cpumask_clear(topology_core_cpumask(cpu));
 	c->phys_proc_id = 0;
 	c->cpu_core_id = 0;
 	cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 324ab524768756..b5e7687668c503 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -54,8 +54,7 @@
 #include <asm/ftrace.h>
 #include <asm/traps.h>
 #include <asm/desc.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/mce.h>
 #include <asm/fixmap.h>
 #include <asm/mach_traps.h>
@@ -372,7 +371,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk = current;
-	struct xsave_struct *xsave_buf;
+	struct xregs_state *xsave_buf;
 	enum ctx_state prev_state;
 	struct bndcsr *bndcsr;
 	siginfo_t *info;
@@ -396,8 +395,8 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 	 * It is not directly accessible, though, so we need to
 	 * do an xsave and then pull it out of the xsave buffer.
 	 */
-	fpu_save_init(&tsk->thread.fpu);
-	xsave_buf = &(tsk->thread.fpu.state->xsave);
+	copy_fpregs_to_fpstate(&tsk->thread.fpu);
+	xsave_buf = &(tsk->thread.fpu.state.xsave);
 	bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR);
 	if (!bndcsr)
 		goto exit_trap;
@@ -709,8 +708,8 @@ NOKPROBE_SYMBOL(do_debug);
 static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 {
 	struct task_struct *task = current;
+	struct fpu *fpu = &task->thread.fpu;
 	siginfo_t info;
-	unsigned short err;
 	char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
 						"simd exception";
 
@@ -718,8 +717,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 		return;
 	conditional_sti(regs);
 
-	if (!user_mode(regs))
-	{
+	if (!user_mode(regs)) {
 		if (!fixup_exception(regs)) {
 			task->thread.error_code = error_code;
 			task->thread.trap_nr = trapnr;
@@ -731,62 +729,20 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	/*
 	 * Save the info for the exception handler and clear the error.
 	 */
-	unlazy_fpu(task);
-	task->thread.trap_nr = trapnr;
+	fpu__save(fpu);
+
+	task->thread.trap_nr	= trapnr;
 	task->thread.error_code = error_code;
-	info.si_signo = SIGFPE;
-	info.si_errno = 0;
-	info.si_addr = (void __user *)uprobe_get_trap_addr(regs);
-	if (trapnr == X86_TRAP_MF) {
-		unsigned short cwd, swd;
-		/*
-		 * (~cwd & swd) will mask out exceptions that are not set to unmasked
-		 * status.  0x3f is the exception bits in these regs, 0x200 is the
-		 * C1 reg you need in case of a stack fault, 0x040 is the stack
-		 * fault bit.  We should only be taking one exception at a time,
-		 * so if this combination doesn't produce any single exception,
-		 * then we have a bad program that isn't synchronizing its FPU usage
-		 * and it will suffer the consequences since we won't be able to
-		 * fully reproduce the context of the exception
-		 */
-		cwd = get_fpu_cwd(task);
-		swd = get_fpu_swd(task);
+	info.si_signo		= SIGFPE;
+	info.si_errno		= 0;
+	info.si_addr		= (void __user *)uprobe_get_trap_addr(regs);
 
-		err = swd & ~cwd;
-	} else {
-		/*
-		 * The SIMD FPU exceptions are handled a little differently, as there
-		 * is only a single status/control register.  Thus, to determine which
-		 * unmasked exception was caught we must mask the exception mask bits
-		 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
-		 */
-		unsigned short mxcsr = get_fpu_mxcsr(task);
-		err = ~(mxcsr >> 7) & mxcsr;
-	}
+	info.si_code = fpu__exception_code(fpu, trapnr);
 
-	if (err & 0x001) {	/* Invalid op */
-		/*
-		 * swd & 0x240 == 0x040: Stack Underflow
-		 * swd & 0x240 == 0x240: Stack Overflow
-		 * User must clear the SF bit (0x40) if set
-		 */
-		info.si_code = FPE_FLTINV;
-	} else if (err & 0x004) { /* Divide by Zero */
-		info.si_code = FPE_FLTDIV;
-	} else if (err & 0x008) { /* Overflow */
-		info.si_code = FPE_FLTOVF;
-	} else if (err & 0x012) { /* Denormal, Underflow */
-		info.si_code = FPE_FLTUND;
-	} else if (err & 0x020) { /* Precision */
-		info.si_code = FPE_FLTRES;
-	} else {
-		/*
-		 * If we're using IRQ 13, or supposedly even some trap
-		 * X86_TRAP_MF implementations, it's possible
-		 * we get a spurious trap, which is not an error.
-		 */
+	/* Retry when we get spurious exceptions: */
+	if (!info.si_code)
 		return;
-	}
+
 	force_sig_info(SIGFPE, &info, task);
 }
 
@@ -813,62 +769,8 @@ dotraplinkage void
 do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 {
 	conditional_sti(regs);
-#if 0
-	/* No need to warn about this any longer. */
-	pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
-#endif
-}
-
-asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void)
-{
-}
-
-asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void)
-{
 }
 
-/*
- * 'math_state_restore()' saves the current math information in the
- * old math state array, and gets the new ones from the current task
- *
- * Careful.. There are problems with IBM-designed IRQ13 behaviour.
- * Don't touch unless you *really* know how it works.
- *
- * Must be called with kernel preemption disabled (eg with local
- * local interrupts as in the case of do_device_not_available).
- */
-void math_state_restore(void)
-{
-	struct task_struct *tsk = current;
-
-	if (!tsk_used_math(tsk)) {
-		local_irq_enable();
-		/*
-		 * does a slab alloc which can sleep
-		 */
-		if (init_fpu(tsk)) {
-			/*
-			 * ran out of memory!
-			 */
-			do_group_exit(SIGKILL);
-			return;
-		}
-		local_irq_disable();
-	}
-
-	/* Avoid __kernel_fpu_begin() right after __thread_fpu_begin() */
-	kernel_fpu_disable();
-	__thread_fpu_begin(tsk);
-	if (unlikely(restore_fpu_checking(tsk))) {
-		fpu_reset_state(tsk);
-		force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
-	} else {
-		tsk->thread.fpu_counter++;
-	}
-	kernel_fpu_enable();
-}
-EXPORT_SYMBOL_GPL(math_state_restore);
-
 dotraplinkage void
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
@@ -889,7 +791,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 		return;
 	}
 #endif
-	math_state_restore(); /* interrupts still off */
+	fpu__restore(&current->thread.fpu); /* interrupts still off */
 #ifdef CONFIG_X86_32
 	conditional_sti(regs);
 #endif
@@ -997,8 +899,8 @@ void __init trap_init(void)
 #endif
 
 #ifdef CONFIG_X86_32
-	set_system_trap_gate(SYSCALL_VECTOR, &system_call);
-	set_bit(SYSCALL_VECTOR, used_vectors);
+	set_system_trap_gate(IA32_SYSCALL_VECTOR, &system_call);
+	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
 #endif
 
 	/*
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 26488487bc61e8..dd8d0791dfb502 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -113,7 +113,7 @@ static void check_tsc_warp(unsigned int timeout)
  */
 static inline unsigned int loop_timeout(int cpu)
 {
-	return (cpumask_weight(cpu_core_mask(cpu)) > 1) ? 2 : 20;
+	return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20;
 }
 
 /*
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 234b0722de5352..3cee10abf01d15 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -111,11 +111,9 @@ EXPORT_SYMBOL_GPL(x86_platform);
 #if defined(CONFIG_PCI_MSI)
 struct x86_msi_ops x86_msi = {
 	.setup_msi_irqs		= native_setup_msi_irqs,
-	.compose_msi_msg	= native_compose_msi_msg,
 	.teardown_msi_irq	= native_teardown_msi_irq,
 	.teardown_msi_irqs	= default_teardown_msi_irqs,
 	.restore_msi_irqs	= default_restore_msi_irqs,
-	.setup_hpet_msi		= default_setup_hpet_msi,
 };
 
 /* MSI arch specific hooks */
@@ -141,13 +139,6 @@ void arch_restore_msi_irqs(struct pci_dev *dev)
 #endif
 
 struct x86_io_apic_ops x86_io_apic_ops = {
-	.init			= native_io_apic_init_mappings,
 	.read			= native_io_apic_read,
-	.write			= native_io_apic_write,
-	.modify			= native_io_apic_modify,
 	.disable		= native_disable_io_apic,
-	.print_entries		= native_io_apic_print_entries,
-	.set_affinity		= native_ioapic_set_affinity,
-	.setup_entry		= native_setup_ioapic_entry,
-	.eoi_ioapic_pin		= native_eoi_ioapic_pin,
 };
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
deleted file mode 100644
index 87a815b85f3e5c..00000000000000
--- a/arch/x86/kernel/xsave.c
+++ /dev/null
@@ -1,724 +0,0 @@
-/*
- * xsave/xrstor support.
- *
- * Author: Suresh Siddha <suresh.b.siddha@intel.com>
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/bootmem.h>
-#include <linux/compat.h>
-#include <linux/cpu.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
-#include <asm/sigframe.h>
-#include <asm/tlbflush.h>
-#include <asm/xcr.h>
-
-/*
- * Supported feature mask by the CPU and the kernel.
- */
-u64 pcntxt_mask;
-
-/*
- * Represents init state for the supported extended state.
- */
-struct xsave_struct *init_xstate_buf;
-
-static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
-static unsigned int *xstate_offsets, *xstate_sizes;
-static unsigned int xstate_comp_offsets[sizeof(pcntxt_mask)*8];
-static unsigned int xstate_features;
-
-/*
- * If a processor implementation discern that a processor state component is
- * in its initialized state it may modify the corresponding bit in the
- * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory
- * layout in the case of xsaveopt. While presenting the xstate information to
- * the user, we always ensure that the memory layout of a feature will be in
- * the init state if the corresponding header bit is zero. This is to ensure
- * that the user doesn't see some stale state in the memory layout during
- * signal handling, debugging etc.
- */
-void __sanitize_i387_state(struct task_struct *tsk)
-{
-	struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
-	int feature_bit = 0x2;
-	u64 xstate_bv;
-
-	if (!fx)
-		return;
-
-	xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
-
-	/*
-	 * None of the feature bits are in init state. So nothing else
-	 * to do for us, as the memory layout is up to date.
-	 */
-	if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
-		return;
-
-	/*
-	 * FP is in init state
-	 */
-	if (!(xstate_bv & XSTATE_FP)) {
-		fx->cwd = 0x37f;
-		fx->swd = 0;
-		fx->twd = 0;
-		fx->fop = 0;
-		fx->rip = 0;
-		fx->rdp = 0;
-		memset(&fx->st_space[0], 0, 128);
-	}
-
-	/*
-	 * SSE is in init state
-	 */
-	if (!(xstate_bv & XSTATE_SSE))
-		memset(&fx->xmm_space[0], 0, 256);
-
-	xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2;
-
-	/*
-	 * Update all the other memory layouts for which the corresponding
-	 * header bit is in the init state.
-	 */
-	while (xstate_bv) {
-		if (xstate_bv & 0x1) {
-			int offset = xstate_offsets[feature_bit];
-			int size = xstate_sizes[feature_bit];
-
-			memcpy(((void *) fx) + offset,
-			       ((void *) init_xstate_buf) + offset,
-			       size);
-		}
-
-		xstate_bv >>= 1;
-		feature_bit++;
-	}
-}
-
-/*
- * Check for the presence of extended state information in the
- * user fpstate pointer in the sigcontext.
- */
-static inline int check_for_xstate(struct i387_fxsave_struct __user *buf,
-				   void __user *fpstate,
-				   struct _fpx_sw_bytes *fx_sw)
-{
-	int min_xstate_size = sizeof(struct i387_fxsave_struct) +
-			      sizeof(struct xsave_hdr_struct);
-	unsigned int magic2;
-
-	if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw)))
-		return -1;
-
-	/* Check for the first magic field and other error scenarios. */
-	if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
-	    fx_sw->xstate_size < min_xstate_size ||
-	    fx_sw->xstate_size > xstate_size ||
-	    fx_sw->xstate_size > fx_sw->extended_size)
-		return -1;
-
-	/*
-	 * Check for the presence of second magic word at the end of memory
-	 * layout. This detects the case where the user just copied the legacy
-	 * fpstate layout with out copying the extended state information
-	 * in the memory layout.
-	 */
-	if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))
-	    || magic2 != FP_XSTATE_MAGIC2)
-		return -1;
-
-	return 0;
-}
-
-/*
- * Signal frame handlers.
- */
-static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
-{
-	if (use_fxsr()) {
-		struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
-		struct user_i387_ia32_struct env;
-		struct _fpstate_ia32 __user *fp = buf;
-
-		convert_from_fxsr(&env, tsk);
-
-		if (__copy_to_user(buf, &env, sizeof(env)) ||
-		    __put_user(xsave->i387.swd, &fp->status) ||
-		    __put_user(X86_FXSR_MAGIC, &fp->magic))
-			return -1;
-	} else {
-		struct i387_fsave_struct __user *fp = buf;
-		u32 swd;
-		if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
-			return -1;
-	}
-
-	return 0;
-}
-
-static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
-{
-	struct xsave_struct __user *x = buf;
-	struct _fpx_sw_bytes *sw_bytes;
-	u32 xstate_bv;
-	int err;
-
-	/* Setup the bytes not touched by the [f]xsave and reserved for SW. */
-	sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved;
-	err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
-
-	if (!use_xsave())
-		return err;
-
-	err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size));
-
-	/*
-	 * Read the xstate_bv which we copied (directly from the cpu or
-	 * from the state in task struct) to the user buffers.
-	 */
-	err |= __get_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv);
-
-	/*
-	 * For legacy compatible, we always set FP/SSE bits in the bit
-	 * vector while saving the state to the user context. This will
-	 * enable us capturing any changes(during sigreturn) to
-	 * the FP/SSE bits by the legacy applications which don't touch
-	 * xstate_bv in the xsave header.
-	 *
-	 * xsave aware apps can change the xstate_bv in the xsave
-	 * header as well as change any contents in the memory layout.
-	 * xrestore as part of sigreturn will capture all the changes.
-	 */
-	xstate_bv |= XSTATE_FPSSE;
-
-	err |= __put_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv);
-
-	return err;
-}
-
-static inline int save_user_xstate(struct xsave_struct __user *buf)
-{
-	int err;
-
-	if (use_xsave())
-		err = xsave_user(buf);
-	else if (use_fxsr())
-		err = fxsave_user((struct i387_fxsave_struct __user *) buf);
-	else
-		err = fsave_user((struct i387_fsave_struct __user *) buf);
-
-	if (unlikely(err) && __clear_user(buf, xstate_size))
-		err = -EFAULT;
-	return err;
-}
-
-/*
- * Save the fpu, extended register state to the user signal frame.
- *
- * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save
- *  state is copied.
- *  'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'.
- *
- *	buf == buf_fx for 64-bit frames and 32-bit fsave frame.
- *	buf != buf_fx for 32-bit frames with fxstate.
- *
- * If the fpu, extended register state is live, save the state directly
- * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise,
- * copy the thread's fpu state to the user frame starting at 'buf_fx'.
- *
- * If this is a 32-bit frame with fxstate, put a fsave header before
- * the aligned state at 'buf_fx'.
- *
- * For [f]xsave state, update the SW reserved fields in the [f]xsave frame
- * indicating the absence/presence of the extended state to the user.
- */
-int save_xstate_sig(void __user *buf, void __user *buf_fx, int size)
-{
-	struct xsave_struct *xsave = &current->thread.fpu.state->xsave;
-	struct task_struct *tsk = current;
-	int ia32_fxstate = (buf != buf_fx);
-
-	ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
-			 config_enabled(CONFIG_IA32_EMULATION));
-
-	if (!access_ok(VERIFY_WRITE, buf, size))
-		return -EACCES;
-
-	if (!static_cpu_has(X86_FEATURE_FPU))
-		return fpregs_soft_get(current, NULL, 0,
-			sizeof(struct user_i387_ia32_struct), NULL,
-			(struct _fpstate_ia32 __user *) buf) ? -1 : 1;
-
-	if (user_has_fpu()) {
-		/* Save the live register state to the user directly. */
-		if (save_user_xstate(buf_fx))
-			return -1;
-		/* Update the thread's fxstate to save the fsave header. */
-		if (ia32_fxstate)
-			fpu_fxsave(&tsk->thread.fpu);
-	} else {
-		sanitize_i387_state(tsk);
-		if (__copy_to_user(buf_fx, xsave, xstate_size))
-			return -1;
-	}
-
-	/* Save the fsave header for the 32-bit frames. */
-	if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
-		return -1;
-
-	if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
-		return -1;
-
-	return 0;
-}
-
-static inline void
-sanitize_restored_xstate(struct task_struct *tsk,
-			 struct user_i387_ia32_struct *ia32_env,
-			 u64 xstate_bv, int fx_only)
-{
-	struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
-	struct xsave_hdr_struct *xsave_hdr = &xsave->xsave_hdr;
-
-	if (use_xsave()) {
-		/* These bits must be zero. */
-		memset(xsave_hdr->reserved, 0, 48);
-
-		/*
-		 * Init the state that is not present in the memory
-		 * layout and not enabled by the OS.
-		 */
-		if (fx_only)
-			xsave_hdr->xstate_bv = XSTATE_FPSSE;
-		else
-			xsave_hdr->xstate_bv &= (pcntxt_mask & xstate_bv);
-	}
-
-	if (use_fxsr()) {
-		/*
-		 * mscsr reserved bits must be masked to zero for security
-		 * reasons.
-		 */
-		xsave->i387.mxcsr &= mxcsr_feature_mask;
-
-		convert_to_fxsr(tsk, ia32_env);
-	}
-}
-
-/*
- * Restore the extended state if present. Otherwise, restore the FP/SSE state.
- */
-static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only)
-{
-	if (use_xsave()) {
-		if ((unsigned long)buf % 64 || fx_only) {
-			u64 init_bv = pcntxt_mask & ~XSTATE_FPSSE;
-			xrstor_state(init_xstate_buf, init_bv);
-			return fxrstor_user(buf);
-		} else {
-			u64 init_bv = pcntxt_mask & ~xbv;
-			if (unlikely(init_bv))
-				xrstor_state(init_xstate_buf, init_bv);
-			return xrestore_user(buf, xbv);
-		}
-	} else if (use_fxsr()) {
-		return fxrstor_user(buf);
-	} else
-		return frstor_user(buf);
-}
-
-int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
-{
-	int ia32_fxstate = (buf != buf_fx);
-	struct task_struct *tsk = current;
-	int state_size = xstate_size;
-	u64 xstate_bv = 0;
-	int fx_only = 0;
-
-	ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
-			 config_enabled(CONFIG_IA32_EMULATION));
-
-	if (!buf) {
-		fpu_reset_state(tsk);
-		return 0;
-	}
-
-	if (!access_ok(VERIFY_READ, buf, size))
-		return -EACCES;
-
-	if (!used_math() && init_fpu(tsk))
-		return -1;
-
-	if (!static_cpu_has(X86_FEATURE_FPU))
-		return fpregs_soft_set(current, NULL,
-				       0, sizeof(struct user_i387_ia32_struct),
-				       NULL, buf) != 0;
-
-	if (use_xsave()) {
-		struct _fpx_sw_bytes fx_sw_user;
-		if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) {
-			/*
-			 * Couldn't find the extended state information in the
-			 * memory layout. Restore just the FP/SSE and init all
-			 * the other extended state.
-			 */
-			state_size = sizeof(struct i387_fxsave_struct);
-			fx_only = 1;
-		} else {
-			state_size = fx_sw_user.xstate_size;
-			xstate_bv = fx_sw_user.xstate_bv;
-		}
-	}
-
-	if (ia32_fxstate) {
-		/*
-		 * For 32-bit frames with fxstate, copy the user state to the
-		 * thread's fpu state, reconstruct fxstate from the fsave
-		 * header. Sanitize the copied state etc.
-		 */
-		struct fpu *fpu = &tsk->thread.fpu;
-		struct user_i387_ia32_struct env;
-		int err = 0;
-
-		/*
-		 * Drop the current fpu which clears used_math(). This ensures
-		 * that any context-switch during the copy of the new state,
-		 * avoids the intermediate state from getting restored/saved.
-		 * Thus avoiding the new restored state from getting corrupted.
-		 * We will be ready to restore/save the state only after
-		 * set_used_math() is again set.
-		 */
-		drop_fpu(tsk);
-
-		if (__copy_from_user(&fpu->state->xsave, buf_fx, state_size) ||
-		    __copy_from_user(&env, buf, sizeof(env))) {
-			fpu_finit(fpu);
-			err = -1;
-		} else {
-			sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only);
-		}
-
-		set_used_math();
-		if (use_eager_fpu()) {
-			preempt_disable();
-			math_state_restore();
-			preempt_enable();
-		}
-
-		return err;
-	} else {
-		/*
-		 * For 64-bit frames and 32-bit fsave frames, restore the user
-		 * state to the registers directly (with exceptions handled).
-		 */
-		user_fpu_begin();
-		if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) {
-			fpu_reset_state(tsk);
-			return -1;
-		}
-	}
-
-	return 0;
-}
-
-/*
- * Prepare the SW reserved portion of the fxsave memory layout, indicating
- * the presence of the extended state information in the memory layout
- * pointed by the fpstate pointer in the sigcontext.
- * This will be saved when ever the FP and extended state context is
- * saved on the user stack during the signal handler delivery to the user.
- */
-static void prepare_fx_sw_frame(void)
-{
-	int fsave_header_size = sizeof(struct i387_fsave_struct);
-	int size = xstate_size + FP_XSTATE_MAGIC2_SIZE;
-
-	if (config_enabled(CONFIG_X86_32))
-		size += fsave_header_size;
-
-	fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
-	fx_sw_reserved.extended_size = size;
-	fx_sw_reserved.xstate_bv = pcntxt_mask;
-	fx_sw_reserved.xstate_size = xstate_size;
-
-	if (config_enabled(CONFIG_IA32_EMULATION)) {
-		fx_sw_reserved_ia32 = fx_sw_reserved;
-		fx_sw_reserved_ia32.extended_size += fsave_header_size;
-	}
-}
-
-/*
- * Enable the extended processor state save/restore feature
- */
-static inline void xstate_enable(void)
-{
-	cr4_set_bits(X86_CR4_OSXSAVE);
-	xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
-}
-
-/*
- * Record the offsets and sizes of different state managed by the xsave
- * memory layout.
- */
-static void __init setup_xstate_features(void)
-{
-	int eax, ebx, ecx, edx, leaf = 0x2;
-
-	xstate_features = fls64(pcntxt_mask);
-	xstate_offsets = alloc_bootmem(xstate_features * sizeof(int));
-	xstate_sizes = alloc_bootmem(xstate_features * sizeof(int));
-
-	do {
-		cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx);
-
-		if (eax == 0)
-			break;
-
-		xstate_offsets[leaf] = ebx;
-		xstate_sizes[leaf] = eax;
-
-		leaf++;
-	} while (1);
-}
-
-/*
- * This function sets up offsets and sizes of all extended states in
- * xsave area. This supports both standard format and compacted format
- * of the xsave aread.
- *
- * Input: void
- * Output: void
- */
-void setup_xstate_comp(void)
-{
-	unsigned int xstate_comp_sizes[sizeof(pcntxt_mask)*8];
-	int i;
-
-	/*
-	 * The FP xstates and SSE xstates are legacy states. They are always
-	 * in the fixed offsets in the xsave area in either compacted form
-	 * or standard form.
-	 */
-	xstate_comp_offsets[0] = 0;
-	xstate_comp_offsets[1] = offsetof(struct i387_fxsave_struct, xmm_space);
-
-	if (!cpu_has_xsaves) {
-		for (i = 2; i < xstate_features; i++) {
-			if (test_bit(i, (unsigned long *)&pcntxt_mask)) {
-				xstate_comp_offsets[i] = xstate_offsets[i];
-				xstate_comp_sizes[i] = xstate_sizes[i];
-			}
-		}
-		return;
-	}
-
-	xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE;
-
-	for (i = 2; i < xstate_features; i++) {
-		if (test_bit(i, (unsigned long *)&pcntxt_mask))
-			xstate_comp_sizes[i] = xstate_sizes[i];
-		else
-			xstate_comp_sizes[i] = 0;
-
-		if (i > 2)
-			xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
-					+ xstate_comp_sizes[i-1];
-
-	}
-}
-
-/*
- * setup the xstate image representing the init state
- */
-static void __init setup_init_fpu_buf(void)
-{
-	/*
-	 * Setup init_xstate_buf to represent the init state of
-	 * all the features managed by the xsave
-	 */
-	init_xstate_buf = alloc_bootmem_align(xstate_size,
-					      __alignof__(struct xsave_struct));
-	fx_finit(&init_xstate_buf->i387);
-
-	if (!cpu_has_xsave)
-		return;
-
-	setup_xstate_features();
-
-	if (cpu_has_xsaves) {
-		init_xstate_buf->xsave_hdr.xcomp_bv =
-						(u64)1 << 63 | pcntxt_mask;
-		init_xstate_buf->xsave_hdr.xstate_bv = pcntxt_mask;
-	}
-
-	/*
-	 * Init all the features state with header_bv being 0x0
-	 */
-	xrstor_state_booting(init_xstate_buf, -1);
-	/*
-	 * Dump the init state again. This is to identify the init state
-	 * of any feature which is not represented by all zero's.
-	 */
-	xsave_state_booting(init_xstate_buf, -1);
-}
-
-static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
-static int __init eager_fpu_setup(char *s)
-{
-	if (!strcmp(s, "on"))
-		eagerfpu = ENABLE;
-	else if (!strcmp(s, "off"))
-		eagerfpu = DISABLE;
-	else if (!strcmp(s, "auto"))
-		eagerfpu = AUTO;
-	return 1;
-}
-__setup("eagerfpu=", eager_fpu_setup);
-
-
-/*
- * Calculate total size of enabled xstates in XCR0/pcntxt_mask.
- */
-static void __init init_xstate_size(void)
-{
-	unsigned int eax, ebx, ecx, edx;
-	int i;
-
-	if (!cpu_has_xsaves) {
-		cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
-		xstate_size = ebx;
-		return;
-	}
-
-	xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
-	for (i = 2; i < 64; i++) {
-		if (test_bit(i, (unsigned long *)&pcntxt_mask)) {
-			cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
-			xstate_size += eax;
-		}
-	}
-}
-
-/*
- * Enable and initialize the xsave feature.
- */
-static void __init xstate_enable_boot_cpu(void)
-{
-	unsigned int eax, ebx, ecx, edx;
-
-	if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
-		WARN(1, KERN_ERR "XSTATE_CPUID missing\n");
-		return;
-	}
-
-	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
-	pcntxt_mask = eax + ((u64)edx << 32);
-
-	if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
-		pr_err("FP/SSE not shown under xsave features 0x%llx\n",
-		       pcntxt_mask);
-		BUG();
-	}
-
-	/*
-	 * Support only the state known to OS.
-	 */
-	pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
-
-	xstate_enable();
-
-	/*
-	 * Recompute the context size for enabled features
-	 */
-	init_xstate_size();
-
-	update_regset_xstate_info(xstate_size, pcntxt_mask);
-	prepare_fx_sw_frame();
-	setup_init_fpu_buf();
-
-	/* Auto enable eagerfpu for xsaveopt */
-	if (cpu_has_xsaveopt && eagerfpu != DISABLE)
-		eagerfpu = ENABLE;
-
-	if (pcntxt_mask & XSTATE_EAGER) {
-		if (eagerfpu == DISABLE) {
-			pr_err("eagerfpu not present, disabling some xstate features: 0x%llx\n",
-					pcntxt_mask & XSTATE_EAGER);
-			pcntxt_mask &= ~XSTATE_EAGER;
-		} else {
-			eagerfpu = ENABLE;
-		}
-	}
-
-	pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x using %s\n",
-		pcntxt_mask, xstate_size,
-		cpu_has_xsaves ? "compacted form" : "standard form");
-}
-
-/*
- * For the very first instance, this calls xstate_enable_boot_cpu();
- * for all subsequent instances, this calls xstate_enable().
- *
- * This is somewhat obfuscated due to the lack of powerful enough
- * overrides for the section checks.
- */
-void xsave_init(void)
-{
-	static __refdata void (*next_func)(void) = xstate_enable_boot_cpu;
-	void (*this_func)(void);
-
-	if (!cpu_has_xsave)
-		return;
-
-	this_func = next_func;
-	next_func = xstate_enable;
-	this_func();
-}
-
-/*
- * setup_init_fpu_buf() is __init and it is OK to call it here because
- * init_xstate_buf will be unset only once during boot.
- */
-void __init_refok eager_fpu_init(void)
-{
-	WARN_ON(used_math());
-	current_thread_info()->status = 0;
-
-	if (eagerfpu == ENABLE)
-		setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
-
-	if (!cpu_has_eager_fpu) {
-		stts();
-		return;
-	}
-
-	if (!init_xstate_buf)
-		setup_init_fpu_buf();
-}
-
-/*
- * Given the xsave area and a state inside, this function returns the
- * address of the state.
- *
- * This is the API that is called to get xstate address in either
- * standard format or compacted format of xsave area.
- *
- * Inputs:
- *	xsave: base address of the xsave area;
- *	xstate: state which is defined in xsave.h (e.g. XSTATE_FP, XSTATE_SSE,
- *	etc.)
- * Output:
- *	address of the state in the xsave area.
- */
-void *get_xsave_addr(struct xsave_struct *xsave, int xstate)
-{
-	int feature = fls64(xstate) - 1;
-	if (!test_bit(feature, (unsigned long *)&pcntxt_mask))
-		return NULL;
-
-	return (void *)xsave + xstate_comp_offsets[feature];
-}
-EXPORT_SYMBOL_GPL(get_xsave_addr);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 1d08ad3582d07f..9f705e618af574 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -16,10 +16,8 @@
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
-#include <asm/i387.h> /* For use_eager_fpu.  Ugh! */
-#include <asm/fpu-internal.h> /* For use_eager_fpu.  Ugh! */
 #include <asm/user.h>
-#include <asm/xsave.h>
+#include <asm/fpu/xstate.h>
 #include "cpuid.h"
 #include "lapic.h"
 #include "mmu.h"
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2d73807f0d317f..e11dd59398f157 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -40,8 +40,7 @@
 #include <asm/vmx.h>
 #include <asm/virtext.h>
 #include <asm/mce.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
+#include <asm/fpu/internal.h>
 #include <asm/perf_event.h>
 #include <asm/debugreg.h>
 #include <asm/kexec.h>
@@ -1883,7 +1882,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 	 * If the FPU is not active (through the host task or
 	 * the guest vcpu), then restore the cr0.TS bit.
 	 */
-	if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded)
+	if (!fpregs_active() && !vmx->vcpu.guest_fpu_loaded)
 		stts();
 	load_gdt(this_cpu_ptr(&host_gdt));
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ea306adbbc1360..26eaeb522cab21 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -59,9 +59,8 @@
 #include <asm/desc.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h> /* Ugh! */
-#include <asm/xcr.h>
+#include <linux/kernel_stat.h>
+#include <asm/fpu/internal.h> /* Ugh! */
 #include <asm/pvclock.h>
 #include <asm/div64.h>
 
@@ -3194,8 +3193,8 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 
 static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
 {
-	struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave;
-	u64 xstate_bv = xsave->xsave_hdr.xstate_bv;
+	struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
+	u64 xstate_bv = xsave->header.xfeatures;
 	u64 valid;
 
 	/*
@@ -3230,7 +3229,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
 
 static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
 {
-	struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave;
+	struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
 	u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
 	u64 valid;
 
@@ -3241,9 +3240,9 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
 	memcpy(xsave, src, XSAVE_HDR_OFFSET);
 
 	/* Set XSTATE_BV and possibly XCOMP_BV.  */
-	xsave->xsave_hdr.xstate_bv = xstate_bv;
+	xsave->header.xfeatures = xstate_bv;
 	if (cpu_has_xsaves)
-		xsave->xsave_hdr.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
+		xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
 
 	/*
 	 * Copy each region from the non-compacted offset to the
@@ -3275,8 +3274,8 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 		fill_xsave((u8 *) guest_xsave->region, vcpu);
 	} else {
 		memcpy(guest_xsave->region,
-			&vcpu->arch.guest_fpu.state->fxsave,
-			sizeof(struct i387_fxsave_struct));
+			&vcpu->arch.guest_fpu.state.fxsave,
+			sizeof(struct fxregs_state));
 		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
 			XSTATE_FPSSE;
 	}
@@ -3300,8 +3299,8 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 	} else {
 		if (xstate_bv & ~XSTATE_FPSSE)
 			return -EINVAL;
-		memcpy(&vcpu->arch.guest_fpu.state->fxsave,
-			guest_xsave->region, sizeof(struct i387_fxsave_struct));
+		memcpy(&vcpu->arch.guest_fpu.state.fxsave,
+			guest_xsave->region, sizeof(struct fxregs_state));
 	}
 	return 0;
 }
@@ -6597,11 +6596,11 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
+	struct fpu *fpu = &current->thread.fpu;
 	int r;
 	sigset_t sigsaved;
 
-	if (!tsk_used_math(current) && init_fpu(current))
-		return -ENOMEM;
+	fpu__activate_curr(fpu);
 
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
@@ -6971,8 +6970,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-	struct i387_fxsave_struct *fxsave =
-			&vcpu->arch.guest_fpu.state->fxsave;
+	struct fxregs_state *fxsave =
+			&vcpu->arch.guest_fpu.state.fxsave;
 
 	memcpy(fpu->fpr, fxsave->st_space, 128);
 	fpu->fcw = fxsave->cwd;
@@ -6988,8 +6987,8 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-	struct i387_fxsave_struct *fxsave =
-			&vcpu->arch.guest_fpu.state->fxsave;
+	struct fxregs_state *fxsave =
+			&vcpu->arch.guest_fpu.state.fxsave;
 
 	memcpy(fxsave->st_space, fpu->fpr, 128);
 	fxsave->cwd = fpu->fcw;
@@ -7003,17 +7002,11 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	return 0;
 }
 
-int fx_init(struct kvm_vcpu *vcpu)
+static void fx_init(struct kvm_vcpu *vcpu)
 {
-	int err;
-
-	err = fpu_alloc(&vcpu->arch.guest_fpu);
-	if (err)
-		return err;
-
-	fpu_finit(&vcpu->arch.guest_fpu);
+	fpstate_init(&vcpu->arch.guest_fpu.state);
 	if (cpu_has_xsaves)
-		vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv =
+		vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv =
 			host_xcr0 | XSTATE_COMPACTION_ENABLED;
 
 	/*
@@ -7022,14 +7015,6 @@ int fx_init(struct kvm_vcpu *vcpu)
 	vcpu->arch.xcr0 = XSTATE_FP;
 
 	vcpu->arch.cr0 |= X86_CR0_ET;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(fx_init);
-
-static void fx_free(struct kvm_vcpu *vcpu)
-{
-	fpu_free(&vcpu->arch.guest_fpu);
 }
 
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
@@ -7045,7 +7030,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 	kvm_put_guest_xcr0(vcpu);
 	vcpu->guest_fpu_loaded = 1;
 	__kernel_fpu_begin();
-	fpu_restore_checking(&vcpu->arch.guest_fpu);
+	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state);
 	trace_kvm_fpu(1);
 }
 
@@ -7057,7 +7042,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 		return;
 
 	vcpu->guest_fpu_loaded = 0;
-	fpu_save_init(&vcpu->arch.guest_fpu);
+	copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
 	__kernel_fpu_end();
 	++vcpu->stat.fpu_reload;
 	if (!vcpu->arch.eager_fpu)
@@ -7071,7 +7056,6 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 	kvmclock_reset(vcpu);
 
 	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
-	fx_free(vcpu);
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 
@@ -7137,7 +7121,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kvm_mmu_unload(vcpu);
 	vcpu_put(vcpu);
 
-	fx_free(vcpu);
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 
@@ -7363,9 +7346,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 		goto fail_free_mce_banks;
 	}
 
-	r = fx_init(vcpu);
-	if (r)
-		goto fail_free_wbinvd_dirty_mask;
+	fx_init(vcpu);
 
 	vcpu->arch.ia32_tsc_adjust_msr = 0x0;
 	vcpu->arch.pv_time_enabled = false;
@@ -7379,8 +7360,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	kvm_pmu_init(vcpu);
 
 	return 0;
-fail_free_wbinvd_dirty_mask:
-	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+
 fail_free_mce_banks:
 	kfree(vcpu->arch.mce_banks);
 fail_free_lapic:
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 8f9a133cc09934..f2dc08c003eb0b 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -70,7 +70,7 @@
 #include <asm/e820.h>
 #include <asm/mce.h>
 #include <asm/io.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <asm/stackprotector.h>
 #include <asm/reboot.h>		/* for struct machine_ops */
 #include <asm/kvm_para.h>
@@ -90,7 +90,7 @@ struct lguest_data lguest_data = {
 	.noirq_iret = (u32)lguest_noirq_iret,
 	.kernel_address = PAGE_OFFSET,
 	.blocked_interrupts = { 1 }, /* Block timer interrupts */
-	.syscall_vec = SYSCALL_VECTOR,
+	.syscall_vec = IA32_SYSCALL_VECTOR,
 };
 
 /*G:037
@@ -866,7 +866,7 @@ static void __init lguest_init_IRQ(void)
 	for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) {
 		/* Some systems map "vectors" to interrupts weirdly.  Not us! */
 		__this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
-		if (i != SYSCALL_VECTOR)
+		if (i != IA32_SYSCALL_VECTOR)
 			set_intr_gate(i, irq_entries_start +
 					8 * (i - FIRST_EXTERNAL_VECTOR));
 	}
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 1530afb07c8544..982989d282ff57 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -40,6 +40,6 @@ else
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
         lib-y += clear_page_64.o copy_page_64.o
         lib-y += memmove_64.o memset_64.o
-        lib-y += copy_user_64.o copy_user_nocache_64.o
+        lib-y += copy_user_64.o
 	lib-y += cmpxchg16b_emu.o
 endif
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 00933d5e992f7a..9b0ca8fe80fce9 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -11,26 +11,23 @@
 
 #include <linux/linkage.h>
 #include <asm/alternative-asm.h>
-#include <asm/dwarf2.h>
 
 /* if you want SMP support, implement these with real spinlocks */
 .macro LOCK reg
-	pushfl_cfi
+	pushfl
 	cli
 .endm
 
 .macro UNLOCK reg
-	popfl_cfi
+	popfl
 .endm
 
 #define BEGIN(op) \
 .macro endp; \
-	CFI_ENDPROC; \
 ENDPROC(atomic64_##op##_386); \
 .purgem endp; \
 .endm; \
 ENTRY(atomic64_##op##_386); \
-	CFI_STARTPROC; \
 	LOCK v;
 
 #define ENDP endp
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index 082a85167a5b68..db3ae85440ff79 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -11,7 +11,6 @@
 
 #include <linux/linkage.h>
 #include <asm/alternative-asm.h>
-#include <asm/dwarf2.h>
 
 .macro read64 reg
 	movl %ebx, %eax
@@ -22,16 +21,11 @@
 .endm
 
 ENTRY(atomic64_read_cx8)
-	CFI_STARTPROC
-
 	read64 %ecx
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_read_cx8)
 
 ENTRY(atomic64_set_cx8)
-	CFI_STARTPROC
-
 1:
 /* we don't need LOCK_PREFIX since aligned 64-bit writes
  * are atomic on 586 and newer */
@@ -39,28 +33,23 @@ ENTRY(atomic64_set_cx8)
 	jne 1b
 
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_set_cx8)
 
 ENTRY(atomic64_xchg_cx8)
-	CFI_STARTPROC
-
 1:
 	LOCK_PREFIX
 	cmpxchg8b (%esi)
 	jne 1b
 
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_xchg_cx8)
 
 .macro addsub_return func ins insc
 ENTRY(atomic64_\func\()_return_cx8)
-	CFI_STARTPROC
-	pushl_cfi_reg ebp
-	pushl_cfi_reg ebx
-	pushl_cfi_reg esi
-	pushl_cfi_reg edi
+	pushl %ebp
+	pushl %ebx
+	pushl %esi
+	pushl %edi
 
 	movl %eax, %esi
 	movl %edx, %edi
@@ -79,12 +68,11 @@ ENTRY(atomic64_\func\()_return_cx8)
 10:
 	movl %ebx, %eax
 	movl %ecx, %edx
-	popl_cfi_reg edi
-	popl_cfi_reg esi
-	popl_cfi_reg ebx
-	popl_cfi_reg ebp
+	popl %edi
+	popl %esi
+	popl %ebx
+	popl %ebp
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_\func\()_return_cx8)
 .endm
 
@@ -93,8 +81,7 @@ addsub_return sub sub sbb
 
 .macro incdec_return func ins insc
 ENTRY(atomic64_\func\()_return_cx8)
-	CFI_STARTPROC
-	pushl_cfi_reg ebx
+	pushl %ebx
 
 	read64 %esi
 1:
@@ -109,9 +96,8 @@ ENTRY(atomic64_\func\()_return_cx8)
 10:
 	movl %ebx, %eax
 	movl %ecx, %edx
-	popl_cfi_reg ebx
+	popl %ebx
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_\func\()_return_cx8)
 .endm
 
@@ -119,8 +105,7 @@ incdec_return inc add adc
 incdec_return dec sub sbb
 
 ENTRY(atomic64_dec_if_positive_cx8)
-	CFI_STARTPROC
-	pushl_cfi_reg ebx
+	pushl %ebx
 
 	read64 %esi
 1:
@@ -136,18 +121,16 @@ ENTRY(atomic64_dec_if_positive_cx8)
 2:
 	movl %ebx, %eax
 	movl %ecx, %edx
-	popl_cfi_reg ebx
+	popl %ebx
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_dec_if_positive_cx8)
 
 ENTRY(atomic64_add_unless_cx8)
-	CFI_STARTPROC
-	pushl_cfi_reg ebp
-	pushl_cfi_reg ebx
+	pushl %ebp
+	pushl %ebx
 /* these just push these two parameters on the stack */
-	pushl_cfi_reg edi
-	pushl_cfi_reg ecx
+	pushl %edi
+	pushl %ecx
 
 	movl %eax, %ebp
 	movl %edx, %edi
@@ -168,21 +151,18 @@ ENTRY(atomic64_add_unless_cx8)
 	movl $1, %eax
 3:
 	addl $8, %esp
-	CFI_ADJUST_CFA_OFFSET -8
-	popl_cfi_reg ebx
-	popl_cfi_reg ebp
+	popl %ebx
+	popl %ebp
 	ret
 4:
 	cmpl %edx, 4(%esp)
 	jne 2b
 	xorl %eax, %eax
 	jmp 3b
-	CFI_ENDPROC
 ENDPROC(atomic64_add_unless_cx8)
 
 ENTRY(atomic64_inc_not_zero_cx8)
-	CFI_STARTPROC
-	pushl_cfi_reg ebx
+	pushl %ebx
 
 	read64 %esi
 1:
@@ -199,7 +179,6 @@ ENTRY(atomic64_inc_not_zero_cx8)
 
 	movl $1, %eax
 3:
-	popl_cfi_reg ebx
+	popl %ebx
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_inc_not_zero_cx8)
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index 9bc944a9127481..c1e6232098531f 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -26,7 +26,6 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/errno.h>
 #include <asm/asm.h>
 				
@@ -50,9 +49,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
 	   * alignment for the unrolled loop.
 	   */		
 ENTRY(csum_partial)
-	CFI_STARTPROC
-	pushl_cfi_reg esi
-	pushl_cfi_reg ebx
+	pushl %esi
+	pushl %ebx
 	movl 20(%esp),%eax	# Function arg: unsigned int sum
 	movl 16(%esp),%ecx	# Function arg: int len
 	movl 12(%esp),%esi	# Function arg: unsigned char *buff
@@ -129,10 +127,9 @@ ENTRY(csum_partial)
 	jz 8f
 	roll $8, %eax
 8:
-	popl_cfi_reg ebx
-	popl_cfi_reg esi
+	popl %ebx
+	popl %esi
 	ret
-	CFI_ENDPROC
 ENDPROC(csum_partial)
 
 #else
@@ -140,9 +137,8 @@ ENDPROC(csum_partial)
 /* Version for PentiumII/PPro */
 
 ENTRY(csum_partial)
-	CFI_STARTPROC
-	pushl_cfi_reg esi
-	pushl_cfi_reg ebx
+	pushl %esi
+	pushl %ebx
 	movl 20(%esp),%eax	# Function arg: unsigned int sum
 	movl 16(%esp),%ecx	# Function arg: int len
 	movl 12(%esp),%esi	# Function arg:	const unsigned char *buf
@@ -249,10 +245,9 @@ ENTRY(csum_partial)
 	jz 90f
 	roll $8, %eax
 90: 
-	popl_cfi_reg ebx
-	popl_cfi_reg esi
+	popl %ebx
+	popl %esi
 	ret
-	CFI_ENDPROC
 ENDPROC(csum_partial)
 				
 #endif
@@ -287,12 +282,10 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst,
 #define FP		12
 		
 ENTRY(csum_partial_copy_generic)
-	CFI_STARTPROC
 	subl  $4,%esp	
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl_cfi_reg edi
-	pushl_cfi_reg esi
-	pushl_cfi_reg ebx
+	pushl %edi
+	pushl %esi
+	pushl %ebx
 	movl ARGBASE+16(%esp),%eax	# sum
 	movl ARGBASE+12(%esp),%ecx	# len
 	movl ARGBASE+4(%esp),%esi	# src
@@ -401,12 +394,11 @@ DST(	movb %cl, (%edi)	)
 
 .previous
 
-	popl_cfi_reg ebx
-	popl_cfi_reg esi
-	popl_cfi_reg edi
-	popl_cfi %ecx			# equivalent to addl $4,%esp
+	popl %ebx
+	popl %esi
+	popl %edi
+	popl %ecx			# equivalent to addl $4,%esp
 	ret	
-	CFI_ENDPROC
 ENDPROC(csum_partial_copy_generic)
 
 #else
@@ -426,10 +418,9 @@ ENDPROC(csum_partial_copy_generic)
 #define ARGBASE 12
 		
 ENTRY(csum_partial_copy_generic)
-	CFI_STARTPROC
-	pushl_cfi_reg ebx
-	pushl_cfi_reg edi
-	pushl_cfi_reg esi
+	pushl %ebx
+	pushl %edi
+	pushl %esi
 	movl ARGBASE+4(%esp),%esi	#src
 	movl ARGBASE+8(%esp),%edi	#dst	
 	movl ARGBASE+12(%esp),%ecx	#len
@@ -489,11 +480,10 @@ DST(	movb %dl, (%edi)         )
 	jmp  7b			
 .previous				
 
-	popl_cfi_reg esi
-	popl_cfi_reg edi
-	popl_cfi_reg ebx
+	popl %esi
+	popl %edi
+	popl %ebx
 	ret
-	CFI_ENDPROC
 ENDPROC(csum_partial_copy_generic)
 				
 #undef ROUND
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index e67e579c93bdf7..a2fe51b00ccefc 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,4 @@
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
@@ -15,7 +14,6 @@
  * %rdi	- page
  */
 ENTRY(clear_page)
-	CFI_STARTPROC
 
 	ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
 		      "jmp clear_page_c_e", X86_FEATURE_ERMS
@@ -24,11 +22,9 @@ ENTRY(clear_page)
 	xorl %eax,%eax
 	rep stosq
 	ret
-	CFI_ENDPROC
 ENDPROC(clear_page)
 
 ENTRY(clear_page_orig)
-	CFI_STARTPROC
 
 	xorl   %eax,%eax
 	movl   $4096/64,%ecx
@@ -48,14 +44,11 @@ ENTRY(clear_page_orig)
 	jnz	.Lloop
 	nop
 	ret
-	CFI_ENDPROC
 ENDPROC(clear_page_orig)
 
 ENTRY(clear_page_c_e)
-	CFI_STARTPROC
 	movl $4096,%ecx
 	xorl %eax,%eax
 	rep stosb
 	ret
-	CFI_ENDPROC
 ENDPROC(clear_page_c_e)
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
index 40a172541ee2cb..9b330242e74081 100644
--- a/arch/x86/lib/cmpxchg16b_emu.S
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -6,7 +6,6 @@
  *
  */
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/percpu.h>
 
 .text
@@ -21,7 +20,6 @@
  * %al  : Operation successful
  */
 ENTRY(this_cpu_cmpxchg16b_emu)
-CFI_STARTPROC
 
 #
 # Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not
@@ -32,7 +30,7 @@ CFI_STARTPROC
 # *atomic* on a single cpu (as provided by the this_cpu_xx class of
 # macros).
 #
-	pushfq_cfi
+	pushfq
 	cli
 
 	cmpq PER_CPU_VAR((%rsi)), %rax
@@ -43,17 +41,13 @@ CFI_STARTPROC
 	movq %rbx, PER_CPU_VAR((%rsi))
 	movq %rcx, PER_CPU_VAR(8(%rsi))
 
-	CFI_REMEMBER_STATE
-	popfq_cfi
+	popfq
 	mov $1, %al
 	ret
 
-	CFI_RESTORE_STATE
 .Lnot_same:
-	popfq_cfi
+	popfq
 	xor %al,%al
 	ret
 
-CFI_ENDPROC
-
 ENDPROC(this_cpu_cmpxchg16b_emu)
diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S
index b4807fce517760..ad53497784904b 100644
--- a/arch/x86/lib/cmpxchg8b_emu.S
+++ b/arch/x86/lib/cmpxchg8b_emu.S
@@ -7,7 +7,6 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 
 .text
 
@@ -20,14 +19,13 @@
  * %ecx : high 32 bits of new value
  */
 ENTRY(cmpxchg8b_emu)
-CFI_STARTPROC
 
 #
 # Emulate 'cmpxchg8b (%esi)' on UP except we don't
 # set the whole ZF thing (caller will just compare
 # eax:edx with the expected value)
 #
-	pushfl_cfi
+	pushfl
 	cli
 
 	cmpl  (%esi), %eax
@@ -38,18 +36,15 @@ CFI_STARTPROC
 	movl %ebx,  (%esi)
 	movl %ecx, 4(%esi)
 
-	CFI_REMEMBER_STATE
-	popfl_cfi
+	popfl
 	ret
 
-	CFI_RESTORE_STATE
 .Lnot_same:
 	movl  (%esi), %eax
 .Lhalf_same:
 	movl 4(%esi), %edx
 
-	popfl_cfi
+	popfl
 	ret
 
-CFI_ENDPROC
 ENDPROC(cmpxchg8b_emu)
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 8239dbcbf98455..009f98216b7eb3 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -1,7 +1,6 @@
 /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
@@ -13,22 +12,16 @@
  */
 	ALIGN
 ENTRY(copy_page)
-	CFI_STARTPROC
 	ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
 	movl	$4096/8, %ecx
 	rep	movsq
 	ret
-	CFI_ENDPROC
 ENDPROC(copy_page)
 
 ENTRY(copy_page_regs)
-	CFI_STARTPROC
 	subq	$2*8,	%rsp
-	CFI_ADJUST_CFA_OFFSET 2*8
 	movq	%rbx,	(%rsp)
-	CFI_REL_OFFSET rbx, 0
 	movq	%r12,	1*8(%rsp)
-	CFI_REL_OFFSET r12, 1*8
 
 	movl	$(4096/64)-5,	%ecx
 	.p2align 4
@@ -87,11 +80,7 @@ ENTRY(copy_page_regs)
 	jnz	.Loop2
 
 	movq	(%rsp), %rbx
-	CFI_RESTORE rbx
 	movq	1*8(%rsp), %r12
-	CFI_RESTORE r12
 	addq	$2*8, %rsp
-	CFI_ADJUST_CFA_OFFSET -2*8
 	ret
-	CFI_ENDPROC
 ENDPROC(copy_page_regs)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index fa997dfaef242f..982ce34f4a9bf6 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -7,7 +7,6 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/current.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
@@ -16,33 +15,8 @@
 #include <asm/asm.h>
 #include <asm/smap.h>
 
-	.macro ALIGN_DESTINATION
-	/* check for bad alignment of destination */
-	movl %edi,%ecx
-	andl $7,%ecx
-	jz 102f				/* already aligned */
-	subl $8,%ecx
-	negl %ecx
-	subl %ecx,%edx
-100:	movb (%rsi),%al
-101:	movb %al,(%rdi)
-	incq %rsi
-	incq %rdi
-	decl %ecx
-	jnz 100b
-102:
-	.section .fixup,"ax"
-103:	addl %ecx,%edx			/* ecx is zerorest also */
-	jmp copy_user_handle_tail
-	.previous
-
-	_ASM_EXTABLE(100b,103b)
-	_ASM_EXTABLE(101b,103b)
-	.endm
-
 /* Standard copy_to_user with segment limit checking */
 ENTRY(_copy_to_user)
-	CFI_STARTPROC
 	GET_THREAD_INFO(%rax)
 	movq %rdi,%rcx
 	addq %rdx,%rcx
@@ -54,12 +28,10 @@ ENTRY(_copy_to_user)
 		      X86_FEATURE_REP_GOOD,			\
 		      "jmp copy_user_enhanced_fast_string",	\
 		      X86_FEATURE_ERMS
-	CFI_ENDPROC
 ENDPROC(_copy_to_user)
 
 /* Standard copy_from_user with segment limit checking */
 ENTRY(_copy_from_user)
-	CFI_STARTPROC
 	GET_THREAD_INFO(%rax)
 	movq %rsi,%rcx
 	addq %rdx,%rcx
@@ -71,14 +43,12 @@ ENTRY(_copy_from_user)
 		      X86_FEATURE_REP_GOOD,			\
 		      "jmp copy_user_enhanced_fast_string",	\
 		      X86_FEATURE_ERMS
-	CFI_ENDPROC
 ENDPROC(_copy_from_user)
 
 	.section .fixup,"ax"
 	/* must zero dest */
 ENTRY(bad_from_user)
 bad_from_user:
-	CFI_STARTPROC
 	movl %edx,%ecx
 	xorl %eax,%eax
 	rep
@@ -86,7 +56,6 @@ bad_from_user:
 bad_to_user:
 	movl %edx,%eax
 	ret
-	CFI_ENDPROC
 ENDPROC(bad_from_user)
 	.previous
 
@@ -104,7 +73,6 @@ ENDPROC(bad_from_user)
  * eax uncopied bytes or 0 if successful.
  */
 ENTRY(copy_user_generic_unrolled)
-	CFI_STARTPROC
 	ASM_STAC
 	cmpl $8,%edx
 	jb 20f		/* less then 8 bytes, go to byte copy loop */
@@ -186,7 +154,6 @@ ENTRY(copy_user_generic_unrolled)
 	_ASM_EXTABLE(19b,40b)
 	_ASM_EXTABLE(21b,50b)
 	_ASM_EXTABLE(22b,50b)
-	CFI_ENDPROC
 ENDPROC(copy_user_generic_unrolled)
 
 /* Some CPUs run faster using the string copy instructions.
@@ -208,7 +175,6 @@ ENDPROC(copy_user_generic_unrolled)
  * eax uncopied bytes or 0 if successful.
  */
 ENTRY(copy_user_generic_string)
-	CFI_STARTPROC
 	ASM_STAC
 	cmpl $8,%edx
 	jb 2f		/* less than 8 bytes, go to byte copy loop */
@@ -233,7 +199,6 @@ ENTRY(copy_user_generic_string)
 
 	_ASM_EXTABLE(1b,11b)
 	_ASM_EXTABLE(3b,12b)
-	CFI_ENDPROC
 ENDPROC(copy_user_generic_string)
 
 /*
@@ -249,7 +214,6 @@ ENDPROC(copy_user_generic_string)
  * eax uncopied bytes or 0 if successful.
  */
 ENTRY(copy_user_enhanced_fast_string)
-	CFI_STARTPROC
 	ASM_STAC
 	movl %edx,%ecx
 1:	rep
@@ -264,5 +228,94 @@ ENTRY(copy_user_enhanced_fast_string)
 	.previous
 
 	_ASM_EXTABLE(1b,12b)
-	CFI_ENDPROC
 ENDPROC(copy_user_enhanced_fast_string)
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination/source out of cache for more performance.
+ */
+ENTRY(__copy_user_nocache)
+	ASM_STAC
+	cmpl $8,%edx
+	jb 20f		/* less then 8 bytes, go to byte copy loop */
+	ALIGN_DESTINATION
+	movl %edx,%ecx
+	andl $63,%edx
+	shrl $6,%ecx
+	jz 17f
+1:	movq (%rsi),%r8
+2:	movq 1*8(%rsi),%r9
+3:	movq 2*8(%rsi),%r10
+4:	movq 3*8(%rsi),%r11
+5:	movnti %r8,(%rdi)
+6:	movnti %r9,1*8(%rdi)
+7:	movnti %r10,2*8(%rdi)
+8:	movnti %r11,3*8(%rdi)
+9:	movq 4*8(%rsi),%r8
+10:	movq 5*8(%rsi),%r9
+11:	movq 6*8(%rsi),%r10
+12:	movq 7*8(%rsi),%r11
+13:	movnti %r8,4*8(%rdi)
+14:	movnti %r9,5*8(%rdi)
+15:	movnti %r10,6*8(%rdi)
+16:	movnti %r11,7*8(%rdi)
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	decl %ecx
+	jnz 1b
+17:	movl %edx,%ecx
+	andl $7,%edx
+	shrl $3,%ecx
+	jz 20f
+18:	movq (%rsi),%r8
+19:	movnti %r8,(%rdi)
+	leaq 8(%rsi),%rsi
+	leaq 8(%rdi),%rdi
+	decl %ecx
+	jnz 18b
+20:	andl %edx,%edx
+	jz 23f
+	movl %edx,%ecx
+21:	movb (%rsi),%al
+22:	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 21b
+23:	xorl %eax,%eax
+	ASM_CLAC
+	sfence
+	ret
+
+	.section .fixup,"ax"
+30:	shll $6,%ecx
+	addl %ecx,%edx
+	jmp 60f
+40:	lea (%rdx,%rcx,8),%rdx
+	jmp 60f
+50:	movl %ecx,%edx
+60:	sfence
+	jmp copy_user_handle_tail
+	.previous
+
+	_ASM_EXTABLE(1b,30b)
+	_ASM_EXTABLE(2b,30b)
+	_ASM_EXTABLE(3b,30b)
+	_ASM_EXTABLE(4b,30b)
+	_ASM_EXTABLE(5b,30b)
+	_ASM_EXTABLE(6b,30b)
+	_ASM_EXTABLE(7b,30b)
+	_ASM_EXTABLE(8b,30b)
+	_ASM_EXTABLE(9b,30b)
+	_ASM_EXTABLE(10b,30b)
+	_ASM_EXTABLE(11b,30b)
+	_ASM_EXTABLE(12b,30b)
+	_ASM_EXTABLE(13b,30b)
+	_ASM_EXTABLE(14b,30b)
+	_ASM_EXTABLE(15b,30b)
+	_ASM_EXTABLE(16b,30b)
+	_ASM_EXTABLE(18b,40b)
+	_ASM_EXTABLE(19b,40b)
+	_ASM_EXTABLE(21b,50b)
+	_ASM_EXTABLE(22b,50b)
+ENDPROC(__copy_user_nocache)
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
deleted file mode 100644
index 6a4f43c2d9e6d6..00000000000000
--- a/arch/x86/lib/copy_user_nocache_64.S
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
- * Copyright 2002 Andi Kleen, SuSE Labs.
- * Subject to the GNU Public License v2.
- *
- * Functions to copy from and to user space.
- */
-
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-
-#define FIX_ALIGNMENT 1
-
-#include <asm/current.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/asm.h>
-#include <asm/smap.h>
-
-	.macro ALIGN_DESTINATION
-#ifdef FIX_ALIGNMENT
-	/* check for bad alignment of destination */
-	movl %edi,%ecx
-	andl $7,%ecx
-	jz 102f				/* already aligned */
-	subl $8,%ecx
-	negl %ecx
-	subl %ecx,%edx
-100:	movb (%rsi),%al
-101:	movb %al,(%rdi)
-	incq %rsi
-	incq %rdi
-	decl %ecx
-	jnz 100b
-102:
-	.section .fixup,"ax"
-103:	addl %ecx,%edx			/* ecx is zerorest also */
-	jmp copy_user_handle_tail
-	.previous
-
-	_ASM_EXTABLE(100b,103b)
-	_ASM_EXTABLE(101b,103b)
-#endif
-	.endm
-
-/*
- * copy_user_nocache - Uncached memory copy with exception handling
- * This will force destination/source out of cache for more performance.
- */
-ENTRY(__copy_user_nocache)
-	CFI_STARTPROC
-	ASM_STAC
-	cmpl $8,%edx
-	jb 20f		/* less then 8 bytes, go to byte copy loop */
-	ALIGN_DESTINATION
-	movl %edx,%ecx
-	andl $63,%edx
-	shrl $6,%ecx
-	jz 17f
-1:	movq (%rsi),%r8
-2:	movq 1*8(%rsi),%r9
-3:	movq 2*8(%rsi),%r10
-4:	movq 3*8(%rsi),%r11
-5:	movnti %r8,(%rdi)
-6:	movnti %r9,1*8(%rdi)
-7:	movnti %r10,2*8(%rdi)
-8:	movnti %r11,3*8(%rdi)
-9:	movq 4*8(%rsi),%r8
-10:	movq 5*8(%rsi),%r9
-11:	movq 6*8(%rsi),%r10
-12:	movq 7*8(%rsi),%r11
-13:	movnti %r8,4*8(%rdi)
-14:	movnti %r9,5*8(%rdi)
-15:	movnti %r10,6*8(%rdi)
-16:	movnti %r11,7*8(%rdi)
-	leaq 64(%rsi),%rsi
-	leaq 64(%rdi),%rdi
-	decl %ecx
-	jnz 1b
-17:	movl %edx,%ecx
-	andl $7,%edx
-	shrl $3,%ecx
-	jz 20f
-18:	movq (%rsi),%r8
-19:	movnti %r8,(%rdi)
-	leaq 8(%rsi),%rsi
-	leaq 8(%rdi),%rdi
-	decl %ecx
-	jnz 18b
-20:	andl %edx,%edx
-	jz 23f
-	movl %edx,%ecx
-21:	movb (%rsi),%al
-22:	movb %al,(%rdi)
-	incq %rsi
-	incq %rdi
-	decl %ecx
-	jnz 21b
-23:	xorl %eax,%eax
-	ASM_CLAC
-	sfence
-	ret
-
-	.section .fixup,"ax"
-30:	shll $6,%ecx
-	addl %ecx,%edx
-	jmp 60f
-40:	lea (%rdx,%rcx,8),%rdx
-	jmp 60f
-50:	movl %ecx,%edx
-60:	sfence
-	jmp copy_user_handle_tail
-	.previous
-
-	_ASM_EXTABLE(1b,30b)
-	_ASM_EXTABLE(2b,30b)
-	_ASM_EXTABLE(3b,30b)
-	_ASM_EXTABLE(4b,30b)
-	_ASM_EXTABLE(5b,30b)
-	_ASM_EXTABLE(6b,30b)
-	_ASM_EXTABLE(7b,30b)
-	_ASM_EXTABLE(8b,30b)
-	_ASM_EXTABLE(9b,30b)
-	_ASM_EXTABLE(10b,30b)
-	_ASM_EXTABLE(11b,30b)
-	_ASM_EXTABLE(12b,30b)
-	_ASM_EXTABLE(13b,30b)
-	_ASM_EXTABLE(14b,30b)
-	_ASM_EXTABLE(15b,30b)
-	_ASM_EXTABLE(16b,30b)
-	_ASM_EXTABLE(18b,40b)
-	_ASM_EXTABLE(19b,40b)
-	_ASM_EXTABLE(21b,50b)
-	_ASM_EXTABLE(22b,50b)
-	CFI_ENDPROC
-ENDPROC(__copy_user_nocache)
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 9734182966f3be..7e48807b2fa198 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -6,7 +6,6 @@
  * for more details. No warranty for anything given at all.
  */
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/errno.h>
 #include <asm/asm.h>
 
@@ -47,23 +46,16 @@
 
 
 ENTRY(csum_partial_copy_generic)
-	CFI_STARTPROC
 	cmpl	$3*64, %edx
 	jle	.Lignore
 
 .Lignore:
 	subq  $7*8, %rsp
-	CFI_ADJUST_CFA_OFFSET 7*8
 	movq  %rbx, 2*8(%rsp)
-	CFI_REL_OFFSET rbx, 2*8
 	movq  %r12, 3*8(%rsp)
-	CFI_REL_OFFSET r12, 3*8
 	movq  %r14, 4*8(%rsp)
-	CFI_REL_OFFSET r14, 4*8
 	movq  %r13, 5*8(%rsp)
-	CFI_REL_OFFSET r13, 5*8
 	movq  %rbp, 6*8(%rsp)
-	CFI_REL_OFFSET rbp, 6*8
 
 	movq  %r8, (%rsp)
 	movq  %r9, 1*8(%rsp)
@@ -206,22 +198,14 @@ ENTRY(csum_partial_copy_generic)
 	addl %ebx, %eax
 	adcl %r9d, %eax		/* carry */
 
-	CFI_REMEMBER_STATE
 .Lende:
 	movq 2*8(%rsp), %rbx
-	CFI_RESTORE rbx
 	movq 3*8(%rsp), %r12
-	CFI_RESTORE r12
 	movq 4*8(%rsp), %r14
-	CFI_RESTORE r14
 	movq 5*8(%rsp), %r13
-	CFI_RESTORE r13
 	movq 6*8(%rsp), %rbp
-	CFI_RESTORE rbp
 	addq $7*8, %rsp
-	CFI_ADJUST_CFA_OFFSET -7*8
 	ret
-	CFI_RESTORE_STATE
 
 	/* Exception handlers. Very simple, zeroing is done in the wrappers */
 .Lbad_source:
@@ -237,5 +221,4 @@ ENTRY(csum_partial_copy_generic)
 	jz   .Lende
 	movl $-EFAULT, (%rax)
 	jmp .Lende
-	CFI_ENDPROC
 ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index a4512359656aea..46668cda4ffdfd 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -26,7 +26,6 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/page_types.h>
 #include <asm/errno.h>
 #include <asm/asm-offsets.h>
@@ -36,7 +35,6 @@
 
 	.text
 ENTRY(__get_user_1)
-	CFI_STARTPROC
 	GET_THREAD_INFO(%_ASM_DX)
 	cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
 	jae bad_get_user
@@ -45,11 +43,9 @@ ENTRY(__get_user_1)
 	xor %eax,%eax
 	ASM_CLAC
 	ret
-	CFI_ENDPROC
 ENDPROC(__get_user_1)
 
 ENTRY(__get_user_2)
-	CFI_STARTPROC
 	add $1,%_ASM_AX
 	jc bad_get_user
 	GET_THREAD_INFO(%_ASM_DX)
@@ -60,11 +56,9 @@ ENTRY(__get_user_2)
 	xor %eax,%eax
 	ASM_CLAC
 	ret
-	CFI_ENDPROC
 ENDPROC(__get_user_2)
 
 ENTRY(__get_user_4)
-	CFI_STARTPROC
 	add $3,%_ASM_AX
 	jc bad_get_user
 	GET_THREAD_INFO(%_ASM_DX)
@@ -75,11 +69,9 @@ ENTRY(__get_user_4)
 	xor %eax,%eax
 	ASM_CLAC
 	ret
-	CFI_ENDPROC
 ENDPROC(__get_user_4)
 
 ENTRY(__get_user_8)
-	CFI_STARTPROC
 #ifdef CONFIG_X86_64
 	add $7,%_ASM_AX
 	jc bad_get_user
@@ -104,28 +96,23 @@ ENTRY(__get_user_8)
 	ASM_CLAC
 	ret
 #endif
-	CFI_ENDPROC
 ENDPROC(__get_user_8)
 
 
 bad_get_user:
-	CFI_STARTPROC
 	xor %edx,%edx
 	mov $(-EFAULT),%_ASM_AX
 	ASM_CLAC
 	ret
-	CFI_ENDPROC
 END(bad_get_user)
 
 #ifdef CONFIG_X86_32
 bad_get_user_8:
-	CFI_STARTPROC
 	xor %edx,%edx
 	xor %ecx,%ecx
 	mov $(-EFAULT),%_ASM_AX
 	ASM_CLAC
 	ret
-	CFI_ENDPROC
 END(bad_get_user_8)
 #endif
 
diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S
index 05a95e713da885..33147fef3452ce 100644
--- a/arch/x86/lib/iomap_copy_64.S
+++ b/arch/x86/lib/iomap_copy_64.S
@@ -16,15 +16,12 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 
 /*
  * override generic version in lib/iomap_copy.c
  */
 ENTRY(__iowrite32_copy)
-	CFI_STARTPROC
 	movl %edx,%ecx
 	rep movsd
 	ret
-	CFI_ENDPROC
 ENDPROC(__iowrite32_copy)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index b046664f5a1ccf..16698bba87deb9 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -2,7 +2,6 @@
 
 #include <linux/linkage.h>
 #include <asm/cpufeature.h>
-#include <asm/dwarf2.h>
 #include <asm/alternative-asm.h>
 
 /*
@@ -53,7 +52,6 @@ ENTRY(memcpy_erms)
 ENDPROC(memcpy_erms)
 
 ENTRY(memcpy_orig)
-	CFI_STARTPROC
 	movq %rdi, %rax
 
 	cmpq $0x20, %rdx
@@ -178,5 +176,4 @@ ENTRY(memcpy_orig)
 
 .Lend:
 	retq
-	CFI_ENDPROC
 ENDPROC(memcpy_orig)
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 0f8a0d0331b917..ca2afdd6d98ed2 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -6,7 +6,6 @@
  *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
  */
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
@@ -27,7 +26,6 @@
 
 ENTRY(memmove)
 ENTRY(__memmove)
-	CFI_STARTPROC
 
 	/* Handle more 32 bytes in loop */
 	mov %rdi, %rax
@@ -207,6 +205,5 @@ ENTRY(__memmove)
 	movb %r11b, (%rdi)
 13:
 	retq
-	CFI_ENDPROC
 ENDPROC(__memmove)
 ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 93118fb239762b..2661fad0582716 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -1,7 +1,6 @@
 /* Copyright 2002 Andi Kleen, SuSE Labs */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
@@ -66,7 +65,6 @@ ENTRY(memset_erms)
 ENDPROC(memset_erms)
 
 ENTRY(memset_orig)
-	CFI_STARTPROC
 	movq %rdi,%r10
 
 	/* expand byte value  */
@@ -78,7 +76,6 @@ ENTRY(memset_orig)
 	movl  %edi,%r9d
 	andl  $7,%r9d
 	jnz  .Lbad_alignment
-	CFI_REMEMBER_STATE
 .Lafter_bad_alignment:
 
 	movq  %rdx,%rcx
@@ -128,7 +125,6 @@ ENTRY(memset_orig)
 	movq	%r10,%rax
 	ret
 
-	CFI_RESTORE_STATE
 .Lbad_alignment:
 	cmpq $7,%rdx
 	jbe	.Lhandle_7
@@ -139,5 +135,4 @@ ENTRY(memset_orig)
 	subq %r8,%rdx
 	jmp .Lafter_bad_alignment
 .Lfinal:
-	CFI_ENDPROC
 ENDPROC(memset_orig)
diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c
index c9f2d9ba8dd8c2..e5e3ed8dc0798b 100644
--- a/arch/x86/lib/mmx_32.c
+++ b/arch/x86/lib/mmx_32.c
@@ -22,7 +22,7 @@
 #include <linux/sched.h>
 #include <linux/types.h>
 
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <asm/asm.h>
 
 void *_mmx_memcpy(void *to, const void *from, size_t len)
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index 3ca5218fbece0c..c81556409bbb87 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -1,6 +1,5 @@
 #include <linux/linkage.h>
 #include <linux/errno.h>
-#include <asm/dwarf2.h>
 #include <asm/asm.h>
 #include <asm/msr.h>
 
@@ -13,9 +12,8 @@
  */
 .macro op_safe_regs op
 ENTRY(\op\()_safe_regs)
-	CFI_STARTPROC
-	pushq_cfi_reg rbx
-	pushq_cfi_reg rbp
+	pushq %rbx
+	pushq %rbp
 	movq	%rdi, %r10	/* Save pointer */
 	xorl	%r11d, %r11d	/* Return value */
 	movl    (%rdi), %eax
@@ -25,7 +23,6 @@ ENTRY(\op\()_safe_regs)
 	movl    20(%rdi), %ebp
 	movl    24(%rdi), %esi
 	movl    28(%rdi), %edi
-	CFI_REMEMBER_STATE
 1:	\op
 2:	movl    %eax, (%r10)
 	movl	%r11d, %eax	/* Return value */
@@ -35,16 +32,14 @@ ENTRY(\op\()_safe_regs)
 	movl    %ebp, 20(%r10)
 	movl    %esi, 24(%r10)
 	movl    %edi, 28(%r10)
-	popq_cfi_reg rbp
-	popq_cfi_reg rbx
+	popq %rbp
+	popq %rbx
 	ret
 3:
-	CFI_RESTORE_STATE
 	movl    $-EIO, %r11d
 	jmp     2b
 
 	_ASM_EXTABLE(1b, 3b)
-	CFI_ENDPROC
 ENDPROC(\op\()_safe_regs)
 .endm
 
@@ -52,13 +47,12 @@ ENDPROC(\op\()_safe_regs)
 
 .macro op_safe_regs op
 ENTRY(\op\()_safe_regs)
-	CFI_STARTPROC
-	pushl_cfi_reg ebx
-	pushl_cfi_reg ebp
-	pushl_cfi_reg esi
-	pushl_cfi_reg edi
-	pushl_cfi $0              /* Return value */
-	pushl_cfi %eax
+	pushl %ebx
+	pushl %ebp
+	pushl %esi
+	pushl %edi
+	pushl $0              /* Return value */
+	pushl %eax
 	movl    4(%eax), %ecx
 	movl    8(%eax), %edx
 	movl    12(%eax), %ebx
@@ -66,32 +60,28 @@ ENTRY(\op\()_safe_regs)
 	movl    24(%eax), %esi
 	movl    28(%eax), %edi
 	movl    (%eax), %eax
-	CFI_REMEMBER_STATE
 1:	\op
-2:	pushl_cfi %eax
+2:	pushl %eax
 	movl    4(%esp), %eax
-	popl_cfi (%eax)
+	popl (%eax)
 	addl    $4, %esp
-	CFI_ADJUST_CFA_OFFSET -4
 	movl    %ecx, 4(%eax)
 	movl    %edx, 8(%eax)
 	movl    %ebx, 12(%eax)
 	movl    %ebp, 20(%eax)
 	movl    %esi, 24(%eax)
 	movl    %edi, 28(%eax)
-	popl_cfi %eax
-	popl_cfi_reg edi
-	popl_cfi_reg esi
-	popl_cfi_reg ebp
-	popl_cfi_reg ebx
+	popl %eax
+	popl %edi
+	popl %esi
+	popl %ebp
+	popl %ebx
 	ret
 3:
-	CFI_RESTORE_STATE
 	movl    $-EIO, 4(%esp)
 	jmp     2b
 
 	_ASM_EXTABLE(1b, 3b)
-	CFI_ENDPROC
 ENDPROC(\op\()_safe_regs)
 .endm
 
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index fc6ba17a7eec2a..e0817a12d32362 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -11,7 +11,6 @@
  * return value.
  */
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/thread_info.h>
 #include <asm/errno.h>
 #include <asm/asm.h>
@@ -30,11 +29,9 @@
  * as they get called from within inline assembly.
  */
 
-#define ENTER	CFI_STARTPROC ; \
-		GET_THREAD_INFO(%_ASM_BX)
+#define ENTER	GET_THREAD_INFO(%_ASM_BX)
 #define EXIT	ASM_CLAC ;	\
-		ret ;		\
-		CFI_ENDPROC
+		ret
 
 .text
 ENTRY(__put_user_1)
@@ -87,7 +84,6 @@ ENTRY(__put_user_8)
 ENDPROC(__put_user_8)
 
 bad_put_user:
-	CFI_STARTPROC
 	movl $-EFAULT,%eax
 	EXIT
 END(bad_put_user)
diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S
index 2322abe4da3b01..40027db991405b 100644
--- a/arch/x86/lib/rwsem.S
+++ b/arch/x86/lib/rwsem.S
@@ -15,7 +15,6 @@
 
 #include <linux/linkage.h>
 #include <asm/alternative-asm.h>
-#include <asm/dwarf2.h>
 
 #define __ASM_HALF_REG(reg)	__ASM_SEL(reg, e##reg)
 #define __ASM_HALF_SIZE(inst)	__ASM_SEL(inst##w, inst##l)
@@ -34,10 +33,10 @@
  */
 
 #define save_common_regs \
-	pushl_cfi_reg ecx
+	pushl %ecx
 
 #define restore_common_regs \
-	popl_cfi_reg ecx
+	popl %ecx
 
 	/* Avoid uglifying the argument copying x86-64 needs to do. */
 	.macro movq src, dst
@@ -64,50 +63,45 @@
  */
 
 #define save_common_regs \
-	pushq_cfi_reg rdi; \
-	pushq_cfi_reg rsi; \
-	pushq_cfi_reg rcx; \
-	pushq_cfi_reg r8;  \
-	pushq_cfi_reg r9;  \
-	pushq_cfi_reg r10; \
-	pushq_cfi_reg r11
+	pushq %rdi; \
+	pushq %rsi; \
+	pushq %rcx; \
+	pushq %r8;  \
+	pushq %r9;  \
+	pushq %r10; \
+	pushq %r11
 
 #define restore_common_regs \
-	popq_cfi_reg r11; \
-	popq_cfi_reg r10; \
-	popq_cfi_reg r9; \
-	popq_cfi_reg r8; \
-	popq_cfi_reg rcx; \
-	popq_cfi_reg rsi; \
-	popq_cfi_reg rdi
+	popq %r11; \
+	popq %r10; \
+	popq %r9; \
+	popq %r8; \
+	popq %rcx; \
+	popq %rsi; \
+	popq %rdi
 
 #endif
 
 /* Fix up special calling conventions */
 ENTRY(call_rwsem_down_read_failed)
-	CFI_STARTPROC
 	save_common_regs
-	__ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
+	__ASM_SIZE(push,) %__ASM_REG(dx)
 	movq %rax,%rdi
 	call rwsem_down_read_failed
-	__ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
+	__ASM_SIZE(pop,) %__ASM_REG(dx)
 	restore_common_regs
 	ret
-	CFI_ENDPROC
 ENDPROC(call_rwsem_down_read_failed)
 
 ENTRY(call_rwsem_down_write_failed)
-	CFI_STARTPROC
 	save_common_regs
 	movq %rax,%rdi
 	call rwsem_down_write_failed
 	restore_common_regs
 	ret
-	CFI_ENDPROC
 ENDPROC(call_rwsem_down_write_failed)
 
 ENTRY(call_rwsem_wake)
-	CFI_STARTPROC
 	/* do nothing if still outstanding active readers */
 	__ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx)
 	jnz 1f
@@ -116,17 +110,14 @@ ENTRY(call_rwsem_wake)
 	call rwsem_wake
 	restore_common_regs
 1:	ret
-	CFI_ENDPROC
 ENDPROC(call_rwsem_wake)
 
 ENTRY(call_rwsem_downgrade_wake)
-	CFI_STARTPROC
 	save_common_regs
-	__ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
+	__ASM_SIZE(push,) %__ASM_REG(dx)
 	movq %rax,%rdi
 	call rwsem_downgrade_wake
-	__ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
+	__ASM_SIZE(pop,) %__ASM_REG(dx)
 	restore_common_regs
 	ret
-	CFI_ENDPROC
 ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index 5eb715087b804e..e9acf5f4fc923c 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -6,16 +6,14 @@
  */
 	#include <linux/linkage.h>
 	#include <asm/asm.h>
-	#include <asm/dwarf2.h>
 
 	/* put return address in eax (arg1) */
 	.macro THUNK name, func, put_ret_addr_in_eax=0
 	.globl \name
 \name:
-	CFI_STARTPROC
-	pushl_cfi_reg eax
-	pushl_cfi_reg ecx
-	pushl_cfi_reg edx
+	pushl %eax
+	pushl %ecx
+	pushl %edx
 
 	.if \put_ret_addr_in_eax
 	/* Place EIP in the arg1 */
@@ -23,11 +21,10 @@
 	.endif
 
 	call \func
-	popl_cfi_reg edx
-	popl_cfi_reg ecx
-	popl_cfi_reg eax
+	popl %edx
+	popl %ecx
+	popl %eax
 	ret
-	CFI_ENDPROC
 	_ASM_NOKPROBE(\name)
 	.endm
 
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index f89ba4e93025de..10f555e435e171 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -6,7 +6,6 @@
  * Subject to the GNU public license, v.2. No warranty of any kind.
  */
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/calling.h>
 #include <asm/asm.h>
 
@@ -14,27 +13,25 @@
 	.macro THUNK name, func, put_ret_addr_in_rdi=0
 	.globl \name
 \name:
-	CFI_STARTPROC
 
 	/* this one pushes 9 elems, the next one would be %rIP */
-	pushq_cfi_reg rdi
-	pushq_cfi_reg rsi
-	pushq_cfi_reg rdx
-	pushq_cfi_reg rcx
-	pushq_cfi_reg rax
-	pushq_cfi_reg r8
-	pushq_cfi_reg r9
-	pushq_cfi_reg r10
-	pushq_cfi_reg r11
+	pushq %rdi
+	pushq %rsi
+	pushq %rdx
+	pushq %rcx
+	pushq %rax
+	pushq %r8
+	pushq %r9
+	pushq %r10
+	pushq %r11
 
 	.if \put_ret_addr_in_rdi
 	/* 9*8(%rsp) is return addr on stack */
-	movq_cfi_restore 9*8, rdi
+	movq 9*8(%rsp), %rdi
 	.endif
 
 	call \func
 	jmp  restore
-	CFI_ENDPROC
 	_ASM_NOKPROBE(\name)
 	.endm
 
@@ -57,19 +54,16 @@
 #if defined(CONFIG_TRACE_IRQFLAGS) \
  || defined(CONFIG_DEBUG_LOCK_ALLOC) \
  || defined(CONFIG_PREEMPT)
-	CFI_STARTPROC
-	CFI_ADJUST_CFA_OFFSET 9*8
 restore:
-	popq_cfi_reg r11
-	popq_cfi_reg r10
-	popq_cfi_reg r9
-	popq_cfi_reg r8
-	popq_cfi_reg rax
-	popq_cfi_reg rcx
-	popq_cfi_reg rdx
-	popq_cfi_reg rsi
-	popq_cfi_reg rdi
+	popq %r11
+	popq %r10
+	popq %r9
+	popq %r8
+	popq %rax
+	popq %rcx
+	popq %rdx
+	popq %rsi
+	popq %rdi
 	ret
-	CFI_ENDPROC
 	_ASM_NOKPROBE(restore)
 #endif
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index e2f5e21c03b304..91d93b95bd8685 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -647,7 +647,8 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
  * @from: Source address, in kernel space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from kernel space to user space.
  *
@@ -668,7 +669,8 @@ EXPORT_SYMBOL(_copy_to_user);
  * @from: Source address, in user space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from user space to kernel space.
  *
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index dc8adad10a2f38..dd76a05729b010 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -30,7 +30,7 @@ static void fclex(void)
 }
 
 /* Needs to be externally visible */
-void finit_soft_fpu(struct i387_soft_struct *soft)
+void fpstate_init_soft(struct swregs_state *soft)
 {
 	struct address *oaddr, *iaddr;
 	memset(soft, 0, sizeof(*soft));
@@ -52,7 +52,7 @@ void finit_soft_fpu(struct i387_soft_struct *soft)
 
 void finit(void)
 {
-	finit_soft_fpu(&current->thread.fpu.state->soft);
+	fpstate_init_soft(&current->thread.fpu.state.soft);
 }
 
 /*
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 9b868124128d79..f37e84ab49f38e 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -31,7 +31,7 @@
 #include <asm/traps.h>
 #include <asm/desc.h>
 #include <asm/user.h>
-#include <asm/i387.h>
+#include <asm/fpu/internal.h>
 
 #include "fpu_system.h"
 #include "fpu_emu.h"
@@ -147,13 +147,9 @@ void math_emulate(struct math_emu_info *info)
 	unsigned long code_base = 0;
 	unsigned long code_limit = 0;	/* Initialized to stop compiler warnings */
 	struct desc_struct code_descriptor;
+	struct fpu *fpu = &current->thread.fpu;
 
-	if (!used_math()) {
-		if (init_fpu(current)) {
-			do_group_exit(SIGKILL);
-			return;
-		}
-	}
+	fpu__activate_curr(fpu);
 
 #ifdef RE_ENTRANT_CHECKING
 	if (emulating) {
@@ -673,7 +669,7 @@ void math_abort(struct math_emu_info *info, unsigned int signal)
 #endif /* PARANOID */
 }
 
-#define S387 ((struct i387_soft_struct *)s387)
+#define S387 ((struct swregs_state *)s387)
 #define sstatus_word() \
   ((S387->swd & ~SW_Top & 0xffff) | ((S387->ftop << SW_Top_Shift) & SW_Top))
 
@@ -682,14 +678,14 @@ int fpregs_soft_set(struct task_struct *target,
 		    unsigned int pos, unsigned int count,
 		    const void *kbuf, const void __user *ubuf)
 {
-	struct i387_soft_struct *s387 = &target->thread.fpu.state->soft;
+	struct swregs_state *s387 = &target->thread.fpu.state.soft;
 	void *space = s387->st_space;
 	int ret;
 	int offset, other, i, tags, regnr, tag, newtop;
 
 	RE_ENTRANT_CHECK_OFF;
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, s387, 0,
-				 offsetof(struct i387_soft_struct, st_space));
+				 offsetof(struct swregs_state, st_space));
 	RE_ENTRANT_CHECK_ON;
 
 	if (ret)
@@ -734,7 +730,7 @@ int fpregs_soft_get(struct task_struct *target,
 		    unsigned int pos, unsigned int count,
 		    void *kbuf, void __user *ubuf)
 {
-	struct i387_soft_struct *s387 = &target->thread.fpu.state->soft;
+	struct swregs_state *s387 = &target->thread.fpu.state.soft;
 	const void *space = s387->st_space;
 	int ret;
 	int offset = (S387->ftop & 7) * 10, other = 80 - offset;
@@ -752,7 +748,7 @@ int fpregs_soft_get(struct task_struct *target,
 #endif /* PECULIAR_486 */
 
 	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, s387, 0,
-				  offsetof(struct i387_soft_struct, st_space));
+				  offsetof(struct swregs_state, st_space));
 
 	/* Copy all registers in stack order. */
 	if (!ret)
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index 2c614410a5f397..9ccecb61a4fa12 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -31,7 +31,7 @@
 #define SEG_EXPAND_DOWN(s)	(((s).b & ((1 << 11) | (1 << 10))) \
 				 == (1 << 10))
 
-#define I387			(current->thread.fpu.state)
+#define I387			(&current->thread.fpu.state)
 #define FPU_info		(I387->soft.info)
 
 #define FPU_CS			(*(unsigned short *) &(FPU_info->regs->cs))
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 181c53bac3a7ee..9dc909841739bf 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -13,6 +13,7 @@
 #include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
 #include <linux/prefetch.h>		/* prefetchw			*/
 #include <linux/context_tracking.h>	/* exception_enter(), ...	*/
+#include <linux/uaccess.h>		/* faulthandler_disabled()	*/
 
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
@@ -1126,9 +1127,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
-	 * in an atomic region then we must not take the fault:
+	 * in a region with pagefaults disabled then we must not take the fault
 	 */
-	if (unlikely(in_atomic() || !mm)) {
+	if (unlikely(faulthandler_disabled() || !mm)) {
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 4500142bc4aa46..eecb207a203708 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -35,7 +35,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 	unsigned long vaddr;
 	int idx, type;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 
 	if (!PageHighMem(page))
@@ -100,6 +100,7 @@ void __kunmap_atomic(void *kvaddr)
 #endif
 
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
 
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 9ca35fc60cfeaa..37d2ba2ec27fad 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -59,6 +59,7 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 	unsigned long vaddr;
 	int idx, type;
 
+	preempt_disable();
 	pagefault_disable();
 
 	type = kmap_atomic_idx_push();
@@ -82,7 +83,7 @@ iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 	 * MTRR is UC or WC.  UC_MINUS gets the real intention, of the
 	 * user, which is "WC if the MTRR is WC, UC if you can't do that."
 	 */
-	if (!pat_enabled && pgprot_val(prot) ==
+	if (!pat_enabled() && pgprot_val(prot) ==
 	    (__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_WC)))
 		prot = __pgprot(__PAGE_KERNEL |
 				cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS));
@@ -117,5 +118,6 @@ iounmap_atomic(void __iomem *kvaddr)
 	}
 
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 70e7444c683516..938609e259d825 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -234,10 +234,11 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
 {
 	/*
 	 * Ideally, this should be:
-	 *	pat_enabled ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS;
+	 *	pat_enabled() ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS;
 	 *
 	 * Till we fix all X drivers to use ioremap_wc(), we will use
-	 * UC MINUS.
+	 * UC MINUS. Drivers that are certain they need or can already
+	 * be converted over to strong UC can use ioremap_uc().
 	 */
 	enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
@@ -247,6 +248,39 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
 EXPORT_SYMBOL(ioremap_nocache);
 
 /**
+ * ioremap_uc     -   map bus memory into CPU space as strongly uncachable
+ * @phys_addr:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap_uc performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * This version of ioremap ensures that the memory is marked with a strong
+ * preference as completely uncachable on the CPU when possible. For non-PAT
+ * systems this ends up setting page-attribute flags PCD=1, PWT=1. For PAT
+ * systems this will set the PAT entry for the pages as strong UC.  This call
+ * will honor existing caching rules from things like the PCI bus. Note that
+ * there are other caches and buffers on many busses. In particular driver
+ * authors should read up on PCI writes.
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
+{
+	enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
+
+	return __ioremap_caller(phys_addr, size, pcm,
+				__builtin_return_address(0));
+}
+EXPORT_SYMBOL_GPL(ioremap_uc);
+
+/**
  * ioremap_wc	-	map memory into CPU space write combined
  * @phys_addr:	bus address of the memory
  * @size:	size of the resource to map
@@ -258,7 +292,7 @@ EXPORT_SYMBOL(ioremap_nocache);
  */
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
-	if (pat_enabled)
+	if (pat_enabled())
 		return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
 					__builtin_return_address(0));
 	else
@@ -331,7 +365,7 @@ void iounmap(volatile void __iomem *addr)
 }
 EXPORT_SYMBOL(iounmap);
 
-int arch_ioremap_pud_supported(void)
+int __init arch_ioremap_pud_supported(void)
 {
 #ifdef CONFIG_X86_64
 	return cpu_has_gbpages;
@@ -340,7 +374,7 @@ int arch_ioremap_pud_supported(void)
 #endif
 }
 
-int arch_ioremap_pmd_supported(void)
+int __init arch_ioremap_pmd_supported(void)
 {
 	return cpu_has_pse;
 }
@@ -353,18 +387,18 @@ void *xlate_dev_mem_ptr(phys_addr_t phys)
 {
 	unsigned long start  = phys &  PAGE_MASK;
 	unsigned long offset = phys & ~PAGE_MASK;
-	unsigned long vaddr;
+	void *vaddr;
 
 	/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
 	if (page_is_ram(start >> PAGE_SHIFT))
 		return __va(phys);
 
-	vaddr = (unsigned long)ioremap_cache(start, PAGE_SIZE);
+	vaddr = ioremap_cache(start, PAGE_SIZE);
 	/* Only add the offset on success and return NULL if the ioremap() failed: */
 	if (vaddr)
 		vaddr += offset;
 
-	return (void *)vaddr;
+	return vaddr;
 }
 
 void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
@@ -373,7 +407,6 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
 		return;
 
 	iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
-	return;
 }
 
 static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index c439ec47821601..2e0dfd39bd2280 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -10,13 +10,12 @@
 #include <linux/syscalls.h>
 #include <linux/sched/sysctl.h>
 
-#include <asm/i387.h>
 #include <asm/insn.h>
 #include <asm/mman.h>
 #include <asm/mmu_context.h>
 #include <asm/mpx.h>
 #include <asm/processor.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 
 static const char *mpx_mapping_name(struct vm_area_struct *vma)
 {
@@ -273,7 +272,7 @@ bad_opcode:
  * The caller is expected to kfree() the returned siginfo_t.
  */
 siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
-				struct xsave_struct *xsave_buf)
+				struct xregs_state *xsave_buf)
 {
 	struct bndreg *bndregs, *bndreg;
 	siginfo_t *info = NULL;
@@ -358,8 +357,8 @@ static __user void *task_get_bounds_dir(struct task_struct *tsk)
 	 * The bounds directory pointer is stored in a register
 	 * only accessible if we first do an xsave.
 	 */
-	fpu_save_init(&tsk->thread.fpu);
-	bndcsr = get_xsave_addr(&tsk->thread.fpu.state->xsave, XSTATE_BNDCSR);
+	copy_fpregs_to_fpstate(&tsk->thread.fpu);
+	bndcsr = get_xsave_addr(&tsk->thread.fpu.state.xsave, XSTATE_BNDCSR);
 	if (!bndcsr)
 		return MPX_INVALID_BOUNDS_DIR;
 
@@ -390,7 +389,7 @@ int mpx_enable_management(struct task_struct *tsk)
 	 * directory into XSAVE/XRSTOR Save Area and enable MPX through
 	 * XRSTOR instruction.
 	 *
-	 * fpu_xsave() is expected to be very expensive. Storing the bounds
+	 * copy_xregs_to_kernel() is expected to be very expensive. Storing the bounds
 	 * directory here means that we do not have to do xsave in the unmap
 	 * path; we can just use mm->bd_addr instead.
 	 */
@@ -498,7 +497,7 @@ out_unmap:
  * bound table is 16KB. With 64-bit mode, the size of BD is 2GB,
  * and the size of each bound table is 4MB.
  */
-static int do_mpx_bt_fault(struct xsave_struct *xsave_buf)
+static int do_mpx_bt_fault(struct xregs_state *xsave_buf)
 {
 	unsigned long bd_entry, bd_base;
 	struct bndcsr *bndcsr;
@@ -526,7 +525,7 @@ static int do_mpx_bt_fault(struct xsave_struct *xsave_buf)
 	return allocate_bt((long __user *)bd_entry);
 }
 
-int mpx_handle_bd_fault(struct xsave_struct *xsave_buf)
+int mpx_handle_bd_fault(struct xregs_state *xsave_buf)
 {
 	/*
 	 * Userspace never asked us to manage the bounds tables,
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 89af288ec6740c..70d221fe2eb409 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -129,16 +129,15 @@ within(unsigned long addr, unsigned long start, unsigned long end)
  */
 void clflush_cache_range(void *vaddr, unsigned int size)
 {
-	void *vend = vaddr + size - 1;
+	unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+	void *vend = vaddr + size;
+	void *p;
 
 	mb();
 
-	for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
-		clflushopt(vaddr);
-	/*
-	 * Flush any possible final partial cacheline:
-	 */
-	clflushopt(vend);
+	for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+	     p < vend; p += boot_cpu_data.x86_clflush_size)
+		clflushopt(p);
 
 	mb();
 }
@@ -418,13 +417,11 @@ phys_addr_t slow_virt_to_phys(void *__virt_addr)
 	phys_addr_t phys_addr;
 	unsigned long offset;
 	enum pg_level level;
-	unsigned long psize;
 	unsigned long pmask;
 	pte_t *pte;
 
 	pte = lookup_address(virt_addr, &level);
 	BUG_ON(!pte);
-	psize = page_level_size(level);
 	pmask = page_level_mask(level);
 	offset = virt_addr & ~pmask;
 	phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
@@ -1468,6 +1465,9 @@ int _set_memory_uc(unsigned long addr, int numpages)
 {
 	/*
 	 * for now UC MINUS. see comments in ioremap_nocache()
+	 * If you really need strong UC use ioremap_uc(), but note
+	 * that you cannot override IO areas with set_memory_*() as
+	 * these helpers cannot work with IO memory.
 	 */
 	return change_page_attr_set(&addr, numpages,
 				    cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
@@ -1571,7 +1571,7 @@ int set_memory_wc(unsigned long addr, int numpages)
 {
 	int ret;
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return set_memory_uc(addr, numpages);
 
 	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 35af6771a95ad6..a1c96544099d27 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -33,13 +33,15 @@
 #include "pat_internal.h"
 #include "mm_internal.h"
 
-#ifdef CONFIG_X86_PAT
-int __read_mostly pat_enabled = 1;
+#undef pr_fmt
+#define pr_fmt(fmt) "" fmt
+
+static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT);
 
 static inline void pat_disable(const char *reason)
 {
-	pat_enabled = 0;
-	printk(KERN_INFO "%s\n", reason);
+	__pat_enabled = 0;
+	pr_info("x86/PAT: %s\n", reason);
 }
 
 static int __init nopat(char *str)
@@ -48,13 +50,12 @@ static int __init nopat(char *str)
 	return 0;
 }
 early_param("nopat", nopat);
-#else
-static inline void pat_disable(const char *reason)
+
+bool pat_enabled(void)
 {
-	(void)reason;
+	return !!__pat_enabled;
 }
-#endif
-
+EXPORT_SYMBOL_GPL(pat_enabled);
 
 int pat_debug_enable;
 
@@ -188,7 +189,7 @@ void pat_init_cache_modes(void)
 					   pat_msg + 4 * i);
 		update_cache_mode_entry(i, cache);
 	}
-	pr_info("PAT configuration [0-7]: %s\n", pat_msg);
+	pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg);
 }
 
 #define PAT(x, y)	((u64)PAT_ ## y << ((x)*8))
@@ -198,7 +199,7 @@ void pat_init(void)
 	u64 pat;
 	bool boot_cpu = !boot_pat_state;
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return;
 
 	if (!cpu_has_pat) {
@@ -211,8 +212,7 @@ void pat_init(void)
 			 * switched to PAT on the boot CPU. We have no way to
 			 * undo PAT.
 			 */
-			printk(KERN_ERR "PAT enabled, "
-			       "but not supported by secondary CPU\n");
+			pr_err("x86/PAT: PAT enabled, but not supported by secondary CPU\n");
 			BUG();
 		}
 	}
@@ -267,9 +267,9 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end,
 	 * request is for WB.
 	 */
 	if (req_type == _PAGE_CACHE_MODE_WB) {
-		u8 mtrr_type;
+		u8 mtrr_type, uniform;
 
-		mtrr_type = mtrr_type_lookup(start, end);
+		mtrr_type = mtrr_type_lookup(start, end, &uniform);
 		if (mtrr_type != MTRR_TYPE_WRBACK)
 			return _PAGE_CACHE_MODE_UC_MINUS;
 
@@ -347,7 +347,7 @@ static int reserve_ram_pages_type(u64 start, u64 end,
 		page = pfn_to_page(pfn);
 		type = get_page_memtype(page);
 		if (type != -1) {
-			pr_info("reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
+			pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
 				start, end - 1, type, req_type);
 			if (new_type)
 				*new_type = type;
@@ -400,7 +400,7 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
 
 	BUG_ON(start >= end); /* end is exclusive */
 
-	if (!pat_enabled) {
+	if (!pat_enabled()) {
 		/* This is identical to page table setting without PAT */
 		if (new_type) {
 			if (req_type == _PAGE_CACHE_MODE_WC)
@@ -451,9 +451,9 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
 
 	err = rbt_memtype_check_insert(new, new_type);
 	if (err) {
-		printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
-		       start, end - 1,
-		       cattr_name(new->type), cattr_name(req_type));
+		pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
+			start, end - 1,
+			cattr_name(new->type), cattr_name(req_type));
 		kfree(new);
 		spin_unlock(&memtype_lock);
 
@@ -475,7 +475,7 @@ int free_memtype(u64 start, u64 end)
 	int is_range_ram;
 	struct memtype *entry;
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return 0;
 
 	/* Low ISA region is always mapped WB. No need to track */
@@ -497,8 +497,8 @@ int free_memtype(u64 start, u64 end)
 	spin_unlock(&memtype_lock);
 
 	if (!entry) {
-		printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
-		       current->comm, current->pid, start, end - 1);
+		pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
+			current->comm, current->pid, start, end - 1);
 		return -EINVAL;
 	}
 
@@ -623,13 +623,13 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 	u64 to = from + size;
 	u64 cursor = from;
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return 1;
 
 	while (cursor < to) {
 		if (!devmem_is_allowed(pfn)) {
-			printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
-			       current->comm, from, to - 1);
+			pr_info("x86/PAT: Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
+				current->comm, from, to - 1);
 			return 0;
 		}
 		cursor += PAGE_SIZE;
@@ -659,7 +659,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 	 * caching for the high addresses through the KEN pin, but
 	 * we maintain the tradition of paranoia in this code.
 	 */
-	if (!pat_enabled &&
+	if (!pat_enabled() &&
 	    !(boot_cpu_has(X86_FEATURE_MTRR) ||
 	      boot_cpu_has(X86_FEATURE_K6_MTRR) ||
 	      boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
@@ -698,8 +698,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size,
 				size;
 
 	if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) {
-		printk(KERN_INFO "%s:%d ioremap_change_attr failed %s "
-			"for [mem %#010Lx-%#010Lx]\n",
+		pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n",
 			current->comm, current->pid,
 			cattr_name(pcm),
 			base, (unsigned long long)(base + size-1));
@@ -729,12 +728,12 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 	 * the type requested matches the type of first page in the range.
 	 */
 	if (is_ram) {
-		if (!pat_enabled)
+		if (!pat_enabled())
 			return 0;
 
 		pcm = lookup_memtype(paddr);
 		if (want_pcm != pcm) {
-			printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
+			pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
 				current->comm, current->pid,
 				cattr_name(want_pcm),
 				(unsigned long long)paddr,
@@ -755,13 +754,12 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 		if (strict_prot ||
 		    !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
 			free_memtype(paddr, paddr + size);
-			printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
-				" for [mem %#010Lx-%#010Lx], got %s\n",
-				current->comm, current->pid,
-				cattr_name(want_pcm),
-				(unsigned long long)paddr,
-				(unsigned long long)(paddr + size - 1),
-				cattr_name(pcm));
+			pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n",
+			       current->comm, current->pid,
+			       cattr_name(want_pcm),
+			       (unsigned long long)paddr,
+			       (unsigned long long)(paddr + size - 1),
+			       cattr_name(pcm));
 			return -EINVAL;
 		}
 		/*
@@ -844,7 +842,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
 		return ret;
 	}
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return 0;
 
 	/*
@@ -872,7 +870,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
 {
 	enum page_cache_mode pcm;
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return 0;
 
 	/* Set prot based on lookup */
@@ -913,7 +911,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 
 pgprot_t pgprot_writecombine(pgprot_t prot)
 {
-	if (pat_enabled)
+	if (pat_enabled())
 		return __pgprot(pgprot_val(prot) |
 				cachemode2protval(_PAGE_CACHE_MODE_WC));
 	else
@@ -996,7 +994,7 @@ static const struct file_operations memtype_fops = {
 
 static int __init pat_memtype_list_init(void)
 {
-	if (pat_enabled) {
+	if (pat_enabled()) {
 		debugfs_create_file("pat_memtype_list", S_IRUSR,
 				    arch_debugfs_dir, NULL, &memtype_fops);
 	}
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
index f6411620305d89..a739bfc40690b7 100644
--- a/arch/x86/mm/pat_internal.h
+++ b/arch/x86/mm/pat_internal.h
@@ -4,7 +4,7 @@
 extern int pat_debug_enable;
 
 #define dprintk(fmt, arg...) \
-	do { if (pat_debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
+	do { if (pat_debug_enable) pr_info("x86/PAT: " fmt, ##arg); } while (0)
 
 struct memtype {
 	u64			start;
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 6582adcc8bd935..63931080366aaa 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -160,9 +160,9 @@ success:
 	return 0;
 
 failure:
-	printk(KERN_INFO "%s:%d conflicting memory types "
-		"%Lx-%Lx %s<->%s\n", current->comm, current->pid, start,
-		end, cattr_name(found_type), cattr_name(match->type));
+	pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+		current->comm, current->pid, start, end,
+		cattr_name(found_type), cattr_name(match->type));
 	return -EBUSY;
 }
 
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 0b97d2c75df3d5..fb0a9dd1d6e46f 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -563,16 +563,31 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
 }
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+/**
+ * pud_set_huge - setup kernel PUD mapping
+ *
+ * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
+ * function sets up a huge page only if any of the following conditions are met:
+ *
+ * - MTRRs are disabled, or
+ *
+ * - MTRRs are enabled and the range is completely covered by a single MTRR, or
+ *
+ * - MTRRs are enabled and the corresponding MTRR memory type is WB, which
+ *   has no effect on the requested PAT memory type.
+ *
+ * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
+ * page mapping attempt fails.
+ *
+ * Returns 1 on success and 0 on failure.
+ */
 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
 {
-	u8 mtrr;
+	u8 mtrr, uniform;
 
-	/*
-	 * Do not use a huge page when the range is covered by non-WB type
-	 * of MTRRs.
-	 */
-	mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE);
-	if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+	mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
+	if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
+	    (mtrr != MTRR_TYPE_WRBACK))
 		return 0;
 
 	prot = pgprot_4k_2_large(prot);
@@ -584,17 +599,24 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
 	return 1;
 }
 
+/**
+ * pmd_set_huge - setup kernel PMD mapping
+ *
+ * See text over pud_set_huge() above.
+ *
+ * Returns 1 on success and 0 on failure.
+ */
 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
 {
-	u8 mtrr;
+	u8 mtrr, uniform;
 
-	/*
-	 * Do not use a huge page when the range is covered by non-WB type
-	 * of MTRRs.
-	 */
-	mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE);
-	if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+	mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
+	if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
+	    (mtrr != MTRR_TYPE_WRBACK)) {
+		pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
+			     __func__, addr, addr + PMD_SIZE);
 		return 0;
+	}
 
 	prot = pgprot_4k_2_large(prot);
 
@@ -605,6 +627,11 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
 	return 1;
 }
 
+/**
+ * pud_clear_huge - clear kernel PUD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PUD map is found).
+ */
 int pud_clear_huge(pud_t *pud)
 {
 	if (pud_large(*pud)) {
@@ -615,6 +642,11 @@ int pud_clear_huge(pud_t *pud)
 	return 0;
 }
 
+/**
+ * pmd_clear_huge - clear kernel PMD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PMD map is found).
+ */
 int pmd_clear_huge(pmd_t *pmd)
 {
 	if (pmd_large(*pmd)) {
diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
index 6440221ced0d49..4093216b379131 100644
--- a/arch/x86/net/bpf_jit.S
+++ b/arch/x86/net/bpf_jit.S
@@ -8,7 +8,6 @@
  * of the License.
  */
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 
 /*
  * Calling convention :
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 349c0d32cc0b14..0a9f2caf358ff7 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -429,12 +429,12 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
  	 * Caller can followup with UC MINUS request and add a WC mtrr if there
  	 * is a free mtrr slot.
  	 */
-	if (!pat_enabled && write_combine)
+	if (!pat_enabled() && write_combine)
 		return -EINVAL;
 
-	if (pat_enabled && write_combine)
+	if (pat_enabled() && write_combine)
 		prot |= cachemode2protval(_PAGE_CACHE_MODE_WC);
-	else if (pat_enabled || boot_cpu_data.x86 > 3)
+	else if (pat_enabled() || boot_cpu_data.x86 > 3)
 		/*
 		 * ioremap() and ioremap_nocache() defaults to UC MINUS for now.
 		 * To avoid attribute conflicts, request UC MINUS here
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 852aa4c92da027..27062303c88135 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -208,6 +208,7 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
 
 static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 {
+	struct irq_alloc_info info;
 	int polarity;
 
 	if (dev->irq_managed && dev->irq > 0)
@@ -217,14 +218,13 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 		polarity = 0; /* active high */
 	else
 		polarity = 1; /* active low */
+	ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity);
 
 	/*
 	 * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
 	 * IOAPIC RTE entries, so we just enable RTE for the device.
 	 */
-	if (mp_set_gsi_attr(dev->irq, 1, polarity, dev_to_node(&dev->dev)))
-		return -EBUSY;
-	if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC) < 0)
+	if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC, &info) < 0)
 		return -EBUSY;
 
 	dev->irq_managed = 1;
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 5dc6ca5e174131..9bd11548474570 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -146,19 +146,20 @@ static void __init pirq_peer_trick(void)
 
 /*
  *  Code for querying and setting of IRQ routes on various interrupt routers.
+ *  PIC Edge/Level Control Registers (ELCR) 0x4d0 & 0x4d1.
  */
 
-void eisa_set_level_irq(unsigned int irq)
+void elcr_set_level_irq(unsigned int irq)
 {
 	unsigned char mask = 1 << (irq & 7);
 	unsigned int port = 0x4d0 + (irq >> 3);
 	unsigned char val;
-	static u16 eisa_irq_mask;
+	static u16 elcr_irq_mask;
 
-	if (irq >= 16 || (1 << irq) & eisa_irq_mask)
+	if (irq >= 16 || (1 << irq) & elcr_irq_mask)
 		return;
 
-	eisa_irq_mask |= (1 << irq);
+	elcr_irq_mask |= (1 << irq);
 	printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq);
 	val = inb(port);
 	if (!(val & mask)) {
@@ -965,11 +966,11 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
 	} else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
 	((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
 		msg = "found";
-		eisa_set_level_irq(irq);
+		elcr_set_level_irq(irq);
 	} else if (newirq && r->set &&
 		(dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
 		if (r->set(pirq_router_dev, dev, pirq, newirq)) {
-			eisa_set_level_irq(newirq);
+			elcr_set_level_irq(newirq);
 			msg = "assigned";
 			irq = newirq;
 		}
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index a62e0be3a2f1b4..f1a6c8e86ddd92 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -1,4 +1,5 @@
 # Platform specific code goes here
+obj-y	+= atom/
 obj-y	+= ce4100/
 obj-y	+= efi/
 obj-y	+= geode/
diff --git a/arch/x86/platform/atom/Makefile b/arch/x86/platform/atom/Makefile
new file mode 100644
index 00000000000000..0a3a40cbc794ac
--- /dev/null
+++ b/arch/x86/platform/atom/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_PUNIT_ATOM_DEBUG) += punit_atom_debug.o
diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c
new file mode 100644
index 00000000000000..5ca8ead915795f
--- /dev/null
+++ b/arch/x86/platform/atom/punit_atom_debug.c
@@ -0,0 +1,183 @@
+/*
+ * Intel SOC Punit device state debug driver
+ * Punit controls power management for North Complex devices (Graphics
+ * blocks, Image Signal Processing, video processing, display, DSP etc.)
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/io.h>
+#include <asm/cpu_device_id.h>
+#include <asm/iosf_mbi.h>
+
+/* Side band Interface port */
+#define PUNIT_PORT		0x04
+/* Power gate status reg */
+#define PWRGT_STATUS		0x61
+/* Subsystem config/status Video processor */
+#define VED_SS_PM0		0x32
+/* Subsystem config/status ISP (Image Signal Processor) */
+#define ISP_SS_PM0		0x39
+/* Subsystem config/status Input/output controller */
+#define MIO_SS_PM		0x3B
+/* Shift bits for getting status for video, isp and i/o */
+#define SSS_SHIFT		24
+/* Shift bits for getting status for graphics rendering */
+#define RENDER_POS		0
+/* Shift bits for getting status for media control */
+#define MEDIA_POS		2
+/* Shift bits for getting status for Valley View/Baytrail display */
+#define VLV_DISPLAY_POS		6
+/* Subsystem config/status display for Cherry Trail SOC */
+#define CHT_DSP_SSS		0x36
+/* Shift bits for getting status for display */
+#define CHT_DSP_SSS_POS		16
+
+struct punit_device {
+	char *name;
+	int reg;
+	int sss_pos;
+};
+
+static const struct punit_device punit_device_byt[] = {
+	{ "GFX RENDER",	PWRGT_STATUS,	RENDER_POS },
+	{ "GFX MEDIA",	PWRGT_STATUS,	MEDIA_POS },
+	{ "DISPLAY",	PWRGT_STATUS,	VLV_DISPLAY_POS },
+	{ "VED",	VED_SS_PM0,	SSS_SHIFT },
+	{ "ISP",	ISP_SS_PM0,	SSS_SHIFT },
+	{ "MIO",	MIO_SS_PM,	SSS_SHIFT },
+	{ NULL }
+};
+
+static const struct punit_device punit_device_cht[] = {
+	{ "GFX RENDER",	PWRGT_STATUS,	RENDER_POS },
+	{ "GFX MEDIA",	PWRGT_STATUS,	MEDIA_POS },
+	{ "DISPLAY",	CHT_DSP_SSS,	CHT_DSP_SSS_POS },
+	{ "VED",	VED_SS_PM0,	SSS_SHIFT },
+	{ "ISP",	ISP_SS_PM0,	SSS_SHIFT },
+	{ "MIO",	MIO_SS_PM,	SSS_SHIFT },
+	{ NULL }
+};
+
+static const char * const dstates[] = {"D0", "D0i1", "D0i2", "D0i3"};
+
+static int punit_dev_state_show(struct seq_file *seq_file, void *unused)
+{
+	u32 punit_pwr_status;
+	struct punit_device *punit_devp = seq_file->private;
+	int index;
+	int status;
+
+	seq_puts(seq_file, "\n\nPUNIT NORTH COMPLEX DEVICES :\n");
+	while (punit_devp->name) {
+		status = iosf_mbi_read(PUNIT_PORT, BT_MBI_PMC_READ,
+				       punit_devp->reg,
+				       &punit_pwr_status);
+		if (status) {
+			seq_printf(seq_file, "%9s : Read Failed\n",
+				   punit_devp->name);
+		} else  {
+			index = (punit_pwr_status >> punit_devp->sss_pos) & 3;
+			seq_printf(seq_file, "%9s : %s\n", punit_devp->name,
+				   dstates[index]);
+		}
+		punit_devp++;
+	}
+
+	return 0;
+}
+
+static int punit_dev_state_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, punit_dev_state_show, inode->i_private);
+}
+
+static const struct file_operations punit_dev_state_ops = {
+	.open		= punit_dev_state_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static struct dentry *punit_dbg_file;
+
+static int punit_dbgfs_register(struct punit_device *punit_device)
+{
+	static struct dentry *dev_state;
+
+	punit_dbg_file = debugfs_create_dir("punit_atom", NULL);
+	if (!punit_dbg_file)
+		return -ENXIO;
+
+	dev_state = debugfs_create_file("dev_power_state", S_IFREG | S_IRUGO,
+					punit_dbg_file, punit_device,
+					&punit_dev_state_ops);
+	if (!dev_state) {
+		pr_err("punit_dev_state register failed\n");
+		debugfs_remove(punit_dbg_file);
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+static void punit_dbgfs_unregister(void)
+{
+	debugfs_remove_recursive(punit_dbg_file);
+}
+
+#define ICPU(model, drv_data) \
+	{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT,\
+	  (kernel_ulong_t)&drv_data }
+
+static const struct x86_cpu_id intel_punit_cpu_ids[] = {
+	ICPU(55, punit_device_byt), /* Valleyview, Bay Trail */
+	ICPU(76, punit_device_cht), /* Braswell, Cherry Trail */
+	{}
+};
+
+MODULE_DEVICE_TABLE(x86cpu, intel_punit_cpu_ids);
+
+static int __init punit_atom_debug_init(void)
+{
+	const struct x86_cpu_id *id;
+	int ret;
+
+	id = x86_match_cpu(intel_punit_cpu_ids);
+	if (!id)
+		return -ENODEV;
+
+	ret = punit_dbgfs_register((struct punit_device *)id->driver_data);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static void __exit punit_atom_debug_exit(void)
+{
+	punit_dbgfs_unregister();
+}
+
+module_init(punit_atom_debug_init);
+module_exit(punit_atom_debug_exit);
+
+MODULE_AUTHOR("Kumar P, Mahesh <mahesh.kumar.p@intel.com>");
+MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
+MODULE_DESCRIPTION("Driver for Punit devices states debugging");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 02744df576d525..3b984c3aa1b0b5 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -501,6 +501,8 @@ void __init efi_init(void)
 
 	if (efi_enabled(EFI_DBG))
 		print_efi_memmap();
+
+	efi_esrt_init();
 }
 
 void __init efi_late_init(void)
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
index 0b283d4d0ad770..de734134bc8d2e 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
@@ -27,6 +27,7 @@ static struct platform_device wdt_dev = {
 static int tangier_probe(struct platform_device *pdev)
 {
 	int gsi;
+	struct irq_alloc_info info;
 	struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data;
 
 	if (!pdata)
@@ -34,8 +35,8 @@ static int tangier_probe(struct platform_device *pdev)
 
 	/* IOAPIC builds identity mapping between GSI and IRQ on MID */
 	gsi = pdata->irq;
-	if (mp_set_gsi_attr(gsi, 1, 0, cpu_to_node(0)) ||
-	    mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC) <= 0) {
+	ioapic_set_alloc_attr(&info, cpu_to_node(0), 1, 0);
+	if (mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info) <= 0) {
 		dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n",
 			 gsi);
 		return -EINVAL;
diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
index 3005f0c89f2ecf..01d54ea766c165 100644
--- a/arch/x86/platform/intel-mid/intel-mid.c
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -81,26 +81,34 @@ static unsigned long __init intel_mid_calibrate_tsc(void)
 	return 0;
 }
 
+static void __init intel_mid_setup_bp_timer(void)
+{
+	apbt_time_init();
+	setup_boot_APIC_clock();
+}
+
 static void __init intel_mid_time_init(void)
 {
 	sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
+
 	switch (intel_mid_timer_options) {
 	case INTEL_MID_TIMER_APBT_ONLY:
 		break;
 	case INTEL_MID_TIMER_LAPIC_APBT:
-		x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+		/* Use apbt and local apic */
+		x86_init.timers.setup_percpu_clockev = intel_mid_setup_bp_timer;
 		x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
-		break;
+		return;
 	default:
 		if (!boot_cpu_has(X86_FEATURE_ARAT))
 			break;
+		/* Lapic only, no apbt */
 		x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
 		x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
 		return;
 	}
-	/* we need at least one APB timer */
-	pre_init_apic_IRQ0();
-	apbt_time_init();
+
+	x86_init.timers.setup_percpu_clockev = apbt_time_init;
 }
 
 static void intel_mid_arch_setup(void)
diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
index c14ad34776c466..ce992e8cc06526 100644
--- a/arch/x86/platform/intel-mid/sfi.c
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -95,18 +95,16 @@ int __init sfi_parse_mtmr(struct sfi_table_header *table)
 		pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz, irq = %d\n",
 			totallen, (u32)pentry->phys_addr,
 			pentry->freq_hz, pentry->irq);
-			if (!pentry->irq)
-				continue;
-			mp_irq.type = MP_INTSRC;
-			mp_irq.irqtype = mp_INT;
-/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
-			mp_irq.irqflag = 5;
-			mp_irq.srcbus = MP_BUS_ISA;
-			mp_irq.srcbusirq = pentry->irq;	/* IRQ */
-			mp_irq.dstapic = MP_APIC_ALL;
-			mp_irq.dstirq = pentry->irq;
-			mp_save_irq(&mp_irq);
-			mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC);
+		mp_irq.type = MP_INTSRC;
+		mp_irq.irqtype = mp_INT;
+		/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
+		mp_irq.irqflag = 5;
+		mp_irq.srcbus = MP_BUS_ISA;
+		mp_irq.srcbusirq = pentry->irq;	/* IRQ */
+		mp_irq.dstapic = MP_APIC_ALL;
+		mp_irq.dstirq = pentry->irq;
+		mp_save_irq(&mp_irq);
+		mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL);
 	}
 
 	return 0;
@@ -177,7 +175,7 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
 		mp_irq.dstapic = MP_APIC_ALL;
 		mp_irq.dstirq = pentry->irq;
 		mp_save_irq(&mp_irq);
-		mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC);
+		mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL);
 	}
 	return 0;
 }
@@ -436,6 +434,7 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
 	struct devs_id *dev = NULL;
 	int num, i, ret;
 	int polarity;
+	struct irq_alloc_info info;
 
 	sb = (struct sfi_table_simple *)table;
 	num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
@@ -469,9 +468,8 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
 				polarity = 1;
 			}
 
-			ret = mp_set_gsi_attr(irq, 1, polarity, NUMA_NO_NODE);
-			if (ret == 0)
-				ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC);
+			ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, polarity);
+			ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC, &info);
 			WARN_ON(ret < 0);
 		}
 
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c
index 2a8a74f3bd76c1..6c7111bbd1e92f 100644
--- a/arch/x86/platform/sfi/sfi.c
+++ b/arch/x86/platform/sfi/sfi.c
@@ -25,8 +25,8 @@
 #include <linux/init.h>
 #include <linux/sfi.h>
 #include <linux/io.h>
-#include <linux/irqdomain.h>
 
+#include <asm/irqdomain.h>
 #include <asm/io_apic.h>
 #include <asm/mpspec.h>
 #include <asm/setup.h>
@@ -71,9 +71,6 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table)
 #endif /* CONFIG_X86_LOCAL_APIC */
 
 #ifdef CONFIG_X86_IO_APIC
-static struct irq_domain_ops sfi_ioapic_irqdomain_ops = {
-	.map = mp_irqdomain_map,
-};
 
 static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 {
@@ -82,7 +79,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 	int i, num;
 	struct ioapic_domain_cfg cfg = {
 		.type = IOAPIC_DOMAIN_STRICT,
-		.ops = &sfi_ioapic_irqdomain_ops,
+		.ops = &mp_ioapic_irqdomain_ops,
 	};
 
 	sb = (struct sfi_table_simple *)table;
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 0ce67364543242..8570abe68be1fe 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -13,22 +13,37 @@
 #include <linux/slab.h>
 #include <linux/irq.h>
 
+#include <asm/irqdomain.h>
 #include <asm/apic.h>
 #include <asm/uv/uv_irq.h>
 #include <asm/uv/uv_hub.h>
 
 /* MMR offset and pnode of hub sourcing interrupts for a given irq */
-struct uv_irq_2_mmr_pnode{
-	struct rb_node		list;
+struct uv_irq_2_mmr_pnode {
 	unsigned long		offset;
 	int			pnode;
-	int			irq;
 };
 
-static DEFINE_SPINLOCK(uv_irq_lock);
-static struct rb_root		uv_irq_root;
+static void uv_program_mmr(struct irq_cfg *cfg, struct uv_irq_2_mmr_pnode *info)
+{
+	unsigned long mmr_value;
+	struct uv_IO_APIC_route_entry *entry;
+
+	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
+		     sizeof(unsigned long));
+
+	mmr_value = 0;
+	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+	entry->vector		= cfg->vector;
+	entry->delivery_mode	= apic->irq_delivery_mode;
+	entry->dest_mode	= apic->irq_dest_mode;
+	entry->polarity		= 0;
+	entry->trigger		= 0;
+	entry->mask		= 0;
+	entry->dest		= cfg->dest_apicid;
 
-static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool);
+	uv_write_global_mmr64(info->pnode, info->offset, mmr_value);
+}
 
 static void uv_noop(struct irq_data *data) { }
 
@@ -37,6 +52,23 @@ static void uv_ack_apic(struct irq_data *data)
 	ack_APIC_irq();
 }
 
+static int
+uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
+		    bool force)
+{
+	struct irq_data *parent = data->parent_data;
+	struct irq_cfg *cfg = irqd_cfg(data);
+	int ret;
+
+	ret = parent->chip->irq_set_affinity(parent, mask, force);
+	if (ret >= 0) {
+		uv_program_mmr(cfg, data->chip_data);
+		send_cleanup_vector(cfg);
+	}
+
+	return ret;
+}
+
 static struct irq_chip uv_irq_chip = {
 	.name			= "UV-CORE",
 	.irq_mask		= uv_noop,
@@ -45,189 +77,99 @@ static struct irq_chip uv_irq_chip = {
 	.irq_set_affinity	= uv_set_irq_affinity,
 };
 
-/*
- * Add offset and pnode information of the hub sourcing interrupts to the
- * rb tree for a specific irq.
- */
-static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade)
+static int uv_domain_alloc(struct irq_domain *domain, unsigned int virq,
+			   unsigned int nr_irqs, void *arg)
 {
-	struct rb_node **link = &uv_irq_root.rb_node;
-	struct rb_node *parent = NULL;
-	struct uv_irq_2_mmr_pnode *n;
-	struct uv_irq_2_mmr_pnode *e;
-	unsigned long irqflags;
-
-	n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL,
-				uv_blade_to_memory_nid(blade));
-	if (!n)
+	struct uv_irq_2_mmr_pnode *chip_data;
+	struct irq_alloc_info *info = arg;
+	struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
+	int ret;
+
+	if (nr_irqs > 1 || !info || info->type != X86_IRQ_ALLOC_TYPE_UV)
+		return -EINVAL;
+
+	chip_data = kmalloc_node(sizeof(*chip_data), GFP_KERNEL,
+				 irq_data->node);
+	if (!chip_data)
 		return -ENOMEM;
 
-	n->irq = irq;
-	n->offset = offset;
-	n->pnode = uv_blade_to_pnode(blade);
-	spin_lock_irqsave(&uv_irq_lock, irqflags);
-	/* Find the right place in the rbtree: */
-	while (*link) {
-		parent = *link;
-		e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list);
-
-		if (unlikely(irq == e->irq)) {
-			/* irq entry exists */
-			e->pnode = uv_blade_to_pnode(blade);
-			e->offset = offset;
-			spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-			kfree(n);
-			return 0;
-		}
-
-		if (irq < e->irq)
-			link = &(*link)->rb_left;
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+	if (ret >= 0) {
+		if (info->uv_limit == UV_AFFINITY_CPU)
+			irq_set_status_flags(virq, IRQ_NO_BALANCING);
 		else
-			link = &(*link)->rb_right;
+			irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
+
+		chip_data->pnode = uv_blade_to_pnode(info->uv_blade);
+		chip_data->offset = info->uv_offset;
+		irq_domain_set_info(domain, virq, virq, &uv_irq_chip, chip_data,
+				    handle_percpu_irq, NULL, info->uv_name);
+	} else {
+		kfree(chip_data);
 	}
 
-	/* Insert the node into the rbtree. */
-	rb_link_node(&n->list, parent, link);
-	rb_insert_color(&n->list, &uv_irq_root);
-
-	spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-	return 0;
+	return ret;
 }
 
-/* Retrieve offset and pnode information from the rb tree for a specific irq */
-int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
+static void uv_domain_free(struct irq_domain *domain, unsigned int virq,
+			   unsigned int nr_irqs)
 {
-	struct uv_irq_2_mmr_pnode *e;
-	struct rb_node *n;
-	unsigned long irqflags;
-
-	spin_lock_irqsave(&uv_irq_lock, irqflags);
-	n = uv_irq_root.rb_node;
-	while (n) {
-		e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
-
-		if (e->irq == irq) {
-			*offset = e->offset;
-			*pnode = e->pnode;
-			spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-			return 0;
-		}
-
-		if (irq < e->irq)
-			n = n->rb_left;
-		else
-			n = n->rb_right;
-	}
-	spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-	return -1;
+	struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
+
+	BUG_ON(nr_irqs != 1);
+	kfree(irq_data->chip_data);
+	irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
+	irq_clear_status_flags(virq, IRQ_NO_BALANCING);
+	irq_domain_free_irqs_top(domain, virq, nr_irqs);
 }
 
 /*
  * Re-target the irq to the specified CPU and enable the specified MMR located
  * on the specified blade to allow the sending of MSIs to the specified CPU.
  */
-static int
-arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
-		       unsigned long mmr_offset, int limit)
+static void uv_domain_activate(struct irq_domain *domain,
+			       struct irq_data *irq_data)
 {
-	const struct cpumask *eligible_cpu = cpumask_of(cpu);
-	struct irq_cfg *cfg = irq_cfg(irq);
-	unsigned long mmr_value;
-	struct uv_IO_APIC_route_entry *entry;
-	int mmr_pnode, err;
-	unsigned int dest;
-
-	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
-			sizeof(unsigned long));
-
-	err = assign_irq_vector(irq, cfg, eligible_cpu);
-	if (err != 0)
-		return err;
-
-	err = apic->cpu_mask_to_apicid_and(eligible_cpu, eligible_cpu, &dest);
-	if (err != 0)
-		return err;
-
-	if (limit == UV_AFFINITY_CPU)
-		irq_set_status_flags(irq, IRQ_NO_BALANCING);
-	else
-		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-
-	irq_set_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
-				      irq_name);
-
-	mmr_value = 0;
-	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-	entry->vector		= cfg->vector;
-	entry->delivery_mode	= apic->irq_delivery_mode;
-	entry->dest_mode	= apic->irq_dest_mode;
-	entry->polarity		= 0;
-	entry->trigger		= 0;
-	entry->mask		= 0;
-	entry->dest		= dest;
-
-	mmr_pnode = uv_blade_to_pnode(mmr_blade);
-	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
-
-	if (cfg->move_in_progress)
-		send_cleanup_vector(cfg);
-
-	return irq;
+	uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data);
 }
 
 /*
  * Disable the specified MMR located on the specified blade so that MSIs are
  * longer allowed to be sent.
  */
-static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
+static void uv_domain_deactivate(struct irq_domain *domain,
+				 struct irq_data *irq_data)
 {
 	unsigned long mmr_value;
 	struct uv_IO_APIC_route_entry *entry;
 
-	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
-			sizeof(unsigned long));
-
 	mmr_value = 0;
 	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
 	entry->mask = 1;
-
-	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+	uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data);
 }
 
-static int
-uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
-		    bool force)
-{
-	struct irq_cfg *cfg = irqd_cfg(data);
-	unsigned int dest;
-	unsigned long mmr_value, mmr_offset;
-	struct uv_IO_APIC_route_entry *entry;
-	int mmr_pnode;
-
-	if (apic_set_affinity(data, mask, &dest))
-		return -1;
-
-	mmr_value = 0;
-	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-
-	entry->vector		= cfg->vector;
-	entry->delivery_mode	= apic->irq_delivery_mode;
-	entry->dest_mode	= apic->irq_dest_mode;
-	entry->polarity		= 0;
-	entry->trigger		= 0;
-	entry->mask		= 0;
-	entry->dest		= dest;
-
-	/* Get previously stored MMR and pnode of hub sourcing interrupts */
-	if (uv_irq_2_mmr_info(data->irq, &mmr_offset, &mmr_pnode))
-		return -1;
-
-	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+static const struct irq_domain_ops uv_domain_ops = {
+	.alloc		= uv_domain_alloc,
+	.free		= uv_domain_free,
+	.activate	= uv_domain_activate,
+	.deactivate	= uv_domain_deactivate,
+};
 
-	if (cfg->move_in_progress)
-		send_cleanup_vector(cfg);
+static struct irq_domain *uv_get_irq_domain(void)
+{
+	static struct irq_domain *uv_domain;
+	static DEFINE_MUTEX(uv_lock);
+
+	mutex_lock(&uv_lock);
+	if (uv_domain == NULL) {
+		uv_domain = irq_domain_add_tree(NULL, &uv_domain_ops, NULL);
+		if (uv_domain)
+			uv_domain->parent = x86_vector_domain;
+	}
+	mutex_unlock(&uv_lock);
 
-	return IRQ_SET_MASK_OK_NOCOPY;
+	return uv_domain;
 }
 
 /*
@@ -238,19 +180,21 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
 int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
 		 unsigned long mmr_offset, int limit)
 {
-	int ret, irq = irq_alloc_hwirq(uv_blade_to_memory_nid(mmr_blade));
+	struct irq_alloc_info info;
+	struct irq_domain *domain = uv_get_irq_domain();
 
-	if (!irq)
-		return -EBUSY;
+	if (!domain)
+		return -ENOMEM;
 
-	ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
-		limit);
-	if (ret == irq)
-		uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
-	else
-		irq_free_hwirq(irq);
+	init_irq_alloc_info(&info, cpumask_of(cpu));
+	info.type = X86_IRQ_ALLOC_TYPE_UV;
+	info.uv_limit = limit;
+	info.uv_blade = mmr_blade;
+	info.uv_offset = mmr_offset;
+	info.uv_name = irq_name;
 
-	return ret;
+	return irq_domain_alloc_irqs(domain, 1,
+				     uv_blade_to_memory_nid(mmr_blade), &info);
 }
 EXPORT_SYMBOL_GPL(uv_setup_irq);
 
@@ -263,26 +207,6 @@ EXPORT_SYMBOL_GPL(uv_setup_irq);
  */
 void uv_teardown_irq(unsigned int irq)
 {
-	struct uv_irq_2_mmr_pnode *e;
-	struct rb_node *n;
-	unsigned long irqflags;
-
-	spin_lock_irqsave(&uv_irq_lock, irqflags);
-	n = uv_irq_root.rb_node;
-	while (n) {
-		e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
-		if (e->irq == irq) {
-			arch_disable_uv_irq(e->pnode, e->offset);
-			rb_erase(n, &uv_irq_root);
-			kfree(e);
-			break;
-		}
-		if (irq < e->irq)
-			n = n->rb_left;
-		else
-			n = n->rb_right;
-	}
-	spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-	irq_free_hwirq(irq);
+	irq_domain_free_irqs(irq, 1);
 }
 EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 757678fb26e1a0..0d7dd1f5ac36fa 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -18,10 +18,9 @@
 #include <asm/mtrr.h>
 #include <asm/page.h>
 #include <asm/mce.h>
-#include <asm/xcr.h>
 #include <asm/suspend.h>
+#include <asm/fpu/internal.h>
 #include <asm/debugreg.h>
-#include <asm/fpu-internal.h> /* pcntxt_mask */
 #include <asm/cpu.h>
 
 #ifdef CONFIG_X86_32
@@ -155,6 +154,8 @@ static void fix_processor_context(void)
 #endif
 	load_TR_desc();				/* This does ltr */
 	load_LDT(&current->active_mm->context);	/* This does lldt */
+
+	fpu__resume_cpu();
 }
 
 /**
@@ -221,12 +222,6 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
 	wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
 #endif
 
-	/*
-	 * restore XCR0 for xsave capable cpu's.
-	 */
-	if (cpu_has_xsave)
-		xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
-
 	fix_processor_context();
 
 	do_fpu_end();
diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S
index 3c4469a7a929c7..e2386cb4e0c315 100644
--- a/arch/x86/power/hibernate_asm_64.S
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -78,9 +78,9 @@ ENTRY(restore_image)
 
 	/* code below has been relocated to a safe page */
 ENTRY(core_restore_code)
-loop:
+.Lloop:
 	testq	%rdx, %rdx
-	jz	done
+	jz	.Ldone
 
 	/* get addresses from the pbe and copy the page */
 	movq	pbe_address(%rdx), %rsi
@@ -91,8 +91,8 @@ loop:
 
 	/* progress to the next pbe */
 	movq	pbe_next(%rdx), %rdx
-	jmp	loop
-done:
+	jmp	.Lloop
+.Ldone:
 	/* jump to the restore_registers address from the image header */
 	jmpq	*%rax
 	/*
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 7e8a1a65043561..b9531d34313491 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -39,7 +39,8 @@
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
 #define smp_wmb()	barrier()
-#define set_mb(var, value) do { var = value; barrier(); } while (0)
+
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
 
 #define read_barrier_depends()		do { } while (0)
 #define smp_read_barrier_depends()	do { } while (0)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 46957ead3060ee..37058679d14838 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1181,10 +1181,11 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 	.read_tscp = native_read_tscp,
 
 	.iret = xen_iret,
-	.irq_enable_sysexit = xen_sysexit,
 #ifdef CONFIG_X86_64
 	.usergs_sysret32 = xen_sysret32,
 	.usergs_sysret64 = xen_sysret64,
+#else
+	.irq_enable_sysexit = xen_sysexit,
 #endif
 
 	.load_tr_desc = paravirt_nop,
@@ -1423,7 +1424,7 @@ static void xen_pvh_set_cr_flags(int cpu)
 		return;
 	/*
 	 * For BSP, PSE PGE are set in probe_page_size_mask(), for APs
-	 * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init.
+	 * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu__init_cpu().
 	*/
 	if (cpu_has_pse)
 		cr4_set_bits_and_update_boot(X86_CR4_PSE);
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 956374c1edbc31..9e2ba5c6e1dd7b 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -17,6 +17,56 @@
 #include "xen-ops.h"
 #include "debugfs.h"
 
+static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
+static DEFINE_PER_CPU(char *, irq_name);
+static bool xen_pvspin = true;
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+#include <asm/qspinlock.h>
+
+static void xen_qlock_kick(int cpu)
+{
+	xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
+}
+
+/*
+ * Halt the current CPU & release it back to the host
+ */
+static void xen_qlock_wait(u8 *byte, u8 val)
+{
+	int irq = __this_cpu_read(lock_kicker_irq);
+
+	/* If kicker interrupts not initialized yet, just spin */
+	if (irq == -1)
+		return;
+
+	/* clear pending */
+	xen_clear_irq_pending(irq);
+	barrier();
+
+	/*
+	 * We check the byte value after clearing pending IRQ to make sure
+	 * that we won't miss a wakeup event because of the clearing.
+	 *
+	 * The sync_clear_bit() call in xen_clear_irq_pending() is atomic.
+	 * So it is effectively a memory barrier for x86.
+	 */
+	if (READ_ONCE(*byte) != val)
+		return;
+
+	/*
+	 * If an interrupt happens here, it will leave the wakeup irq
+	 * pending, which will cause xen_poll_irq() to return
+	 * immediately.
+	 */
+
+	/* Block until irq becomes pending (or perhaps a spurious wakeup) */
+	xen_poll_irq(irq);
+}
+
+#else /* CONFIG_QUEUED_SPINLOCKS */
+
 enum xen_contention_stat {
 	TAKEN_SLOW,
 	TAKEN_SLOW_PICKUP,
@@ -100,12 +150,9 @@ struct xen_lock_waiting {
 	__ticket_t want;
 };
 
-static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
-static DEFINE_PER_CPU(char *, irq_name);
 static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
 static cpumask_t waiting_cpus;
 
-static bool xen_pvspin = true;
 __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 {
 	int irq = __this_cpu_read(lock_kicker_irq);
@@ -217,6 +264,7 @@ static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
 		}
 	}
 }
+#endif /* CONFIG_QUEUED_SPINLOCKS */
 
 static irqreturn_t dummy_handler(int irq, void *dev_id)
 {
@@ -280,8 +328,16 @@ void __init xen_init_spinlocks(void)
 		return;
 	}
 	printk(KERN_DEBUG "xen: PV spinlocks enabled\n");
+#ifdef CONFIG_QUEUED_SPINLOCKS
+	__pv_init_lock_hash();
+	pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+	pv_lock_ops.wait = xen_qlock_wait;
+	pv_lock_ops.kick = xen_qlock_kick;
+#else
 	pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning);
 	pv_lock_ops.unlock_kick = xen_unlock_kick;
+#endif
 }
 
 /*
@@ -310,7 +366,7 @@ static __init int xen_parse_nopvspin(char *arg)
 }
 early_param("xen_nopvspin", xen_parse_nopvspin);
 
-#ifdef CONFIG_XEN_DEBUG_FS
+#if defined(CONFIG_XEN_DEBUG_FS) && !defined(CONFIG_QUEUED_SPINLOCKS)
 
 static struct dentry *d_spin_debug;
 
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 985fc3ee0973c8..04529e620559fa 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -15,6 +15,8 @@
 #include <asm/percpu.h>
 #include <asm/processor-flags.h>
 #include <asm/segment.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
 
 #include <xen/interface/xen.h>
 
@@ -47,29 +49,13 @@ ENTRY(xen_iret)
 ENDPATCH(xen_iret)
 RELOC(xen_iret, 1b+1)
 
-/*
- * sysexit is not used for 64-bit processes, so it's only ever used to
- * return to 32-bit compat userspace.
- */
-ENTRY(xen_sysexit)
-	pushq $__USER32_DS
-	pushq %rcx
-	pushq $X86_EFLAGS_IF
-	pushq $__USER32_CS
-	pushq %rdx
-
-	pushq $0
-1:	jmp hypercall_iret
-ENDPATCH(xen_sysexit)
-RELOC(xen_sysexit, 1b+1)
-
 ENTRY(xen_sysret64)
 	/*
 	 * We're already on the usermode stack at this point, but
 	 * still with the kernel gs, so we can easily switch back
 	 */
 	movq %rsp, PER_CPU_VAR(rsp_scratch)
-	movq PER_CPU_VAR(kernel_stack), %rsp
+	movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 	pushq $__USER_DS
 	pushq PER_CPU_VAR(rsp_scratch)
@@ -88,7 +74,7 @@ ENTRY(xen_sysret32)
 	 * still with the kernel gs, so we can easily switch back
 	 */
 	movq %rsp, PER_CPU_VAR(rsp_scratch)
-	movq PER_CPU_VAR(kernel_stack), %rsp
+	movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 	pushq $__USER32_DS
 	pushq PER_CPU_VAR(rsp_scratch)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9e195c683549dc..c20fe29e65f48b 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -134,7 +134,9 @@ DECL_ASM(void, xen_restore_fl_direct, unsigned long);
 
 /* These are not functions, and cannot be called normally */
 __visible void xen_iret(void);
+#ifdef CONFIG_X86_32
 __visible void xen_sysexit(void);
+#endif
 __visible void xen_sysret32(void);
 __visible void xen_sysret64(void);
 __visible void xen_adjust_exception_frame(void);
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 9e3571a6535c3b..83a44a33cfa112 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -15,10 +15,10 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/hardirq.h>
+#include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
 #include <asm/hardirq.h>
-#include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 
 DEFINE_PER_CPU(unsigned long, asid_cache) = ASID_USER_FIRST;
@@ -57,7 +57,7 @@ void do_page_fault(struct pt_regs *regs)
 	/* If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm) {
+	if (faulthandler_disabled() || !mm) {
 		bad_page_fault(regs, address, SIGSEGV);
 		return;
 	}
diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c
index 8cfb71ec093736..184ceadccc1a3a 100644
--- a/arch/xtensa/mm/highmem.c
+++ b/arch/xtensa/mm/highmem.c
@@ -42,6 +42,7 @@ void *kmap_atomic(struct page *page)
 	enum fixed_addresses idx;
 	unsigned long vaddr;
 
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -79,6 +80,7 @@ void __kunmap_atomic(void *kvaddr)
 	}
 
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
 
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 5f13f4d0bcceda..1e28ddb656b891 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -24,7 +24,7 @@ static int get_first_sibling(unsigned int cpu)
 {
 	unsigned int ret;
 
-	ret = cpumask_first(topology_thread_cpumask(cpu));
+	ret = cpumask_first(topology_sibling_cpumask(cpu));
 	if (ret < nr_cpu_ids)
 		return ret;
 
diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c
index 6bc9cbc01ad6a3..00b39802d7ecf5 100644
--- a/drivers/acpi/acpi_pad.c
+++ b/drivers/acpi/acpi_pad.c
@@ -105,7 +105,7 @@ static void round_robin_cpu(unsigned int tsk_index)
 	mutex_lock(&round_robin_lock);
 	cpumask_clear(tmp);
 	for_each_cpu(cpu, pad_busy_cpus)
-		cpumask_or(tmp, tmp, topology_thread_cpumask(cpu));
+		cpumask_or(tmp, tmp, topology_sibling_cpumask(cpu));
 	cpumask_andnot(tmp, cpu_online_mask, tmp);
 	/* avoid HT sibilings if possible */
 	if (cpumask_empty(tmp))
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 6491f45200a786..8b7d7f8e585184 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -61,7 +61,7 @@ static DEVICE_ATTR_RO(physical_package_id);
 define_id_show_func(core_id);
 static DEVICE_ATTR_RO(core_id);
 
-define_siblings_show_func(thread_siblings, thread_cpumask);
+define_siblings_show_func(thread_siblings, sibling_cpumask);
 static DEVICE_ATTR_RO(thread_siblings);
 static DEVICE_ATTR_RO(thread_siblings_list);
 
diff --git a/drivers/char/hw_random/via-rng.c b/drivers/char/hw_random/via-rng.c
index a3bebef255ad3a..0c98a9d51a2494 100644
--- a/drivers/char/hw_random/via-rng.c
+++ b/drivers/char/hw_random/via-rng.c
@@ -33,7 +33,7 @@
 #include <asm/io.h>
 #include <asm/msr.h>
 #include <asm/cpufeature.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 
 
diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index d965cb4f690b8f..60d7d3d204d327 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -106,6 +106,16 @@ config CLKSRC_EFM32
 	  Support to use the timers of EFM32 SoCs as clock source and clock
 	  event device.
 
+config CLKSRC_LPC32XX
+	bool
+	select CLKSRC_MMIO
+	select CLKSRC_OF
+
+config CLKSRC_STM32
+	bool "Clocksource for STM32 SoCs" if COMPILE_TEST
+	depends on OF
+	select CLKSRC_MMIO
+
 config ARM_ARCH_TIMER
 	bool
 	select CLKSRC_OF if OF
@@ -145,6 +155,13 @@ config CLKSRC_ARM_GLOBAL_TIMER_SCHED_CLOCK
 	help
 	 Use ARM global timer clock source as sched_clock
 
+config ARMV7M_SYSTICK
+	bool
+	select CLKSRC_OF if OF
+	select CLKSRC_MMIO
+	help
+	  This options enables support for the ARMv7M system timer unit
+
 config ATMEL_PIT
 	select CLKSRC_OF if OF
 	def_bool SOC_AT91SAM9 || SOC_SAMA5
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index 12ec793f7fff7d..f1ae0e78653ae7 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -36,7 +36,9 @@ obj-$(CONFIG_ARCH_NSPIRE)	+= zevio-timer.o
 obj-$(CONFIG_ARCH_BCM_MOBILE)	+= bcm_kona_timer.o
 obj-$(CONFIG_CADENCE_TTC_TIMER)	+= cadence_ttc_timer.o
 obj-$(CONFIG_CLKSRC_EFM32)	+= time-efm32.o
+obj-$(CONFIG_CLKSRC_STM32)	+= timer-stm32.o
 obj-$(CONFIG_CLKSRC_EXYNOS_MCT)	+= exynos_mct.o
+obj-$(CONFIG_CLKSRC_LPC32XX)	+= time-lpc32xx.o
 obj-$(CONFIG_CLKSRC_SAMSUNG_PWM)	+= samsung_pwm_timer.o
 obj-$(CONFIG_FSL_FTM_TIMER)	+= fsl_ftm_timer.o
 obj-$(CONFIG_VF_PIT_TIMER)	+= vf_pit_timer.o
@@ -46,6 +48,7 @@ obj-$(CONFIG_MTK_TIMER)		+= mtk_timer.o
 obj-$(CONFIG_ARM_ARCH_TIMER)		+= arm_arch_timer.o
 obj-$(CONFIG_ARM_GLOBAL_TIMER)		+= arm_global_timer.o
 obj-$(CONFIG_ARM_TIMER_SP804)		+= timer-sp804.o
+obj-$(CONFIG_ARMV7M_SYSTICK)		+= armv7m_systick.o
 obj-$(CONFIG_CLKSRC_METAG_GENERIC)	+= metag_generic.o
 obj-$(CONFIG_ARCH_HAS_TICK_BROADCAST)	+= dummy_timer.o
 obj-$(CONFIG_ARCH_KEYSTONE)		+= timer-keystone.o
diff --git a/drivers/clocksource/armv7m_systick.c b/drivers/clocksource/armv7m_systick.c
new file mode 100644
index 00000000000000..addfd2c64f54ef
--- /dev/null
+++ b/drivers/clocksource/armv7m_systick.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) Maxime Coquelin 2015
+ * Author:  Maxime Coquelin <mcoquelin.stm32@gmail.com>
+ * License terms:  GNU General Public License (GPL), version 2
+ */
+
+#include <linux/kernel.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/clk.h>
+#include <linux/bitops.h>
+
+#define SYST_CSR	0x00
+#define SYST_RVR	0x04
+#define SYST_CVR	0x08
+#define SYST_CALIB	0x0c
+
+#define SYST_CSR_ENABLE BIT(0)
+
+#define SYSTICK_LOAD_RELOAD_MASK 0x00FFFFFF
+
+static void __init system_timer_of_register(struct device_node *np)
+{
+	struct clk *clk = NULL;
+	void __iomem *base;
+	u32 rate;
+	int ret;
+
+	base = of_iomap(np, 0);
+	if (!base) {
+		pr_warn("system-timer: invalid base address\n");
+		return;
+	}
+
+	ret = of_property_read_u32(np, "clock-frequency", &rate);
+	if (ret) {
+		clk = of_clk_get(np, 0);
+		if (IS_ERR(clk))
+			goto out_unmap;
+
+		ret = clk_prepare_enable(clk);
+		if (ret)
+			goto out_clk_put;
+
+		rate = clk_get_rate(clk);
+		if (!rate)
+			goto out_clk_disable;
+	}
+
+	writel_relaxed(SYSTICK_LOAD_RELOAD_MASK, base + SYST_RVR);
+	writel_relaxed(SYST_CSR_ENABLE, base + SYST_CSR);
+
+	ret = clocksource_mmio_init(base + SYST_CVR, "arm_system_timer", rate,
+			200, 24, clocksource_mmio_readl_down);
+	if (ret) {
+		pr_err("failed to init clocksource (%d)\n", ret);
+		if (clk)
+			goto out_clk_disable;
+		else
+			goto out_unmap;
+	}
+
+	pr_info("ARM System timer initialized as clocksource\n");
+
+	return;
+
+out_clk_disable:
+	clk_disable_unprepare(clk);
+out_clk_put:
+	clk_put(clk);
+out_unmap:
+	iounmap(base);
+	pr_warn("ARM System timer register failed (%d)\n", ret);
+}
+
+CLOCKSOURCE_OF_DECLARE(arm_systick, "arm,armv7m-systick",
+			system_timer_of_register);
diff --git a/drivers/clocksource/asm9260_timer.c b/drivers/clocksource/asm9260_timer.c
index 2c9c993727c85a..4c2ba59897e84b 100644
--- a/drivers/clocksource/asm9260_timer.c
+++ b/drivers/clocksource/asm9260_timer.c
@@ -178,7 +178,7 @@ static void __init asm9260_timer_init(struct device_node *np)
 	unsigned long rate;
 
 	priv.base = of_io_request_and_map(np, 0, np->name);
-	if (!priv.base)
+	if (IS_ERR(priv.base))
 		panic("%s: unable to map resource", np->name);
 
 	clk = of_clk_get(np, 0);
diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c
index 83564c9cfdbe3b..935b05936dbdd9 100644
--- a/drivers/clocksource/exynos_mct.c
+++ b/drivers/clocksource/exynos_mct.c
@@ -209,7 +209,7 @@ static void exynos4_frc_resume(struct clocksource *cs)
 	exynos4_mct_frc_start();
 }
 
-struct clocksource mct_frc = {
+static struct clocksource mct_frc = {
 	.name		= "mct-frc",
 	.rating		= 400,
 	.read		= exynos4_frc_read,
@@ -413,7 +413,7 @@ static inline void exynos4_tick_set_mode(enum clock_event_mode mode,
 	}
 }
 
-static int exynos4_mct_tick_clear(struct mct_clock_event_device *mevt)
+static void exynos4_mct_tick_clear(struct mct_clock_event_device *mevt)
 {
 	struct clock_event_device *evt = &mevt->evt;
 
@@ -426,12 +426,8 @@ static int exynos4_mct_tick_clear(struct mct_clock_event_device *mevt)
 		exynos4_mct_tick_stop(mevt);
 
 	/* Clear the MCT tick interrupt */
-	if (readl_relaxed(reg_base + mevt->base + MCT_L_INT_CSTAT_OFFSET) & 1) {
+	if (readl_relaxed(reg_base + mevt->base + MCT_L_INT_CSTAT_OFFSET) & 1)
 		exynos4_mct_write(0x1, mevt->base + MCT_L_INT_CSTAT_OFFSET);
-		return 1;
-	} else {
-		return 0;
-	}
 }
 
 static irqreturn_t exynos4_mct_tick_isr(int irq, void *dev_id)
@@ -564,18 +560,6 @@ out_irq:
 	free_percpu_irq(mct_irqs[MCT_L0_IRQ], &percpu_mct_tick);
 }
 
-void __init mct_init(void __iomem *base, int irq_g0, int irq_l0, int irq_l1)
-{
-	mct_irqs[MCT_G0_IRQ] = irq_g0;
-	mct_irqs[MCT_L0_IRQ] = irq_l0;
-	mct_irqs[MCT_L1_IRQ] = irq_l1;
-	mct_int_type = MCT_INT_SPI;
-
-	exynos4_timer_resources(NULL, base);
-	exynos4_clocksource_init();
-	exynos4_clockevent_init();
-}
-
 static void __init mct_init_dt(struct device_node *np, unsigned int int_type)
 {
 	u32 nr_irqs, i;
diff --git a/drivers/clocksource/qcom-timer.c b/drivers/clocksource/qcom-timer.c
index 098c542e5c537d..cba2d015564c2c 100644
--- a/drivers/clocksource/qcom-timer.c
+++ b/drivers/clocksource/qcom-timer.c
@@ -40,8 +40,6 @@
 
 #define GPT_HZ 32768
 
-#define MSM_DGT_SHIFT 5
-
 static void __iomem *event_base;
 static void __iomem *sts_base;
 
@@ -232,7 +230,6 @@ err:
 	register_current_timer_delay(&msm_delay_timer);
 }
 
-#ifdef CONFIG_ARCH_QCOM
 static void __init msm_dt_timer_init(struct device_node *np)
 {
 	u32 freq;
@@ -285,59 +282,3 @@ static void __init msm_dt_timer_init(struct device_node *np)
 }
 CLOCKSOURCE_OF_DECLARE(kpss_timer, "qcom,kpss-timer", msm_dt_timer_init);
 CLOCKSOURCE_OF_DECLARE(scss_timer, "qcom,scss-timer", msm_dt_timer_init);
-#else
-
-static int __init msm_timer_map(phys_addr_t addr, u32 event, u32 source,
-				u32 sts)
-{
-	void __iomem *base;
-
-	base = ioremap(addr, SZ_256);
-	if (!base) {
-		pr_err("Failed to map timer base\n");
-		return -ENOMEM;
-	}
-	event_base = base + event;
-	source_base = base + source;
-	if (sts)
-		sts_base = base + sts;
-
-	return 0;
-}
-
-static notrace cycle_t msm_read_timer_count_shift(struct clocksource *cs)
-{
-	/*
-	 * Shift timer count down by a constant due to unreliable lower bits
-	 * on some targets.
-	 */
-	return msm_read_timer_count(cs) >> MSM_DGT_SHIFT;
-}
-
-void __init msm7x01_timer_init(void)
-{
-	struct clocksource *cs = &msm_clocksource;
-
-	if (msm_timer_map(0xc0100000, 0x0, 0x10, 0x0))
-		return;
-	cs->read = msm_read_timer_count_shift;
-	cs->mask = CLOCKSOURCE_MASK((32 - MSM_DGT_SHIFT));
-	/* 600 KHz */
-	msm_timer_init(19200000 >> MSM_DGT_SHIFT, 32 - MSM_DGT_SHIFT, 7,
-			false);
-}
-
-void __init msm7x30_timer_init(void)
-{
-	if (msm_timer_map(0xc0100000, 0x4, 0x24, 0x80))
-		return;
-	msm_timer_init(24576000 / 4, 32, 1, false);
-}
-
-void __init qsd8x50_timer_init(void)
-{
-	if (msm_timer_map(0xAC100000, 0x0, 0x10, 0x34))
-		return;
-	msm_timer_init(19200000 / 4, 32, 7, false);
-}
-#endif
diff --git a/drivers/clocksource/time-lpc32xx.c b/drivers/clocksource/time-lpc32xx.c
new file mode 100644
index 00000000000000..a1c06a2bc77c55
--- /dev/null
+++ b/drivers/clocksource/time-lpc32xx.c
@@ -0,0 +1,272 @@
+/*
+ * Clocksource driver for NXP LPC32xx/18xx/43xx timer
+ *
+ * Copyright (C) 2015 Joachim Eastwood <manabian@gmail.com>
+ *
+ * Based on:
+ * time-efm32 Copyright (C) 2013 Pengutronix
+ * mach-lpc32xx/timer.c Copyright (C) 2009 - 2010 NXP Semiconductors
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ *
+ */
+
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
+#include <linux/clk.h>
+#include <linux/clockchips.h>
+#include <linux/clocksource.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/sched_clock.h>
+
+#define LPC32XX_TIMER_IR		0x000
+#define  LPC32XX_TIMER_IR_MR0INT	BIT(0)
+#define LPC32XX_TIMER_TCR		0x004
+#define  LPC32XX_TIMER_TCR_CEN		BIT(0)
+#define  LPC32XX_TIMER_TCR_CRST		BIT(1)
+#define LPC32XX_TIMER_TC		0x008
+#define LPC32XX_TIMER_PR		0x00c
+#define LPC32XX_TIMER_MCR		0x014
+#define  LPC32XX_TIMER_MCR_MR0I		BIT(0)
+#define  LPC32XX_TIMER_MCR_MR0R		BIT(1)
+#define  LPC32XX_TIMER_MCR_MR0S		BIT(2)
+#define LPC32XX_TIMER_MR0		0x018
+#define LPC32XX_TIMER_CTCR		0x070
+
+struct lpc32xx_clock_event_ddata {
+	struct clock_event_device evtdev;
+	void __iomem *base;
+};
+
+/* Needed for the sched clock */
+static void __iomem *clocksource_timer_counter;
+
+static u64 notrace lpc32xx_read_sched_clock(void)
+{
+	return readl(clocksource_timer_counter);
+}
+
+static int lpc32xx_clkevt_next_event(unsigned long delta,
+				     struct clock_event_device *evtdev)
+{
+	struct lpc32xx_clock_event_ddata *ddata =
+		container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev);
+
+	/*
+	 * Place timer in reset and program the delta in the prescale
+	 * register (PR). When the prescale counter matches the value
+	 * in PR the counter register is incremented and the compare
+	 * match will trigger. After setup the timer is released from
+	 * reset and enabled.
+	 */
+	writel_relaxed(LPC32XX_TIMER_TCR_CRST, ddata->base + LPC32XX_TIMER_TCR);
+	writel_relaxed(delta, ddata->base + LPC32XX_TIMER_PR);
+	writel_relaxed(LPC32XX_TIMER_TCR_CEN, ddata->base + LPC32XX_TIMER_TCR);
+
+	return 0;
+}
+
+static int lpc32xx_clkevt_shutdown(struct clock_event_device *evtdev)
+{
+	struct lpc32xx_clock_event_ddata *ddata =
+		container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev);
+
+	/* Disable the timer */
+	writel_relaxed(0, ddata->base + LPC32XX_TIMER_TCR);
+
+	return 0;
+}
+
+static int lpc32xx_clkevt_oneshot(struct clock_event_device *evtdev)
+{
+	/*
+	 * When using oneshot, we must also disable the timer
+	 * to wait for the first call to set_next_event().
+	 */
+	return lpc32xx_clkevt_shutdown(evtdev);
+}
+
+static irqreturn_t lpc32xx_clock_event_handler(int irq, void *dev_id)
+{
+	struct lpc32xx_clock_event_ddata *ddata = dev_id;
+
+	/* Clear match on channel 0 */
+	writel_relaxed(LPC32XX_TIMER_IR_MR0INT, ddata->base + LPC32XX_TIMER_IR);
+
+	ddata->evtdev.event_handler(&ddata->evtdev);
+
+	return IRQ_HANDLED;
+}
+
+static struct lpc32xx_clock_event_ddata lpc32xx_clk_event_ddata = {
+	.evtdev = {
+		.name			= "lpc3220 clockevent",
+		.features		= CLOCK_EVT_FEAT_ONESHOT,
+		.rating			= 300,
+		.set_next_event		= lpc32xx_clkevt_next_event,
+		.set_state_shutdown	= lpc32xx_clkevt_shutdown,
+		.set_state_oneshot	= lpc32xx_clkevt_oneshot,
+	},
+};
+
+static int __init lpc32xx_clocksource_init(struct device_node *np)
+{
+	void __iomem *base;
+	unsigned long rate;
+	struct clk *clk;
+	int ret;
+
+	clk = of_clk_get_by_name(np, "timerclk");
+	if (IS_ERR(clk)) {
+		pr_err("clock get failed (%lu)\n", PTR_ERR(clk));
+		return PTR_ERR(clk);
+	}
+
+	ret = clk_prepare_enable(clk);
+	if (ret) {
+		pr_err("clock enable failed (%d)\n", ret);
+		goto err_clk_enable;
+	}
+
+	base = of_iomap(np, 0);
+	if (!base) {
+		pr_err("unable to map registers\n");
+		ret = -EADDRNOTAVAIL;
+		goto err_iomap;
+	}
+
+	/*
+	 * Disable and reset timer then set it to free running timer
+	 * mode (CTCR) with no prescaler (PR) or match operations (MCR).
+	 * After setup the timer is released from reset and enabled.
+	 */
+	writel_relaxed(LPC32XX_TIMER_TCR_CRST, base + LPC32XX_TIMER_TCR);
+	writel_relaxed(0, base + LPC32XX_TIMER_PR);
+	writel_relaxed(0, base + LPC32XX_TIMER_MCR);
+	writel_relaxed(0, base + LPC32XX_TIMER_CTCR);
+	writel_relaxed(LPC32XX_TIMER_TCR_CEN, base + LPC32XX_TIMER_TCR);
+
+	rate = clk_get_rate(clk);
+	ret = clocksource_mmio_init(base + LPC32XX_TIMER_TC, "lpc3220 timer",
+				    rate, 300, 32, clocksource_mmio_readl_up);
+	if (ret) {
+		pr_err("failed to init clocksource (%d)\n", ret);
+		goto err_clocksource_init;
+	}
+
+	clocksource_timer_counter = base + LPC32XX_TIMER_TC;
+	sched_clock_register(lpc32xx_read_sched_clock, 32, rate);
+
+	return 0;
+
+err_clocksource_init:
+	iounmap(base);
+err_iomap:
+	clk_disable_unprepare(clk);
+err_clk_enable:
+	clk_put(clk);
+	return ret;
+}
+
+static int __init lpc32xx_clockevent_init(struct device_node *np)
+{
+	void __iomem *base;
+	unsigned long rate;
+	struct clk *clk;
+	int ret, irq;
+
+	clk = of_clk_get_by_name(np, "timerclk");
+	if (IS_ERR(clk)) {
+		pr_err("clock get failed (%lu)\n", PTR_ERR(clk));
+		return PTR_ERR(clk);
+	}
+
+	ret = clk_prepare_enable(clk);
+	if (ret) {
+		pr_err("clock enable failed (%d)\n", ret);
+		goto err_clk_enable;
+	}
+
+	base = of_iomap(np, 0);
+	if (!base) {
+		pr_err("unable to map registers\n");
+		ret = -EADDRNOTAVAIL;
+		goto err_iomap;
+	}
+
+	irq = irq_of_parse_and_map(np, 0);
+	if (!irq) {
+		pr_err("get irq failed\n");
+		ret = -ENOENT;
+		goto err_irq;
+	}
+
+	/*
+	 * Disable timer and clear any pending interrupt (IR) on match
+	 * channel 0 (MR0). Configure a compare match value of 1 on MR0
+	 * and enable interrupt, reset on match and stop on match (MCR).
+	 */
+	writel_relaxed(0, base + LPC32XX_TIMER_TCR);
+	writel_relaxed(0, base + LPC32XX_TIMER_CTCR);
+	writel_relaxed(LPC32XX_TIMER_IR_MR0INT, base + LPC32XX_TIMER_IR);
+	writel_relaxed(1, base + LPC32XX_TIMER_MR0);
+	writel_relaxed(LPC32XX_TIMER_MCR_MR0I | LPC32XX_TIMER_MCR_MR0R |
+		       LPC32XX_TIMER_MCR_MR0S, base + LPC32XX_TIMER_MCR);
+
+	rate = clk_get_rate(clk);
+	lpc32xx_clk_event_ddata.base = base;
+	clockevents_config_and_register(&lpc32xx_clk_event_ddata.evtdev,
+					rate, 1, -1);
+
+	ret = request_irq(irq, lpc32xx_clock_event_handler,
+			  IRQF_TIMER | IRQF_IRQPOLL, "lpc3220 clockevent",
+			  &lpc32xx_clk_event_ddata);
+	if (ret) {
+		pr_err("request irq failed\n");
+		goto err_irq;
+	}
+
+	return 0;
+
+err_irq:
+	iounmap(base);
+err_iomap:
+	clk_disable_unprepare(clk);
+err_clk_enable:
+	clk_put(clk);
+	return ret;
+}
+
+/*
+ * This function asserts that we have exactly one clocksource and one
+ * clock_event_device in the end.
+ */
+static void __init lpc32xx_timer_init(struct device_node *np)
+{
+	static int has_clocksource, has_clockevent;
+	int ret;
+
+	if (!has_clocksource) {
+		ret = lpc32xx_clocksource_init(np);
+		if (!ret) {
+			has_clocksource = 1;
+			return;
+		}
+	}
+
+	if (!has_clockevent) {
+		ret = lpc32xx_clockevent_init(np);
+		if (!ret) {
+			has_clockevent = 1;
+			return;
+		}
+	}
+}
+CLOCKSOURCE_OF_DECLARE(lpc32xx_timer, "nxp,lpc3220-timer", lpc32xx_timer_init);
diff --git a/drivers/clocksource/timer-integrator-ap.c b/drivers/clocksource/timer-integrator-ap.c
index d7d21e4dcef019..a68866e0ecd4d6 100644
--- a/drivers/clocksource/timer-integrator-ap.c
+++ b/drivers/clocksource/timer-integrator-ap.c
@@ -167,7 +167,7 @@ static void __init integrator_ap_timer_init_of(struct device_node *node)
 	struct device_node *sec_node;
 
 	base = of_io_request_and_map(node, 0, "integrator-timer");
-	if (!base)
+	if (IS_ERR(base))
 		return;
 
 	clk = of_clk_get(node, 0);
diff --git a/drivers/clocksource/timer-stm32.c b/drivers/clocksource/timer-stm32.c
new file mode 100644
index 00000000000000..a97e8b50701cf9
--- /dev/null
+++ b/drivers/clocksource/timer-stm32.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) Maxime Coquelin 2015
+ * Author:  Maxime Coquelin <mcoquelin.stm32@gmail.com>
+ * License terms:  GNU General Public License (GPL), version 2
+ *
+ * Inspired by time-efm32.c from Uwe Kleine-Koenig
+ */
+
+#include <linux/kernel.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/clk.h>
+#include <linux/reset.h>
+
+#define TIM_CR1		0x00
+#define TIM_DIER	0x0c
+#define TIM_SR		0x10
+#define TIM_EGR		0x14
+#define TIM_PSC		0x28
+#define TIM_ARR		0x2c
+
+#define TIM_CR1_CEN	BIT(0)
+#define TIM_CR1_OPM	BIT(3)
+#define TIM_CR1_ARPE	BIT(7)
+
+#define TIM_DIER_UIE	BIT(0)
+
+#define TIM_SR_UIF	BIT(0)
+
+#define TIM_EGR_UG	BIT(0)
+
+struct stm32_clock_event_ddata {
+	struct clock_event_device evtdev;
+	unsigned periodic_top;
+	void __iomem *base;
+};
+
+static void stm32_clock_event_set_mode(enum clock_event_mode mode,
+				       struct clock_event_device *evtdev)
+{
+	struct stm32_clock_event_ddata *data =
+		container_of(evtdev, struct stm32_clock_event_ddata, evtdev);
+	void *base = data->base;
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		writel_relaxed(data->periodic_top, base + TIM_ARR);
+		writel_relaxed(TIM_CR1_ARPE | TIM_CR1_CEN, base + TIM_CR1);
+		break;
+
+	case CLOCK_EVT_MODE_ONESHOT:
+	default:
+		writel_relaxed(0, base + TIM_CR1);
+		break;
+	}
+}
+
+static int stm32_clock_event_set_next_event(unsigned long evt,
+					    struct clock_event_device *evtdev)
+{
+	struct stm32_clock_event_ddata *data =
+		container_of(evtdev, struct stm32_clock_event_ddata, evtdev);
+
+	writel_relaxed(evt, data->base + TIM_ARR);
+	writel_relaxed(TIM_CR1_ARPE | TIM_CR1_OPM | TIM_CR1_CEN,
+		       data->base + TIM_CR1);
+
+	return 0;
+}
+
+static irqreturn_t stm32_clock_event_handler(int irq, void *dev_id)
+{
+	struct stm32_clock_event_ddata *data = dev_id;
+
+	writel_relaxed(0, data->base + TIM_SR);
+
+	data->evtdev.event_handler(&data->evtdev);
+
+	return IRQ_HANDLED;
+}
+
+static struct stm32_clock_event_ddata clock_event_ddata = {
+	.evtdev = {
+		.name = "stm32 clockevent",
+		.features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_PERIODIC,
+		.set_mode = stm32_clock_event_set_mode,
+		.set_next_event = stm32_clock_event_set_next_event,
+		.rating = 200,
+	},
+};
+
+static void __init stm32_clockevent_init(struct device_node *np)
+{
+	struct stm32_clock_event_ddata *data = &clock_event_ddata;
+	struct clk *clk;
+	struct reset_control *rstc;
+	unsigned long rate, max_delta;
+	int irq, ret, bits, prescaler = 1;
+
+	clk = of_clk_get(np, 0);
+	if (IS_ERR(clk)) {
+		ret = PTR_ERR(clk);
+		pr_err("failed to get clock for clockevent (%d)\n", ret);
+		goto err_clk_get;
+	}
+
+	ret = clk_prepare_enable(clk);
+	if (ret) {
+		pr_err("failed to enable timer clock for clockevent (%d)\n",
+		       ret);
+		goto err_clk_enable;
+	}
+
+	rate = clk_get_rate(clk);
+
+	rstc = of_reset_control_get(np, NULL);
+	if (!IS_ERR(rstc)) {
+		reset_control_assert(rstc);
+		reset_control_deassert(rstc);
+	}
+
+	data->base = of_iomap(np, 0);
+	if (!data->base) {
+		pr_err("failed to map registers for clockevent\n");
+		goto err_iomap;
+	}
+
+	irq = irq_of_parse_and_map(np, 0);
+	if (!irq) {
+		pr_err("%s: failed to get irq.\n", np->full_name);
+		goto err_get_irq;
+	}
+
+	/* Detect whether the timer is 16 or 32 bits */
+	writel_relaxed(~0U, data->base + TIM_ARR);
+	max_delta = readl_relaxed(data->base + TIM_ARR);
+	if (max_delta == ~0U) {
+		prescaler = 1;
+		bits = 32;
+	} else {
+		prescaler = 1024;
+		bits = 16;
+	}
+	writel_relaxed(0, data->base + TIM_ARR);
+
+	writel_relaxed(prescaler - 1, data->base + TIM_PSC);
+	writel_relaxed(TIM_EGR_UG, data->base + TIM_EGR);
+	writel_relaxed(TIM_DIER_UIE, data->base + TIM_DIER);
+	writel_relaxed(0, data->base + TIM_SR);
+
+	data->periodic_top = DIV_ROUND_CLOSEST(rate, prescaler * HZ);
+
+	clockevents_config_and_register(&data->evtdev,
+					DIV_ROUND_CLOSEST(rate, prescaler),
+					0x1, max_delta);
+
+	ret = request_irq(irq, stm32_clock_event_handler, IRQF_TIMER,
+			"stm32 clockevent", data);
+	if (ret) {
+		pr_err("%s: failed to request irq.\n", np->full_name);
+		goto err_get_irq;
+	}
+
+	pr_info("%s: STM32 clockevent driver initialized (%d bits)\n",
+			np->full_name, bits);
+
+	return;
+
+err_get_irq:
+	iounmap(data->base);
+err_iomap:
+	clk_disable_unprepare(clk);
+err_clk_enable:
+	clk_put(clk);
+err_clk_get:
+	return;
+}
+
+CLOCKSOURCE_OF_DECLARE(stm32, "st,stm32-timer", stm32_clockevent_init);
diff --git a/drivers/clocksource/timer-sun5i.c b/drivers/clocksource/timer-sun5i.c
index 28aa4b7bb6020c..0ffb4ea7c9253e 100644
--- a/drivers/clocksource/timer-sun5i.c
+++ b/drivers/clocksource/timer-sun5i.c
@@ -324,7 +324,7 @@ static void __init sun5i_timer_init(struct device_node *node)
 	int irq;
 
 	timer_base = of_io_request_and_map(node, 0, of_node_full_name(node));
-	if (!timer_base)
+	if (IS_ERR(timer_base))
 		panic("Can't map registers");
 
 	irq = irq_of_parse_and_map(node, 0);
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index b0c18ed8d83f70..0136dfcdabf0ba 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -699,13 +699,14 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	dmi_check_system(sw_any_bug_dmi_table);
 	if (bios_with_sw_any_bug && !policy_is_shared(policy)) {
 		policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
-		cpumask_copy(policy->cpus, cpu_core_mask(cpu));
+		cpumask_copy(policy->cpus, topology_core_cpumask(cpu));
 	}
 
 	if (check_amd_hwpstate_cpu(cpu) && !acpi_pstate_strict) {
 		cpumask_clear(policy->cpus);
 		cpumask_set_cpu(cpu, policy->cpus);
-		cpumask_copy(data->freqdomain_cpus, cpu_sibling_mask(cpu));
+		cpumask_copy(data->freqdomain_cpus,
+			     topology_sibling_cpumask(cpu));
 		policy->shared_type = CPUFREQ_SHARED_TYPE_HW;
 		pr_info_once(PFX "overriding BIOS provided _PSD data\n");
 	}
diff --git a/drivers/cpufreq/p4-clockmod.c b/drivers/cpufreq/p4-clockmod.c
index 529cfd92158fa6..5dd95dab580d1c 100644
--- a/drivers/cpufreq/p4-clockmod.c
+++ b/drivers/cpufreq/p4-clockmod.c
@@ -172,7 +172,7 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
 	unsigned int i;
 
 #ifdef CONFIG_SMP
-	cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
+	cpumask_copy(policy->cpus, topology_sibling_cpumask(policy->cpu));
 #endif
 
 	/* Errata workaround */
diff --git a/drivers/cpufreq/powernow-k8.c b/drivers/cpufreq/powernow-k8.c
index f9ce7e4bf0feae..5c035d04d82710 100644
--- a/drivers/cpufreq/powernow-k8.c
+++ b/drivers/cpufreq/powernow-k8.c
@@ -57,13 +57,6 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
 
 static struct cpufreq_driver cpufreq_amd64_driver;
 
-#ifndef CONFIG_SMP
-static inline const struct cpumask *cpu_core_mask(int cpu)
-{
-	return cpumask_of(0);
-}
-#endif
-
 /* Return a frequency in MHz, given an input fid */
 static u32 find_freq_from_fid(u32 fid)
 {
@@ -620,7 +613,7 @@ static int fill_powernow_table(struct powernow_k8_data *data,
 
 	pr_debug("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
 	data->powernow_table = powernow_table;
-	if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
+	if (cpumask_first(topology_core_cpumask(data->cpu)) == data->cpu)
 		print_basics(data);
 
 	for (j = 0; j < data->numps; j++)
@@ -784,7 +777,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
 		CPUFREQ_TABLE_END;
 	data->powernow_table = powernow_table;
 
-	if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
+	if (cpumask_first(topology_core_cpumask(data->cpu)) == data->cpu)
 		print_basics(data);
 
 	/* notify BIOS that we exist */
@@ -1090,7 +1083,7 @@ static int powernowk8_cpu_init(struct cpufreq_policy *pol)
 	if (rc != 0)
 		goto err_out_exit_acpi;
 
-	cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu));
+	cpumask_copy(pol->cpus, topology_core_cpumask(pol->cpu));
 	data->available_cores = pol->cpus;
 
 	/* min/max the cpu is capable of */
diff --git a/drivers/cpufreq/speedstep-ich.c b/drivers/cpufreq/speedstep-ich.c
index e56d632a8b2107..37555c6b86a7cf 100644
--- a/drivers/cpufreq/speedstep-ich.c
+++ b/drivers/cpufreq/speedstep-ich.c
@@ -292,7 +292,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
 
 	/* only run on CPU to be set, or on its sibling */
 #ifdef CONFIG_SMP
-	cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
+	cpumask_copy(policy->cpus, topology_sibling_cpumask(policy->cpu));
 #endif
 	policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
 
diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c
index c178ed8c3908d3..da2d6777bd092f 100644
--- a/drivers/crypto/padlock-aes.c
+++ b/drivers/crypto/padlock-aes.c
@@ -22,7 +22,7 @@
 #include <asm/cpu_device_id.h>
 #include <asm/byteorder.h>
 #include <asm/processor.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 /*
  * Number of data blocks actually fetched for each xcrypt insn.
diff --git a/drivers/crypto/padlock-sha.c b/drivers/crypto/padlock-sha.c
index 95f7d27ce491f0..4e154c9b92064b 100644
--- a/drivers/crypto/padlock-sha.c
+++ b/drivers/crypto/padlock-sha.c
@@ -23,7 +23,7 @@
 #include <linux/kernel.h>
 #include <linux/scatterlist.h>
 #include <asm/cpu_device_id.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 struct padlock_sha_desc {
 	struct shash_desc fallback;
diff --git a/drivers/crypto/vmx/aes.c b/drivers/crypto/vmx/aes.c
index ab300ea19434e3..a9064e36e7b548 100644
--- a/drivers/crypto/vmx/aes.c
+++ b/drivers/crypto/vmx/aes.c
@@ -78,12 +78,14 @@ static int p8_aes_setkey(struct crypto_tfm *tfm, const u8 *key,
     int ret;
     struct p8_aes_ctx *ctx = crypto_tfm_ctx(tfm);
 
+    preempt_disable();
     pagefault_disable();
     enable_kernel_altivec();
     ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
     ret += aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key);
     pagefault_enable();
-    
+    preempt_enable();
+
     ret += crypto_cipher_setkey(ctx->fallback, key, keylen);
     return ret;
 }
@@ -95,10 +97,12 @@ static void p8_aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
     if (in_interrupt()) {
         crypto_cipher_encrypt_one(ctx->fallback, dst, src);
     } else {
+	preempt_disable();
         pagefault_disable();
         enable_kernel_altivec();
         aes_p8_encrypt(src, dst, &ctx->enc_key);
         pagefault_enable();
+	preempt_enable();
     }
 }
 
@@ -109,10 +113,12 @@ static void p8_aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
     if (in_interrupt()) {
         crypto_cipher_decrypt_one(ctx->fallback, dst, src);
     } else {
+	preempt_disable();
         pagefault_disable();
         enable_kernel_altivec();
         aes_p8_decrypt(src, dst, &ctx->dec_key);
         pagefault_enable();
+	preempt_enable();
     }
 }
 
diff --git a/drivers/crypto/vmx/aes_cbc.c b/drivers/crypto/vmx/aes_cbc.c
index 1a559b7dddb5f2..477284abdd11dc 100644
--- a/drivers/crypto/vmx/aes_cbc.c
+++ b/drivers/crypto/vmx/aes_cbc.c
@@ -79,11 +79,13 @@ static int p8_aes_cbc_setkey(struct crypto_tfm *tfm, const u8 *key,
     int ret;
     struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
 
+    preempt_disable();
     pagefault_disable();
     enable_kernel_altivec();
     ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
     ret += aes_p8_set_decrypt_key(key, keylen * 8, &ctx->dec_key);
     pagefault_enable();
+    preempt_enable();
 
     ret += crypto_blkcipher_setkey(ctx->fallback, key, keylen);
     return ret;
@@ -106,6 +108,7 @@ static int p8_aes_cbc_encrypt(struct blkcipher_desc *desc,
     if (in_interrupt()) {
         ret = crypto_blkcipher_encrypt(&fallback_desc, dst, src, nbytes);
     } else {
+	preempt_disable();
         pagefault_disable();
         enable_kernel_altivec();
 
@@ -119,6 +122,7 @@ static int p8_aes_cbc_encrypt(struct blkcipher_desc *desc,
 	}
 
         pagefault_enable();
+	preempt_enable();
     }
 
     return ret;
@@ -141,6 +145,7 @@ static int p8_aes_cbc_decrypt(struct blkcipher_desc *desc,
     if (in_interrupt()) {
         ret = crypto_blkcipher_decrypt(&fallback_desc, dst, src, nbytes);
     } else {
+	preempt_disable();
         pagefault_disable();
         enable_kernel_altivec();
 
@@ -154,6 +159,7 @@ static int p8_aes_cbc_decrypt(struct blkcipher_desc *desc,
 		}
 
         pagefault_enable();
+	preempt_enable();
     }
 
     return ret;
diff --git a/drivers/crypto/vmx/ghash.c b/drivers/crypto/vmx/ghash.c
index d0ffe277af5ca5..f255ec4a04d48d 100644
--- a/drivers/crypto/vmx/ghash.c
+++ b/drivers/crypto/vmx/ghash.c
@@ -114,11 +114,13 @@ static int p8_ghash_setkey(struct crypto_shash *tfm, const u8 *key,
     if (keylen != GHASH_KEY_LEN)
         return -EINVAL;
 
+    preempt_disable();
     pagefault_disable();
     enable_kernel_altivec();
     enable_kernel_fp();
     gcm_init_p8(ctx->htable, (const u64 *) key);
     pagefault_enable();
+    preempt_enable();
     return crypto_shash_setkey(ctx->fallback, key, keylen);
 }
 
@@ -140,23 +142,27 @@ static int p8_ghash_update(struct shash_desc *desc,
             }
             memcpy(dctx->buffer + dctx->bytes, src,
                     GHASH_DIGEST_SIZE - dctx->bytes);
+	    preempt_disable();
             pagefault_disable();
             enable_kernel_altivec();
             enable_kernel_fp();
             gcm_ghash_p8(dctx->shash, ctx->htable, dctx->buffer,
                     GHASH_DIGEST_SIZE);
             pagefault_enable();
+	    preempt_enable();
             src += GHASH_DIGEST_SIZE - dctx->bytes;
             srclen -= GHASH_DIGEST_SIZE - dctx->bytes;
             dctx->bytes = 0;
         }
         len = srclen & ~(GHASH_DIGEST_SIZE - 1);
         if (len) {
+	    preempt_disable();
             pagefault_disable();
             enable_kernel_altivec();
             enable_kernel_fp();
             gcm_ghash_p8(dctx->shash, ctx->htable, src, len);
             pagefault_enable();
+	    preempt_enable();
             src += len;
             srclen -= len;
         }
@@ -180,12 +186,14 @@ static int p8_ghash_final(struct shash_desc *desc, u8 *out)
         if (dctx->bytes) {
             for (i = dctx->bytes; i < GHASH_DIGEST_SIZE; i++)
                 dctx->buffer[i] = 0;
+	    preempt_disable();
             pagefault_disable();
             enable_kernel_altivec();
             enable_kernel_fp();
             gcm_ghash_p8(dctx->shash, ctx->htable, dctx->buffer,
                     GHASH_DIGEST_SIZE);
             pagefault_enable();
+	    preempt_enable();
             dctx->bytes = 0;
         }
         memcpy(out, dctx->shash, GHASH_DIGEST_SIZE);
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index d8be608a9f3be7..26eabbc553413a 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -1,7 +1,7 @@
 #
 # Makefile for linux kernel
 #
-obj-$(CONFIG_EFI)			+= efi.o vars.o reboot.o
+obj-$(CONFIG_EFI)			+= efi.o esrt.o vars.o reboot.o
 obj-$(CONFIG_EFI_VARS)			+= efivars.o
 obj-$(CONFIG_EFI_VARS_PSTORE)		+= efi-pstore.o
 obj-$(CONFIG_UEFI_CPER)			+= cper.o
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 3061bb8629dc3f..ca617f40574ac2 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -39,6 +39,7 @@ struct efi __read_mostly efi = {
 	.fw_vendor  = EFI_INVALID_TABLE_ADDR,
 	.runtime    = EFI_INVALID_TABLE_ADDR,
 	.config_table  = EFI_INVALID_TABLE_ADDR,
+	.esrt       = EFI_INVALID_TABLE_ADDR,
 };
 EXPORT_SYMBOL(efi);
 
@@ -64,7 +65,7 @@ static int __init parse_efi_cmdline(char *str)
 }
 early_param("efi", parse_efi_cmdline);
 
-static struct kobject *efi_kobj;
+struct kobject *efi_kobj;
 static struct kobject *efivars_kobj;
 
 /*
@@ -85,10 +86,15 @@ static ssize_t systab_show(struct kobject *kobj,
 		str += sprintf(str, "ACPI20=0x%lx\n", efi.acpi20);
 	if (efi.acpi != EFI_INVALID_TABLE_ADDR)
 		str += sprintf(str, "ACPI=0x%lx\n", efi.acpi);
-	if (efi.smbios != EFI_INVALID_TABLE_ADDR)
-		str += sprintf(str, "SMBIOS=0x%lx\n", efi.smbios);
+	/*
+	 * If both SMBIOS and SMBIOS3 entry points are implemented, the
+	 * SMBIOS3 entry point shall be preferred, so we list it first to
+	 * let applications stop parsing after the first match.
+	 */
 	if (efi.smbios3 != EFI_INVALID_TABLE_ADDR)
 		str += sprintf(str, "SMBIOS3=0x%lx\n", efi.smbios3);
+	if (efi.smbios != EFI_INVALID_TABLE_ADDR)
+		str += sprintf(str, "SMBIOS=0x%lx\n", efi.smbios);
 	if (efi.hcdp != EFI_INVALID_TABLE_ADDR)
 		str += sprintf(str, "HCDP=0x%lx\n", efi.hcdp);
 	if (efi.boot_info != EFI_INVALID_TABLE_ADDR)
@@ -232,6 +238,84 @@ err_put:
 
 subsys_initcall(efisubsys_init);
 
+/*
+ * Find the efi memory descriptor for a given physical address.  Given a
+ * physicall address, determine if it exists within an EFI Memory Map entry,
+ * and if so, populate the supplied memory descriptor with the appropriate
+ * data.
+ */
+int __init efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md)
+{
+	struct efi_memory_map *map = efi.memmap;
+	void *p, *e;
+
+	if (!efi_enabled(EFI_MEMMAP)) {
+		pr_err_once("EFI_MEMMAP is not enabled.\n");
+		return -EINVAL;
+	}
+
+	if (!map) {
+		pr_err_once("efi.memmap is not set.\n");
+		return -EINVAL;
+	}
+	if (!out_md) {
+		pr_err_once("out_md is null.\n");
+		return -EINVAL;
+        }
+	if (WARN_ON_ONCE(!map->phys_map))
+		return -EINVAL;
+	if (WARN_ON_ONCE(map->nr_map == 0) || WARN_ON_ONCE(map->desc_size == 0))
+		return -EINVAL;
+
+	e = map->phys_map + map->nr_map * map->desc_size;
+	for (p = map->phys_map; p < e; p += map->desc_size) {
+		efi_memory_desc_t *md;
+		u64 size;
+		u64 end;
+
+		/*
+		 * If a driver calls this after efi_free_boot_services,
+		 * ->map will be NULL, and the target may also not be mapped.
+		 * So just always get our own virtual map on the CPU.
+		 *
+		 */
+		md = early_memremap((phys_addr_t)p, sizeof (*md));
+		if (!md) {
+			pr_err_once("early_memremap(%p, %zu) failed.\n",
+				    p, sizeof (*md));
+			return -ENOMEM;
+		}
+
+		if (!(md->attribute & EFI_MEMORY_RUNTIME) &&
+		    md->type != EFI_BOOT_SERVICES_DATA &&
+		    md->type != EFI_RUNTIME_SERVICES_DATA) {
+			early_memunmap(md, sizeof (*md));
+			continue;
+		}
+
+		size = md->num_pages << EFI_PAGE_SHIFT;
+		end = md->phys_addr + size;
+		if (phys_addr >= md->phys_addr && phys_addr < end) {
+			memcpy(out_md, md, sizeof(*out_md));
+			early_memunmap(md, sizeof (*md));
+			return 0;
+		}
+
+		early_memunmap(md, sizeof (*md));
+	}
+	pr_err_once("requested map not found.\n");
+	return -ENOENT;
+}
+
+/*
+ * Calculate the highest address of an efi memory descriptor.
+ */
+u64 __init efi_mem_desc_end(efi_memory_desc_t *md)
+{
+	u64 size = md->num_pages << EFI_PAGE_SHIFT;
+	u64 end = md->phys_addr + size;
+	return end;
+}
 
 /*
  * We can't ioremap data in EFI boot services RAM, because we've already mapped
@@ -274,6 +358,7 @@ static __initdata efi_config_table_type_t common_tables[] = {
 	{SMBIOS_TABLE_GUID, "SMBIOS", &efi.smbios},
 	{SMBIOS3_TABLE_GUID, "SMBIOS 3.0", &efi.smbios3},
 	{UGA_IO_PROTOCOL_GUID, "UGA", &efi.uga},
+	{EFI_SYSTEM_RESOURCE_TABLE_GUID, "ESRT", &efi.esrt},
 	{NULL_GUID, NULL, NULL},
 };
 
diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c
index 7b2e0496e0c084..756eca8c4cf8f2 100644
--- a/drivers/firmware/efi/efivars.c
+++ b/drivers/firmware/efi/efivars.c
@@ -535,7 +535,7 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
  * efivar_create_sysfs_entry - create a new entry in sysfs
  * @new_var: efivar entry to create
  *
- * Returns 1 on failure, 0 on success
+ * Returns 0 on success, negative error code on failure
  */
 static int
 efivar_create_sysfs_entry(struct efivar_entry *new_var)
@@ -544,6 +544,7 @@ efivar_create_sysfs_entry(struct efivar_entry *new_var)
 	char *short_name;
 	unsigned long variable_name_size;
 	efi_char16_t *variable_name;
+	int ret;
 
 	variable_name = new_var->var.VariableName;
 	variable_name_size = ucs2_strlen(variable_name) * sizeof(efi_char16_t);
@@ -558,7 +559,7 @@ efivar_create_sysfs_entry(struct efivar_entry *new_var)
 	short_name = kzalloc(short_name_size, GFP_KERNEL);
 
 	if (!short_name)
-		return 1;
+		return -ENOMEM;
 
 	/* Convert Unicode to normal chars (assume top bits are 0),
 	   ala UTF-8 */
@@ -574,11 +575,11 @@ efivar_create_sysfs_entry(struct efivar_entry *new_var)
 
 	new_var->kobj.kset = efivars_kset;
 
-	i = kobject_init_and_add(&new_var->kobj, &efivar_ktype,
+	ret = kobject_init_and_add(&new_var->kobj, &efivar_ktype,
 				   NULL, "%s", short_name);
 	kfree(short_name);
-	if (i)
-		return 1;
+	if (ret)
+		return ret;
 
 	kobject_uevent(&new_var->kobj, KOBJ_ADD);
 	efivar_entry_add(new_var, &efivar_sysfs_list);
diff --git a/drivers/firmware/efi/esrt.c b/drivers/firmware/efi/esrt.c
new file mode 100644
index 00000000000000..a5b95d61ae71ef
--- /dev/null
+++ b/drivers/firmware/efi/esrt.c
@@ -0,0 +1,471 @@
+/*
+ * esrt.c
+ *
+ * This module exports EFI System Resource Table (ESRT) entries into userspace
+ * through the sysfs file system. The ESRT provides a read-only catalog of
+ * system components for which the system accepts firmware upgrades via UEFI's
+ * "Capsule Update" feature. This module allows userland utilities to evaluate
+ * what firmware updates can be applied to this system, and potentially arrange
+ * for those updates to occur.
+ *
+ * Data is currently found below /sys/firmware/efi/esrt/...
+ */
+#define pr_fmt(fmt) "esrt: " fmt
+
+#include <linux/capability.h>
+#include <linux/device.h>
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/memblock.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include <asm/io.h>
+#include <asm/early_ioremap.h>
+
+struct efi_system_resource_entry_v1 {
+	efi_guid_t	fw_class;
+	u32		fw_type;
+	u32		fw_version;
+	u32		lowest_supported_fw_version;
+	u32		capsule_flags;
+	u32		last_attempt_version;
+	u32		last_attempt_status;
+};
+
+/*
+ * _count and _version are what they seem like.  _max is actually just
+ * accounting info for the firmware when creating the table; it should never
+ * have been exposed to us.  To wit, the spec says:
+ * The maximum number of resource array entries that can be within the
+ * table without reallocating the table, must not be zero.
+ * Since there's no guidance about what that means in terms of memory layout,
+ * it means nothing to us.
+ */
+struct efi_system_resource_table {
+	u32	fw_resource_count;
+	u32	fw_resource_count_max;
+	u64	fw_resource_version;
+	u8	entries[];
+};
+
+static phys_addr_t esrt_data;
+static size_t esrt_data_size;
+
+static struct efi_system_resource_table *esrt;
+
+struct esre_entry {
+	union {
+		struct efi_system_resource_entry_v1 *esre1;
+	} esre;
+
+	struct kobject kobj;
+	struct list_head list;
+};
+
+/* global list of esre_entry. */
+static LIST_HEAD(entry_list);
+
+/* entry attribute */
+struct esre_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct esre_entry *entry, char *buf);
+	ssize_t (*store)(struct esre_entry *entry,
+			 const char *buf, size_t count);
+};
+
+static struct esre_entry *to_entry(struct kobject *kobj)
+{
+	return container_of(kobj, struct esre_entry, kobj);
+}
+
+static struct esre_attribute *to_attr(struct attribute *attr)
+{
+	return container_of(attr, struct esre_attribute, attr);
+}
+
+static ssize_t esre_attr_show(struct kobject *kobj,
+			      struct attribute *_attr, char *buf)
+{
+	struct esre_entry *entry = to_entry(kobj);
+	struct esre_attribute *attr = to_attr(_attr);
+
+	/* Don't tell normal users what firmware versions we've got... */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	return attr->show(entry, buf);
+}
+
+static const struct sysfs_ops esre_attr_ops = {
+	.show = esre_attr_show,
+};
+
+/* Generic ESRT Entry ("ESRE") support. */
+static ssize_t esre_fw_class_show(struct esre_entry *entry, char *buf)
+{
+	char *str = buf;
+
+	efi_guid_to_str(&entry->esre.esre1->fw_class, str);
+	str += strlen(str);
+	str += sprintf(str, "\n");
+
+	return str - buf;
+}
+
+static struct esre_attribute esre_fw_class = __ATTR(fw_class, 0400,
+	esre_fw_class_show, NULL);
+
+#define esre_attr_decl(name, size, fmt) \
+static ssize_t esre_##name##_show(struct esre_entry *entry, char *buf) \
+{ \
+	return sprintf(buf, fmt "\n", \
+		       le##size##_to_cpu(entry->esre.esre1->name)); \
+} \
+\
+static struct esre_attribute esre_##name = __ATTR(name, 0400, \
+	esre_##name##_show, NULL)
+
+esre_attr_decl(fw_type, 32, "%u");
+esre_attr_decl(fw_version, 32, "%u");
+esre_attr_decl(lowest_supported_fw_version, 32, "%u");
+esre_attr_decl(capsule_flags, 32, "0x%x");
+esre_attr_decl(last_attempt_version, 32, "%u");
+esre_attr_decl(last_attempt_status, 32, "%u");
+
+static struct attribute *esre1_attrs[] = {
+	&esre_fw_class.attr,
+	&esre_fw_type.attr,
+	&esre_fw_version.attr,
+	&esre_lowest_supported_fw_version.attr,
+	&esre_capsule_flags.attr,
+	&esre_last_attempt_version.attr,
+	&esre_last_attempt_status.attr,
+	NULL
+};
+static void esre_release(struct kobject *kobj)
+{
+	struct esre_entry *entry = to_entry(kobj);
+
+	list_del(&entry->list);
+	kfree(entry);
+}
+
+static struct kobj_type esre1_ktype = {
+	.release = esre_release,
+	.sysfs_ops = &esre_attr_ops,
+	.default_attrs = esre1_attrs,
+};
+
+
+static struct kobject *esrt_kobj;
+static struct kset *esrt_kset;
+
+static int esre_create_sysfs_entry(void *esre, int entry_num)
+{
+	struct esre_entry *entry;
+	char name[20];
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	sprintf(name, "entry%d", entry_num);
+
+	entry->kobj.kset = esrt_kset;
+
+	if (esrt->fw_resource_version == 1) {
+		int rc = 0;
+
+		entry->esre.esre1 = esre;
+		rc = kobject_init_and_add(&entry->kobj, &esre1_ktype, NULL,
+					  "%s", name);
+		if (rc) {
+			kfree(entry);
+			return rc;
+		}
+	}
+
+	list_add_tail(&entry->list, &entry_list);
+	return 0;
+}
+
+/* support for displaying ESRT fields at the top level */
+#define esrt_attr_decl(name, size, fmt) \
+static ssize_t esrt_##name##_show(struct kobject *kobj, \
+				  struct kobj_attribute *attr, char *buf)\
+{ \
+	return sprintf(buf, fmt "\n", le##size##_to_cpu(esrt->name)); \
+} \
+\
+static struct kobj_attribute esrt_##name = __ATTR(name, 0400, \
+	esrt_##name##_show, NULL)
+
+esrt_attr_decl(fw_resource_count, 32, "%u");
+esrt_attr_decl(fw_resource_count_max, 32, "%u");
+esrt_attr_decl(fw_resource_version, 64, "%llu");
+
+static struct attribute *esrt_attrs[] = {
+	&esrt_fw_resource_count.attr,
+	&esrt_fw_resource_count_max.attr,
+	&esrt_fw_resource_version.attr,
+	NULL,
+};
+
+static inline int esrt_table_exists(void)
+{
+	if (!efi_enabled(EFI_CONFIG_TABLES))
+		return 0;
+	if (efi.esrt == EFI_INVALID_TABLE_ADDR)
+		return 0;
+	return 1;
+}
+
+static umode_t esrt_attr_is_visible(struct kobject *kobj,
+				    struct attribute *attr, int n)
+{
+	if (!esrt_table_exists())
+		return 0;
+	return attr->mode;
+}
+
+static struct attribute_group esrt_attr_group = {
+	.attrs = esrt_attrs,
+	.is_visible = esrt_attr_is_visible,
+};
+
+/*
+ * remap the table, copy it to kmalloced pages, and unmap it.
+ */
+void __init efi_esrt_init(void)
+{
+	void *va;
+	struct efi_system_resource_table tmpesrt;
+	struct efi_system_resource_entry_v1 *v1_entries;
+	size_t size, max, entry_size, entries_size;
+	efi_memory_desc_t md;
+	int rc;
+	phys_addr_t end;
+
+	pr_debug("esrt-init: loading.\n");
+	if (!esrt_table_exists())
+		return;
+
+	rc = efi_mem_desc_lookup(efi.esrt, &md);
+	if (rc < 0) {
+		pr_err("ESRT header is not in the memory map.\n");
+		return;
+	}
+
+	max = efi_mem_desc_end(&md);
+	if (max < efi.esrt) {
+		pr_err("EFI memory descriptor is invalid. (esrt: %p max: %p)\n",
+		       (void *)efi.esrt, (void *)max);
+		return;
+	}
+
+	size = sizeof(*esrt);
+	max -= efi.esrt;
+
+	if (max < size) {
+		pr_err("ESRT header doen't fit on single memory map entry. (size: %zu max: %zu)\n",
+		       size, max);
+		return;
+	}
+
+	va = early_memremap(efi.esrt, size);
+	if (!va) {
+		pr_err("early_memremap(%p, %zu) failed.\n", (void *)efi.esrt,
+		       size);
+		return;
+	}
+
+	memcpy(&tmpesrt, va, sizeof(tmpesrt));
+
+	if (tmpesrt.fw_resource_version == 1) {
+		entry_size = sizeof (*v1_entries);
+	} else {
+		pr_err("Unsupported ESRT version %lld.\n",
+		       tmpesrt.fw_resource_version);
+		return;
+	}
+
+	if (tmpesrt.fw_resource_count > 0 && max - size < entry_size) {
+		pr_err("ESRT memory map entry can only hold the header. (max: %zu size: %zu)\n",
+		       max - size, entry_size);
+		goto err_memunmap;
+	}
+
+	/*
+	 * The format doesn't really give us any boundary to test here,
+	 * so I'm making up 128 as the max number of individually updatable
+	 * components we support.
+	 * 128 should be pretty excessive, but there's still some chance
+	 * somebody will do that someday and we'll need to raise this.
+	 */
+	if (tmpesrt.fw_resource_count > 128) {
+		pr_err("ESRT says fw_resource_count has very large value %d.\n",
+		       tmpesrt.fw_resource_count);
+		goto err_memunmap;
+	}
+
+	/*
+	 * We know it can't be larger than N * sizeof() here, and N is limited
+	 * by the previous test to a small number, so there's no overflow.
+	 */
+	entries_size = tmpesrt.fw_resource_count * entry_size;
+	if (max < size + entries_size) {
+		pr_err("ESRT does not fit on single memory map entry (size: %zu max: %zu)\n",
+		       size, max);
+		goto err_memunmap;
+	}
+
+	/* remap it with our (plausible) new pages */
+	early_memunmap(va, size);
+	size += entries_size;
+	va = early_memremap(efi.esrt, size);
+	if (!va) {
+		pr_err("early_memremap(%p, %zu) failed.\n", (void *)efi.esrt,
+		       size);
+		return;
+	}
+
+	esrt_data = (phys_addr_t)efi.esrt;
+	esrt_data_size = size;
+
+	end = esrt_data + size;
+	pr_info("Reserving ESRT space from %pa to %pa.\n", &esrt_data, &end);
+	memblock_reserve(esrt_data, esrt_data_size);
+
+	pr_debug("esrt-init: loaded.\n");
+err_memunmap:
+	early_memunmap(va, size);
+}
+
+static int __init register_entries(void)
+{
+	struct efi_system_resource_entry_v1 *v1_entries = (void *)esrt->entries;
+	int i, rc;
+
+	if (!esrt_table_exists())
+		return 0;
+
+	for (i = 0; i < le32_to_cpu(esrt->fw_resource_count); i++) {
+		void *esre = NULL;
+		if (esrt->fw_resource_version == 1) {
+			esre = &v1_entries[i];
+		} else {
+			pr_err("Unsupported ESRT version %lld.\n",
+			       esrt->fw_resource_version);
+			return -EINVAL;
+		}
+
+		rc = esre_create_sysfs_entry(esre, i);
+		if (rc < 0) {
+			pr_err("ESRT entry creation failed with error %d.\n",
+			       rc);
+			return rc;
+		}
+	}
+	return 0;
+}
+
+static void cleanup_entry_list(void)
+{
+	struct esre_entry *entry, *next;
+
+	list_for_each_entry_safe(entry, next, &entry_list, list) {
+		kobject_put(&entry->kobj);
+	}
+}
+
+static int __init esrt_sysfs_init(void)
+{
+	int error;
+	struct efi_system_resource_table __iomem *ioesrt;
+
+	pr_debug("esrt-sysfs: loading.\n");
+	if (!esrt_data || !esrt_data_size)
+		return -ENOSYS;
+
+	ioesrt = ioremap(esrt_data, esrt_data_size);
+	if (!ioesrt) {
+		pr_err("ioremap(%pa, %zu) failed.\n", &esrt_data,
+		       esrt_data_size);
+		return -ENOMEM;
+	}
+
+	esrt = kmalloc(esrt_data_size, GFP_KERNEL);
+	if (!esrt) {
+		pr_err("kmalloc failed. (wanted %zu bytes)\n", esrt_data_size);
+		iounmap(ioesrt);
+		return -ENOMEM;
+	}
+
+	memcpy_fromio(esrt, ioesrt, esrt_data_size);
+
+	esrt_kobj = kobject_create_and_add("esrt", efi_kobj);
+	if (!esrt_kobj) {
+		pr_err("Firmware table registration failed.\n");
+		error = -ENOMEM;
+		goto err;
+	}
+
+	error = sysfs_create_group(esrt_kobj, &esrt_attr_group);
+	if (error) {
+		pr_err("Sysfs attribute export failed with error %d.\n",
+		       error);
+		goto err_remove_esrt;
+	}
+
+	esrt_kset = kset_create_and_add("entries", NULL, esrt_kobj);
+	if (!esrt_kset) {
+		pr_err("kset creation failed.\n");
+		error = -ENOMEM;
+		goto err_remove_group;
+	}
+
+	error = register_entries();
+	if (error)
+		goto err_cleanup_list;
+
+	memblock_remove(esrt_data, esrt_data_size);
+
+	pr_debug("esrt-sysfs: loaded.\n");
+
+	return 0;
+err_cleanup_list:
+	cleanup_entry_list();
+	kset_unregister(esrt_kset);
+err_remove_group:
+	sysfs_remove_group(esrt_kobj, &esrt_attr_group);
+err_remove_esrt:
+	kobject_put(esrt_kobj);
+err:
+	kfree(esrt);
+	esrt = NULL;
+	return error;
+}
+
+static void __exit esrt_sysfs_exit(void)
+{
+	pr_debug("esrt-sysfs: unloading.\n");
+	cleanup_entry_list();
+	kset_unregister(esrt_kset);
+	sysfs_remove_group(esrt_kobj, &esrt_attr_group);
+	kfree(esrt);
+	esrt = NULL;
+	kobject_del(esrt_kobj);
+	kobject_put(esrt_kobj);
+}
+
+module_init(esrt_sysfs_init);
+module_exit(esrt_sysfs_exit);
+
+MODULE_AUTHOR("Peter Jones <pjones@redhat.com>");
+MODULE_DESCRIPTION("EFI System Resource Table support");
+MODULE_LICENSE("GPL");
diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c
index 9bac1b7479af59..165a2a6d35fe73 100644
--- a/drivers/gpu/drm/drm_ioctl.c
+++ b/drivers/gpu/drm/drm_ioctl.c
@@ -36,9 +36,6 @@
 
 #include <linux/pci.h>
 #include <linux/export.h>
-#ifdef CONFIG_X86
-#include <asm/mtrr.h>
-#endif
 
 static int drm_version(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv);
@@ -197,16 +194,7 @@ static int drm_getmap(struct drm_device *dev, void *data,
 	map->type = r_list->map->type;
 	map->flags = r_list->map->flags;
 	map->handle = (void *)(unsigned long) r_list->user_token;
-
-#ifdef CONFIG_X86
-	/*
-	 * There appears to be exactly one user of the mtrr index: dritest.
-	 * It's easy enough to keep it working on non-PAT systems.
-	 */
-	map->mtrr = phys_wc_to_mtrr_index(r_list->map->mtrr);
-#else
-	map->mtrr = -1;
-#endif
+	map->mtrr = arch_phys_wc_index(r_list->map->mtrr);
 
 	mutex_unlock(&dev->struct_mutex);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index bd0e4bda2c649a..a7fa14516cda5b 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -32,6 +32,7 @@
 #include "i915_trace.h"
 #include "intel_drv.h"
 #include <linux/dma_remapping.h>
+#include <linux/uaccess.h>
 
 #define  __EXEC_OBJECT_HAS_PIN (1<<31)
 #define  __EXEC_OBJECT_HAS_FENCE (1<<30)
@@ -458,7 +459,7 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 	}
 
 	/* We can't wait for rendering with pagefaults disabled */
-	if (obj->active && in_atomic())
+	if (obj->active && pagefault_disabled())
 		return -EFAULT;
 
 	if (use_cpu_reloc(obj))
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index ed303ba3a59393..3e03379e7c5d92 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -63,7 +63,8 @@ MODULE_PARM_DESC(tjmax, "TjMax value in degrees Celsius");
 #define TO_ATTR_NO(cpu)		(TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO)
 
 #ifdef CONFIG_SMP
-#define for_each_sibling(i, cpu)	for_each_cpu(i, cpu_sibling_mask(cpu))
+#define for_each_sibling(i, cpu) \
+	for_each_cpu(i, topology_sibling_cpumask(cpu))
 #else
 #define for_each_sibling(i, cpu)	for (i = 0; false; )
 #endif
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 53d5417bb291b9..2024db148cb645 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -34,6 +34,7 @@
 #include <linux/irq.h>
 #include <linux/msi.h>
 #include <linux/dma-contiguous.h>
+#include <linux/irqdomain.h>
 #include <asm/irq_remapping.h>
 #include <asm/io_apic.h>
 #include <asm/apic.h>
@@ -3854,6 +3855,21 @@ union irte {
 	} fields;
 };
 
+struct irq_2_irte {
+	u16 devid; /* Device ID for IRTE table */
+	u16 index; /* Index into IRTE table*/
+};
+
+struct amd_ir_data {
+	struct irq_2_irte			irq_2_irte;
+	union irte				irte_entry;
+	union {
+		struct msi_msg			msi_entry;
+	};
+};
+
+static struct irq_chip amd_ir_chip;
+
 #define DTE_IRQ_PHYS_ADDR_MASK	(((1ULL << 45)-1) << 6)
 #define DTE_IRQ_REMAP_INTCTL    (2ULL << 60)
 #define DTE_IRQ_TABLE_LEN       (8ULL << 1)
@@ -3947,7 +3963,7 @@ out_unlock:
 	return table;
 }
 
-static int alloc_irq_index(struct irq_cfg *cfg, u16 devid, int count)
+static int alloc_irq_index(u16 devid, int count)
 {
 	struct irq_remap_table *table;
 	unsigned long flags;
@@ -3969,18 +3985,10 @@ static int alloc_irq_index(struct irq_cfg *cfg, u16 devid, int count)
 			c = 0;
 
 		if (c == count)	{
-			struct irq_2_irte *irte_info;
-
 			for (; c != 0; --c)
 				table->table[index - c + 1] = IRTE_ALLOCATED;
 
 			index -= count - 1;
-
-			cfg->remapped	      = 1;
-			irte_info             = &cfg->irq_2_irte;
-			irte_info->devid      = devid;
-			irte_info->index      = index;
-
 			goto out;
 		}
 	}
@@ -3993,22 +4001,6 @@ out:
 	return index;
 }
 
-static int get_irte(u16 devid, int index, union irte *irte)
-{
-	struct irq_remap_table *table;
-	unsigned long flags;
-
-	table = get_irq_table(devid, false);
-	if (!table)
-		return -ENOMEM;
-
-	spin_lock_irqsave(&table->lock, flags);
-	irte->val = table->table[index];
-	spin_unlock_irqrestore(&table->lock, flags);
-
-	return 0;
-}
-
 static int modify_irte(u16 devid, int index, union irte irte)
 {
 	struct irq_remap_table *table;
@@ -4055,243 +4047,316 @@ static void free_irte(u16 devid, int index)
 	iommu_completion_wait(iommu);
 }
 
-static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
-			      unsigned int destination, int vector,
-			      struct io_apic_irq_attr *attr)
+static int get_devid(struct irq_alloc_info *info)
 {
-	struct irq_remap_table *table;
-	struct irq_2_irte *irte_info;
-	struct irq_cfg *cfg;
-	union irte irte;
-	int ioapic_id;
-	int index;
-	int devid;
-	int ret;
-
-	cfg = irq_cfg(irq);
-	if (!cfg)
-		return -EINVAL;
-
-	irte_info = &cfg->irq_2_irte;
-	ioapic_id = mpc_ioapic_id(attr->ioapic);
-	devid     = get_ioapic_devid(ioapic_id);
-
-	if (devid < 0)
-		return devid;
-
-	table = get_irq_table(devid, true);
-	if (table == NULL)
-		return -ENOMEM;
-
-	index = attr->ioapic_pin;
+	int devid = -1;
 
-	/* Setup IRQ remapping info */
-	cfg->remapped	      = 1;
-	irte_info->devid      = devid;
-	irte_info->index      = index;
+	switch (info->type) {
+	case X86_IRQ_ALLOC_TYPE_IOAPIC:
+		devid     = get_ioapic_devid(info->ioapic_id);
+		break;
+	case X86_IRQ_ALLOC_TYPE_HPET:
+		devid     = get_hpet_devid(info->hpet_id);
+		break;
+	case X86_IRQ_ALLOC_TYPE_MSI:
+	case X86_IRQ_ALLOC_TYPE_MSIX:
+		devid = get_device_id(&info->msi_dev->dev);
+		break;
+	default:
+		BUG_ON(1);
+		break;
+	}
 
-	/* Setup IRTE for IOMMU */
-	irte.val		= 0;
-	irte.fields.vector      = vector;
-	irte.fields.int_type    = apic->irq_delivery_mode;
-	irte.fields.destination = destination;
-	irte.fields.dm          = apic->irq_dest_mode;
-	irte.fields.valid       = 1;
-
-	ret = modify_irte(devid, index, irte);
-	if (ret)
-		return ret;
+	return devid;
+}
 
-	/* Setup IOAPIC entry */
-	memset(entry, 0, sizeof(*entry));
+static struct irq_domain *get_ir_irq_domain(struct irq_alloc_info *info)
+{
+	struct amd_iommu *iommu;
+	int devid;
 
-	entry->vector        = index;
-	entry->mask          = 0;
-	entry->trigger       = attr->trigger;
-	entry->polarity      = attr->polarity;
+	if (!info)
+		return NULL;
 
-	/*
-	 * Mask level triggered irqs.
-	 */
-	if (attr->trigger)
-		entry->mask = 1;
+	devid = get_devid(info);
+	if (devid >= 0) {
+		iommu = amd_iommu_rlookup_table[devid];
+		if (iommu)
+			return iommu->ir_domain;
+	}
 
-	return 0;
+	return NULL;
 }
 
-static int set_affinity(struct irq_data *data, const struct cpumask *mask,
-			bool force)
+static struct irq_domain *get_irq_domain(struct irq_alloc_info *info)
 {
-	struct irq_2_irte *irte_info;
-	unsigned int dest, irq;
-	struct irq_cfg *cfg;
-	union irte irte;
-	int err;
-
-	if (!config_enabled(CONFIG_SMP))
-		return -1;
-
-	cfg       = irqd_cfg(data);
-	irq       = data->irq;
-	irte_info = &cfg->irq_2_irte;
+	struct amd_iommu *iommu;
+	int devid;
 
-	if (!cpumask_intersects(mask, cpu_online_mask))
-		return -EINVAL;
+	if (!info)
+		return NULL;
 
-	if (get_irte(irte_info->devid, irte_info->index, &irte))
-		return -EBUSY;
+	switch (info->type) {
+	case X86_IRQ_ALLOC_TYPE_MSI:
+	case X86_IRQ_ALLOC_TYPE_MSIX:
+		devid = get_device_id(&info->msi_dev->dev);
+		if (devid >= 0) {
+			iommu = amd_iommu_rlookup_table[devid];
+			if (iommu)
+				return iommu->msi_domain;
+		}
+		break;
+	default:
+		break;
+	}
 
-	if (assign_irq_vector(irq, cfg, mask))
-		return -EBUSY;
+	return NULL;
+}
 
-	err = apic->cpu_mask_to_apicid_and(cfg->domain, mask, &dest);
-	if (err) {
-		if (assign_irq_vector(irq, cfg, data->affinity))
-			pr_err("AMD-Vi: Failed to recover vector for irq %d\n", irq);
-		return err;
-	}
+struct irq_remap_ops amd_iommu_irq_ops = {
+	.prepare		= amd_iommu_prepare,
+	.enable			= amd_iommu_enable,
+	.disable		= amd_iommu_disable,
+	.reenable		= amd_iommu_reenable,
+	.enable_faulting	= amd_iommu_enable_faulting,
+	.get_ir_irq_domain	= get_ir_irq_domain,
+	.get_irq_domain		= get_irq_domain,
+};
 
-	irte.fields.vector      = cfg->vector;
-	irte.fields.destination = dest;
+static void irq_remapping_prepare_irte(struct amd_ir_data *data,
+				       struct irq_cfg *irq_cfg,
+				       struct irq_alloc_info *info,
+				       int devid, int index, int sub_handle)
+{
+	struct irq_2_irte *irte_info = &data->irq_2_irte;
+	struct msi_msg *msg = &data->msi_entry;
+	union irte *irte = &data->irte_entry;
+	struct IO_APIC_route_entry *entry;
 
-	modify_irte(irte_info->devid, irte_info->index, irte);
+	data->irq_2_irte.devid = devid;
+	data->irq_2_irte.index = index + sub_handle;
 
-	if (cfg->move_in_progress)
-		send_cleanup_vector(cfg);
+	/* Setup IRTE for IOMMU */
+	irte->val = 0;
+	irte->fields.vector      = irq_cfg->vector;
+	irte->fields.int_type    = apic->irq_delivery_mode;
+	irte->fields.destination = irq_cfg->dest_apicid;
+	irte->fields.dm          = apic->irq_dest_mode;
+	irte->fields.valid       = 1;
+
+	switch (info->type) {
+	case X86_IRQ_ALLOC_TYPE_IOAPIC:
+		/* Setup IOAPIC entry */
+		entry = info->ioapic_entry;
+		info->ioapic_entry = NULL;
+		memset(entry, 0, sizeof(*entry));
+		entry->vector        = index;
+		entry->mask          = 0;
+		entry->trigger       = info->ioapic_trigger;
+		entry->polarity      = info->ioapic_polarity;
+		/* Mask level triggered irqs. */
+		if (info->ioapic_trigger)
+			entry->mask = 1;
+		break;
 
-	cpumask_copy(data->affinity, mask);
+	case X86_IRQ_ALLOC_TYPE_HPET:
+	case X86_IRQ_ALLOC_TYPE_MSI:
+	case X86_IRQ_ALLOC_TYPE_MSIX:
+		msg->address_hi = MSI_ADDR_BASE_HI;
+		msg->address_lo = MSI_ADDR_BASE_LO;
+		msg->data = irte_info->index;
+		break;
 
-	return 0;
+	default:
+		BUG_ON(1);
+		break;
+	}
 }
 
-static int free_irq(int irq)
+static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
+			       unsigned int nr_irqs, void *arg)
 {
-	struct irq_2_irte *irte_info;
+	struct irq_alloc_info *info = arg;
+	struct irq_data *irq_data;
+	struct amd_ir_data *data;
 	struct irq_cfg *cfg;
+	int i, ret, devid;
+	int index = -1;
 
-	cfg = irq_cfg(irq);
-	if (!cfg)
+	if (!info)
+		return -EINVAL;
+	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_MSI &&
+	    info->type != X86_IRQ_ALLOC_TYPE_MSIX)
 		return -EINVAL;
 
-	irte_info = &cfg->irq_2_irte;
-
-	free_irte(irte_info->devid, irte_info->index);
+	/*
+	 * With IRQ remapping enabled, don't need contiguous CPU vectors
+	 * to support multiple MSI interrupts.
+	 */
+	if (info->type == X86_IRQ_ALLOC_TYPE_MSI)
+		info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
 
-	return 0;
-}
+	devid = get_devid(info);
+	if (devid < 0)
+		return -EINVAL;
 
-static void compose_msi_msg(struct pci_dev *pdev,
-			    unsigned int irq, unsigned int dest,
-			    struct msi_msg *msg, u8 hpet_id)
-{
-	struct irq_2_irte *irte_info;
-	struct irq_cfg *cfg;
-	union irte irte;
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+	if (ret < 0)
+		return ret;
 
-	cfg = irq_cfg(irq);
-	if (!cfg)
-		return;
+	ret = -ENOMEM;
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		goto out_free_parent;
 
-	irte_info = &cfg->irq_2_irte;
+	if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
+		if (get_irq_table(devid, true))
+			index = info->ioapic_pin;
+		else
+			ret = -ENOMEM;
+	} else {
+		index = alloc_irq_index(devid, nr_irqs);
+	}
+	if (index < 0) {
+		pr_warn("Failed to allocate IRTE\n");
+		kfree(data);
+		goto out_free_parent;
+	}
 
-	irte.val		= 0;
-	irte.fields.vector	= cfg->vector;
-	irte.fields.int_type    = apic->irq_delivery_mode;
-	irte.fields.destination	= dest;
-	irte.fields.dm		= apic->irq_dest_mode;
-	irte.fields.valid	= 1;
+	for (i = 0; i < nr_irqs; i++) {
+		irq_data = irq_domain_get_irq_data(domain, virq + i);
+		cfg = irqd_cfg(irq_data);
+		if (!irq_data || !cfg) {
+			ret = -EINVAL;
+			goto out_free_data;
+		}
 
-	modify_irte(irte_info->devid, irte_info->index, irte);
+		if (i > 0) {
+			data = kzalloc(sizeof(*data), GFP_KERNEL);
+			if (!data)
+				goto out_free_data;
+		}
+		irq_data->hwirq = (devid << 16) + i;
+		irq_data->chip_data = data;
+		irq_data->chip = &amd_ir_chip;
+		irq_remapping_prepare_irte(data, cfg, info, devid, index, i);
+		irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
+	}
+	return 0;
 
-	msg->address_hi = MSI_ADDR_BASE_HI;
-	msg->address_lo = MSI_ADDR_BASE_LO;
-	msg->data       = irte_info->index;
+out_free_data:
+	for (i--; i >= 0; i--) {
+		irq_data = irq_domain_get_irq_data(domain, virq + i);
+		if (irq_data)
+			kfree(irq_data->chip_data);
+	}
+	for (i = 0; i < nr_irqs; i++)
+		free_irte(devid, index + i);
+out_free_parent:
+	irq_domain_free_irqs_common(domain, virq, nr_irqs);
+	return ret;
 }
 
-static int msi_alloc_irq(struct pci_dev *pdev, int irq, int nvec)
+static void irq_remapping_free(struct irq_domain *domain, unsigned int virq,
+			       unsigned int nr_irqs)
 {
-	struct irq_cfg *cfg;
-	int index;
-	u16 devid;
-
-	if (!pdev)
-		return -EINVAL;
+	struct irq_2_irte *irte_info;
+	struct irq_data *irq_data;
+	struct amd_ir_data *data;
+	int i;
 
-	cfg = irq_cfg(irq);
-	if (!cfg)
-		return -EINVAL;
+	for (i = 0; i < nr_irqs; i++) {
+		irq_data = irq_domain_get_irq_data(domain, virq  + i);
+		if (irq_data && irq_data->chip_data) {
+			data = irq_data->chip_data;
+			irte_info = &data->irq_2_irte;
+			free_irte(irte_info->devid, irte_info->index);
+			kfree(data);
+		}
+	}
+	irq_domain_free_irqs_common(domain, virq, nr_irqs);
+}
 
-	devid = get_device_id(&pdev->dev);
-	index = alloc_irq_index(cfg, devid, nvec);
+static void irq_remapping_activate(struct irq_domain *domain,
+				   struct irq_data *irq_data)
+{
+	struct amd_ir_data *data = irq_data->chip_data;
+	struct irq_2_irte *irte_info = &data->irq_2_irte;
 
-	return index < 0 ? MAX_IRQS_PER_TABLE : index;
+	modify_irte(irte_info->devid, irte_info->index, data->irte_entry);
 }
 
-static int msi_setup_irq(struct pci_dev *pdev, unsigned int irq,
-			 int index, int offset)
+static void irq_remapping_deactivate(struct irq_domain *domain,
+				     struct irq_data *irq_data)
 {
-	struct irq_2_irte *irte_info;
-	struct irq_cfg *cfg;
-	u16 devid;
+	struct amd_ir_data *data = irq_data->chip_data;
+	struct irq_2_irte *irte_info = &data->irq_2_irte;
+	union irte entry;
 
-	if (!pdev)
-		return -EINVAL;
+	entry.val = 0;
+	modify_irte(irte_info->devid, irte_info->index, data->irte_entry);
+}
 
-	cfg = irq_cfg(irq);
-	if (!cfg)
-		return -EINVAL;
+static struct irq_domain_ops amd_ir_domain_ops = {
+	.alloc = irq_remapping_alloc,
+	.free = irq_remapping_free,
+	.activate = irq_remapping_activate,
+	.deactivate = irq_remapping_deactivate,
+};
 
-	if (index >= MAX_IRQS_PER_TABLE)
-		return 0;
+static int amd_ir_set_affinity(struct irq_data *data,
+			       const struct cpumask *mask, bool force)
+{
+	struct amd_ir_data *ir_data = data->chip_data;
+	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
+	struct irq_cfg *cfg = irqd_cfg(data);
+	struct irq_data *parent = data->parent_data;
+	int ret;
 
-	devid		= get_device_id(&pdev->dev);
-	irte_info	= &cfg->irq_2_irte;
+	ret = parent->chip->irq_set_affinity(parent, mask, force);
+	if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+		return ret;
 
-	cfg->remapped	      = 1;
-	irte_info->devid      = devid;
-	irte_info->index      = index + offset;
+	/*
+	 * Atomically updates the IRTE with the new destination, vector
+	 * and flushes the interrupt entry cache.
+	 */
+	ir_data->irte_entry.fields.vector = cfg->vector;
+	ir_data->irte_entry.fields.destination = cfg->dest_apicid;
+	modify_irte(irte_info->devid, irte_info->index, ir_data->irte_entry);
 
-	return 0;
+	/*
+	 * After this point, all the interrupts will start arriving
+	 * at the new destination. So, time to cleanup the previous
+	 * vector allocation.
+	 */
+	send_cleanup_vector(cfg);
+
+	return IRQ_SET_MASK_OK_DONE;
 }
 
-static int alloc_hpet_msi(unsigned int irq, unsigned int id)
+static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
 {
-	struct irq_2_irte *irte_info;
-	struct irq_cfg *cfg;
-	int index, devid;
+	struct amd_ir_data *ir_data = irq_data->chip_data;
 
-	cfg = irq_cfg(irq);
-	if (!cfg)
-		return -EINVAL;
+	*msg = ir_data->msi_entry;
+}
 
-	irte_info = &cfg->irq_2_irte;
-	devid     = get_hpet_devid(id);
-	if (devid < 0)
-		return devid;
+static struct irq_chip amd_ir_chip = {
+	.irq_ack = ir_ack_apic_edge,
+	.irq_set_affinity = amd_ir_set_affinity,
+	.irq_compose_msi_msg = ir_compose_msi_msg,
+};
 
-	index = alloc_irq_index(cfg, devid, 1);
-	if (index < 0)
-		return index;
+int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
+{
+	iommu->ir_domain = irq_domain_add_tree(NULL, &amd_ir_domain_ops, iommu);
+	if (!iommu->ir_domain)
+		return -ENOMEM;
 
-	cfg->remapped	      = 1;
-	irte_info->devid      = devid;
-	irte_info->index      = index;
+	iommu->ir_domain->parent = arch_get_ir_parent_domain();
+	iommu->msi_domain = arch_create_msi_irq_domain(iommu->ir_domain);
 
 	return 0;
 }
-
-struct irq_remap_ops amd_iommu_irq_ops = {
-	.prepare		= amd_iommu_prepare,
-	.enable			= amd_iommu_enable,
-	.disable		= amd_iommu_disable,
-	.reenable		= amd_iommu_reenable,
-	.enable_faulting	= amd_iommu_enable_faulting,
-	.setup_ioapic_entry	= setup_ioapic_entry,
-	.set_affinity		= set_affinity,
-	.free_irq		= free_irq,
-	.compose_msi_msg	= compose_msi_msg,
-	.msi_alloc_irq		= msi_alloc_irq,
-	.msi_setup_irq		= msi_setup_irq,
-	.alloc_hpet_msi		= alloc_hpet_msi,
-};
 #endif
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 450ef5001a65ab..c17df04d7a7fff 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -1124,6 +1124,10 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
 	if (ret)
 		return ret;
 
+	ret = amd_iommu_create_irq_domain(iommu);
+	if (ret)
+		return ret;
+
 	/*
 	 * Make sure IOMMU is not considered to translate itself. The IVRS
 	 * table tells us so, but this is a lie!
diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h
index 72b0fd455e2444..0a21142d3639d0 100644
--- a/drivers/iommu/amd_iommu_proto.h
+++ b/drivers/iommu/amd_iommu_proto.h
@@ -62,6 +62,15 @@ extern u8 amd_iommu_pc_get_max_counters(u16 devid);
 extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn,
 				    u64 *value, bool is_write);
 
+#ifdef CONFIG_IRQ_REMAP
+extern int amd_iommu_create_irq_domain(struct amd_iommu *iommu);
+#else
+static inline int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
+{
+	return 0;
+}
+#endif
+
 #define PPR_SUCCESS			0x0
 #define PPR_INVALID			0x1
 #define PPR_FAILURE			0xf
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 05030e523771a6..6533e874c9d779 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -398,6 +398,7 @@ struct amd_iommu_fault {
 
 
 struct iommu_domain;
+struct irq_domain;
 
 /*
  * This structure contains generic data for  IOMMU protection domains
@@ -579,6 +580,10 @@ struct amd_iommu {
 	/* The maximum PC banks and counters/bank (PCSup=1) */
 	u8 max_banks;
 	u8 max_counters;
+#ifdef CONFIG_IRQ_REMAP
+	struct irq_domain *ir_domain;
+	struct irq_domain *msi_domain;
+#endif
 };
 
 struct devid_map {
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 9847613085e157..536f2d8ea41abe 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1087,8 +1087,8 @@ static void free_iommu(struct intel_iommu *iommu)
 
 	if (iommu->irq) {
 		free_irq(iommu->irq, iommu);
-		irq_set_handler_data(iommu->irq, NULL);
 		dmar_free_hwirq(iommu->irq);
+		iommu->irq = 0;
 	}
 
 	if (iommu->qi) {
@@ -1642,23 +1642,14 @@ int dmar_set_interrupt(struct intel_iommu *iommu)
 	if (iommu->irq)
 		return 0;
 
-	irq = dmar_alloc_hwirq();
-	if (irq <= 0) {
+	irq = dmar_alloc_hwirq(iommu->seq_id, iommu->node, iommu);
+	if (irq > 0) {
+		iommu->irq = irq;
+	} else {
 		pr_err("IOMMU: no free vectors\n");
 		return -EINVAL;
 	}
 
-	irq_set_handler_data(irq, iommu);
-	iommu->irq = irq;
-
-	ret = arch_setup_dmar_msi(irq);
-	if (ret) {
-		irq_set_handler_data(irq, NULL);
-		iommu->irq = 0;
-		dmar_free_hwirq(irq);
-		return ret;
-	}
-
 	ret = request_irq(irq, dmar_fault, IRQF_NO_THREAD, iommu->name, iommu);
 	if (ret)
 		pr_err("IOMMU: can't request irq\n");
diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c
index 46d80adf7afdd1..24f7a3572410d9 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -8,6 +8,7 @@
 #include <linux/irq.h>
 #include <linux/intel-iommu.h>
 #include <linux/acpi.h>
+#include <linux/irqdomain.h>
 #include <asm/io_apic.h>
 #include <asm/smp.h>
 #include <asm/cpu.h>
@@ -35,6 +36,21 @@ struct hpet_scope {
 	unsigned int devfn;
 };
 
+struct irq_2_iommu {
+	struct intel_iommu *iommu;
+	u16 irte_index;
+	u16 sub_handle;
+	u8  irte_mask;
+};
+
+struct intel_ir_data {
+	struct irq_2_iommu			irq_2_iommu;
+	struct irte				irte_entry;
+	union {
+		struct msi_msg			msi_entry;
+	};
+};
+
 #define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0)
 #define IRTE_DEST(dest) ((eim_mode) ? dest : dest << 8)
 
@@ -54,43 +70,14 @@ static struct hpet_scope ir_hpet[MAX_HPET_TBS];
  * the dmar_global_lock.
  */
 static DEFINE_RAW_SPINLOCK(irq_2_ir_lock);
+static struct irq_domain_ops intel_ir_domain_ops;
 
 static int __init parse_ioapics_under_ir(void);
 
-static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
-{
-	struct irq_cfg *cfg = irq_cfg(irq);
-	return cfg ? &cfg->irq_2_iommu : NULL;
-}
-
-static int get_irte(int irq, struct irte *entry)
-{
-	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
-	unsigned long flags;
-	int index;
-
-	if (!entry || !irq_iommu)
-		return -1;
-
-	raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
-
-	if (unlikely(!irq_iommu->iommu)) {
-		raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-		return -1;
-	}
-
-	index = irq_iommu->irte_index + irq_iommu->sub_handle;
-	*entry = *(irq_iommu->iommu->ir_table->base + index);
-
-	raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-	return 0;
-}
-
-static int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
+static int alloc_irte(struct intel_iommu *iommu, int irq,
+		      struct irq_2_iommu *irq_iommu, u16 count)
 {
 	struct ir_table *table = iommu->ir_table;
-	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
-	struct irq_cfg *cfg = irq_cfg(irq);
 	unsigned int mask = 0;
 	unsigned long flags;
 	int index;
@@ -117,7 +104,6 @@ static int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
 	if (index < 0) {
 		pr_warn("IR%d: can't allocate an IRTE\n", iommu->seq_id);
 	} else {
-		cfg->remapped = 1;
 		irq_iommu->iommu = iommu;
 		irq_iommu->irte_index =  index;
 		irq_iommu->sub_handle = 0;
@@ -139,47 +125,9 @@ static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask)
 	return qi_submit_sync(&desc, iommu);
 }
 
-static int map_irq_to_irte_handle(int irq, u16 *sub_handle)
+static int modify_irte(struct irq_2_iommu *irq_iommu,
+		       struct irte *irte_modified)
 {
-	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
-	unsigned long flags;
-	int index;
-
-	if (!irq_iommu)
-		return -1;
-
-	raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
-	*sub_handle = irq_iommu->sub_handle;
-	index = irq_iommu->irte_index;
-	raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-	return index;
-}
-
-static int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
-{
-	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
-	struct irq_cfg *cfg = irq_cfg(irq);
-	unsigned long flags;
-
-	if (!irq_iommu)
-		return -1;
-
-	raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
-
-	cfg->remapped = 1;
-	irq_iommu->iommu = iommu;
-	irq_iommu->irte_index = index;
-	irq_iommu->sub_handle = subhandle;
-	irq_iommu->irte_mask = 0;
-
-	raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-
-	return 0;
-}
-
-static int modify_irte(int irq, struct irte *irte_modified)
-{
-	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
 	struct intel_iommu *iommu;
 	unsigned long flags;
 	struct irte *irte;
@@ -250,7 +198,7 @@ static int clear_entries(struct irq_2_iommu *irq_iommu)
 		return 0;
 
 	iommu = irq_iommu->iommu;
-	index = irq_iommu->irte_index + irq_iommu->sub_handle;
+	index = irq_iommu->irte_index;
 
 	start = iommu->ir_table->base + index;
 	end = start + (1 << irq_iommu->irte_mask);
@@ -268,29 +216,6 @@ static int clear_entries(struct irq_2_iommu *irq_iommu)
 	return qi_flush_iec(iommu, index, irq_iommu->irte_mask);
 }
 
-static int free_irte(int irq)
-{
-	struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
-	unsigned long flags;
-	int rc;
-
-	if (!irq_iommu)
-		return -1;
-
-	raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
-
-	rc = clear_entries(irq_iommu);
-
-	irq_iommu->iommu = NULL;
-	irq_iommu->irte_index = 0;
-	irq_iommu->sub_handle = 0;
-	irq_iommu->irte_mask = 0;
-
-	raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-
-	return rc;
-}
-
 /*
  * source validation type
  */
@@ -499,7 +424,6 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu)
 
 	pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO,
 				 INTR_REMAP_PAGE_ORDER);
-
 	if (!pages) {
 		pr_err("IR%d: failed to allocate pages of order %d\n",
 		       iommu->seq_id, INTR_REMAP_PAGE_ORDER);
@@ -513,11 +437,23 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu)
 		goto out_free_pages;
 	}
 
+	iommu->ir_domain = irq_domain_add_hierarchy(arch_get_ir_parent_domain(),
+						    0, INTR_REMAP_TABLE_ENTRIES,
+						    NULL, &intel_ir_domain_ops,
+						    iommu);
+	if (!iommu->ir_domain) {
+		pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id);
+		goto out_free_bitmap;
+	}
+	iommu->ir_msi_domain = arch_create_msi_irq_domain(iommu->ir_domain);
+
 	ir_table->base = page_address(pages);
 	ir_table->bitmap = bitmap;
 	iommu->ir_table = ir_table;
 	return 0;
 
+out_free_bitmap:
+	kfree(bitmap);
 out_free_pages:
 	__free_pages(pages, INTR_REMAP_PAGE_ORDER);
 out_free_table:
@@ -528,6 +464,14 @@ out_free_table:
 static void intel_teardown_irq_remapping(struct intel_iommu *iommu)
 {
 	if (iommu && iommu->ir_table) {
+		if (iommu->ir_msi_domain) {
+			irq_domain_remove(iommu->ir_msi_domain);
+			iommu->ir_msi_domain = NULL;
+		}
+		if (iommu->ir_domain) {
+			irq_domain_remove(iommu->ir_domain);
+			iommu->ir_domain = NULL;
+		}
 		free_pages((unsigned long)iommu->ir_table->base,
 			   INTR_REMAP_PAGE_ORDER);
 		kfree(iommu->ir_table->bitmap);
@@ -726,13 +670,6 @@ static int __init intel_enable_irq_remapping(void)
 
 	irq_remapping_enabled = 1;
 
-	/*
-	 * VT-d has a different layout for IO-APIC entries when
-	 * interrupt remapping is enabled. So it needs a special routine
-	 * to print IO-APIC entries for debugging purposes too.
-	 */
-	x86_io_apic_ops.print_entries = intel_ir_io_apic_print_entries;
-
 	pr_info("Enabled IRQ remapping in %s mode\n", eim ? "x2apic" : "xapic");
 
 	return eim ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE;
@@ -969,8 +906,7 @@ error:
 	return -1;
 }
 
-static void prepare_irte(struct irte *irte, int vector,
-			 unsigned int dest)
+static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
 {
 	memset(irte, 0, sizeof(*irte));
 
@@ -990,76 +926,63 @@ static void prepare_irte(struct irte *irte, int vector,
 	irte->redir_hint = 1;
 }
 
-static int intel_setup_ioapic_entry(int irq,
-				    struct IO_APIC_route_entry *route_entry,
-				    unsigned int destination, int vector,
-				    struct io_apic_irq_attr *attr)
+static struct irq_domain *intel_get_ir_irq_domain(struct irq_alloc_info *info)
 {
-	int ioapic_id = mpc_ioapic_id(attr->ioapic);
-	struct intel_iommu *iommu;
-	struct IR_IO_APIC_route_entry *entry;
-	struct irte irte;
-	int index;
-
-	down_read(&dmar_global_lock);
-	iommu = map_ioapic_to_ir(ioapic_id);
-	if (!iommu) {
-		pr_warn("No mapping iommu for ioapic %d\n", ioapic_id);
-		index = -ENODEV;
-	} else {
-		index = alloc_irte(iommu, irq, 1);
-		if (index < 0) {
-			pr_warn("Failed to allocate IRTE for ioapic %d\n",
-				ioapic_id);
-			index = -ENOMEM;
-		}
-	}
-	up_read(&dmar_global_lock);
-	if (index < 0)
-		return index;
-
-	prepare_irte(&irte, vector, destination);
+	struct intel_iommu *iommu = NULL;
 
-	/* Set source-id of interrupt request */
-	set_ioapic_sid(&irte, ioapic_id);
+	if (!info)
+		return NULL;
 
-	modify_irte(irq, &irte);
+	switch (info->type) {
+	case X86_IRQ_ALLOC_TYPE_IOAPIC:
+		iommu = map_ioapic_to_ir(info->ioapic_id);
+		break;
+	case X86_IRQ_ALLOC_TYPE_HPET:
+		iommu = map_hpet_to_ir(info->hpet_id);
+		break;
+	case X86_IRQ_ALLOC_TYPE_MSI:
+	case X86_IRQ_ALLOC_TYPE_MSIX:
+		iommu = map_dev_to_ir(info->msi_dev);
+		break;
+	default:
+		BUG_ON(1);
+		break;
+	}
 
-	apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
-		"Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
-		"Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
-		"Avail:%X Vector:%02X Dest:%08X "
-		"SID:%04X SQ:%X SVT:%X)\n",
-		attr->ioapic, irte.present, irte.fpd, irte.dst_mode,
-		irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
-		irte.avail, irte.vector, irte.dest_id,
-		irte.sid, irte.sq, irte.svt);
+	return iommu ? iommu->ir_domain : NULL;
+}
 
-	entry = (struct IR_IO_APIC_route_entry *)route_entry;
-	memset(entry, 0, sizeof(*entry));
+static struct irq_domain *intel_get_irq_domain(struct irq_alloc_info *info)
+{
+	struct intel_iommu *iommu;
 
-	entry->index2	= (index >> 15) & 0x1;
-	entry->zero	= 0;
-	entry->format	= 1;
-	entry->index	= (index & 0x7fff);
-	/*
-	 * IO-APIC RTE will be configured with virtual vector.
-	 * irq handler will do the explicit EOI to the io-apic.
-	 */
-	entry->vector	= attr->ioapic_pin;
-	entry->mask	= 0;			/* enable IRQ */
-	entry->trigger	= attr->trigger;
-	entry->polarity	= attr->polarity;
+	if (!info)
+		return NULL;
 
-	/* Mask level triggered irqs.
-	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
-	 */
-	if (attr->trigger)
-		entry->mask = 1;
+	switch (info->type) {
+	case X86_IRQ_ALLOC_TYPE_MSI:
+	case X86_IRQ_ALLOC_TYPE_MSIX:
+		iommu = map_dev_to_ir(info->msi_dev);
+		if (iommu)
+			return iommu->ir_msi_domain;
+		break;
+	default:
+		break;
+	}
 
-	return 0;
+	return NULL;
 }
 
+struct irq_remap_ops intel_irq_remap_ops = {
+	.prepare		= intel_prepare_irq_remapping,
+	.enable			= intel_enable_irq_remapping,
+	.disable		= disable_irq_remapping,
+	.reenable		= reenable_irq_remapping,
+	.enable_faulting	= enable_drhd_fault_handling,
+	.get_ir_irq_domain	= intel_get_ir_irq_domain,
+	.get_irq_domain		= intel_get_irq_domain,
+};
+
 /*
  * Migrate the IO-APIC irq in the presence of intr-remapping.
  *
@@ -1075,170 +998,242 @@ static int intel_setup_ioapic_entry(int irq,
  * is used to migrate MSI irq's in the presence of interrupt-remapping.
  */
 static int
-intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
-			  bool force)
+intel_ir_set_affinity(struct irq_data *data, const struct cpumask *mask,
+		      bool force)
 {
+	struct intel_ir_data *ir_data = data->chip_data;
+	struct irte *irte = &ir_data->irte_entry;
 	struct irq_cfg *cfg = irqd_cfg(data);
-	unsigned int dest, irq = data->irq;
-	struct irte irte;
-	int err;
-
-	if (!config_enabled(CONFIG_SMP))
-		return -EINVAL;
-
-	if (!cpumask_intersects(mask, cpu_online_mask))
-		return -EINVAL;
-
-	if (get_irte(irq, &irte))
-		return -EBUSY;
-
-	err = assign_irq_vector(irq, cfg, mask);
-	if (err)
-		return err;
-
-	err = apic->cpu_mask_to_apicid_and(cfg->domain, mask, &dest);
-	if (err) {
-		if (assign_irq_vector(irq, cfg, data->affinity))
-			pr_err("Failed to recover vector for irq %d\n", irq);
-		return err;
-	}
+	struct irq_data *parent = data->parent_data;
+	int ret;
 
-	irte.vector = cfg->vector;
-	irte.dest_id = IRTE_DEST(dest);
+	ret = parent->chip->irq_set_affinity(parent, mask, force);
+	if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+		return ret;
 
 	/*
 	 * Atomically updates the IRTE with the new destination, vector
 	 * and flushes the interrupt entry cache.
 	 */
-	modify_irte(irq, &irte);
+	irte->vector = cfg->vector;
+	irte->dest_id = IRTE_DEST(cfg->dest_apicid);
+	modify_irte(&ir_data->irq_2_iommu, irte);
 
 	/*
 	 * After this point, all the interrupts will start arriving
 	 * at the new destination. So, time to cleanup the previous
 	 * vector allocation.
 	 */
-	if (cfg->move_in_progress)
-		send_cleanup_vector(cfg);
+	send_cleanup_vector(cfg);
 
-	cpumask_copy(data->affinity, mask);
-	return 0;
+	return IRQ_SET_MASK_OK_DONE;
 }
 
-static void intel_compose_msi_msg(struct pci_dev *pdev,
-				  unsigned int irq, unsigned int dest,
-				  struct msi_msg *msg, u8 hpet_id)
+static void intel_ir_compose_msi_msg(struct irq_data *irq_data,
+				     struct msi_msg *msg)
 {
-	struct irq_cfg *cfg;
-	struct irte irte;
-	u16 sub_handle = 0;
-	int ir_index;
-
-	cfg = irq_cfg(irq);
+	struct intel_ir_data *ir_data = irq_data->chip_data;
 
-	ir_index = map_irq_to_irte_handle(irq, &sub_handle);
-	BUG_ON(ir_index == -1);
-
-	prepare_irte(&irte, cfg->vector, dest);
-
-	/* Set source-id of interrupt request */
-	if (pdev)
-		set_msi_sid(&irte, pdev);
-	else
-		set_hpet_sid(&irte, hpet_id);
+	*msg = ir_data->msi_entry;
+}
 
-	modify_irte(irq, &irte);
+static struct irq_chip intel_ir_chip = {
+	.irq_ack = ir_ack_apic_edge,
+	.irq_set_affinity = intel_ir_set_affinity,
+	.irq_compose_msi_msg = intel_ir_compose_msi_msg,
+};
 
-	msg->address_hi = MSI_ADDR_BASE_HI;
-	msg->data = sub_handle;
-	msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
-			  MSI_ADDR_IR_SHV |
-			  MSI_ADDR_IR_INDEX1(ir_index) |
-			  MSI_ADDR_IR_INDEX2(ir_index);
+static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
+					     struct irq_cfg *irq_cfg,
+					     struct irq_alloc_info *info,
+					     int index, int sub_handle)
+{
+	struct IR_IO_APIC_route_entry *entry;
+	struct irte *irte = &data->irte_entry;
+	struct msi_msg *msg = &data->msi_entry;
+
+	prepare_irte(irte, irq_cfg->vector, irq_cfg->dest_apicid);
+	switch (info->type) {
+	case X86_IRQ_ALLOC_TYPE_IOAPIC:
+		/* Set source-id of interrupt request */
+		set_ioapic_sid(irte, info->ioapic_id);
+		apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n",
+			info->ioapic_id, irte->present, irte->fpd,
+			irte->dst_mode, irte->redir_hint,
+			irte->trigger_mode, irte->dlvry_mode,
+			irte->avail, irte->vector, irte->dest_id,
+			irte->sid, irte->sq, irte->svt);
+
+		entry = (struct IR_IO_APIC_route_entry *)info->ioapic_entry;
+		info->ioapic_entry = NULL;
+		memset(entry, 0, sizeof(*entry));
+		entry->index2	= (index >> 15) & 0x1;
+		entry->zero	= 0;
+		entry->format	= 1;
+		entry->index	= (index & 0x7fff);
+		/*
+		 * IO-APIC RTE will be configured with virtual vector.
+		 * irq handler will do the explicit EOI to the io-apic.
+		 */
+		entry->vector	= info->ioapic_pin;
+		entry->mask	= 0;			/* enable IRQ */
+		entry->trigger	= info->ioapic_trigger;
+		entry->polarity	= info->ioapic_polarity;
+		if (info->ioapic_trigger)
+			entry->mask = 1; /* Mask level triggered irqs. */
+		break;
+
+	case X86_IRQ_ALLOC_TYPE_HPET:
+	case X86_IRQ_ALLOC_TYPE_MSI:
+	case X86_IRQ_ALLOC_TYPE_MSIX:
+		if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
+			set_hpet_sid(irte, info->hpet_id);
+		else
+			set_msi_sid(irte, info->msi_dev);
+
+		msg->address_hi = MSI_ADDR_BASE_HI;
+		msg->data = sub_handle;
+		msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
+				  MSI_ADDR_IR_SHV |
+				  MSI_ADDR_IR_INDEX1(index) |
+				  MSI_ADDR_IR_INDEX2(index);
+		break;
+
+	default:
+		BUG_ON(1);
+		break;
+	}
 }
 
-/*
- * Map the PCI dev to the corresponding remapping hardware unit
- * and allocate 'nvec' consecutive interrupt-remapping table entries
- * in it.
- */
-static int intel_msi_alloc_irq(struct pci_dev *dev, int irq, int nvec)
+static void intel_free_irq_resources(struct irq_domain *domain,
+				     unsigned int virq, unsigned int nr_irqs)
 {
-	struct intel_iommu *iommu;
-	int index;
+	struct irq_data *irq_data;
+	struct intel_ir_data *data;
+	struct irq_2_iommu *irq_iommu;
+	unsigned long flags;
+	int i;
 
-	down_read(&dmar_global_lock);
-	iommu = map_dev_to_ir(dev);
-	if (!iommu) {
-		printk(KERN_ERR
-		       "Unable to map PCI %s to iommu\n", pci_name(dev));
-		index = -ENOENT;
-	} else {
-		index = alloc_irte(iommu, irq, nvec);
-		if (index < 0) {
-			printk(KERN_ERR
-			       "Unable to allocate %d IRTE for PCI %s\n",
-			       nvec, pci_name(dev));
-			index = -ENOSPC;
+	for (i = 0; i < nr_irqs; i++) {
+		irq_data = irq_domain_get_irq_data(domain, virq  + i);
+		if (irq_data && irq_data->chip_data) {
+			data = irq_data->chip_data;
+			irq_iommu = &data->irq_2_iommu;
+			raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+			clear_entries(irq_iommu);
+			raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+			irq_domain_reset_irq_data(irq_data);
+			kfree(data);
 		}
 	}
-	up_read(&dmar_global_lock);
-
-	return index;
 }
 
-static int intel_msi_setup_irq(struct pci_dev *pdev, unsigned int irq,
-			       int index, int sub_handle)
+static int intel_irq_remapping_alloc(struct irq_domain *domain,
+				     unsigned int virq, unsigned int nr_irqs,
+				     void *arg)
 {
-	struct intel_iommu *iommu;
-	int ret = -ENOENT;
+	struct intel_iommu *iommu = domain->host_data;
+	struct irq_alloc_info *info = arg;
+	struct intel_ir_data *data, *ird;
+	struct irq_data *irq_data;
+	struct irq_cfg *irq_cfg;
+	int i, ret, index;
+
+	if (!info || !iommu)
+		return -EINVAL;
+	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_MSI &&
+	    info->type != X86_IRQ_ALLOC_TYPE_MSIX)
+		return -EINVAL;
+
+	/*
+	 * With IRQ remapping enabled, don't need contiguous CPU vectors
+	 * to support multiple MSI interrupts.
+	 */
+	if (info->type == X86_IRQ_ALLOC_TYPE_MSI)
+		info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
+
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+	if (ret < 0)
+		return ret;
+
+	ret = -ENOMEM;
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		goto out_free_parent;
 
 	down_read(&dmar_global_lock);
-	iommu = map_dev_to_ir(pdev);
-	if (iommu) {
-		/*
-		 * setup the mapping between the irq and the IRTE
-		 * base index, the sub_handle pointing to the
-		 * appropriate interrupt remap table entry.
-		 */
-		set_irte_irq(irq, iommu, index, sub_handle);
-		ret = 0;
-	}
+	index = alloc_irte(iommu, virq, &data->irq_2_iommu, nr_irqs);
 	up_read(&dmar_global_lock);
+	if (index < 0) {
+		pr_warn("Failed to allocate IRTE\n");
+		kfree(data);
+		goto out_free_parent;
+	}
 
+	for (i = 0; i < nr_irqs; i++) {
+		irq_data = irq_domain_get_irq_data(domain, virq + i);
+		irq_cfg = irqd_cfg(irq_data);
+		if (!irq_data || !irq_cfg) {
+			ret = -EINVAL;
+			goto out_free_data;
+		}
+
+		if (i > 0) {
+			ird = kzalloc(sizeof(*ird), GFP_KERNEL);
+			if (!ird)
+				goto out_free_data;
+			/* Initialize the common data */
+			ird->irq_2_iommu = data->irq_2_iommu;
+			ird->irq_2_iommu.sub_handle = i;
+		} else {
+			ird = data;
+		}
+
+		irq_data->hwirq = (index << 16) + i;
+		irq_data->chip_data = ird;
+		irq_data->chip = &intel_ir_chip;
+		intel_irq_remapping_prepare_irte(ird, irq_cfg, info, index, i);
+		irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
+	}
+	return 0;
+
+out_free_data:
+	intel_free_irq_resources(domain, virq, i);
+out_free_parent:
+	irq_domain_free_irqs_common(domain, virq, nr_irqs);
 	return ret;
 }
 
-static int intel_alloc_hpet_msi(unsigned int irq, unsigned int id)
+static void intel_irq_remapping_free(struct irq_domain *domain,
+				     unsigned int virq, unsigned int nr_irqs)
 {
-	int ret = -1;
-	struct intel_iommu *iommu;
-	int index;
+	intel_free_irq_resources(domain, virq, nr_irqs);
+	irq_domain_free_irqs_common(domain, virq, nr_irqs);
+}
 
-	down_read(&dmar_global_lock);
-	iommu = map_hpet_to_ir(id);
-	if (iommu) {
-		index = alloc_irte(iommu, irq, 1);
-		if (index >= 0)
-			ret = 0;
-	}
-	up_read(&dmar_global_lock);
+static void intel_irq_remapping_activate(struct irq_domain *domain,
+					 struct irq_data *irq_data)
+{
+	struct intel_ir_data *data = irq_data->chip_data;
 
-	return ret;
+	modify_irte(&data->irq_2_iommu, &data->irte_entry);
 }
 
-struct irq_remap_ops intel_irq_remap_ops = {
-	.prepare		= intel_prepare_irq_remapping,
-	.enable			= intel_enable_irq_remapping,
-	.disable		= disable_irq_remapping,
-	.reenable		= reenable_irq_remapping,
-	.enable_faulting	= enable_drhd_fault_handling,
-	.setup_ioapic_entry	= intel_setup_ioapic_entry,
-	.set_affinity		= intel_ioapic_set_affinity,
-	.free_irq		= free_irte,
-	.compose_msi_msg	= intel_compose_msi_msg,
-	.msi_alloc_irq		= intel_msi_alloc_irq,
-	.msi_setup_irq		= intel_msi_setup_irq,
-	.alloc_hpet_msi		= intel_alloc_hpet_msi,
+static void intel_irq_remapping_deactivate(struct irq_domain *domain,
+					   struct irq_data *irq_data)
+{
+	struct intel_ir_data *data = irq_data->chip_data;
+	struct irte entry;
+
+	memset(&entry, 0, sizeof(entry));
+	modify_irte(&data->irq_2_iommu, &entry);
+}
+
+static struct irq_domain_ops intel_ir_domain_ops = {
+	.alloc = intel_irq_remapping_alloc,
+	.free = intel_irq_remapping_free,
+	.activate = intel_irq_remapping_activate,
+	.deactivate = intel_irq_remapping_deactivate,
 };
 
 /*
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index 390079ee135077..fc78b0d41f71c2 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -6,6 +6,7 @@
 #include <linux/msi.h>
 #include <linux/irq.h>
 #include <linux/pci.h>
+#include <linux/irqdomain.h>
 
 #include <asm/hw_irq.h>
 #include <asm/irq_remapping.h>
@@ -24,18 +25,6 @@ int no_x2apic_optout;
 static int disable_irq_remap;
 static struct irq_remap_ops *remap_ops;
 
-static int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec);
-static int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq,
-				  int index, int sub_handle);
-static int set_remapped_irq_affinity(struct irq_data *data,
-				     const struct cpumask *mask,
-				     bool force);
-
-static bool irq_remapped(struct irq_cfg *cfg)
-{
-	return (cfg->remapped == 1);
-}
-
 static void irq_remapping_disable_io_apic(void)
 {
 	/*
@@ -49,117 +38,9 @@ static void irq_remapping_disable_io_apic(void)
 		disconnect_bsp_APIC(0);
 }
 
-static int do_setup_msi_irqs(struct pci_dev *dev, int nvec)
-{
-	int ret, sub_handle, nvec_pow2, index = 0;
-	unsigned int irq;
-	struct msi_desc *msidesc;
-
-	msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
-
-	irq = irq_alloc_hwirqs(nvec, dev_to_node(&dev->dev));
-	if (irq == 0)
-		return -ENOSPC;
-
-	nvec_pow2 = __roundup_pow_of_two(nvec);
-	for (sub_handle = 0; sub_handle < nvec; sub_handle++) {
-		if (!sub_handle) {
-			index = msi_alloc_remapped_irq(dev, irq, nvec_pow2);
-			if (index < 0) {
-				ret = index;
-				goto error;
-			}
-		} else {
-			ret = msi_setup_remapped_irq(dev, irq + sub_handle,
-						     index, sub_handle);
-			if (ret < 0)
-				goto error;
-		}
-		ret = setup_msi_irq(dev, msidesc, irq, sub_handle);
-		if (ret < 0)
-			goto error;
-	}
-	return 0;
-
-error:
-	irq_free_hwirqs(irq, nvec);
-
-	/*
-	 * Restore altered MSI descriptor fields and prevent just destroyed
-	 * IRQs from tearing down again in default_teardown_msi_irqs()
-	 */
-	msidesc->irq = 0;
-
-	return ret;
-}
-
-static int do_setup_msix_irqs(struct pci_dev *dev, int nvec)
-{
-	int node, ret, sub_handle, index = 0;
-	struct msi_desc *msidesc;
-	unsigned int irq;
-
-	node		= dev_to_node(&dev->dev);
-	sub_handle	= 0;
-
-	list_for_each_entry(msidesc, &dev->msi_list, list) {
-
-		irq = irq_alloc_hwirq(node);
-		if (irq == 0)
-			return -1;
-
-		if (sub_handle == 0)
-			ret = index = msi_alloc_remapped_irq(dev, irq, nvec);
-		else
-			ret = msi_setup_remapped_irq(dev, irq, index, sub_handle);
-
-		if (ret < 0)
-			goto error;
-
-		ret = setup_msi_irq(dev, msidesc, irq, 0);
-		if (ret < 0)
-			goto error;
-
-		sub_handle += 1;
-		irq        += 1;
-	}
-
-	return 0;
-
-error:
-	irq_free_hwirq(irq);
-	return ret;
-}
-
-static int irq_remapping_setup_msi_irqs(struct pci_dev *dev,
-					int nvec, int type)
-{
-	if (type == PCI_CAP_ID_MSI)
-		return do_setup_msi_irqs(dev, nvec);
-	else
-		return do_setup_msix_irqs(dev, nvec);
-}
-
-static void eoi_ioapic_pin_remapped(int apic, int pin, int vector)
-{
-	/*
-	 * Intr-remapping uses pin number as the virtual vector
-	 * in the RTE. Actual vector is programmed in
-	 * intr-remapping table entry. Hence for the io-apic
-	 * EOI we use the pin number.
-	 */
-	io_apic_eoi(apic, pin);
-}
-
 static void __init irq_remapping_modify_x86_ops(void)
 {
 	x86_io_apic_ops.disable		= irq_remapping_disable_io_apic;
-	x86_io_apic_ops.set_affinity	= set_remapped_irq_affinity;
-	x86_io_apic_ops.setup_entry	= setup_ioapic_remapped_entry;
-	x86_io_apic_ops.eoi_ioapic_pin	= eoi_ioapic_pin_remapped;
-	x86_msi.setup_msi_irqs		= irq_remapping_setup_msi_irqs;
-	x86_msi.setup_hpet_msi		= setup_hpet_msi_remapped;
-	x86_msi.compose_msi_msg		= compose_remapped_msi_msg;
 }
 
 static __init int setup_nointremap(char *str)
@@ -254,113 +135,48 @@ int __init irq_remap_enable_fault_handling(void)
 	return remap_ops->enable_faulting();
 }
 
-int setup_ioapic_remapped_entry(int irq,
-				struct IO_APIC_route_entry *entry,
-				unsigned int destination, int vector,
-				struct io_apic_irq_attr *attr)
-{
-	if (!remap_ops->setup_ioapic_entry)
-		return -ENODEV;
-
-	return remap_ops->setup_ioapic_entry(irq, entry, destination,
-					     vector, attr);
-}
-
-static int set_remapped_irq_affinity(struct irq_data *data,
-				     const struct cpumask *mask, bool force)
-{
-	if (!config_enabled(CONFIG_SMP) || !remap_ops->set_affinity)
-		return 0;
-
-	return remap_ops->set_affinity(data, mask, force);
-}
-
-void free_remapped_irq(int irq)
-{
-	struct irq_cfg *cfg = irq_cfg(irq);
-
-	if (irq_remapped(cfg) && remap_ops->free_irq)
-		remap_ops->free_irq(irq);
-}
-
-void compose_remapped_msi_msg(struct pci_dev *pdev,
-			      unsigned int irq, unsigned int dest,
-			      struct msi_msg *msg, u8 hpet_id)
-{
-	struct irq_cfg *cfg = irq_cfg(irq);
-
-	if (!irq_remapped(cfg))
-		native_compose_msi_msg(pdev, irq, dest, msg, hpet_id);
-	else if (remap_ops->compose_msi_msg)
-		remap_ops->compose_msi_msg(pdev, irq, dest, msg, hpet_id);
-}
-
-static int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec)
-{
-	if (!remap_ops->msi_alloc_irq)
-		return -ENODEV;
-
-	return remap_ops->msi_alloc_irq(pdev, irq, nvec);
-}
-
-static int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq,
-				  int index, int sub_handle)
-{
-	if (!remap_ops->msi_setup_irq)
-		return -ENODEV;
-
-	return remap_ops->msi_setup_irq(pdev, irq, index, sub_handle);
-}
-
-int setup_hpet_msi_remapped(unsigned int irq, unsigned int id)
-{
-	int ret;
-
-	if (!remap_ops->alloc_hpet_msi)
-		return -ENODEV;
-
-	ret = remap_ops->alloc_hpet_msi(irq, id);
-	if (ret)
-		return -EINVAL;
-
-	return default_setup_hpet_msi(irq, id);
-}
-
 void panic_if_irq_remap(const char *msg)
 {
 	if (irq_remapping_enabled)
 		panic(msg);
 }
 
-static void ir_ack_apic_edge(struct irq_data *data)
+void ir_ack_apic_edge(struct irq_data *data)
 {
 	ack_APIC_irq();
 }
 
-static void ir_ack_apic_level(struct irq_data *data)
+/**
+ * irq_remapping_get_ir_irq_domain - Get the irqdomain associated with the IOMMU
+ *				     device serving request @info
+ * @info: interrupt allocation information, used to identify the IOMMU device
+ *
+ * It's used to get parent irqdomain for HPET and IOAPIC irqdomains.
+ * Returns pointer to IRQ domain, or NULL on failure.
+ */
+struct irq_domain *
+irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info)
 {
-	ack_APIC_irq();
-	eoi_ioapic_irq(data->irq, irqd_cfg(data));
-}
+	if (!remap_ops || !remap_ops->get_ir_irq_domain)
+		return NULL;
 
-static void ir_print_prefix(struct irq_data *data, struct seq_file *p)
-{
-	seq_printf(p, " IR-%s", data->chip->name);
+	return remap_ops->get_ir_irq_domain(info);
 }
 
-void irq_remap_modify_chip_defaults(struct irq_chip *chip)
+/**
+ * irq_remapping_get_irq_domain - Get the irqdomain serving the request @info
+ * @info: interrupt allocation information, used to identify the IOMMU device
+ *
+ * There will be one PCI MSI/MSIX irqdomain associated with each interrupt
+ * remapping device, so this interface is used to retrieve the PCI MSI/MSIX
+ * irqdomain serving request @info.
+ * Returns pointer to IRQ domain, or NULL on failure.
+ */
+struct irq_domain *
+irq_remapping_get_irq_domain(struct irq_alloc_info *info)
 {
-	chip->irq_print_chip = ir_print_prefix;
-	chip->irq_ack = ir_ack_apic_edge;
-	chip->irq_eoi = ir_ack_apic_level;
-	chip->irq_set_affinity = x86_io_apic_ops.set_affinity;
-}
+	if (!remap_ops || !remap_ops->get_irq_domain)
+		return NULL;
 
-bool setup_remapped_irq(int irq, struct irq_cfg *cfg, struct irq_chip *chip)
-{
-	if (!irq_remapped(cfg))
-		return false;
-	irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-	irq_remap_modify_chip_defaults(chip);
-	return true;
+	return remap_ops->get_irq_domain(info);
 }
diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h
index 7c70cc29ffe6b4..91d5a119956aae 100644
--- a/drivers/iommu/irq_remapping.h
+++ b/drivers/iommu/irq_remapping.h
@@ -24,12 +24,10 @@
 
 #ifdef CONFIG_IRQ_REMAP
 
-struct IO_APIC_route_entry;
-struct io_apic_irq_attr;
 struct irq_data;
-struct cpumask;
-struct pci_dev;
 struct msi_msg;
+struct irq_domain;
+struct irq_alloc_info;
 
 extern int irq_remap_broken;
 extern int disable_sourceid_checking;
@@ -52,36 +50,18 @@ struct irq_remap_ops {
 	/* Enable fault handling */
 	int  (*enable_faulting)(void);
 
-	/* IO-APIC setup routine */
-	int (*setup_ioapic_entry)(int irq, struct IO_APIC_route_entry *,
-				  unsigned int, int,
-				  struct io_apic_irq_attr *);
+	/* Get the irqdomain associated the IOMMU device */
+	struct irq_domain *(*get_ir_irq_domain)(struct irq_alloc_info *);
 
-	/* Set the CPU affinity of a remapped interrupt */
-	int (*set_affinity)(struct irq_data *data, const struct cpumask *mask,
-			    bool force);
-
-	/* Free an IRQ */
-	int (*free_irq)(int);
-
-	/* Create MSI msg to use for interrupt remapping */
-	void (*compose_msi_msg)(struct pci_dev *,
-				unsigned int, unsigned int,
-				struct msi_msg *, u8);
-
-	/* Allocate remapping resources for MSI */
-	int (*msi_alloc_irq)(struct pci_dev *, int, int);
-
-	/* Setup the remapped MSI irq */
-	int (*msi_setup_irq)(struct pci_dev *, unsigned int, int, int);
-
-	/* Setup interrupt remapping for an HPET MSI */
-	int (*alloc_hpet_msi)(unsigned int, unsigned int);
+	/* Get the MSI irqdomain associated with the IOMMU device */
+	struct irq_domain *(*get_irq_domain)(struct irq_alloc_info *);
 };
 
 extern struct irq_remap_ops intel_irq_remap_ops;
 extern struct irq_remap_ops amd_iommu_irq_ops;
 
+extern void ir_ack_apic_edge(struct irq_data *data);
+
 #else  /* CONFIG_IRQ_REMAP */
 
 #define irq_remapping_enabled 0
diff --git a/drivers/irqchip/exynos-combiner.c b/drivers/irqchip/exynos-combiner.c
index 5945223b73fa2c..a57a3a1f339fa2 100644
--- a/drivers/irqchip/exynos-combiner.c
+++ b/drivers/irqchip/exynos-combiner.c
@@ -164,7 +164,7 @@ static int combiner_irq_domain_map(struct irq_domain *d, unsigned int irq,
 	return 0;
 }
 
-static struct irq_domain_ops combiner_irq_domain_ops = {
+static const struct irq_domain_ops combiner_irq_domain_ops = {
 	.xlate	= combiner_irq_domain_xlate,
 	.map	= combiner_irq_domain_map,
 };
diff --git a/drivers/irqchip/irq-armada-370-xp.c b/drivers/irqchip/irq-armada-370-xp.c
index daccc8bdbb423f..0d3b0fe2f175e8 100644
--- a/drivers/irqchip/irq-armada-370-xp.c
+++ b/drivers/irqchip/irq-armada-370-xp.c
@@ -409,7 +409,7 @@ static struct notifier_block mpic_cascaded_cpu_notifier = {
 };
 #endif /* CONFIG_SMP */
 
-static struct irq_domain_ops armada_370_xp_mpic_irq_ops = {
+static const struct irq_domain_ops armada_370_xp_mpic_irq_ops = {
 	.map = armada_370_xp_mpic_irq_map,
 	.xlate = irq_domain_xlate_onecell,
 };
diff --git a/drivers/irqchip/irq-bcm2835.c b/drivers/irqchip/irq-bcm2835.c
index 5916d6cdafa1c9..e68c3b60a681ba 100644
--- a/drivers/irqchip/irq-bcm2835.c
+++ b/drivers/irqchip/irq-bcm2835.c
@@ -135,7 +135,7 @@ static int armctrl_xlate(struct irq_domain *d, struct device_node *ctrlr,
 	return 0;
 }
 
-static struct irq_domain_ops armctrl_ops = {
+static const struct irq_domain_ops armctrl_ops = {
 	.xlate = armctrl_xlate
 };
 
diff --git a/drivers/irqchip/irq-keystone.c b/drivers/irqchip/irq-keystone.c
index 78e8b3ce5252cc..5dc61655055a3c 100644
--- a/drivers/irqchip/irq-keystone.c
+++ b/drivers/irqchip/irq-keystone.c
@@ -131,7 +131,7 @@ static int keystone_irq_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops keystone_irq_ops = {
+static const struct irq_domain_ops keystone_irq_ops = {
 	.map	= keystone_irq_map,
 	.xlate	= irq_domain_xlate_onecell,
 };
diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 57f09cb544644b..44d2d106e818f2 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -739,7 +739,7 @@ static int gic_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr,
 	return 0;
 }
 
-static struct irq_domain_ops gic_irq_domain_ops = {
+static const struct irq_domain_ops gic_irq_domain_ops = {
 	.map = gic_irq_domain_map,
 	.xlate = gic_irq_domain_xlate,
 };
diff --git a/drivers/irqchip/irq-mtk-sysirq.c b/drivers/irqchip/irq-mtk-sysirq.c
index eaf0a710e98aa9..15c13039bba211 100644
--- a/drivers/irqchip/irq-mtk-sysirq.c
+++ b/drivers/irqchip/irq-mtk-sysirq.c
@@ -111,7 +111,7 @@ static int mtk_sysirq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 	return irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, &gic_data);
 }
 
-static struct irq_domain_ops sysirq_domain_ops = {
+static const struct irq_domain_ops sysirq_domain_ops = {
 	.xlate = mtk_sysirq_domain_xlate,
 	.alloc = mtk_sysirq_domain_alloc,
 	.free = irq_domain_free_irqs_common,
@@ -144,7 +144,7 @@ static int __init mtk_sysirq_of_init(struct device_node *node,
 	chip_data->intpol_base = ioremap(res.start, size);
 	if (!chip_data->intpol_base) {
 		pr_err("mtk_sysirq: unable to map sysirq register\n");
-		ret = PTR_ERR(chip_data->intpol_base);
+		ret = -ENXIO;
 		goto out_free;
 	}
 
diff --git a/drivers/irqchip/irq-mxs.c b/drivers/irqchip/irq-mxs.c
index e4acf1e3f8e3b0..04bf97b289cf4a 100644
--- a/drivers/irqchip/irq-mxs.c
+++ b/drivers/irqchip/irq-mxs.c
@@ -90,7 +90,7 @@ static int icoll_irq_domain_map(struct irq_domain *d, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops icoll_irq_domain_ops = {
+static const struct irq_domain_ops icoll_irq_domain_ops = {
 	.map = icoll_irq_domain_map,
 	.xlate = irq_domain_xlate_onecell,
 };
diff --git a/drivers/irqchip/irq-renesas-intc-irqpin.c b/drivers/irqchip/irq-renesas-intc-irqpin.c
index 9a0767b9c89da6..0670ab4e3897bf 100644
--- a/drivers/irqchip/irq-renesas-intc-irqpin.c
+++ b/drivers/irqchip/irq-renesas-intc-irqpin.c
@@ -347,7 +347,7 @@ static int intc_irqpin_irq_domain_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops intc_irqpin_irq_domain_ops = {
+static const struct irq_domain_ops intc_irqpin_irq_domain_ops = {
 	.map	= intc_irqpin_irq_domain_map,
 	.xlate  = irq_domain_xlate_twocell,
 };
diff --git a/drivers/irqchip/irq-renesas-irqc.c b/drivers/irqchip/irq-renesas-irqc.c
index cdf80b7794cd73..778bd076aeea75 100644
--- a/drivers/irqchip/irq-renesas-irqc.c
+++ b/drivers/irqchip/irq-renesas-irqc.c
@@ -29,7 +29,6 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/module.h>
-#include <linux/platform_data/irq-renesas-irqc.h>
 #include <linux/pm_runtime.h>
 
 #define IRQC_IRQ_MAX	32	/* maximum 32 interrupts per driver instance */
@@ -62,7 +61,6 @@ struct irqc_priv {
 	void __iomem *iomem;
 	void __iomem *cpu_int_base;
 	struct irqc_irq irq[IRQC_IRQ_MAX];
-	struct renesas_irqc_config config;
 	unsigned int number_of_irqs;
 	struct platform_device *pdev;
 	struct irq_chip irq_chip;
@@ -168,14 +166,13 @@ static int irqc_irq_domain_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops irqc_irq_domain_ops = {
+static const struct irq_domain_ops irqc_irq_domain_ops = {
 	.map	= irqc_irq_domain_map,
 	.xlate  = irq_domain_xlate_twocell,
 };
 
 static int irqc_probe(struct platform_device *pdev)
 {
-	struct renesas_irqc_config *pdata = pdev->dev.platform_data;
 	struct irqc_priv *p;
 	struct resource *io;
 	struct resource *irq;
@@ -191,10 +188,6 @@ static int irqc_probe(struct platform_device *pdev)
 		goto err0;
 	}
 
-	/* deal with driver instance configuration */
-	if (pdata)
-		memcpy(&p->config, pdata, sizeof(*pdata));
-
 	p->pdev = pdev;
 	platform_set_drvdata(pdev, p);
 
@@ -251,8 +244,7 @@ static int irqc_probe(struct platform_device *pdev)
 	irq_chip->flags	= IRQCHIP_MASK_ON_SUSPEND;
 
 	p->irq_domain = irq_domain_add_simple(pdev->dev.of_node,
-					      p->number_of_irqs,
-					      p->config.irq_base,
+					      p->number_of_irqs, 0,
 					      &irqc_irq_domain_ops, p);
 	if (!p->irq_domain) {
 		ret = -ENXIO;
@@ -272,13 +264,6 @@ static int irqc_probe(struct platform_device *pdev)
 
 	dev_info(&pdev->dev, "driving %d irqs\n", p->number_of_irqs);
 
-	/* warn in case of mismatch if irq base is specified */
-	if (p->config.irq_base) {
-		if (p->config.irq_base != p->irq[0].domain_irq)
-			dev_warn(&pdev->dev, "irq base mismatch (%d/%d)\n",
-				 p->config.irq_base, p->irq[0].domain_irq);
-	}
-
 	return 0;
 err3:
 	while (--k >= 0)
diff --git a/drivers/irqchip/irq-s3c24xx.c b/drivers/irqchip/irq-s3c24xx.c
index c8d373fcd823bd..e96717f45ea151 100644
--- a/drivers/irqchip/irq-s3c24xx.c
+++ b/drivers/irqchip/irq-s3c24xx.c
@@ -502,7 +502,7 @@ err:
 	return -EINVAL;
 }
 
-static struct irq_domain_ops s3c24xx_irq_ops = {
+static const struct irq_domain_ops s3c24xx_irq_ops = {
 	.map = s3c24xx_irq_map,
 	.xlate = irq_domain_xlate_twocell,
 };
@@ -1228,7 +1228,7 @@ static int s3c24xx_irq_xlate_of(struct irq_domain *d, struct device_node *n,
 	return 0;
 }
 
-static struct irq_domain_ops s3c24xx_irq_ops_of = {
+static const struct irq_domain_ops s3c24xx_irq_ops_of = {
 	.map = s3c24xx_irq_map_of,
 	.xlate = s3c24xx_irq_xlate_of,
 };
diff --git a/drivers/irqchip/irq-sun4i.c b/drivers/irqchip/irq-sun4i.c
index 64155b686081b8..83d6aa6464ee0b 100644
--- a/drivers/irqchip/irq-sun4i.c
+++ b/drivers/irqchip/irq-sun4i.c
@@ -89,7 +89,7 @@ static int sun4i_irq_map(struct irq_domain *d, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops sun4i_irq_ops = {
+static const struct irq_domain_ops sun4i_irq_ops = {
 	.map = sun4i_irq_map,
 	.xlate = irq_domain_xlate_onecell,
 };
diff --git a/drivers/irqchip/irq-versatile-fpga.c b/drivers/irqchip/irq-versatile-fpga.c
index 1ab451729a5c5c..888111b76ea0dd 100644
--- a/drivers/irqchip/irq-versatile-fpga.c
+++ b/drivers/irqchip/irq-versatile-fpga.c
@@ -132,7 +132,7 @@ static int fpga_irqdomain_map(struct irq_domain *d, unsigned int irq,
 	return 0;
 }
 
-static struct irq_domain_ops fpga_irqdomain_ops = {
+static const struct irq_domain_ops fpga_irqdomain_ops = {
 	.map = fpga_irqdomain_map,
 	.xlate = irq_domain_xlate_onetwocell,
 };
diff --git a/drivers/irqchip/irq-vf610-mscm-ir.c b/drivers/irqchip/irq-vf610-mscm-ir.c
index b932ecb7b730f3..f5c01cbcc73ac1 100644
--- a/drivers/irqchip/irq-vf610-mscm-ir.c
+++ b/drivers/irqchip/irq-vf610-mscm-ir.c
@@ -182,10 +182,9 @@ static int __init vf610_mscm_ir_of_init(struct device_node *node,
 		return -ENOMEM;
 
 	mscm_ir_data->mscm_ir_base = of_io_request_and_map(node, 0, "mscm-ir");
-
-	if (!mscm_ir_data->mscm_ir_base) {
+	if (IS_ERR(mscm_ir_data->mscm_ir_base)) {
 		pr_err("vf610_mscm_ir: unable to map mscm register\n");
-		ret = -ENOMEM;
+		ret = PTR_ERR(mscm_ir_data->mscm_ir_base);
 		goto out_free;
 	}
 
diff --git a/drivers/irqchip/irq-vic.c b/drivers/irqchip/irq-vic.c
index 54089debf2dca8..d4ce331ea4a08e 100644
--- a/drivers/irqchip/irq-vic.c
+++ b/drivers/irqchip/irq-vic.c
@@ -256,7 +256,7 @@ static void __exception_irq_entry vic_handle_irq(struct pt_regs *regs)
 	} while (handled);
 }
 
-static struct irq_domain_ops vic_irqdomain_ops = {
+static const struct irq_domain_ops vic_irqdomain_ops = {
 	.map = vic_irqdomain_map,
 	.xlate = irq_domain_xlate_onetwocell,
 };
diff --git a/drivers/irqchip/irq-vt8500.c b/drivers/irqchip/irq-vt8500.c
index b7af816f276933..0b297009b85662 100644
--- a/drivers/irqchip/irq-vt8500.c
+++ b/drivers/irqchip/irq-vt8500.c
@@ -173,7 +173,7 @@ static int vt8500_irq_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops vt8500_irq_domain_ops = {
+static const struct irq_domain_ops vt8500_irq_domain_ops = {
 	.map = vt8500_irq_map,
 	.xlate = irq_domain_xlate_onecell,
 };
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 5e7559be222adc..eb934b0242e0e1 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -20,7 +20,7 @@
 #include "lg.h"
 
 /* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */
-static unsigned int syscall_vector = SYSCALL_VECTOR;
+static unsigned int syscall_vector = IA32_SYSCALL_VECTOR;
 module_param(syscall_vector, uint, 0444);
 
 /* The address of the interrupt handler is split into two bits: */
@@ -333,8 +333,8 @@ void set_interrupt(struct lg_cpu *cpu, unsigned int irq)
  */
 static bool could_be_syscall(unsigned int num)
 {
-	/* Normal Linux SYSCALL_VECTOR or reserved vector? */
-	return num == SYSCALL_VECTOR || num == syscall_vector;
+	/* Normal Linux IA32_SYSCALL_VECTOR or reserved vector? */
+	return num == IA32_SYSCALL_VECTOR || num == syscall_vector;
 }
 
 /* The syscall vector it wants must be unused by Host. */
@@ -351,7 +351,7 @@ bool check_syscall_vector(struct lguest *lg)
 int init_interrupts(void)
 {
 	/* If they want some strange system call vector, reserve it now */
-	if (syscall_vector != SYSCALL_VECTOR) {
+	if (syscall_vector != IA32_SYSCALL_VECTOR) {
 		if (test_bit(syscall_vector, used_vectors) ||
 		    vector_used_by_percpu_irq(syscall_vector)) {
 			printk(KERN_ERR "lg: couldn't reserve syscall %u\n",
@@ -366,7 +366,7 @@ int init_interrupts(void)
 
 void free_interrupts(void)
 {
-	if (syscall_vector != SYSCALL_VECTOR)
+	if (syscall_vector != IA32_SYSCALL_VECTOR)
 		clear_bit(syscall_vector, used_vectors);
 }
 
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 30f2aef69d787d..6a4cd771a2be62 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -46,7 +46,7 @@
 #include <asm/setup.h>
 #include <asm/lguest.h>
 #include <asm/uaccess.h>
-#include <asm/i387.h>
+#include <asm/fpu/internal.h>
 #include <asm/tlbflush.h>
 #include "../lg.h"
 
@@ -251,7 +251,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
 	 * we set it now, so we can trap and pass that trap to the Guest if it
 	 * uses the FPU.
 	 */
-	if (cpu->ts && user_has_fpu())
+	if (cpu->ts && fpregs_active())
 		stts();
 
 	/*
@@ -283,7 +283,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
 		wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
 
 	/* Clear the host TS bit if it was set above. */
-	if (cpu->ts && user_has_fpu())
+	if (cpu->ts && fpregs_active())
 		clts();
 
 	/*
@@ -297,12 +297,12 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
 	/*
 	 * Similarly, if we took a trap because the Guest used the FPU,
 	 * we have to restore the FPU it expects to see.
-	 * math_state_restore() may sleep and we may even move off to
+	 * fpu__restore() may sleep and we may even move off to
 	 * a different CPU. So all the critical stuff should be done
 	 * before this.
 	 */
-	else if (cpu->regs->trapnum == 7 && !user_has_fpu())
-		math_state_restore();
+	else if (cpu->regs->trapnum == 7 && !fpregs_active())
+		fpu__restore(&current->thread.fpu);
 }
 
 /*H:130
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 0c42ed9c9e4c56..3d93e72ddd53f5 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -1332,7 +1332,7 @@ static unsigned int efx_wanted_parallelism(struct efx_nic *efx)
 			if (!cpumask_test_cpu(cpu, thread_mask)) {
 				++count;
 				cpumask_or(thread_mask, thread_mask,
-					   topology_thread_cpumask(cpu));
+					   topology_sibling_cpumask(cpu));
 			}
 		}
 
diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c
index a94dd2c4183a0d..7eb4109a3df4eb 100644
--- a/drivers/pci/htirq.c
+++ b/drivers/pci/htirq.c
@@ -23,20 +23,11 @@
  */
 static DEFINE_SPINLOCK(ht_irq_lock);
 
-struct ht_irq_cfg {
-	struct pci_dev *dev;
-	 /* Update callback used to cope with buggy hardware */
-	ht_irq_update_t *update;
-	unsigned pos;
-	unsigned idx;
-	struct ht_irq_msg msg;
-};
-
-
 void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg)
 {
 	struct ht_irq_cfg *cfg = irq_get_handler_data(irq);
 	unsigned long flags;
+
 	spin_lock_irqsave(&ht_irq_lock, flags);
 	if (cfg->msg.address_lo != msg->address_lo) {
 		pci_write_config_byte(cfg->dev, cfg->pos + 2, cfg->idx);
@@ -55,6 +46,7 @@ void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg)
 void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg)
 {
 	struct ht_irq_cfg *cfg = irq_get_handler_data(irq);
+
 	*msg = cfg->msg;
 }
 
@@ -86,7 +78,6 @@ void unmask_ht_irq(struct irq_data *data)
  */
 int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
 {
-	struct ht_irq_cfg *cfg;
 	int max_irq, pos, irq;
 	unsigned long flags;
 	u32 data;
@@ -105,29 +96,9 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
 	if (idx > max_irq)
 		return -EINVAL;
 
-	cfg = kmalloc(sizeof(*cfg), GFP_KERNEL);
-	if (!cfg)
-		return -ENOMEM;
-
-	cfg->dev = dev;
-	cfg->update = update;
-	cfg->pos = pos;
-	cfg->idx = 0x10 + (idx * 2);
-	/* Initialize msg to a value that will never match the first write. */
-	cfg->msg.address_lo = 0xffffffff;
-	cfg->msg.address_hi = 0xffffffff;
-
-	irq = irq_alloc_hwirq(dev_to_node(&dev->dev));
-	if (!irq) {
-		kfree(cfg);
-		return -EBUSY;
-	}
-	irq_set_handler_data(irq, cfg);
-
-	if (arch_setup_ht_irq(irq, dev) < 0) {
-		ht_destroy_irq(irq);
-		return -EBUSY;
-	}
+	irq = arch_setup_ht_irq(idx, pos, dev, update);
+	if (irq > 0)
+		dev_dbg(&dev->dev, "irq %d for HT\n", irq);
 
 	return irq;
 }
@@ -158,13 +129,6 @@ EXPORT_SYMBOL(ht_create_irq);
  */
 void ht_destroy_irq(unsigned int irq)
 {
-	struct ht_irq_cfg *cfg;
-
-	cfg = irq_get_handler_data(irq);
-	irq_set_chip(irq, NULL);
-	irq_set_handler_data(irq, NULL);
-	irq_free_hwirq(irq);
-
-	kfree(cfg);
+	arch_teardown_ht_irq(irq);
 }
 EXPORT_SYMBOL(ht_destroy_irq);
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index c77dfe5aaed6d6..e9fd0e90fa3b5c 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -819,13 +819,6 @@ static void quirk_amd_ioapic(struct pci_dev *dev)
 	}
 }
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD,	PCI_DEVICE_ID_AMD_VIPER_7410,	quirk_amd_ioapic);
-
-static void quirk_ioapic_rmw(struct pci_dev *dev)
-{
-	if (dev->devfn == 0 && dev->bus->number == 0)
-		sis_apic_bug = 1;
-}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_SI,	PCI_ANY_ID,			quirk_ioapic_rmw);
 #endif /* CONFIG_X86_IO_APIC */
 
 /*
diff --git a/drivers/power/reset/ltc2952-poweroff.c b/drivers/power/reset/ltc2952-poweroff.c
index 62c91acd6584e9..15fed9d8f871a7 100644
--- a/drivers/power/reset/ltc2952-poweroff.c
+++ b/drivers/power/reset/ltc2952-poweroff.c
@@ -158,7 +158,6 @@ static irqreturn_t ltc2952_poweroff_handler(int irq, void *dev_id)
 			      HRTIMER_MODE_REL);
 	} else {
 		hrtimer_cancel(&data->timer_trigger);
-		/* omitting return value check, timer should have been valid */
 	}
 	return IRQ_HANDLED;
 }
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
index cc3ab351943e19..f9262243f9359a 100644
--- a/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
@@ -87,7 +87,7 @@ static void cfs_cpu_core_siblings(int cpu, cpumask_t *mask)
 /* return cpumask of HTs in the same core */
 static void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask)
 {
-	cpumask_copy(mask, topology_thread_cpumask(cpu));
+	cpumask_copy(mask, topology_sibling_cpumask(cpu));
 }
 
 static void cfs_node_to_cpumask(int node, cpumask_t *mask)
diff --git a/drivers/staging/lustre/lustre/ptlrpc/service.c b/drivers/staging/lustre/lustre/ptlrpc/service.c
index 8e61421515cb68..344189ac5698a5 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/service.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/service.c
@@ -557,7 +557,7 @@ ptlrpc_server_nthreads_check(struct ptlrpc_service *svc,
 		 * there are.
 		 */
 		/* weight is # of HTs */
-		if (cpumask_weight(topology_thread_cpumask(0)) > 1) {
+		if (cpumask_weight(topology_sibling_cpumask(0)) > 1) {
 			/* depress thread factor for hyper-thread */
 			factor = factor - (factor >> 1) + (factor >> 3);
 		}
@@ -2768,7 +2768,7 @@ int ptlrpc_hr_init(void)
 
 	init_waitqueue_head(&ptlrpc_hr.hr_waitq);
 
-	weight = cpumask_weight(topology_thread_cpumask(0));
+	weight = cpumask_weight(topology_sibling_cpumask(0));
 
 	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
 		hrp->hrp_cpt = i;
diff --git a/fs/select.c b/fs/select.c
index f684c750e08a54..015547330e8869 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -189,7 +189,7 @@ static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
 	 * doesn't imply write barrier and the users expect write
 	 * barrier semantics on wakeup functions.  The following
 	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
-	 * and is paired with set_mb() in poll_schedule_timeout.
+	 * and is paired with smp_store_mb() in poll_schedule_timeout.
 	 */
 	smp_wmb();
 	pwq->triggered = 1;
@@ -244,7 +244,7 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 	/*
 	 * Prepare for the next iteration.
 	 *
-	 * The following set_mb() serves two purposes.  First, it's
+	 * The following smp_store_mb() serves two purposes.  First, it's
 	 * the counterpart rmb of the wmb in pollwake() such that data
 	 * written before wake up is always visible after wake up.
 	 * Second, the full barrier guarantees that triggered clearing
@@ -252,7 +252,7 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 	 * this problem doesn't exist for the first iteration as
 	 * add_wait_queue() has full barrier semantics.
 	 */
-	set_mb(pwq->triggered, 0);
+	smp_store_mb(pwq->triggered, 0);
 
 	return rc;
 }
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index f5c40b0fadc2a5..e6a83d712ef677 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -66,8 +66,8 @@
 #define smp_read_barrier_depends()	do { } while (0)
 #endif
 
-#ifndef set_mb
-#define set_mb(var, value)  do { (var) = (value); mb(); } while (0)
+#ifndef smp_store_mb
+#define smp_store_mb(var, value)  do { WRITE_ONCE(var, value); mb(); } while (0)
 #endif
 
 #ifndef smp_mb__before_atomic
diff --git a/include/asm-generic/cmpxchg.h b/include/asm-generic/cmpxchg.h
index 811fb1e9b06131..3766ab34aa45af 100644
--- a/include/asm-generic/cmpxchg.h
+++ b/include/asm-generic/cmpxchg.h
@@ -86,9 +86,6 @@ unsigned long __xchg(unsigned long x, volatile void *ptr, int size)
 
 /*
  * Atomic compare and exchange.
- *
- * Do not define __HAVE_ARCH_CMPXCHG because we want to use it to check whether
- * a cmpxchg primitive faster than repeated local irq save/restore exists.
  */
 #include <asm-generic/cmpxchg-local.h>
 
diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h
index b59b5a52637ec7..e56272c919b5a6 100644
--- a/include/asm-generic/futex.h
+++ b/include/asm-generic/futex.h
@@ -8,8 +8,7 @@
 #ifndef CONFIG_SMP
 /*
  * The following implementation only for uniprocessor machines.
- * For UP, it's relies on the fact that pagefault_disable() also disables
- * preemption to ensure mutual exclusion.
+ * It relies on preempt_disable() ensuring mutual exclusion.
  *
  */
 
@@ -38,6 +37,7 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
 		oparg = 1 << oparg;
 
+	preempt_disable();
 	pagefault_disable();
 
 	ret = -EFAULT;
@@ -72,6 +72,7 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 out_pagefault_enable:
 	pagefault_enable();
+	preempt_enable();
 
 	if (ret == 0) {
 		switch (cmp) {
@@ -106,6 +107,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 {
 	u32 val;
 
+	preempt_disable();
 	if (unlikely(get_user(val, uaddr) != 0))
 		return -EFAULT;
 
@@ -113,6 +115,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 		return -EFAULT;
 
 	*uval = val;
+	preempt_enable();
 
 	return 0;
 }
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 9db042304df37d..90ccba7f9f9a5b 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -769,6 +769,14 @@ static inline void __iomem *ioremap_nocache(phys_addr_t offset, size_t size)
 }
 #endif
 
+#ifndef ioremap_uc
+#define ioremap_uc ioremap_uc
+static inline void __iomem *ioremap_uc(phys_addr_t offset, size_t size)
+{
+	return ioremap_nocache(offset, size);
+}
+#endif
+
 #ifndef ioremap_wc
 #define ioremap_wc ioremap_wc
 static inline void __iomem *ioremap_wc(phys_addr_t offset, size_t size)
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
new file mode 100644
index 00000000000000..83bfb87f5bf18e
--- /dev/null
+++ b/include/asm-generic/qspinlock.h
@@ -0,0 +1,139 @@
+/*
+ * Queued spinlock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ */
+#ifndef __ASM_GENERIC_QSPINLOCK_H
+#define __ASM_GENERIC_QSPINLOCK_H
+
+#include <asm-generic/qspinlock_types.h>
+
+/**
+ * queued_spin_is_locked - is the spinlock locked?
+ * @lock: Pointer to queued spinlock structure
+ * Return: 1 if it is locked, 0 otherwise
+ */
+static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
+{
+	return atomic_read(&lock->val);
+}
+
+/**
+ * queued_spin_value_unlocked - is the spinlock structure unlocked?
+ * @lock: queued spinlock structure
+ * Return: 1 if it is unlocked, 0 otherwise
+ *
+ * N.B. Whenever there are tasks waiting for the lock, it is considered
+ *      locked wrt the lockref code to avoid lock stealing by the lockref
+ *      code and change things underneath the lock. This also allows some
+ *      optimizations to be applied without conflict with lockref.
+ */
+static __always_inline int queued_spin_value_unlocked(struct qspinlock lock)
+{
+	return !atomic_read(&lock.val);
+}
+
+/**
+ * queued_spin_is_contended - check if the lock is contended
+ * @lock : Pointer to queued spinlock structure
+ * Return: 1 if lock contended, 0 otherwise
+ */
+static __always_inline int queued_spin_is_contended(struct qspinlock *lock)
+{
+	return atomic_read(&lock->val) & ~_Q_LOCKED_MASK;
+}
+/**
+ * queued_spin_trylock - try to acquire the queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ * Return: 1 if lock acquired, 0 if failed
+ */
+static __always_inline int queued_spin_trylock(struct qspinlock *lock)
+{
+	if (!atomic_read(&lock->val) &&
+	   (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) == 0))
+		return 1;
+	return 0;
+}
+
+extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+
+/**
+ * queued_spin_lock - acquire a queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ */
+static __always_inline void queued_spin_lock(struct qspinlock *lock)
+{
+	u32 val;
+
+	val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL);
+	if (likely(val == 0))
+		return;
+	queued_spin_lock_slowpath(lock, val);
+}
+
+#ifndef queued_spin_unlock
+/**
+ * queued_spin_unlock - release a queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ */
+static __always_inline void queued_spin_unlock(struct qspinlock *lock)
+{
+	/*
+	 * smp_mb__before_atomic() in order to guarantee release semantics
+	 */
+	smp_mb__before_atomic_dec();
+	atomic_sub(_Q_LOCKED_VAL, &lock->val);
+}
+#endif
+
+/**
+ * queued_spin_unlock_wait - wait until current lock holder releases the lock
+ * @lock : Pointer to queued spinlock structure
+ *
+ * There is a very slight possibility of live-lock if the lockers keep coming
+ * and the waiter is just unfortunate enough to not see any unlock state.
+ */
+static inline void queued_spin_unlock_wait(struct qspinlock *lock)
+{
+	while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
+		cpu_relax();
+}
+
+#ifndef virt_queued_spin_lock
+static __always_inline bool virt_queued_spin_lock(struct qspinlock *lock)
+{
+	return false;
+}
+#endif
+
+/*
+ * Initializier
+ */
+#define	__ARCH_SPIN_LOCK_UNLOCKED	{ ATOMIC_INIT(0) }
+
+/*
+ * Remapping spinlock architecture specific functions to the corresponding
+ * queued spinlock functions.
+ */
+#define arch_spin_is_locked(l)		queued_spin_is_locked(l)
+#define arch_spin_is_contended(l)	queued_spin_is_contended(l)
+#define arch_spin_value_unlocked(l)	queued_spin_value_unlocked(l)
+#define arch_spin_lock(l)		queued_spin_lock(l)
+#define arch_spin_trylock(l)		queued_spin_trylock(l)
+#define arch_spin_unlock(l)		queued_spin_unlock(l)
+#define arch_spin_lock_flags(l, f)	queued_spin_lock(l)
+#define arch_spin_unlock_wait(l)	queued_spin_unlock_wait(l)
+
+#endif /* __ASM_GENERIC_QSPINLOCK_H */
diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h
new file mode 100644
index 00000000000000..85f888e86761e1
--- /dev/null
+++ b/include/asm-generic/qspinlock_types.h
@@ -0,0 +1,79 @@
+/*
+ * Queued spinlock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ */
+#ifndef __ASM_GENERIC_QSPINLOCK_TYPES_H
+#define __ASM_GENERIC_QSPINLOCK_TYPES_H
+
+/*
+ * Including atomic.h with PARAVIRT on will cause compilation errors because
+ * of recursive header file incluson via paravirt_types.h. So don't include
+ * it if PARAVIRT is on.
+ */
+#ifndef CONFIG_PARAVIRT
+#include <linux/types.h>
+#include <linux/atomic.h>
+#endif
+
+typedef struct qspinlock {
+	atomic_t	val;
+} arch_spinlock_t;
+
+/*
+ * Bitfields in the atomic value:
+ *
+ * When NR_CPUS < 16K
+ *  0- 7: locked byte
+ *     8: pending
+ *  9-15: not used
+ * 16-17: tail index
+ * 18-31: tail cpu (+1)
+ *
+ * When NR_CPUS >= 16K
+ *  0- 7: locked byte
+ *     8: pending
+ *  9-10: tail index
+ * 11-31: tail cpu (+1)
+ */
+#define	_Q_SET_MASK(type)	(((1U << _Q_ ## type ## _BITS) - 1)\
+				      << _Q_ ## type ## _OFFSET)
+#define _Q_LOCKED_OFFSET	0
+#define _Q_LOCKED_BITS		8
+#define _Q_LOCKED_MASK		_Q_SET_MASK(LOCKED)
+
+#define _Q_PENDING_OFFSET	(_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
+#if CONFIG_NR_CPUS < (1U << 14)
+#define _Q_PENDING_BITS		8
+#else
+#define _Q_PENDING_BITS		1
+#endif
+#define _Q_PENDING_MASK		_Q_SET_MASK(PENDING)
+
+#define _Q_TAIL_IDX_OFFSET	(_Q_PENDING_OFFSET + _Q_PENDING_BITS)
+#define _Q_TAIL_IDX_BITS	2
+#define _Q_TAIL_IDX_MASK	_Q_SET_MASK(TAIL_IDX)
+
+#define _Q_TAIL_CPU_OFFSET	(_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
+#define _Q_TAIL_CPU_BITS	(32 - _Q_TAIL_CPU_OFFSET)
+#define _Q_TAIL_CPU_MASK	_Q_SET_MASK(TAIL_CPU)
+
+#define _Q_TAIL_OFFSET		_Q_TAIL_IDX_OFFSET
+#define _Q_TAIL_MASK		(_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
+
+#define _Q_LOCKED_VAL		(1U << _Q_LOCKED_OFFSET)
+#define _Q_PENDING_VAL		(1U << _Q_PENDING_OFFSET)
+
+#endif /* __ASM_GENERIC_QSPINLOCK_TYPES_H */
diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h
index a899402a5a0e63..52f3b7da4f2d4d 100644
--- a/include/linux/alarmtimer.h
+++ b/include/linux/alarmtimer.h
@@ -43,8 +43,8 @@ struct alarm {
 
 void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
 		enum alarmtimer_restart (*function)(struct alarm *, ktime_t));
-int alarm_start(struct alarm *alarm, ktime_t start);
-int alarm_start_relative(struct alarm *alarm, ktime_t start);
+void alarm_start(struct alarm *alarm, ktime_t start);
+void alarm_start_relative(struct alarm *alarm, ktime_t start);
 void alarm_restart(struct alarm *alarm);
 int alarm_try_to_cancel(struct alarm *alarm);
 int alarm_cancel(struct alarm *alarm);
diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
index 86c12c93e3cf6c..8fdcb783197d72 100644
--- a/include/linux/bottom_half.h
+++ b/include/linux/bottom_half.h
@@ -2,7 +2,6 @@
 #define _LINUX_BH_H
 
 #include <linux/preempt.h>
-#include <linux/preempt_mask.h>
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 96c280b2c26347..597a1e836f2237 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -37,12 +37,15 @@ enum clock_event_mode {
  *		reached from DETACHED or SHUTDOWN.
  * ONESHOT:	Device is programmed to generate event only once. Can be reached
  *		from DETACHED or SHUTDOWN.
+ * ONESHOT_STOPPED: Device was programmed in ONESHOT mode and is temporarily
+ *		    stopped.
  */
 enum clock_event_state {
 	CLOCK_EVT_STATE_DETACHED,
 	CLOCK_EVT_STATE_SHUTDOWN,
 	CLOCK_EVT_STATE_PERIODIC,
 	CLOCK_EVT_STATE_ONESHOT,
+	CLOCK_EVT_STATE_ONESHOT_STOPPED,
 };
 
 /*
@@ -84,12 +87,13 @@ enum clock_event_state {
  * @mult:		nanosecond to cycles multiplier
  * @shift:		nanoseconds to cycles divisor (power of two)
  * @mode:		operating mode, relevant only to ->set_mode(), OBSOLETE
- * @state:		current state of the device, assigned by the core code
+ * @state_use_accessors:current state of the device, assigned by the core code
  * @features:		features
  * @retries:		number of forced programming retries
  * @set_mode:		legacy set mode function, only for modes <= CLOCK_EVT_MODE_RESUME.
  * @set_state_periodic:	switch state to periodic, if !set_mode
  * @set_state_oneshot:	switch state to oneshot, if !set_mode
+ * @set_state_oneshot_stopped: switch state to oneshot_stopped, if !set_mode
  * @set_state_shutdown:	switch state to shutdown, if !set_mode
  * @tick_resume:	resume clkevt device, if !set_mode
  * @broadcast:		function to broadcast events
@@ -113,7 +117,7 @@ struct clock_event_device {
 	u32			mult;
 	u32			shift;
 	enum clock_event_mode	mode;
-	enum clock_event_state	state;
+	enum clock_event_state	state_use_accessors;
 	unsigned int		features;
 	unsigned long		retries;
 
@@ -121,11 +125,12 @@ struct clock_event_device {
 	 * State transition callback(s): Only one of the two groups should be
 	 * defined:
 	 * - set_mode(), only for modes <= CLOCK_EVT_MODE_RESUME.
-	 * - set_state_{shutdown|periodic|oneshot}(), tick_resume().
+	 * - set_state_{shutdown|periodic|oneshot|oneshot_stopped}(), tick_resume().
 	 */
 	void			(*set_mode)(enum clock_event_mode mode, struct clock_event_device *);
 	int			(*set_state_periodic)(struct clock_event_device *);
 	int			(*set_state_oneshot)(struct clock_event_device *);
+	int			(*set_state_oneshot_stopped)(struct clock_event_device *);
 	int			(*set_state_shutdown)(struct clock_event_device *);
 	int			(*tick_resume)(struct clock_event_device *);
 
@@ -144,6 +149,32 @@ struct clock_event_device {
 	struct module		*owner;
 } ____cacheline_aligned;
 
+/* Helpers to verify state of a clockevent device */
+static inline bool clockevent_state_detached(struct clock_event_device *dev)
+{
+	return dev->state_use_accessors == CLOCK_EVT_STATE_DETACHED;
+}
+
+static inline bool clockevent_state_shutdown(struct clock_event_device *dev)
+{
+	return dev->state_use_accessors == CLOCK_EVT_STATE_SHUTDOWN;
+}
+
+static inline bool clockevent_state_periodic(struct clock_event_device *dev)
+{
+	return dev->state_use_accessors == CLOCK_EVT_STATE_PERIODIC;
+}
+
+static inline bool clockevent_state_oneshot(struct clock_event_device *dev)
+{
+	return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT;
+}
+
+static inline bool clockevent_state_oneshot_stopped(struct clock_event_device *dev)
+{
+	return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT_STOPPED;
+}
+
 /*
  * Calculate a multiplication factor for scaled math, which is used to convert
  * nanoseconds based values to clock ticks:
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index d27d0152271f9e..278dd279a7a803 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -181,7 +181,6 @@ static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift)
 
 extern int clocksource_unregister(struct clocksource*);
 extern void clocksource_touch_watchdog(void);
-extern struct clocksource* clocksource_get_next(void);
 extern void clocksource_change_rating(struct clocksource *cs, int rating);
 extern void clocksource_suspend(void);
 extern void clocksource_resume(void);
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index eae42c21d5fde9..f7e7235f3a91b8 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -250,7 +250,23 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 	({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
 
 #define WRITE_ONCE(x, val) \
-	({ typeof(x) __val = (val); __write_once_size(&(x), &__val, sizeof(__val)); __val; })
+	({ union { typeof(x) __val; char __c[1]; } __u = { .__val = (val) }; __write_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
+
+/**
+ * READ_ONCE_CTRL - Read a value heading a control dependency
+ * @x: The value to be read, heading the control dependency
+ *
+ * Control dependencies are tricky.  See Documentation/memory-barriers.txt
+ * for important information on how to use them.  Note that in many cases,
+ * use of smp_load_acquire() will be much simpler.  Control dependencies
+ * should be avoided except on the hottest of hotpaths.
+ */
+#define READ_ONCE_CTRL(x) \
+({ \
+	typeof(x) __val = READ_ONCE(x); \
+	smp_read_barrier_depends(); /* Enforce control dependency. */ \
+	__val; \
+})
 
 #endif /* __KERNEL__ */
 
@@ -450,7 +466,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  * with an explicit memory barrier or atomic instruction that provides the
  * required ordering.
  *
- * If possible use READ_ONCE/ASSIGN_ONCE instead.
+ * If possible use READ_ONCE()/WRITE_ONCE() instead.
  */
 #define __ACCESS_ONCE(x) ({ \
 	 __maybe_unused typeof(x) __var = (__force typeof(x)) 0; \
@@ -467,7 +483,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  */
 #define lockless_dereference(p) \
 ({ \
-	typeof(p) _________p1 = ACCESS_ONCE(p); \
+	typeof(p) _________p1 = READ_ONCE(p); \
 	smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
 	(_________p1); \
 })
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 2821838256b4f7..b96bd299966f04 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -14,8 +14,6 @@ extern void context_tracking_enter(enum ctx_state state);
 extern void context_tracking_exit(enum ctx_state state);
 extern void context_tracking_user_enter(void);
 extern void context_tracking_user_exit(void);
-extern void __context_tracking_task_switch(struct task_struct *prev,
-					   struct task_struct *next);
 
 static inline void user_enter(void)
 {
@@ -51,19 +49,11 @@ static inline void exception_exit(enum ctx_state prev_ctx)
 	}
 }
 
-static inline void context_tracking_task_switch(struct task_struct *prev,
-						struct task_struct *next)
-{
-	if (context_tracking_is_enabled())
-		__context_tracking_task_switch(prev, next);
-}
 #else
 static inline void user_enter(void) { }
 static inline void user_exit(void) { }
 static inline enum ctx_state exception_enter(void) { return 0; }
 static inline void exception_exit(enum ctx_state prev_ctx) { }
-static inline void context_tracking_task_switch(struct task_struct *prev,
-						struct task_struct *next) { }
 #endif /* !CONFIG_CONTEXT_TRACKING */
 
 
diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 6b7b96a32b753a..678ecdf90cf660 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -12,6 +12,7 @@ struct context_tracking {
 	 * may be further optimized using static keys.
 	 */
 	bool active;
+	int recursion;
 	enum ctx_state {
 		CONTEXT_KERNEL = 0,
 		CONTEXT_USER,
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 30624954dec5a9..84737565c1fd95 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -227,6 +227,7 @@ extern void dmar_msi_read(int irq, struct msi_msg *msg);
 extern void dmar_msi_write(int irq, struct msi_msg *msg);
 extern int dmar_set_interrupt(struct intel_iommu *iommu);
 extern irqreturn_t dmar_fault(int irq, void *dev_id);
-extern int arch_setup_dmar_msi(unsigned int irq);
+extern int dmar_alloc_hwirq(int id, int node, void *arg);
+extern void dmar_free_hwirq(int irq);
 
 #endif /* __DMAR_H__ */
diff --git a/include/linux/efi.h b/include/linux/efi.h
index af5be0368dec26..024c27e7c0faa9 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -583,6 +583,9 @@ void efi_native_runtime_setup(void);
 #define EFI_FILE_INFO_ID \
     EFI_GUID(  0x9576e92, 0x6d3f, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b )
 
+#define EFI_SYSTEM_RESOURCE_TABLE_GUID \
+    EFI_GUID(  0xb122a263, 0x3661, 0x4f68, 0x99, 0x29, 0x78, 0xf8, 0xb0, 0xd6, 0x21, 0x80 )
+
 #define EFI_FILE_SYSTEM_GUID \
     EFI_GUID(  0x964e5b22, 0x6459, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b )
 
@@ -823,6 +826,7 @@ extern struct efi {
 	unsigned long fw_vendor;	/* fw_vendor */
 	unsigned long runtime;		/* runtime table */
 	unsigned long config_table;	/* config tables */
+	unsigned long esrt;		/* ESRT table */
 	efi_get_time_t *get_time;
 	efi_set_time_t *set_time;
 	efi_get_wakeup_time_t *get_wakeup_time;
@@ -875,6 +879,7 @@ static inline efi_status_t efi_query_variable_store(u32 attributes, unsigned lon
 #endif
 extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr);
 extern int efi_config_init(efi_config_table_type_t *arch_tables);
+extern void __init efi_esrt_init(void);
 extern int efi_config_parse_tables(void *config_tables, int count, int sz,
 				   efi_config_table_type_t *arch_tables);
 extern u64 efi_get_iobase (void);
@@ -882,12 +887,15 @@ extern u32 efi_mem_type (unsigned long phys_addr);
 extern u64 efi_mem_attributes (unsigned long phys_addr);
 extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size);
 extern int __init efi_uart_console_only (void);
+extern u64 efi_mem_desc_end(efi_memory_desc_t *md);
+extern int efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md);
 extern void efi_initialize_iomem_resources(struct resource *code_resource,
 		struct resource *data_resource, struct resource *bss_resource);
 extern void efi_get_time(struct timespec *now);
 extern void efi_reserve_boot_services(void);
 extern int efi_get_fdt_params(struct efi_fdt_params *params, int verbose);
 extern struct efi_memory_map memmap;
+extern struct kobject *efi_kobj;
 
 extern int efi_reboot_quirk_mode;
 extern bool efi_poweroff_required(void);
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index f4af03404b9789..dfd59d6bc6f0f7 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -1,7 +1,7 @@
 #ifndef LINUX_HARDIRQ_H
 #define LINUX_HARDIRQ_H
 
-#include <linux/preempt_mask.h>
+#include <linux/preempt.h>
 #include <linux/lockdep.h>
 #include <linux/ftrace_irq.h>
 #include <linux/vtime.h>
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 9286a46b7d69b5..6aefcd0031a6bd 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -65,6 +65,7 @@ static inline void kunmap(struct page *page)
 
 static inline void *kmap_atomic(struct page *page)
 {
+	preempt_disable();
 	pagefault_disable();
 	return page_address(page);
 }
@@ -73,6 +74,7 @@ static inline void *kmap_atomic(struct page *page)
 static inline void __kunmap_atomic(void *addr)
 {
 	pagefault_enable();
+	preempt_enable();
 }
 
 #define kmap_atomic_pfn(pfn)	kmap_atomic(pfn_to_page(pfn))
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 05f6df1fdf5bbf..470d876c2edafd 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -130,6 +130,12 @@ struct hrtimer_sleeper {
 	struct task_struct *task;
 };
 
+#ifdef CONFIG_64BIT
+# define HRTIMER_CLOCK_BASE_ALIGN	64
+#else
+# define HRTIMER_CLOCK_BASE_ALIGN	32
+#endif
+
 /**
  * struct hrtimer_clock_base - the timer base for a specific clock
  * @cpu_base:		per cpu clock base
@@ -137,9 +143,7 @@ struct hrtimer_sleeper {
  *			timer to a base on another cpu.
  * @clockid:		clock id for per_cpu support
  * @active:		red black tree root node for the active timers
- * @resolution:		the resolution of the clock, in nanoseconds
  * @get_time:		function to retrieve the current time of the clock
- * @softirq_time:	the time when running the hrtimer queue in the softirq
  * @offset:		offset of this clock to the monotonic base
  */
 struct hrtimer_clock_base {
@@ -147,11 +151,9 @@ struct hrtimer_clock_base {
 	int			index;
 	clockid_t		clockid;
 	struct timerqueue_head	active;
-	ktime_t			resolution;
 	ktime_t			(*get_time)(void);
-	ktime_t			softirq_time;
 	ktime_t			offset;
-};
+} __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
 
 enum  hrtimer_base_type {
 	HRTIMER_BASE_MONOTONIC,
@@ -167,9 +169,10 @@ enum  hrtimer_base_type {
  *			and timers
  * @cpu:		cpu number
  * @active_bases:	Bitfield to mark bases with active timers
- * @clock_was_set:	Indicates that clock was set from irq context.
+ * @clock_was_set_seq:	Sequence counter of clock was set events
  * @expires_next:	absolute time of the next event which was scheduled
  *			via clock_set_next_event()
+ * @next_timer:		Pointer to the first expiring timer
  * @in_hrtirq:		hrtimer_interrupt() is currently executing
  * @hres_active:	State of high resolution mode
  * @hang_detected:	The last hrtimer interrupt detected a hang
@@ -178,27 +181,34 @@ enum  hrtimer_base_type {
  * @nr_hangs:		Total number of hrtimer interrupt hangs
  * @max_hang_time:	Maximum time spent in hrtimer_interrupt
  * @clock_base:		array of clock bases for this cpu
+ *
+ * Note: next_timer is just an optimization for __remove_hrtimer().
+ *	 Do not dereference the pointer because it is not reliable on
+ *	 cross cpu removals.
  */
 struct hrtimer_cpu_base {
 	raw_spinlock_t			lock;
 	unsigned int			cpu;
 	unsigned int			active_bases;
-	unsigned int			clock_was_set;
+	unsigned int			clock_was_set_seq;
 #ifdef CONFIG_HIGH_RES_TIMERS
+	unsigned int			in_hrtirq	: 1,
+					hres_active	: 1,
+					hang_detected	: 1;
 	ktime_t				expires_next;
-	int				in_hrtirq;
-	int				hres_active;
-	int				hang_detected;
-	unsigned long			nr_events;
-	unsigned long			nr_retries;
-	unsigned long			nr_hangs;
-	ktime_t				max_hang_time;
+	struct hrtimer			*next_timer;
+	unsigned int			nr_events;
+	unsigned int			nr_retries;
+	unsigned int			nr_hangs;
+	unsigned int			max_hang_time;
 #endif
 	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
-};
+} ____cacheline_aligned;
 
 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
 {
+	BUILD_BUG_ON(sizeof(struct hrtimer_clock_base) > HRTIMER_CLOCK_BASE_ALIGN);
+
 	timer->node.expires = time;
 	timer->_softexpires = time;
 }
@@ -262,19 +272,16 @@ static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
 	return ktime_sub(timer->node.expires, timer->base->get_time());
 }
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-struct clock_event_device;
-
-extern void hrtimer_interrupt(struct clock_event_device *dev);
-
-/*
- * In high resolution mode the time reference must be read accurate
- */
 static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
 {
 	return timer->base->get_time();
 }
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+struct clock_event_device;
+
+extern void hrtimer_interrupt(struct clock_event_device *dev);
+
 static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 {
 	return timer->base->cpu_base->hres_active;
@@ -295,21 +302,16 @@ extern void hrtimer_peek_ahead_timers(void);
 
 extern void clock_was_set_delayed(void);
 
+extern unsigned int hrtimer_resolution;
+
 #else
 
 # define MONOTONIC_RES_NSEC	LOW_RES_NSEC
 # define KTIME_MONOTONIC_RES	KTIME_LOW_RES
 
-static inline void hrtimer_peek_ahead_timers(void) { }
+#define hrtimer_resolution	LOW_RES_NSEC
 
-/*
- * In non high resolution mode the time reference is taken from
- * the base softirq time variable.
- */
-static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
-{
-	return timer->base->softirq_time;
-}
+static inline void hrtimer_peek_ahead_timers(void) { }
 
 static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 {
@@ -353,39 +355,45 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
 #endif
 
 /* Basic timer operations: */
-extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
-			 const enum hrtimer_mode mode);
-extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 			unsigned long range_ns, const enum hrtimer_mode mode);
-extern int
-__hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-			 unsigned long delta_ns,
-			 const enum hrtimer_mode mode, int wakeup);
+
+/**
+ * hrtimer_start - (re)start an hrtimer on the current CPU
+ * @timer:	the timer to be added
+ * @tim:	expiry time
+ * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
+ *		relative (HRTIMER_MODE_REL)
+ */
+static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim,
+				 const enum hrtimer_mode mode)
+{
+	hrtimer_start_range_ns(timer, tim, 0, mode);
+}
 
 extern int hrtimer_cancel(struct hrtimer *timer);
 extern int hrtimer_try_to_cancel(struct hrtimer *timer);
 
-static inline int hrtimer_start_expires(struct hrtimer *timer,
-						enum hrtimer_mode mode)
+static inline void hrtimer_start_expires(struct hrtimer *timer,
+					 enum hrtimer_mode mode)
 {
 	unsigned long delta;
 	ktime_t soft, hard;
 	soft = hrtimer_get_softexpires(timer);
 	hard = hrtimer_get_expires(timer);
 	delta = ktime_to_ns(ktime_sub(hard, soft));
-	return hrtimer_start_range_ns(timer, soft, delta, mode);
+	hrtimer_start_range_ns(timer, soft, delta, mode);
 }
 
-static inline int hrtimer_restart(struct hrtimer *timer)
+static inline void hrtimer_restart(struct hrtimer *timer)
 {
-	return hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+	hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 
 /* Query timers: */
 extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
-extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp);
 
-extern ktime_t hrtimer_get_next_event(void);
+extern u64 hrtimer_get_next_event(void);
 
 /*
  * A timer is active, when it is enqueued into the rbtree or the
@@ -418,7 +426,22 @@ static inline int hrtimer_callback_running(struct hrtimer *timer)
 extern u64
 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval);
 
-/* Forward a hrtimer so it expires after the hrtimer's current now */
+/**
+ * hrtimer_forward_now - forward the timer expiry so it expires after now
+ * @timer:	hrtimer to forward
+ * @interval:	the interval to forward
+ *
+ * Forward the timer expiry so it will expire after the current time
+ * of the hrtimer clock base. Returns the number of overruns.
+ *
+ * Can be safely called from the callback function of @timer. If
+ * called from other contexts @timer must neither be enqueued nor
+ * running the callback and the caller needs to take care of
+ * serialization.
+ *
+ * Note: This only updates the timer expiry value and does not requeue
+ * the timer.
+ */
 static inline u64 hrtimer_forward_now(struct hrtimer *timer,
 				      ktime_t interval)
 {
@@ -443,7 +466,6 @@ extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
 
 /* Soft interrupt function to run the hrtimer queues: */
 extern void hrtimer_run_queues(void);
-extern void hrtimer_run_pending(void);
 
 /* Bootup initialization: */
 extern void __init hrtimers_init(void);
diff --git a/include/linux/htirq.h b/include/linux/htirq.h
index 70a1dbbf209350..d4a527e58434df 100644
--- a/include/linux/htirq.h
+++ b/include/linux/htirq.h
@@ -1,24 +1,38 @@
 #ifndef LINUX_HTIRQ_H
 #define LINUX_HTIRQ_H
 
+struct pci_dev;
+struct irq_data;
+
 struct ht_irq_msg {
 	u32	address_lo;	/* low 32 bits of the ht irq message */
 	u32	address_hi;	/* high 32 bits of the it irq message */
 };
 
+typedef void (ht_irq_update_t)(struct pci_dev *dev, int irq,
+			       struct ht_irq_msg *msg);
+
+struct ht_irq_cfg {
+	struct pci_dev *dev;
+	 /* Update callback used to cope with buggy hardware */
+	ht_irq_update_t *update;
+	unsigned pos;
+	unsigned idx;
+	struct ht_irq_msg msg;
+};
+
 /* Helper functions.. */
 void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
 void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
-struct irq_data;
 void mask_ht_irq(struct irq_data *data);
 void unmask_ht_irq(struct irq_data *data);
 
 /* The arch hook for getting things started */
-int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev);
+int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev,
+		      ht_irq_update_t *update);
+void arch_teardown_ht_irq(unsigned int irq);
 
 /* For drivers of buggy hardware */
-typedef void (ht_irq_update_t)(struct pci_dev *dev, int irq,
-			       struct ht_irq_msg *msg);
 int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update);
 
 #endif /* LINUX_HTIRQ_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 696d22312b3199..bb9b075f0eb022 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -50,9 +50,8 @@ extern struct fs_struct init_fs;
 	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
 	.rlim		= INIT_RLIMITS,					\
 	.cputimer	= { 						\
-		.cputime = INIT_CPUTIME,				\
-		.running = 0,						\
-		.lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock),	\
+		.cputime_atomic	= INIT_CPUTIME_ATOMIC,			\
+		.running	= 0,					\
 	},								\
 	.cred_guard_mutex =						\
 		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index a81ceeeaa3d118..928028fcb350e2 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -298,6 +298,8 @@ struct q_inval {
 
 #define INTR_REMAP_TABLE_ENTRIES	65536
 
+struct irq_domain;
+
 struct ir_table {
 	struct irte *base;
 	unsigned long *bitmap;
@@ -355,6 +357,8 @@ struct intel_iommu {
 
 #ifdef CONFIG_IRQ_REMAP
 	struct ir_table *ir_table;	/* Interrupt remapping info */
+	struct irq_domain *ir_domain;
+	struct irq_domain *ir_msi_domain;
 
 	/* whether interrupt remapping is enabled prior to OS*/
 	u8		pre_enabled_ir;
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 950ae450182602..be7e75c945e97b 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -413,7 +413,8 @@ enum
 	BLOCK_IOPOLL_SOFTIRQ,
 	TASKLET_SOFTIRQ,
 	SCHED_SOFTIRQ,
-	HRTIMER_SOFTIRQ,
+	HRTIMER_SOFTIRQ, /* Unused, but kept as tools rely on the
+			    numbering. Sigh! */
 	RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */
 
 	NR_SOFTIRQS
@@ -592,10 +593,10 @@ tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
 		     clockid_t which_clock, enum hrtimer_mode mode);
 
 static inline
-int tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time,
-			  const enum hrtimer_mode mode)
+void tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time,
+			   const enum hrtimer_mode mode)
 {
-	return hrtimer_start(&ttimer->timer, time, mode);
+	hrtimer_start(&ttimer->timer, time, mode);
 }
 
 static inline
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 657fab4efab351..c27dde7215b5b2 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -141,6 +141,7 @@ static inline void __iomem *
 io_mapping_map_atomic_wc(struct io_mapping *mapping,
 			 unsigned long offset)
 {
+	preempt_disable();
 	pagefault_disable();
 	return ((char __force __iomem *) mapping) + offset;
 }
@@ -149,6 +150,7 @@ static inline void
 io_mapping_unmap_atomic(void __iomem *vaddr)
 {
 	pagefault_enable();
+	preempt_enable();
 }
 
 /* Non-atomic map/unmap */
diff --git a/include/linux/io.h b/include/linux/io.h
index 986f2bffea1edc..04cce4da368541 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -111,6 +111,13 @@ static inline void arch_phys_wc_del(int handle)
 }
 
 #define arch_phys_wc_add arch_phys_wc_add
+#ifndef arch_phys_wc_index
+static inline int arch_phys_wc_index(int handle)
+{
+	return -1;
+}
+#define arch_phys_wc_index arch_phys_wc_index
+#endif
 #endif
 
 #endif /* _LINUX_IO_H */
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 2633061364b1d1..de3213d271ffd5 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -327,6 +327,7 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
  * @irq_write_msi_msg:	optional to write message content for MSI
  * @irq_get_irqchip_state:	return the internal state of an interrupt
  * @irq_set_irqchip_state:	set the internal state of a interrupt
+ * @irq_set_vcpu_affinity:	optional to target a vCPU in a virtual machine
  * @flags:		chip specific flags
  */
 struct irq_chip {
@@ -369,6 +370,8 @@ struct irq_chip {
 	int		(*irq_get_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool *state);
 	int		(*irq_set_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool state);
 
+	int		(*irq_set_vcpu_affinity)(struct irq_data *data, void *vcpu_info);
+
 	unsigned long	flags;
 };
 
@@ -422,6 +425,7 @@ extern void irq_cpu_online(void);
 extern void irq_cpu_offline(void);
 extern int irq_set_affinity_locked(struct irq_data *data,
 				   const struct cpumask *cpumask, bool force);
+extern int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ)
 void irq_move_irq(struct irq_data *data);
@@ -469,6 +473,8 @@ extern int irq_chip_set_affinity_parent(struct irq_data *data,
 					const struct cpumask *dest,
 					bool force);
 extern int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on);
+extern int irq_chip_set_vcpu_affinity_parent(struct irq_data *data,
+					     void *vcpu_info);
 #endif
 
 /* Handling of unhandled and spurious interrupts: */
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index c367cbdf73ab1a..3bde5eb8568bbb 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -7,6 +7,7 @@
 #include <linux/time.h>
 #include <linux/timex.h>
 #include <asm/param.h>			/* for HZ */
+#include <generated/timeconst.h>
 
 /*
  * The following defines establish the engineering parameters of the PLL
@@ -288,7 +289,79 @@ static inline u64 jiffies_to_nsecs(const unsigned long j)
 	return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC;
 }
 
-extern unsigned long msecs_to_jiffies(const unsigned int m);
+extern unsigned long __msecs_to_jiffies(const unsigned int m);
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+/*
+ * HZ is equal to or smaller than 1000, and 1000 is a nice round
+ * multiple of HZ, divide with the factor between them, but round
+ * upwards:
+ */
+static inline unsigned long _msecs_to_jiffies(const unsigned int m)
+{
+	return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
+}
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+/*
+ * HZ is larger than 1000, and HZ is a nice round multiple of 1000 -
+ * simply multiply with the factor between them.
+ *
+ * But first make sure the multiplication result cannot overflow:
+ */
+static inline unsigned long _msecs_to_jiffies(const unsigned int m)
+{
+	if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+		return MAX_JIFFY_OFFSET;
+	return m * (HZ / MSEC_PER_SEC);
+}
+#else
+/*
+ * Generic case - multiply, round and divide. But first check that if
+ * we are doing a net multiplication, that we wouldn't overflow:
+ */
+static inline unsigned long _msecs_to_jiffies(const unsigned int m)
+{
+	if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+		return MAX_JIFFY_OFFSET;
+
+	return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) >> MSEC_TO_HZ_SHR32;
+}
+#endif
+/**
+ * msecs_to_jiffies: - convert milliseconds to jiffies
+ * @m:	time in milliseconds
+ *
+ * conversion is done as follows:
+ *
+ * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
+ *
+ * - 'too large' values [that would result in larger than
+ *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
+ *
+ * - all other values are converted to jiffies by either multiplying
+ *   the input value by a factor or dividing it with a factor and
+ *   handling any 32-bit overflows.
+ *   for the details see __msecs_to_jiffies()
+ *
+ * msecs_to_jiffies() checks for the passed in value being a constant
+ * via __builtin_constant_p() allowing gcc to eliminate most of the
+ * code, __msecs_to_jiffies() is called if the value passed does not
+ * allow constant folding and the actual conversion must be done at
+ * runtime.
+ * the HZ range specific helpers _msecs_to_jiffies() are called both
+ * directly here and from __msecs_to_jiffies() in the case where
+ * constant folding is not possible.
+ */
+static inline unsigned long msecs_to_jiffies(const unsigned int m)
+{
+	if (__builtin_constant_p(m)) {
+		if ((int)m < 0)
+			return MAX_JIFFY_OFFSET;
+		return _msecs_to_jiffies(m);
+	} else {
+		return __msecs_to_jiffies(m);
+	}
+}
+
 extern unsigned long usecs_to_jiffies(const unsigned int u);
 extern unsigned long timespec_to_jiffies(const struct timespec *value);
 extern void jiffies_to_timespec(const unsigned long jiffies,
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index cd54b357c934d0..493465414b2dd9 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -244,7 +244,8 @@ static inline u32 reciprocal_scale(u32 val, u32 ep_ro)
 
 #if defined(CONFIG_MMU) && \
 	(defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP))
-void might_fault(void);
+#define might_fault() __might_fault(__FILE__, __LINE__)
+void __might_fault(const char *file, int line);
 #else
 static inline void might_fault(void) { }
 #endif
diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
index 3a6490e81b2856..703ea5c30a33f8 100644
--- a/include/linux/osq_lock.h
+++ b/include/linux/osq_lock.h
@@ -32,4 +32,9 @@ static inline void osq_lock_init(struct optimistic_spin_queue *lock)
 extern bool osq_lock(struct optimistic_spin_queue *lock);
 extern void osq_unlock(struct optimistic_spin_queue *lock);
 
+static inline bool osq_is_locked(struct optimistic_spin_queue *lock)
+{
+	return atomic_read(&lock->tail) != OSQ_UNLOCKED_VAL;
+}
+
 #endif
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 67c719cc91aa12..c436d5a2f6f48f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -92,8 +92,6 @@ struct hw_perf_event_extra {
 	int		idx;	/* index in shared_regs->regs[] */
 };
 
-struct event_constraint;
-
 /**
  * struct hw_perf_event - performance event hardware details:
  */
@@ -112,8 +110,6 @@ struct hw_perf_event {
 
 			struct hw_perf_event_extra extra_reg;
 			struct hw_perf_event_extra branch_reg;
-
-			struct event_constraint *constraint;
 		};
 		struct { /* software */
 			struct hrtimer	hrtimer;
@@ -124,7 +120,7 @@ struct hw_perf_event {
 		};
 		struct { /* intel_cqm */
 			int			cqm_state;
-			int			cqm_rmid;
+			u32			cqm_rmid;
 			struct list_head	cqm_events_entry;
 			struct list_head	cqm_groups_entry;
 			struct list_head	cqm_group_entry;
@@ -571,8 +567,12 @@ struct perf_cpu_context {
 	struct perf_event_context	*task_ctx;
 	int				active_oncpu;
 	int				exclusive;
+
+	raw_spinlock_t			hrtimer_lock;
 	struct hrtimer			hrtimer;
 	ktime_t				hrtimer_interval;
+	unsigned int			hrtimer_active;
+
 	struct pmu			*unique_pmu;
 	struct perf_cgroup		*cgrp;
 };
@@ -803,11 +803,33 @@ perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
 
 extern struct static_key_deferred perf_sched_events;
 
+static __always_inline bool
+perf_sw_migrate_enabled(void)
+{
+	if (static_key_false(&perf_swevent_enabled[PERF_COUNT_SW_CPU_MIGRATIONS]))
+		return true;
+	return false;
+}
+
+static inline void perf_event_task_migrate(struct task_struct *task)
+{
+	if (perf_sw_migrate_enabled())
+		task->sched_migrated = 1;
+}
+
 static inline void perf_event_task_sched_in(struct task_struct *prev,
 					    struct task_struct *task)
 {
 	if (static_key_false(&perf_sched_events.key))
 		__perf_event_task_sched_in(prev, task);
+
+	if (perf_sw_migrate_enabled() && task->sched_migrated) {
+		struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
+
+		perf_fetch_caller_regs(regs);
+		___perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, regs, 0);
+		task->sched_migrated = 0;
+	}
 }
 
 static inline void perf_event_task_sched_out(struct task_struct *prev,
@@ -930,6 +952,8 @@ perf_aux_output_skip(struct perf_output_handle *handle,
 static inline void *
 perf_get_aux(struct perf_output_handle *handle)				{ return NULL; }
 static inline void
+perf_event_task_migrate(struct task_struct *task)			{ }
+static inline void
 perf_event_task_sched_in(struct task_struct *prev,
 			 struct task_struct *task)			{ }
 static inline void
diff --git a/include/linux/platform_data/irq-renesas-irqc.h b/include/linux/platform_data/irq-renesas-irqc.h
deleted file mode 100644
index 3ae17b3e00ed65..00000000000000
--- a/include/linux/platform_data/irq-renesas-irqc.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Renesas IRQC Driver
- *
- *  Copyright (C) 2013 Magnus Damm
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-#ifndef __IRQ_RENESAS_IRQC_H__
-#define __IRQ_RENESAS_IRQC_H__
-
-struct renesas_irqc_config {
-	unsigned int irq_base;
-};
-
-#endif /* __IRQ_RENESAS_IRQC_H__ */
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index de83b4eb164287..a1a00e14c14ff1 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -10,13 +10,117 @@
 #include <linux/list.h>
 
 /*
- * We use the MSB mostly because its available; see <linux/preempt_mask.h> for
- * the other bits -- can't include that header due to inclusion hell.
+ * We put the hardirq and softirq counter into the preemption
+ * counter. The bitmask has the following meaning:
+ *
+ * - bits 0-7 are the preemption count (max preemption depth: 256)
+ * - bits 8-15 are the softirq count (max # of softirqs: 256)
+ *
+ * The hardirq count could in theory be the same as the number of
+ * interrupts in the system, but we run all interrupt handlers with
+ * interrupts disabled, so we cannot have nesting interrupts. Though
+ * there are a few palaeontologic drivers which reenable interrupts in
+ * the handler, so we need more than one bit here.
+ *
+ *         PREEMPT_MASK:	0x000000ff
+ *         SOFTIRQ_MASK:	0x0000ff00
+ *         HARDIRQ_MASK:	0x000f0000
+ *             NMI_MASK:	0x00100000
+ *       PREEMPT_ACTIVE:	0x00200000
+ * PREEMPT_NEED_RESCHED:	0x80000000
  */
+#define PREEMPT_BITS	8
+#define SOFTIRQ_BITS	8
+#define HARDIRQ_BITS	4
+#define NMI_BITS	1
+
+#define PREEMPT_SHIFT	0
+#define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
+#define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
+#define NMI_SHIFT	(HARDIRQ_SHIFT + HARDIRQ_BITS)
+
+#define __IRQ_MASK(x)	((1UL << (x))-1)
+
+#define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
+#define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
+#define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
+#define NMI_MASK	(__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)
+
+#define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
+#define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
+#define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
+#define NMI_OFFSET	(1UL << NMI_SHIFT)
+
+#define SOFTIRQ_DISABLE_OFFSET	(2 * SOFTIRQ_OFFSET)
+
+#define PREEMPT_ACTIVE_BITS	1
+#define PREEMPT_ACTIVE_SHIFT	(NMI_SHIFT + NMI_BITS)
+#define PREEMPT_ACTIVE	(__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
+
+/* We use the MSB mostly because its available */
 #define PREEMPT_NEED_RESCHED	0x80000000
 
+/* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */
 #include <asm/preempt.h>
 
+#define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
+#define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
+#define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
+				 | NMI_MASK))
+
+/*
+ * Are we doing bottom half or hardware interrupt processing?
+ * Are we in a softirq context? Interrupt context?
+ * in_softirq - Are we currently processing softirq or have bh disabled?
+ * in_serving_softirq - Are we currently processing softirq?
+ */
+#define in_irq()		(hardirq_count())
+#define in_softirq()		(softirq_count())
+#define in_interrupt()		(irq_count())
+#define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
+
+/*
+ * Are we in NMI context?
+ */
+#define in_nmi()	(preempt_count() & NMI_MASK)
+
+#if defined(CONFIG_PREEMPT_COUNT)
+# define PREEMPT_DISABLE_OFFSET 1
+#else
+# define PREEMPT_DISABLE_OFFSET 0
+#endif
+
+/*
+ * The preempt_count offset needed for things like:
+ *
+ *  spin_lock_bh()
+ *
+ * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
+ * softirqs, such that unlock sequences of:
+ *
+ *  spin_unlock();
+ *  local_bh_enable();
+ *
+ * Work as expected.
+ */
+#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_DISABLE_OFFSET)
+
+/*
+ * Are we running in atomic context?  WARNING: this macro cannot
+ * always detect atomic context; in particular, it cannot know about
+ * held spinlocks in non-preemptible kernels.  Thus it should not be
+ * used in the general case to determine whether sleeping is possible.
+ * Do not use in_atomic() in driver code.
+ */
+#define in_atomic()	(preempt_count() != 0)
+
+/*
+ * Check whether we were atomic before we did preempt_disable():
+ * (used by the scheduler)
+ */
+#define in_atomic_preempt_off() \
+		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET)
+
 #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
 extern void preempt_count_add(int val);
 extern void preempt_count_sub(int val);
@@ -33,6 +137,18 @@ extern void preempt_count_sub(int val);
 #define preempt_count_inc() preempt_count_add(1)
 #define preempt_count_dec() preempt_count_sub(1)
 
+#define preempt_active_enter() \
+do { \
+	preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
+	barrier(); \
+} while (0)
+
+#define preempt_active_exit() \
+do { \
+	barrier(); \
+	preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
+} while (0)
+
 #ifdef CONFIG_PREEMPT_COUNT
 
 #define preempt_disable() \
@@ -49,6 +165,8 @@ do { \
 
 #define preempt_enable_no_resched() sched_preempt_enable_no_resched()
 
+#define preemptible()	(preempt_count() == 0 && !irqs_disabled())
+
 #ifdef CONFIG_PREEMPT
 #define preempt_enable() \
 do { \
@@ -121,6 +239,7 @@ do { \
 #define preempt_disable_notrace()		barrier()
 #define preempt_enable_no_resched_notrace()	barrier()
 #define preempt_enable_notrace()		barrier()
+#define preemptible()				0
 
 #endif /* CONFIG_PREEMPT_COUNT */
 
diff --git a/include/linux/preempt_mask.h b/include/linux/preempt_mask.h
deleted file mode 100644
index dbeec4d4a3beaf..00000000000000
--- a/include/linux/preempt_mask.h
+++ /dev/null
@@ -1,117 +0,0 @@
-#ifndef LINUX_PREEMPT_MASK_H
-#define LINUX_PREEMPT_MASK_H
-
-#include <linux/preempt.h>
-
-/*
- * We put the hardirq and softirq counter into the preemption
- * counter. The bitmask has the following meaning:
- *
- * - bits 0-7 are the preemption count (max preemption depth: 256)
- * - bits 8-15 are the softirq count (max # of softirqs: 256)
- *
- * The hardirq count could in theory be the same as the number of
- * interrupts in the system, but we run all interrupt handlers with
- * interrupts disabled, so we cannot have nesting interrupts. Though
- * there are a few palaeontologic drivers which reenable interrupts in
- * the handler, so we need more than one bit here.
- *
- * PREEMPT_MASK:	0x000000ff
- * SOFTIRQ_MASK:	0x0000ff00
- * HARDIRQ_MASK:	0x000f0000
- *     NMI_MASK:	0x00100000
- * PREEMPT_ACTIVE:	0x00200000
- */
-#define PREEMPT_BITS	8
-#define SOFTIRQ_BITS	8
-#define HARDIRQ_BITS	4
-#define NMI_BITS	1
-
-#define PREEMPT_SHIFT	0
-#define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
-#define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
-#define NMI_SHIFT	(HARDIRQ_SHIFT + HARDIRQ_BITS)
-
-#define __IRQ_MASK(x)	((1UL << (x))-1)
-
-#define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
-#define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
-#define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
-#define NMI_MASK	(__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)
-
-#define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
-#define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
-#define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
-#define NMI_OFFSET	(1UL << NMI_SHIFT)
-
-#define SOFTIRQ_DISABLE_OFFSET	(2 * SOFTIRQ_OFFSET)
-
-#define PREEMPT_ACTIVE_BITS	1
-#define PREEMPT_ACTIVE_SHIFT	(NMI_SHIFT + NMI_BITS)
-#define PREEMPT_ACTIVE	(__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
-
-#define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
-#define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
-#define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
-				 | NMI_MASK))
-
-/*
- * Are we doing bottom half or hardware interrupt processing?
- * Are we in a softirq context? Interrupt context?
- * in_softirq - Are we currently processing softirq or have bh disabled?
- * in_serving_softirq - Are we currently processing softirq?
- */
-#define in_irq()		(hardirq_count())
-#define in_softirq()		(softirq_count())
-#define in_interrupt()		(irq_count())
-#define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
-
-/*
- * Are we in NMI context?
- */
-#define in_nmi()	(preempt_count() & NMI_MASK)
-
-#if defined(CONFIG_PREEMPT_COUNT)
-# define PREEMPT_CHECK_OFFSET 1
-#else
-# define PREEMPT_CHECK_OFFSET 0
-#endif
-
-/*
- * The preempt_count offset needed for things like:
- *
- *  spin_lock_bh()
- *
- * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
- * softirqs, such that unlock sequences of:
- *
- *  spin_unlock();
- *  local_bh_enable();
- *
- * Work as expected.
- */
-#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET)
-
-/*
- * Are we running in atomic context?  WARNING: this macro cannot
- * always detect atomic context; in particular, it cannot know about
- * held spinlocks in non-preemptible kernels.  Thus it should not be
- * used in the general case to determine whether sleeping is possible.
- * Do not use in_atomic() in driver code.
- */
-#define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != 0)
-
-/*
- * Check whether we were atomic before we did preempt_disable():
- * (used by the scheduler, *after* releasing the kernel lock)
- */
-#define in_atomic_preempt_off() \
-		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
-
-#ifdef CONFIG_PREEMPT_COUNT
-# define preemptible()	(preempt_count() == 0 && !irqs_disabled())
-#else
-# define preemptible()	0
-#endif
-
-#endif /* LINUX_PREEMPT_MASK_H */
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index a18b16f1dc0e44..17c6b1f84a77d3 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -29,8 +29,8 @@
  */
 static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
 {
-	ACCESS_ONCE(list->next) = list;
-	ACCESS_ONCE(list->prev) = list;
+	WRITE_ONCE(list->next, list);
+	WRITE_ONCE(list->prev, list);
 }
 
 /*
@@ -288,7 +288,7 @@ static inline void list_splice_init_rcu(struct list_head *list,
 #define list_first_or_null_rcu(ptr, type, member) \
 ({ \
 	struct list_head *__ptr = (ptr); \
-	struct list_head *__next = ACCESS_ONCE(__ptr->next); \
+	struct list_head *__next = READ_ONCE(__ptr->next); \
 	likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \
 })
 
@@ -549,8 +549,8 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n,
  */
 #define hlist_for_each_entry_from_rcu(pos, member)			\
 	for (; pos;							\
-	     pos = hlist_entry_safe(rcu_dereference((pos)->member.next),\
-			typeof(*(pos)), member))
+	     pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(	\
+			&(pos)->member)), typeof(*(pos)), member))
 
 #endif	/* __KERNEL__ */
 #endif
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 0356ad954ea568..4cf5f51b4c9c43 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -44,6 +44,8 @@
 #include <linux/debugobjects.h>
 #include <linux/bug.h>
 #include <linux/compiler.h>
+#include <linux/ktime.h>
+
 #include <asm/barrier.h>
 
 extern int rcu_expedited; /* for sysctl */
@@ -292,10 +294,6 @@ void rcu_sched_qs(void);
 void rcu_bh_qs(void);
 void rcu_check_callbacks(int user);
 struct notifier_block;
-void rcu_idle_enter(void);
-void rcu_idle_exit(void);
-void rcu_irq_enter(void);
-void rcu_irq_exit(void);
 int rcu_cpu_notify(struct notifier_block *self,
 		   unsigned long action, void *hcpu);
 
@@ -364,8 +362,8 @@ extern struct srcu_struct tasks_rcu_exit_srcu;
 #define rcu_note_voluntary_context_switch(t) \
 	do { \
 		rcu_all_qs(); \
-		if (ACCESS_ONCE((t)->rcu_tasks_holdout)) \
-			ACCESS_ONCE((t)->rcu_tasks_holdout) = false; \
+		if (READ_ONCE((t)->rcu_tasks_holdout)) \
+			WRITE_ONCE((t)->rcu_tasks_holdout, false); \
 	} while (0)
 #else /* #ifdef CONFIG_TASKS_RCU */
 #define TASKS_RCU(x) do { } while (0)
@@ -609,7 +607,7 @@ static inline void rcu_preempt_sleep_check(void)
 
 #define __rcu_access_pointer(p, space) \
 ({ \
-	typeof(*p) *_________p1 = (typeof(*p) *__force)ACCESS_ONCE(p); \
+	typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \
 	rcu_dereference_sparse(p, space); \
 	((typeof(*p) __force __kernel *)(_________p1)); \
 })
@@ -628,21 +626,6 @@ static inline void rcu_preempt_sleep_check(void)
 	((typeof(*p) __force __kernel *)(p)); \
 })
 
-#define __rcu_access_index(p, space) \
-({ \
-	typeof(p) _________p1 = ACCESS_ONCE(p); \
-	rcu_dereference_sparse(p, space); \
-	(_________p1); \
-})
-#define __rcu_dereference_index_check(p, c) \
-({ \
-	/* Dependency order vs. p above. */ \
-	typeof(p) _________p1 = lockless_dereference(p); \
-	rcu_lockdep_assert(c, \
-			   "suspicious rcu_dereference_index_check() usage"); \
-	(_________p1); \
-})
-
 /**
  * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
  * @v: The value to statically initialize with.
@@ -687,7 +670,7 @@ static inline void rcu_preempt_sleep_check(void)
  * @p: The pointer to read
  *
  * Return the value of the specified RCU-protected pointer, but omit the
- * smp_read_barrier_depends() and keep the ACCESS_ONCE().  This is useful
+ * smp_read_barrier_depends() and keep the READ_ONCE().  This is useful
  * when the value of this pointer is accessed, but the pointer is not
  * dereferenced, for example, when testing an RCU-protected pointer against
  * NULL.  Although rcu_access_pointer() may also be used in cases where
@@ -772,47 +755,12 @@ static inline void rcu_preempt_sleep_check(void)
 #define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu)
 
 /**
- * rcu_access_index() - fetch RCU index with no dereferencing
- * @p: The index to read
- *
- * Return the value of the specified RCU-protected index, but omit the
- * smp_read_barrier_depends() and keep the ACCESS_ONCE().  This is useful
- * when the value of this index is accessed, but the index is not
- * dereferenced, for example, when testing an RCU-protected index against
- * -1.  Although rcu_access_index() may also be used in cases where
- * update-side locks prevent the value of the index from changing, you
- * should instead use rcu_dereference_index_protected() for this use case.
- */
-#define rcu_access_index(p) __rcu_access_index((p), __rcu)
-
-/**
- * rcu_dereference_index_check() - rcu_dereference for indices with debug checking
- * @p: The pointer to read, prior to dereferencing
- * @c: The conditions under which the dereference will take place
- *
- * Similar to rcu_dereference_check(), but omits the sparse checking.
- * This allows rcu_dereference_index_check() to be used on integers,
- * which can then be used as array indices.  Attempting to use
- * rcu_dereference_check() on an integer will give compiler warnings
- * because the sparse address-space mechanism relies on dereferencing
- * the RCU-protected pointer.  Dereferencing integers is not something
- * that even gcc will put up with.
- *
- * Note that this function does not implicitly check for RCU read-side
- * critical sections.  If this function gains lots of uses, it might
- * make sense to provide versions for each flavor of RCU, but it does
- * not make sense as of early 2010.
- */
-#define rcu_dereference_index_check(p, c) \
-	__rcu_dereference_index_check((p), (c))
-
-/**
  * rcu_dereference_protected() - fetch RCU pointer when updates prevented
  * @p: The pointer to read, prior to dereferencing
  * @c: The conditions under which the dereference will take place
  *
  * Return the value of the specified RCU-protected pointer, but omit
- * both the smp_read_barrier_depends() and the ACCESS_ONCE().  This
+ * both the smp_read_barrier_depends() and the READ_ONCE().  This
  * is useful in cases where update-side locks prevent the value of the
  * pointer from changing.  Please note that this primitive does -not-
  * prevent the compiler from repeating this reference or combining it
@@ -1138,13 +1086,13 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
 #define kfree_rcu(ptr, rcu_head)					\
 	__kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
 
-#if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL)
-static inline int rcu_needs_cpu(unsigned long *delta_jiffies)
+#ifdef CONFIG_TINY_RCU
+static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt)
 {
-	*delta_jiffies = ULONG_MAX;
+	*nextevt = KTIME_MAX;
 	return 0;
 }
-#endif /* #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) */
+#endif /* #ifdef CONFIG_TINY_RCU */
 
 #if defined(CONFIG_RCU_NOCB_CPU_ALL)
 static inline bool rcu_is_nocb_cpu(int cpu) { return true; }
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 937edaeb150deb..3df6c1ec4e2550 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -159,6 +159,22 @@ static inline void rcu_cpu_stall_reset(void)
 {
 }
 
+static inline void rcu_idle_enter(void)
+{
+}
+
+static inline void rcu_idle_exit(void)
+{
+}
+
+static inline void rcu_irq_enter(void)
+{
+}
+
+static inline void rcu_irq_exit(void)
+{
+}
+
 static inline void exit_rcu(void)
 {
 }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index d2e583a6aacacf..456879143f89f9 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -31,9 +31,7 @@
 #define __LINUX_RCUTREE_H
 
 void rcu_note_context_switch(void);
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(unsigned long *delta_jiffies);
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
+int rcu_needs_cpu(u64 basem, u64 *nextevt);
 void rcu_cpu_stall_reset(void);
 
 /*
@@ -93,6 +91,11 @@ void rcu_force_quiescent_state(void);
 void rcu_bh_force_quiescent_state(void);
 void rcu_sched_force_quiescent_state(void);
 
+void rcu_idle_enter(void);
+void rcu_idle_exit(void);
+void rcu_irq_enter(void);
+void rcu_irq_exit(void);
+
 void exit_rcu(void);
 
 void rcu_scheduler_starting(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0729ca3c36f615..b945669880c6ad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -25,7 +25,7 @@ struct sched_param {
 #include <linux/errno.h>
 #include <linux/nodemask.h>
 #include <linux/mm_types.h>
-#include <linux/preempt_mask.h>
+#include <linux/preempt.h>
 
 #include <asm/page.h>
 #include <asm/ptrace.h>
@@ -174,7 +174,12 @@ extern unsigned long nr_iowait_cpu(int cpu);
 extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
 
 extern void calc_global_load(unsigned long ticks);
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void update_cpu_load_nohz(void);
+#else
+static inline void update_cpu_load_nohz(void) { }
+#endif
 
 extern unsigned long get_parent_ip(unsigned long addr);
 
@@ -214,9 +219,10 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
 #define TASK_WAKEKILL		128
 #define TASK_WAKING		256
 #define TASK_PARKED		512
-#define TASK_STATE_MAX		1024
+#define TASK_NOLOAD		1024
+#define TASK_STATE_MAX		2048
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWP"
+#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
 
 extern char ___assert_task_state[1 - 2*!!(
 		sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
@@ -226,6 +232,8 @@ extern char ___assert_task_state[1 - 2*!!(
 #define TASK_STOPPED		(TASK_WAKEKILL | __TASK_STOPPED)
 #define TASK_TRACED		(TASK_WAKEKILL | __TASK_TRACED)
 
+#define TASK_IDLE		(TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
+
 /* Convenience macros for the sake of wake_up */
 #define TASK_NORMAL		(TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
 #define TASK_ALL		(TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
@@ -241,7 +249,8 @@ extern char ___assert_task_state[1 - 2*!!(
 			((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
 #define task_contributes_to_load(task)	\
 				((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
-				 (task->flags & PF_FROZEN) == 0)
+				 (task->flags & PF_FROZEN) == 0 && \
+				 (task->state & TASK_NOLOAD) == 0)
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 
@@ -253,7 +262,7 @@ extern char ___assert_task_state[1 - 2*!!(
 #define set_task_state(tsk, state_value)			\
 	do {							\
 		(tsk)->task_state_change = _THIS_IP_;		\
-		set_mb((tsk)->state, (state_value));		\
+		smp_store_mb((tsk)->state, (state_value));		\
 	} while (0)
 
 /*
@@ -275,7 +284,7 @@ extern char ___assert_task_state[1 - 2*!!(
 #define set_current_state(state_value)				\
 	do {							\
 		current->task_state_change = _THIS_IP_;		\
-		set_mb(current->state, (state_value));		\
+		smp_store_mb(current->state, (state_value));		\
 	} while (0)
 
 #else
@@ -283,7 +292,7 @@ extern char ___assert_task_state[1 - 2*!!(
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
 #define set_task_state(tsk, state_value)		\
-	set_mb((tsk)->state, (state_value))
+	smp_store_mb((tsk)->state, (state_value))
 
 /*
  * set_current_state() includes a barrier so that the write of current->state
@@ -299,7 +308,7 @@ extern char ___assert_task_state[1 - 2*!!(
 #define __set_current_state(state_value)		\
 	do { current->state = (state_value); } while (0)
 #define set_current_state(state_value)			\
-	set_mb(current->state, (state_value))
+	smp_store_mb(current->state, (state_value))
 
 #endif
 
@@ -568,6 +577,23 @@ struct task_cputime {
 		.sum_exec_runtime = 0,				\
 	}
 
+/*
+ * This is the atomic variant of task_cputime, which can be used for
+ * storing and updating task_cputime statistics without locking.
+ */
+struct task_cputime_atomic {
+	atomic64_t utime;
+	atomic64_t stime;
+	atomic64_t sum_exec_runtime;
+};
+
+#define INIT_CPUTIME_ATOMIC \
+	(struct task_cputime_atomic) {				\
+		.utime = ATOMIC64_INIT(0),			\
+		.stime = ATOMIC64_INIT(0),			\
+		.sum_exec_runtime = ATOMIC64_INIT(0),		\
+	}
+
 #ifdef CONFIG_PREEMPT_COUNT
 #define PREEMPT_DISABLED	(1 + PREEMPT_ENABLED)
 #else
@@ -585,18 +611,16 @@ struct task_cputime {
 
 /**
  * struct thread_group_cputimer - thread group interval timer counts
- * @cputime:		thread group interval timers.
+ * @cputime_atomic:	atomic thread group interval timers.
  * @running:		non-zero when there are timers running and
  * 			@cputime receives updates.
- * @lock:		lock for fields in this struct.
  *
  * This structure contains the version of task_cputime, above, that is
  * used for thread group CPU timer calculations.
  */
 struct thread_group_cputimer {
-	struct task_cputime cputime;
+	struct task_cputime_atomic cputime_atomic;
 	int running;
-	raw_spinlock_t lock;
 };
 
 #include <linux/rwsem.h>
@@ -901,6 +925,50 @@ enum cpu_idle_type {
 #define SCHED_CAPACITY_SCALE	(1L << SCHED_CAPACITY_SHIFT)
 
 /*
+ * Wake-queues are lists of tasks with a pending wakeup, whose
+ * callers have already marked the task as woken internally,
+ * and can thus carry on. A common use case is being able to
+ * do the wakeups once the corresponding user lock as been
+ * released.
+ *
+ * We hold reference to each task in the list across the wakeup,
+ * thus guaranteeing that the memory is still valid by the time
+ * the actual wakeups are performed in wake_up_q().
+ *
+ * One per task suffices, because there's never a need for a task to be
+ * in two wake queues simultaneously; it is forbidden to abandon a task
+ * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is
+ * already in a wake queue, the wakeup will happen soon and the second
+ * waker can just skip it.
+ *
+ * The WAKE_Q macro declares and initializes the list head.
+ * wake_up_q() does NOT reinitialize the list; it's expected to be
+ * called near the end of a function, where the fact that the queue is
+ * not used again will be easy to see by inspection.
+ *
+ * Note that this can cause spurious wakeups. schedule() callers
+ * must ensure the call is done inside a loop, confirming that the
+ * wakeup condition has in fact occurred.
+ */
+struct wake_q_node {
+	struct wake_q_node *next;
+};
+
+struct wake_q_head {
+	struct wake_q_node *first;
+	struct wake_q_node **lastp;
+};
+
+#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
+
+#define WAKE_Q(name)					\
+	struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
+
+extern void wake_q_add(struct wake_q_head *head,
+		       struct task_struct *task);
+extern void wake_up_q(struct wake_q_head *head);
+
+/*
  * sched-domains (multiprocessor balancing) declarations:
  */
 #ifdef CONFIG_SMP
@@ -1335,8 +1403,6 @@ struct task_struct {
 	int rcu_read_lock_nesting;
 	union rcu_special rcu_read_unlock_special;
 	struct list_head rcu_node_entry;
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-#ifdef CONFIG_PREEMPT_RCU
 	struct rcu_node *rcu_blocked_node;
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 #ifdef CONFIG_TASKS_RCU
@@ -1357,9 +1423,6 @@ struct task_struct {
 #endif
 
 	struct mm_struct *mm, *active_mm;
-#ifdef CONFIG_COMPAT_BRK
-	unsigned brk_randomized:1;
-#endif
 	/* per-thread vma caching */
 	u32 vmacache_seqnum;
 	struct vm_area_struct *vmacache[VMACACHE_SIZE];
@@ -1370,7 +1433,7 @@ struct task_struct {
 	int exit_state;
 	int exit_code, exit_signal;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
-	unsigned int jobctl;	/* JOBCTL_*, siglock protected */
+	unsigned long jobctl;	/* JOBCTL_*, siglock protected */
 
 	/* Used for emulating ABI behavior of previous Linux versions */
 	unsigned int personality;
@@ -1382,10 +1445,14 @@ struct task_struct {
 	/* Revert to default priority/policy when forking */
 	unsigned sched_reset_on_fork:1;
 	unsigned sched_contributes_to_load:1;
+	unsigned sched_migrated:1;
 
 #ifdef CONFIG_MEMCG_KMEM
 	unsigned memcg_kmem_skip_account:1;
 #endif
+#ifdef CONFIG_COMPAT_BRK
+	unsigned brk_randomized:1;
+#endif
 
 	unsigned long atomic_flags; /* Flags needing atomic access. */
 
@@ -1512,6 +1579,8 @@ struct task_struct {
 	/* Protection of the PI data structures: */
 	raw_spinlock_t pi_lock;
 
+	struct wake_q_node wake_q;
+
 #ifdef CONFIG_RT_MUTEXES
 	/* PI waiters blocked on a rt_mutex held by this task */
 	struct rb_root pi_waiters;
@@ -1725,6 +1794,7 @@ struct task_struct {
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 	unsigned long	task_state_change;
 #endif
+	int pagefault_disabled;
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
@@ -2078,22 +2148,22 @@ TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
 #define JOBCTL_TRAPPING_BIT	21	/* switching to TRACED */
 #define JOBCTL_LISTENING_BIT	22	/* ptracer is listening for events */
 
-#define JOBCTL_STOP_DEQUEUED	(1 << JOBCTL_STOP_DEQUEUED_BIT)
-#define JOBCTL_STOP_PENDING	(1 << JOBCTL_STOP_PENDING_BIT)
-#define JOBCTL_STOP_CONSUME	(1 << JOBCTL_STOP_CONSUME_BIT)
-#define JOBCTL_TRAP_STOP	(1 << JOBCTL_TRAP_STOP_BIT)
-#define JOBCTL_TRAP_NOTIFY	(1 << JOBCTL_TRAP_NOTIFY_BIT)
-#define JOBCTL_TRAPPING		(1 << JOBCTL_TRAPPING_BIT)
-#define JOBCTL_LISTENING	(1 << JOBCTL_LISTENING_BIT)
+#define JOBCTL_STOP_DEQUEUED	(1UL << JOBCTL_STOP_DEQUEUED_BIT)
+#define JOBCTL_STOP_PENDING	(1UL << JOBCTL_STOP_PENDING_BIT)
+#define JOBCTL_STOP_CONSUME	(1UL << JOBCTL_STOP_CONSUME_BIT)
+#define JOBCTL_TRAP_STOP	(1UL << JOBCTL_TRAP_STOP_BIT)
+#define JOBCTL_TRAP_NOTIFY	(1UL << JOBCTL_TRAP_NOTIFY_BIT)
+#define JOBCTL_TRAPPING		(1UL << JOBCTL_TRAPPING_BIT)
+#define JOBCTL_LISTENING	(1UL << JOBCTL_LISTENING_BIT)
 
 #define JOBCTL_TRAP_MASK	(JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
 #define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
 
 extern bool task_set_jobctl_pending(struct task_struct *task,
-				    unsigned int mask);
+				    unsigned long mask);
 extern void task_clear_jobctl_trapping(struct task_struct *task);
 extern void task_clear_jobctl_pending(struct task_struct *task,
-				      unsigned int mask);
+				      unsigned long mask);
 
 static inline void rcu_copy_process(struct task_struct *p)
 {
@@ -2532,6 +2602,9 @@ static inline unsigned long wait_task_inactive(struct task_struct *p,
 }
 #endif
 
+#define tasklist_empty() \
+	list_empty(&init_task.tasks)
+
 #define next_task(p) \
 	list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
 
@@ -2962,11 +3035,6 @@ static __always_inline bool need_resched(void)
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
 
-static inline void thread_group_cputime_init(struct signal_struct *sig)
-{
-	raw_spin_lock_init(&sig->cputimer.lock);
-}
-
 /*
  * Reevaluate whether the task has signals pending delivery.
  * Wake the task if so.
@@ -3080,13 +3148,13 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
 static inline unsigned long task_rlimit(const struct task_struct *tsk,
 		unsigned int limit)
 {
-	return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur);
+	return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
 }
 
 static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
 		unsigned int limit)
 {
-	return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max);
+	return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
 }
 
 static inline unsigned long rlimit(unsigned int limit)
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 3e18379dfa6f34..0063b24b4f36df 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -120,7 +120,7 @@ do {								\
 /*
  * Despite its name it doesn't necessarily has to be a full barrier.
  * It should only guarantee that a STORE before the critical section
- * can not be reordered with a LOAD inside this section.
+ * can not be reordered with LOADs and STOREs inside this section.
  * spin_lock() is the one-way barrier, this LOAD can not escape out
  * of the region. So the default implementation simply ensures that
  * a STORE can not move into the critical section, smp_wmb() should
diff --git a/include/linux/tick.h b/include/linux/tick.h
index ec6e8bc992bf70..3741ba1a652c97 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -138,6 +138,12 @@ static inline bool tick_nohz_full_cpu(int cpu)
 	return cpumask_test_cpu(cpu, tick_nohz_full_mask);
 }
 
+static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask)
+{
+	if (tick_nohz_full_enabled())
+		cpumask_or(mask, mask, tick_nohz_full_mask);
+}
+
 extern void __tick_nohz_full_check(void);
 extern void tick_nohz_full_kick(void);
 extern void tick_nohz_full_kick_cpu(int cpu);
@@ -146,6 +152,7 @@ extern void __tick_nohz_task_switch(struct task_struct *tsk);
 #else
 static inline bool tick_nohz_full_enabled(void) { return false; }
 static inline bool tick_nohz_full_cpu(int cpu) { return false; }
+static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
 static inline void __tick_nohz_full_check(void) { }
 static inline void tick_nohz_full_kick_cpu(int cpu) { }
 static inline void tick_nohz_full_kick(void) { }
diff --git a/include/linux/time64.h b/include/linux/time64.h
index a3831478d9cf8d..12d4e82b02765e 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -2,6 +2,7 @@
 #define _LINUX_TIME64_H
 
 #include <uapi/linux/time.h>
+#include <linux/math64.h>
 
 typedef __s64 time64_t;
 
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index fb86963859c772..e1f5a11365546a 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -49,6 +49,7 @@ struct tk_read_base {
  * @offs_boot:		Offset clock monotonic -> clock boottime
  * @offs_tai:		Offset clock monotonic -> clock tai
  * @tai_offset:		The current UTC to TAI offset in seconds
+ * @clock_was_set_seq:	The sequence number of clock was set events
  * @raw_time:		Monotonic raw base time in timespec64 format
  * @cycle_interval:	Number of clock cycles in one NTP interval
  * @xtime_interval:	Number of clock shifted nano seconds in one NTP
@@ -60,6 +61,9 @@ struct tk_read_base {
  *			shifted nano seconds.
  * @ntp_error_shift:	Shift conversion between clock shifted nano seconds and
  *			ntp shifted nano seconds.
+ * @last_warning:	Warning ratelimiter (DEBUG_TIMEKEEPING)
+ * @underflow_seen:	Underflow warning flag (DEBUG_TIMEKEEPING)
+ * @overflow_seen:	Overflow warning flag (DEBUG_TIMEKEEPING)
  *
  * Note: For timespec(64) based interfaces wall_to_monotonic is what
  * we need to add to xtime (or xtime corrected for sub jiffie times)
@@ -85,6 +89,7 @@ struct timekeeper {
 	ktime_t			offs_boot;
 	ktime_t			offs_tai;
 	s32			tai_offset;
+	unsigned int		clock_was_set_seq;
 	struct timespec64	raw_time;
 
 	/* The following members are for timekeeping internal use */
@@ -104,6 +109,18 @@ struct timekeeper {
 	s64			ntp_error;
 	u32			ntp_error_shift;
 	u32			ntp_err_mult;
+#ifdef CONFIG_DEBUG_TIMEKEEPING
+	long			last_warning;
+	/*
+	 * These simple flag variables are managed
+	 * without locks, which is racy, but they are
+	 * ok since we don't really care about being
+	 * super precise about how many events were
+	 * seen, just that a problem was observed.
+	 */
+	int			underflow_seen;
+	int			overflow_seen;
+#endif
 };
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 99176af216af44..3aa72e64865021 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -163,6 +163,7 @@ extern ktime_t ktime_get(void);
 extern ktime_t ktime_get_with_offset(enum tk_offsets offs);
 extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs);
 extern ktime_t ktime_get_raw(void);
+extern u32 ktime_get_resolution_ns(void);
 
 /**
  * ktime_get_real - get the real (wall-) time in ktime_t format
@@ -266,7 +267,6 @@ extern int persistent_clock_is_local;
 
 extern void read_persistent_clock(struct timespec *ts);
 extern void read_persistent_clock64(struct timespec64 *ts);
-extern void read_boot_clock(struct timespec *ts);
 extern void read_boot_clock64(struct timespec64 *ts);
 extern int update_persistent_clock(struct timespec now);
 extern int update_persistent_clock64(struct timespec64 now);
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 8c5a197e1587de..fbb80e0030bfa0 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -188,13 +188,6 @@ extern void set_timer_slack(struct timer_list *time, int slack_hz);
 #define NEXT_TIMER_MAX_DELTA	((1UL << 30) - 1)
 
 /*
- * Return when the next timer-wheel timeout occurs (in absolute jiffies),
- * locks the timer base and does the comparison against the given
- * jiffie.
- */
-extern unsigned long get_next_timer_interrupt(unsigned long now);
-
-/*
  * Timer-statistics info:
  */
 #ifdef CONFIG_TIMER_STATS
diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h
index a520fd70a59f37..7eec17ad7fa195 100644
--- a/include/linux/timerqueue.h
+++ b/include/linux/timerqueue.h
@@ -16,10 +16,10 @@ struct timerqueue_head {
 };
 
 
-extern void timerqueue_add(struct timerqueue_head *head,
-				struct timerqueue_node *node);
-extern void timerqueue_del(struct timerqueue_head *head,
-				struct timerqueue_node *node);
+extern bool timerqueue_add(struct timerqueue_head *head,
+			   struct timerqueue_node *node);
+extern bool timerqueue_del(struct timerqueue_head *head,
+			   struct timerqueue_node *node);
 extern struct timerqueue_node *timerqueue_iterate_next(
 						struct timerqueue_node *node);
 
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 909b6e43b6942c..73ddad1e0fa343 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -191,8 +191,8 @@ static inline int cpu_to_mem(int cpu)
 #ifndef topology_core_id
 #define topology_core_id(cpu)			((void)(cpu), 0)
 #endif
-#ifndef topology_thread_cpumask
-#define topology_thread_cpumask(cpu)		cpumask_of(cpu)
+#ifndef topology_sibling_cpumask
+#define topology_sibling_cpumask(cpu)		cpumask_of(cpu)
 #endif
 #ifndef topology_core_cpumask
 #define topology_core_cpumask(cpu)		cpumask_of(cpu)
@@ -201,7 +201,7 @@ static inline int cpu_to_mem(int cpu)
 #ifdef CONFIG_SCHED_SMT
 static inline const struct cpumask *cpu_smt_mask(int cpu)
 {
-	return topology_thread_cpumask(cpu);
+	return topology_sibling_cpumask(cpu);
 }
 #endif
 
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index ecd3319dac3314..ae572c1386073c 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -1,21 +1,30 @@
 #ifndef __LINUX_UACCESS_H__
 #define __LINUX_UACCESS_H__
 
-#include <linux/preempt.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 
+static __always_inline void pagefault_disabled_inc(void)
+{
+	current->pagefault_disabled++;
+}
+
+static __always_inline void pagefault_disabled_dec(void)
+{
+	current->pagefault_disabled--;
+	WARN_ON(current->pagefault_disabled < 0);
+}
+
 /*
- * These routines enable/disable the pagefault handler in that
- * it will not take any locks and go straight to the fixup table.
+ * These routines enable/disable the pagefault handler. If disabled, it will
+ * not take any locks and go straight to the fixup table.
  *
- * They have great resemblance to the preempt_disable/enable calls
- * and in fact they are identical; this is because currently there is
- * no other way to make the pagefault handlers do this. So we do
- * disable preemption but we don't necessarily care about that.
+ * User access methods will not sleep when called from a pagefault_disabled()
+ * environment.
  */
 static inline void pagefault_disable(void)
 {
-	preempt_count_inc();
+	pagefault_disabled_inc();
 	/*
 	 * make sure to have issued the store before a pagefault
 	 * can hit.
@@ -25,18 +34,31 @@ static inline void pagefault_disable(void)
 
 static inline void pagefault_enable(void)
 {
-#ifndef CONFIG_PREEMPT
 	/*
 	 * make sure to issue those last loads/stores before enabling
 	 * the pagefault handler again.
 	 */
 	barrier();
-	preempt_count_dec();
-#else
-	preempt_enable();
-#endif
+	pagefault_disabled_dec();
 }
 
+/*
+ * Is the pagefault handler disabled? If so, user access methods will not sleep.
+ */
+#define pagefault_disabled() (current->pagefault_disabled != 0)
+
+/*
+ * The pagefault handler is in general disabled by pagefault_disable() or
+ * when in irq context (via in_atomic()).
+ *
+ * This function should only be used by the fault handlers. Other users should
+ * stick to pagefault_disabled().
+ * Please NEVER use preempt_disable() to disable the fault handler. With
+ * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled.
+ * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT.
+ */
+#define faulthandler_disabled() (pagefault_disabled() || in_atomic())
+
 #ifndef ARCH_HAS_NOCACHE_UACCESS
 
 static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
diff --git a/include/linux/wait.h b/include/linux/wait.h
index db78c720494785..1e1bf9f963a947 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -982,7 +982,7 @@ extern int bit_wait_io_timeout(struct wait_bit_key *);
  * on that signal.
  */
 static inline int
-wait_on_bit(void *word, int bit, unsigned mode)
+wait_on_bit(unsigned long *word, int bit, unsigned mode)
 {
 	might_sleep();
 	if (!test_bit(bit, word))
@@ -1007,7 +1007,7 @@ wait_on_bit(void *word, int bit, unsigned mode)
  * on that signal.
  */
 static inline int
-wait_on_bit_io(void *word, int bit, unsigned mode)
+wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
 {
 	might_sleep();
 	if (!test_bit(bit, word))
@@ -1033,7 +1033,8 @@ wait_on_bit_io(void *word, int bit, unsigned mode)
  * received a signal and the mode permitted wakeup on that signal.
  */
 static inline int
-wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout)
+wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
+		    unsigned long timeout)
 {
 	might_sleep();
 	if (!test_bit(bit, word))
@@ -1060,7 +1061,8 @@ wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout)
  * on that signal.
  */
 static inline int
-wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode)
+wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
+		   unsigned mode)
 {
 	might_sleep();
 	if (!test_bit(bit, word))
@@ -1088,7 +1090,7 @@ wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode
  * the @mode allows that signal to wake the process.
  */
 static inline int
-wait_on_bit_lock(void *word, int bit, unsigned mode)
+wait_on_bit_lock(unsigned long *word, int bit, unsigned mode)
 {
 	might_sleep();
 	if (!test_and_set_bit(bit, word))
@@ -1112,7 +1114,7 @@ wait_on_bit_lock(void *word, int bit, unsigned mode)
  * the @mode allows that signal to wake the process.
  */
 static inline int
-wait_on_bit_lock_io(void *word, int bit, unsigned mode)
+wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode)
 {
 	might_sleep();
 	if (!test_and_set_bit(bit, word))
@@ -1138,7 +1140,8 @@ wait_on_bit_lock_io(void *word, int bit, unsigned mode)
  * the @mode allows that signal to wake the process.
  */
 static inline int
-wait_on_bit_lock_action(void *word, int bit, wait_bit_action_f *action, unsigned mode)
+wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
+			unsigned mode)
 {
 	might_sleep();
 	if (!test_and_set_bit(bit, word))
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 30fedaf3e56a25..d57a575fe31fc5 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -147,7 +147,8 @@ TRACE_EVENT(sched_switch,
 		  __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|",
 				{ 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
 				{ 16, "Z" }, { 32, "X" }, { 64, "x" },
-				{ 128, "K" }, { 256, "W" }, { 512, "P" }) : "R",
+				{ 128, "K" }, { 256, "W" }, { 512, "P" },
+				{ 1024, "N" }) : "R",
 		__entry->prev_state & TASK_STATE_MAX ? "+" : "",
 		__entry->next_comm, __entry->next_pid, __entry->next_prio)
 );
diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index 68c2c2000f02bb..d7abef1fe6e010 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -43,15 +43,18 @@ DEFINE_EVENT(timer_class, timer_init,
  */
 TRACE_EVENT(timer_start,
 
-	TP_PROTO(struct timer_list *timer, unsigned long expires),
+	TP_PROTO(struct timer_list *timer,
+		unsigned long expires,
+		unsigned int deferrable),
 
-	TP_ARGS(timer, expires),
+	TP_ARGS(timer, expires, deferrable),
 
 	TP_STRUCT__entry(
 		__field( void *,	timer		)
 		__field( void *,	function	)
 		__field( unsigned long,	expires		)
 		__field( unsigned long,	now		)
+		__field( unsigned int,	deferrable	)
 	),
 
 	TP_fast_assign(
@@ -59,11 +62,13 @@ TRACE_EVENT(timer_start,
 		__entry->function	= timer->function;
 		__entry->expires	= expires;
 		__entry->now		= jiffies;
+		__entry->deferrable     = deferrable;
 	),
 
-	TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld]",
+	TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld] defer=%c",
 		  __entry->timer, __entry->function, __entry->expires,
-		  (long)__entry->expires - __entry->now)
+		  (long)__entry->expires - __entry->now,
+		  __entry->deferrable > 0 ? 'y':'n')
 );
 
 /**
diff --git a/init/Kconfig b/init/Kconfig
index f9cac0b037cc30..194bc7c1df93ca 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -465,13 +465,9 @@ endmenu # "CPU/Task time and stats accounting"
 
 menu "RCU Subsystem"
 
-choice
-	prompt "RCU Implementation"
-	default TREE_RCU
-
 config TREE_RCU
-	bool "Tree-based hierarchical RCU"
-	depends on !PREEMPT && SMP
+	bool
+	default y if !PREEMPT && SMP
 	help
 	  This option selects the RCU implementation that is
 	  designed for very large SMP system with hundreds or
@@ -479,8 +475,8 @@ config TREE_RCU
 	  smaller systems.
 
 config PREEMPT_RCU
-	bool "Preemptible tree-based hierarchical RCU"
-	depends on PREEMPT
+	bool
+	default y if PREEMPT
 	help
 	  This option selects the RCU implementation that is
 	  designed for very large SMP systems with hundreds or
@@ -491,15 +487,28 @@ config PREEMPT_RCU
 	  Select this option if you are unsure.
 
 config TINY_RCU
-	bool "UP-only small-memory-footprint RCU"
-	depends on !PREEMPT && !SMP
+	bool
+	default y if !PREEMPT && !SMP
 	help
 	  This option selects the RCU implementation that is
 	  designed for UP systems from which real-time response
 	  is not required.  This option greatly reduces the
 	  memory footprint of RCU.
 
-endchoice
+config RCU_EXPERT
+	bool "Make expert-level adjustments to RCU configuration"
+	default n
+	help
+	  This option needs to be enabled if you wish to make
+	  expert-level adjustments to RCU configuration.  By default,
+	  no such adjustments can be made, which has the often-beneficial
+	  side-effect of preventing "make oldconfig" from asking you all
+	  sorts of detailed questions about how you would like numerous
+	  obscure RCU options to be set up.
+
+	  Say Y if you need to make expert-level adjustments to RCU.
+
+	  Say N if you are unsure.
 
 config SRCU
 	bool
@@ -509,7 +518,7 @@ config SRCU
 	  sections.
 
 config TASKS_RCU
-	bool "Task_based RCU implementation using voluntary context switch"
+	bool
 	default n
 	select SRCU
 	help
@@ -517,8 +526,6 @@ config TASKS_RCU
 	  only voluntary context switch (not preemption!), idle, and
 	  user-mode execution as quiescent states.
 
-	  If unsure, say N.
-
 config RCU_STALL_COMMON
 	def_bool ( TREE_RCU || PREEMPT_RCU || RCU_TRACE )
 	help
@@ -531,9 +538,7 @@ config CONTEXT_TRACKING
        bool
 
 config RCU_USER_QS
-	bool "Consider userspace as in RCU extended quiescent state"
-	depends on HAVE_CONTEXT_TRACKING && SMP
-	select CONTEXT_TRACKING
+	bool
 	help
 	  This option sets hooks on kernel / userspace boundaries and
 	  puts RCU in extended quiescent state when the CPU runs in
@@ -541,12 +546,6 @@ config RCU_USER_QS
 	  excluded from the global RCU state machine and thus doesn't
 	  try to keep the timer tick on for RCU.
 
-	  Unless you want to hack and help the development of the full
-	  dynticks mode, you shouldn't enable this option.  It also
-	  adds unnecessary overhead.
-
-	  If unsure say N
-
 config CONTEXT_TRACKING_FORCE
 	bool "Force context tracking"
 	depends on CONTEXT_TRACKING
@@ -578,7 +577,7 @@ config RCU_FANOUT
 	int "Tree-based hierarchical RCU fanout value"
 	range 2 64 if 64BIT
 	range 2 32 if !64BIT
-	depends on TREE_RCU || PREEMPT_RCU
+	depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
 	default 64 if 64BIT
 	default 32 if !64BIT
 	help
@@ -596,9 +595,9 @@ config RCU_FANOUT
 
 config RCU_FANOUT_LEAF
 	int "Tree-based hierarchical RCU leaf-level fanout value"
-	range 2 RCU_FANOUT if 64BIT
-	range 2 RCU_FANOUT if !64BIT
-	depends on TREE_RCU || PREEMPT_RCU
+	range 2 64 if 64BIT
+	range 2 32 if !64BIT
+	depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
 	default 16
 	help
 	  This option controls the leaf-level fanout of hierarchical
@@ -621,23 +620,9 @@ config RCU_FANOUT_LEAF
 
 	  Take the default if unsure.
 
-config RCU_FANOUT_EXACT
-	bool "Disable tree-based hierarchical RCU auto-balancing"
-	depends on TREE_RCU || PREEMPT_RCU
-	default n
-	help
-	  This option forces use of the exact RCU_FANOUT value specified,
-	  regardless of imbalances in the hierarchy.  This is useful for
-	  testing RCU itself, and might one day be useful on systems with
-	  strong NUMA behavior.
-
-	  Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
-
-	  Say N if unsure.
-
 config RCU_FAST_NO_HZ
 	bool "Accelerate last non-dyntick-idle CPU's grace periods"
-	depends on NO_HZ_COMMON && SMP
+	depends on NO_HZ_COMMON && SMP && RCU_EXPERT
 	default n
 	help
 	  This option permits CPUs to enter dynticks-idle state even if
@@ -663,7 +648,7 @@ config TREE_RCU_TRACE
 
 config RCU_BOOST
 	bool "Enable RCU priority boosting"
-	depends on RT_MUTEXES && PREEMPT_RCU
+	depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
 	default n
 	help
 	  This option boosts the priority of preempted RCU readers that
@@ -680,6 +665,7 @@ config RCU_KTHREAD_PRIO
 	range 0 99 if !RCU_BOOST
 	default 1 if RCU_BOOST
 	default 0 if !RCU_BOOST
+	depends on RCU_EXPERT
 	help
 	  This option specifies the SCHED_FIFO priority value that will be
 	  assigned to the rcuc/n and rcub/n threads and is also the value
@@ -1713,7 +1699,7 @@ config PERF_EVENTS
 config DEBUG_PERF_USE_VMALLOC
 	default n
 	bool "Debug: use vmalloc to back perf mmap() buffers"
-	depends on PERF_EVENTS && DEBUG_KERNEL
+	depends on PERF_EVENTS && DEBUG_KERNEL && !PPC
 	select PERF_USE_VMALLOC
 	help
 	 Use vmalloc memory to back perf mmap() buffers.
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 3aaea7ffd077c8..a24ba9fe5bb889 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -47,8 +47,7 @@
 #define RECV		1
 
 #define STATE_NONE	0
-#define STATE_PENDING	1
-#define STATE_READY	2
+#define STATE_READY	1
 
 struct posix_msg_tree_node {
 	struct rb_node		rb_node;
@@ -571,15 +570,12 @@ static int wq_sleep(struct mqueue_inode_info *info, int sr,
 	wq_add(info, sr, ewp);
 
 	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
+		__set_current_state(TASK_INTERRUPTIBLE);
 
 		spin_unlock(&info->lock);
 		time = schedule_hrtimeout_range_clock(timeout, 0,
 			HRTIMER_MODE_ABS, CLOCK_REALTIME);
 
-		while (ewp->state == STATE_PENDING)
-			cpu_relax();
-
 		if (ewp->state == STATE_READY) {
 			retval = 0;
 			goto out;
@@ -907,11 +903,15 @@ out_name:
  * list of waiting receivers. A sender checks that list before adding the new
  * message into the message array. If there is a waiting receiver, then it
  * bypasses the message array and directly hands the message over to the
- * receiver.
- * The receiver accepts the message and returns without grabbing the queue
- * spinlock. Therefore an intermediate STATE_PENDING state and memory barriers
- * are necessary. The same algorithm is used for sysv semaphores, see
- * ipc/sem.c for more details.
+ * receiver. The receiver accepts the message and returns without grabbing the
+ * queue spinlock:
+ *
+ * - Set pointer to message.
+ * - Queue the receiver task for later wakeup (without the info->lock).
+ * - Update its state to STATE_READY. Now the receiver can continue.
+ * - Wake up the process after the lock is dropped. Should the process wake up
+ *   before this wakeup (due to a timeout or a signal) it will either see
+ *   STATE_READY and continue or acquire the lock to check the state again.
  *
  * The same algorithm is used for senders.
  */
@@ -919,21 +919,29 @@ out_name:
 /* pipelined_send() - send a message directly to the task waiting in
  * sys_mq_timedreceive() (without inserting message into a queue).
  */
-static inline void pipelined_send(struct mqueue_inode_info *info,
+static inline void pipelined_send(struct wake_q_head *wake_q,
+				  struct mqueue_inode_info *info,
 				  struct msg_msg *message,
 				  struct ext_wait_queue *receiver)
 {
 	receiver->msg = message;
 	list_del(&receiver->list);
-	receiver->state = STATE_PENDING;
-	wake_up_process(receiver->task);
-	smp_wmb();
+	wake_q_add(wake_q, receiver->task);
+	/*
+	 * Rely on the implicit cmpxchg barrier from wake_q_add such
+	 * that we can ensure that updating receiver->state is the last
+	 * write operation: As once set, the receiver can continue,
+	 * and if we don't have the reference count from the wake_q,
+	 * yet, at that point we can later have a use-after-free
+	 * condition and bogus wakeup.
+	 */
 	receiver->state = STATE_READY;
 }
 
 /* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
  * gets its message and put to the queue (we have one free place for sure). */
-static inline void pipelined_receive(struct mqueue_inode_info *info)
+static inline void pipelined_receive(struct wake_q_head *wake_q,
+				     struct mqueue_inode_info *info)
 {
 	struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND);
 
@@ -944,10 +952,9 @@ static inline void pipelined_receive(struct mqueue_inode_info *info)
 	}
 	if (msg_insert(sender->msg, info))
 		return;
+
 	list_del(&sender->list);
-	sender->state = STATE_PENDING;
-	wake_up_process(sender->task);
-	smp_wmb();
+	wake_q_add(wake_q, sender->task);
 	sender->state = STATE_READY;
 }
 
@@ -965,6 +972,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
 	struct timespec ts;
 	struct posix_msg_tree_node *new_leaf = NULL;
 	int ret = 0;
+	WAKE_Q(wake_q);
 
 	if (u_abs_timeout) {
 		int res = prepare_timeout(u_abs_timeout, &expires, &ts);
@@ -1049,7 +1057,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
 	} else {
 		receiver = wq_get_first_waiter(info, RECV);
 		if (receiver) {
-			pipelined_send(info, msg_ptr, receiver);
+			pipelined_send(&wake_q, info, msg_ptr, receiver);
 		} else {
 			/* adds message to the queue */
 			ret = msg_insert(msg_ptr, info);
@@ -1062,6 +1070,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
 	}
 out_unlock:
 	spin_unlock(&info->lock);
+	wake_up_q(&wake_q);
 out_free:
 	if (ret)
 		free_msg(msg_ptr);
@@ -1149,14 +1158,17 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
 			msg_ptr = wait.msg;
 		}
 	} else {
+		WAKE_Q(wake_q);
+
 		msg_ptr = msg_get(info);
 
 		inode->i_atime = inode->i_mtime = inode->i_ctime =
 				CURRENT_TIME;
 
 		/* There is now free space in queue. */
-		pipelined_receive(info);
+		pipelined_receive(&wake_q, info);
 		spin_unlock(&info->lock);
+		wake_up_q(&wake_q);
 		ret = 0;
 	}
 	if (ret == 0) {
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 08561f1acd130b..ebdb0043203adb 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -235,9 +235,16 @@ config LOCK_SPIN_ON_OWNER
        def_bool y
        depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
 
-config ARCH_USE_QUEUE_RWLOCK
+config ARCH_USE_QUEUED_SPINLOCKS
 	bool
 
-config QUEUE_RWLOCK
-	def_bool y if ARCH_USE_QUEUE_RWLOCK
+config QUEUED_SPINLOCKS
+	def_bool y if ARCH_USE_QUEUED_SPINLOCKS
+	depends on SMP
+
+config ARCH_USE_QUEUED_RWLOCKS
+	bool
+
+config QUEUED_RWLOCKS
+	def_bool y if ARCH_USE_QUEUED_RWLOCKS
 	depends on SMP
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 72d59a1a6eb66f..0a495ab35bc72b 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -30,12 +30,23 @@ EXPORT_SYMBOL_GPL(context_tracking_enabled);
 DEFINE_PER_CPU(struct context_tracking, context_tracking);
 EXPORT_SYMBOL_GPL(context_tracking);
 
-void context_tracking_cpu_set(int cpu)
+static bool context_tracking_recursion_enter(void)
 {
-	if (!per_cpu(context_tracking.active, cpu)) {
-		per_cpu(context_tracking.active, cpu) = true;
-		static_key_slow_inc(&context_tracking_enabled);
-	}
+	int recursion;
+
+	recursion = __this_cpu_inc_return(context_tracking.recursion);
+	if (recursion == 1)
+		return true;
+
+	WARN_ONCE((recursion < 1), "Invalid context tracking recursion value %d\n", recursion);
+	__this_cpu_dec(context_tracking.recursion);
+
+	return false;
+}
+
+static void context_tracking_recursion_exit(void)
+{
+	__this_cpu_dec(context_tracking.recursion);
 }
 
 /**
@@ -75,6 +86,9 @@ void context_tracking_enter(enum ctx_state state)
 	WARN_ON_ONCE(!current->mm);
 
 	local_irq_save(flags);
+	if (!context_tracking_recursion_enter())
+		goto out_irq_restore;
+
 	if ( __this_cpu_read(context_tracking.state) != state) {
 		if (__this_cpu_read(context_tracking.active)) {
 			/*
@@ -105,6 +119,8 @@ void context_tracking_enter(enum ctx_state state)
 		 */
 		__this_cpu_write(context_tracking.state, state);
 	}
+	context_tracking_recursion_exit();
+out_irq_restore:
 	local_irq_restore(flags);
 }
 NOKPROBE_SYMBOL(context_tracking_enter);
@@ -139,6 +155,9 @@ void context_tracking_exit(enum ctx_state state)
 		return;
 
 	local_irq_save(flags);
+	if (!context_tracking_recursion_enter())
+		goto out_irq_restore;
+
 	if (__this_cpu_read(context_tracking.state) == state) {
 		if (__this_cpu_read(context_tracking.active)) {
 			/*
@@ -153,6 +172,8 @@ void context_tracking_exit(enum ctx_state state)
 		}
 		__this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
 	}
+	context_tracking_recursion_exit();
+out_irq_restore:
 	local_irq_restore(flags);
 }
 NOKPROBE_SYMBOL(context_tracking_exit);
@@ -164,24 +185,26 @@ void context_tracking_user_exit(void)
 }
 NOKPROBE_SYMBOL(context_tracking_user_exit);
 
-/**
- * __context_tracking_task_switch - context switch the syscall callbacks
- * @prev: the task that is being switched out
- * @next: the task that is being switched in
- *
- * The context tracking uses the syscall slow path to implement its user-kernel
- * boundaries probes on syscalls. This way it doesn't impact the syscall fast
- * path on CPUs that don't do context tracking.
- *
- * But we need to clear the flag on the previous task because it may later
- * migrate to some CPU that doesn't do the context tracking. As such the TIF
- * flag may not be desired there.
- */
-void __context_tracking_task_switch(struct task_struct *prev,
-				    struct task_struct *next)
+void __init context_tracking_cpu_set(int cpu)
 {
-	clear_tsk_thread_flag(prev, TIF_NOHZ);
-	set_tsk_thread_flag(next, TIF_NOHZ);
+	static __initdata bool initialized = false;
+
+	if (!per_cpu(context_tracking.active, cpu)) {
+		per_cpu(context_tracking.active, cpu) = true;
+		static_key_slow_inc(&context_tracking_enabled);
+	}
+
+	if (initialized)
+		return;
+
+	/*
+	 * Set TIF_NOHZ to init/0 and let it propagate to all tasks through fork
+	 * This assumes that init is the only task at this early boot stage.
+	 */
+	set_tsk_thread_flag(&init_task, TIF_NOHZ);
+	WARN_ON_ONCE(!tasklist_empty());
+
+	initialized = true;
 }
 
 #ifdef CONFIG_CONTEXT_TRACKING_FORCE
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 94bbe4695232cd..9c9c9fab16cc36 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -398,7 +398,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
 	if (err) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
-		smpboot_unpark_threads(cpu);
 		cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
 		goto out_release;
 	}
@@ -463,6 +462,7 @@ static int smpboot_thread_call(struct notifier_block *nfb,
 
 	switch (action & ~CPU_TASKS_FROZEN) {
 
+	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
 		smpboot_unpark_threads(cpu);
 		break;
@@ -479,7 +479,7 @@ static struct notifier_block smpboot_thread_notifier = {
 	.priority = CPU_PRI_SMPBOOT,
 };
 
-void __cpuinit smpboot_thread_init(void)
+void smpboot_thread_init(void)
 {
 	register_cpu_notifier(&smpboot_thread_notifier);
 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3f30995a757e95..fec85513d006b3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -51,9 +51,11 @@
 
 static struct workqueue_struct *perf_wq;
 
+typedef int (*remote_function_f)(void *);
+
 struct remote_function_call {
 	struct task_struct	*p;
-	int			(*func)(void *info);
+	remote_function_f	func;
 	void			*info;
 	int			ret;
 };
@@ -86,7 +88,7 @@ static void remote_function(void *data)
  *	    -EAGAIN - when the process moved away
  */
 static int
-task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
+task_function_call(struct task_struct *p, remote_function_f func, void *info)
 {
 	struct remote_function_call data = {
 		.p	= p,
@@ -110,7 +112,7 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
  *
  * returns: @func return value or -ENXIO when the cpu is offline
  */
-static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
+static int cpu_function_call(int cpu, remote_function_f func, void *info)
 {
 	struct remote_function_call data = {
 		.p	= NULL,
@@ -747,62 +749,31 @@ perf_cgroup_mark_enabled(struct perf_event *event,
 /*
  * function must be called with interrupts disbled
  */
-static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
 {
 	struct perf_cpu_context *cpuctx;
-	enum hrtimer_restart ret = HRTIMER_NORESTART;
 	int rotations = 0;
 
 	WARN_ON(!irqs_disabled());
 
 	cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
-
 	rotations = perf_rotate_context(cpuctx);
 
-	/*
-	 * arm timer if needed
-	 */
-	if (rotations) {
+	raw_spin_lock(&cpuctx->hrtimer_lock);
+	if (rotations)
 		hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
-		ret = HRTIMER_RESTART;
-	}
-
-	return ret;
-}
-
-/* CPU is going down */
-void perf_cpu_hrtimer_cancel(int cpu)
-{
-	struct perf_cpu_context *cpuctx;
-	struct pmu *pmu;
-	unsigned long flags;
-
-	if (WARN_ON(cpu != smp_processor_id()))
-		return;
-
-	local_irq_save(flags);
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
-		if (pmu->task_ctx_nr == perf_sw_context)
-			continue;
-
-		hrtimer_cancel(&cpuctx->hrtimer);
-	}
-
-	rcu_read_unlock();
+	else
+		cpuctx->hrtimer_active = 0;
+	raw_spin_unlock(&cpuctx->hrtimer_lock);
 
-	local_irq_restore(flags);
+	return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
 }
 
-static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
 {
-	struct hrtimer *hr = &cpuctx->hrtimer;
+	struct hrtimer *timer = &cpuctx->hrtimer;
 	struct pmu *pmu = cpuctx->ctx.pmu;
-	int timer;
+	u64 interval;
 
 	/* no multiplexing needed for SW PMU */
 	if (pmu->task_ctx_nr == perf_sw_context)
@@ -812,31 +783,36 @@ static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
 	 * check default is sane, if not set then force to
 	 * default interval (1/tick)
 	 */
-	timer = pmu->hrtimer_interval_ms;
-	if (timer < 1)
-		timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
+	interval = pmu->hrtimer_interval_ms;
+	if (interval < 1)
+		interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
 
-	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
 
-	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
-	hr->function = perf_cpu_hrtimer_handler;
+	raw_spin_lock_init(&cpuctx->hrtimer_lock);
+	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+	timer->function = perf_mux_hrtimer_handler;
 }
 
-static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
 {
-	struct hrtimer *hr = &cpuctx->hrtimer;
+	struct hrtimer *timer = &cpuctx->hrtimer;
 	struct pmu *pmu = cpuctx->ctx.pmu;
+	unsigned long flags;
 
 	/* not for SW PMU */
 	if (pmu->task_ctx_nr == perf_sw_context)
-		return;
+		return 0;
 
-	if (hrtimer_active(hr))
-		return;
+	raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
+	if (!cpuctx->hrtimer_active) {
+		cpuctx->hrtimer_active = 1;
+		hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+	}
+	raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
 
-	if (!hrtimer_callback_running(hr))
-		__hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
-					 0, HRTIMER_MODE_REL_PINNED, 0);
+	return 0;
 }
 
 void perf_pmu_disable(struct pmu *pmu)
@@ -1941,7 +1917,7 @@ group_sched_in(struct perf_event *group_event,
 
 	if (event_sched_in(group_event, cpuctx, ctx)) {
 		pmu->cancel_txn(pmu);
-		perf_cpu_hrtimer_restart(cpuctx);
+		perf_mux_hrtimer_restart(cpuctx);
 		return -EAGAIN;
 	}
 
@@ -1988,7 +1964,7 @@ group_error:
 
 	pmu->cancel_txn(pmu);
 
-	perf_cpu_hrtimer_restart(cpuctx);
+	perf_mux_hrtimer_restart(cpuctx);
 
 	return -EAGAIN;
 }
@@ -2261,7 +2237,7 @@ static int __perf_event_enable(void *info)
 		 */
 		if (leader != event) {
 			group_sched_out(leader, cpuctx, ctx);
-			perf_cpu_hrtimer_restart(cpuctx);
+			perf_mux_hrtimer_restart(cpuctx);
 		}
 		if (leader->attr.pinned) {
 			update_group_times(leader);
@@ -3448,7 +3424,6 @@ static void free_event_rcu(struct rcu_head *head)
 	if (event->ns)
 		put_pid_ns(event->ns);
 	perf_event_free_filter(event);
-	perf_event_free_bpf_prog(event);
 	kfree(event);
 }
 
@@ -3579,6 +3554,8 @@ static void __free_event(struct perf_event *event)
 			put_callchain_buffers();
 	}
 
+	perf_event_free_bpf_prog(event);
+
 	if (event->destroy)
 		event->destroy(event);
 
@@ -6869,9 +6846,8 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
 	} else {
 		period = max_t(u64, 10000, hwc->sample_period);
 	}
-	__hrtimer_start_range_ns(&hwc->hrtimer,
-				ns_to_ktime(period), 0,
-				HRTIMER_MODE_REL_PINNED, 0);
+	hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
+		      HRTIMER_MODE_REL_PINNED);
 }
 
 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
@@ -7172,6 +7148,8 @@ perf_event_mux_interval_ms_show(struct device *dev,
 	return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
 }
 
+static DEFINE_MUTEX(mux_interval_mutex);
+
 static ssize_t
 perf_event_mux_interval_ms_store(struct device *dev,
 				 struct device_attribute *attr,
@@ -7191,17 +7169,21 @@ perf_event_mux_interval_ms_store(struct device *dev,
 	if (timer == pmu->hrtimer_interval_ms)
 		return count;
 
+	mutex_lock(&mux_interval_mutex);
 	pmu->hrtimer_interval_ms = timer;
 
 	/* update all cpuctx for this PMU */
-	for_each_possible_cpu(cpu) {
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
 		struct perf_cpu_context *cpuctx;
 		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
 		cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
 
-		if (hrtimer_active(&cpuctx->hrtimer))
-			hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+		cpu_function_call(cpu,
+			(remote_function_f)perf_mux_hrtimer_restart, cpuctx);
 	}
+	put_online_cpus();
+	mutex_unlock(&mux_interval_mutex);
 
 	return count;
 }
@@ -7306,7 +7288,7 @@ skip_type:
 		lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
 		cpuctx->ctx.pmu = pmu;
 
-		__perf_cpu_hrtimer_init(cpuctx, cpu);
+		__perf_mux_hrtimer_init(cpuctx, cpu);
 
 		cpuctx->unique_pmu = pmu;
 	}
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 232f00f273cbe4..96472824a752f7 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -141,7 +141,7 @@ int perf_output_begin(struct perf_output_handle *handle,
 	perf_output_get_handle(handle);
 
 	do {
-		tail = ACCESS_ONCE(rb->user_page->data_tail);
+		tail = READ_ONCE_CTRL(rb->user_page->data_tail);
 		offset = head = local_read(&rb->head);
 		if (!rb->overwrite &&
 		    unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
@@ -493,6 +493,20 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
 			rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
 	}
 
+	/*
+	 * In overwrite mode, PMUs that don't support SG may not handle more
+	 * than one contiguous allocation, since they rely on PMI to do double
+	 * buffering. In this case, the entire buffer has to be one contiguous
+	 * chunk.
+	 */
+	if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) &&
+	    overwrite) {
+		struct page *page = virt_to_page(rb->aux_pages[0]);
+
+		if (page_private(page) != max_order)
+			goto out;
+	}
+
 	rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
 					     overwrite);
 	if (!rb->aux_priv)
diff --git a/kernel/fork.c b/kernel/fork.c
index 03c1eaaa6ef56f..0bb88b55555058 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1091,10 +1091,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 {
 	unsigned long cpu_limit;
 
-	/* Thread group counters. */
-	thread_group_cputime_init(sig);
-
-	cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+	cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
 	if (cpu_limit != RLIM_INFINITY) {
 		sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
 		sig->cputimer.running = 1;
@@ -1396,6 +1393,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->hardirq_context = 0;
 	p->softirq_context = 0;
 #endif
+
+	p->pagefault_disabled = 0;
+
 #ifdef CONFIG_LOCKDEP
 	p->lockdep_depth = 0; /* no locks held yet */
 	p->curr_chain_key = 0;
diff --git a/kernel/futex.c b/kernel/futex.c
index 2579e407ff67d0..ea6ca0bca52570 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1090,9 +1090,11 @@ static void __unqueue_futex(struct futex_q *q)
 
 /*
  * The hash bucket lock must be held when this is called.
- * Afterwards, the futex_q must not be accessed.
+ * Afterwards, the futex_q must not be accessed. Callers
+ * must ensure to later call wake_up_q() for the actual
+ * wakeups to occur.
  */
-static void wake_futex(struct futex_q *q)
+static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
 {
 	struct task_struct *p = q->task;
 
@@ -1100,14 +1102,10 @@ static void wake_futex(struct futex_q *q)
 		return;
 
 	/*
-	 * We set q->lock_ptr = NULL _before_ we wake up the task. If
-	 * a non-futex wake up happens on another CPU then the task
-	 * might exit and p would dereference a non-existing task
-	 * struct. Prevent this by holding a reference on p across the
-	 * wake up.
+	 * Queue the task for later wakeup for after we've released
+	 * the hb->lock. wake_q_add() grabs reference to p.
 	 */
-	get_task_struct(p);
-
+	wake_q_add(wake_q, p);
 	__unqueue_futex(q);
 	/*
 	 * The waiting task can free the futex_q as soon as
@@ -1117,9 +1115,6 @@ static void wake_futex(struct futex_q *q)
 	 */
 	smp_wmb();
 	q->lock_ptr = NULL;
-
-	wake_up_state(p, TASK_NORMAL);
-	put_task_struct(p);
 }
 
 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -1217,6 +1212,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 	struct futex_q *this, *next;
 	union futex_key key = FUTEX_KEY_INIT;
 	int ret;
+	WAKE_Q(wake_q);
 
 	if (!bitset)
 		return -EINVAL;
@@ -1244,13 +1240,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 			if (!(this->bitset & bitset))
 				continue;
 
-			wake_futex(this);
+			mark_wake_futex(&wake_q, this);
 			if (++ret >= nr_wake)
 				break;
 		}
 	}
 
 	spin_unlock(&hb->lock);
+	wake_up_q(&wake_q);
 out_put_key:
 	put_futex_key(&key);
 out:
@@ -1269,6 +1266,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 	struct futex_hash_bucket *hb1, *hb2;
 	struct futex_q *this, *next;
 	int ret, op_ret;
+	WAKE_Q(wake_q);
 
 retry:
 	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
@@ -1320,7 +1318,7 @@ retry_private:
 				ret = -EINVAL;
 				goto out_unlock;
 			}
-			wake_futex(this);
+			mark_wake_futex(&wake_q, this);
 			if (++ret >= nr_wake)
 				break;
 		}
@@ -1334,7 +1332,7 @@ retry_private:
 					ret = -EINVAL;
 					goto out_unlock;
 				}
-				wake_futex(this);
+				mark_wake_futex(&wake_q, this);
 				if (++op_ret >= nr_wake2)
 					break;
 			}
@@ -1344,6 +1342,7 @@ retry_private:
 
 out_unlock:
 	double_unlock_hb(hb1, hb2);
+	wake_up_q(&wake_q);
 out_put_keys:
 	put_futex_key(&key2);
 out_put_key1:
@@ -1503,6 +1502,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 	struct futex_pi_state *pi_state = NULL;
 	struct futex_hash_bucket *hb1, *hb2;
 	struct futex_q *this, *next;
+	WAKE_Q(wake_q);
 
 	if (requeue_pi) {
 		/*
@@ -1679,7 +1679,7 @@ retry_private:
 		 * woken by futex_unlock_pi().
 		 */
 		if (++task_count <= nr_wake && !requeue_pi) {
-			wake_futex(this);
+			mark_wake_futex(&wake_q, this);
 			continue;
 		}
 
@@ -1719,6 +1719,7 @@ retry_private:
 out_unlock:
 	free_pi_state(pi_state);
 	double_unlock_hb(hb1, hb2);
+	wake_up_q(&wake_q);
 	hb_waiters_dec(hb2);
 
 	/*
@@ -2055,7 +2056,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 {
 	/*
 	 * The task state is guaranteed to be set before another task can
-	 * wake it. set_current_state() is implemented using set_mb() and
+	 * wake it. set_current_state() is implemented using smp_store_mb() and
 	 * queue_me() calls spin_unlock() upon completion, both serializing
 	 * access to the hash list and forcing another memory barrier.
 	 */
@@ -2063,11 +2064,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 	queue_me(q, hb);
 
 	/* Arm the timer */
-	if (timeout) {
+	if (timeout)
 		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-		if (!hrtimer_active(&timeout->timer))
-			timeout->task = NULL;
-	}
 
 	/*
 	 * If we have been removed from the hash list, then another task
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2456fe89719c5a..330fc797e632cb 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -978,6 +978,20 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
 }
 
 /**
+ * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt
+ * @data:	Pointer to interrupt specific data
+ * @dest:	The vcpu affinity information
+ */
+int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
+{
+	data = data->parent_data;
+	if (data->chip->irq_set_vcpu_affinity)
+		return data->chip->irq_set_vcpu_affinity(data, vcpu_info);
+
+	return -ENOSYS;
+}
+
+/**
  * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt
  * @data:	Pointer to interrupt specific data
  * @on:		Whether to set or reset the wake-up capability of this irq
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index d5d0f7345c5450..74d90a75426881 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -104,7 +104,7 @@ int devm_request_any_context_irq(struct device *dev, unsigned int irq,
 		return -ENOMEM;
 
 	rc = request_any_context_irq(irq, handler, irqflags, devname, dev_id);
-	if (rc) {
+	if (rc < 0) {
 		devres_free(dr);
 		return rc;
 	}
@@ -113,7 +113,7 @@ int devm_request_any_context_irq(struct device *dev, unsigned int irq,
 	dr->dev_id = dev_id;
 	devres_add(dev, dr);
 
-	return 0;
+	return rc;
 }
 EXPORT_SYMBOL(devm_request_any_context_irq);
 
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 2feb6feca0cc96..326a67f2410bf9 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -42,6 +42,7 @@ struct irq_chip no_irq_chip = {
 	.irq_enable	= noop,
 	.irq_disable	= noop,
 	.irq_ack	= ack_bad,
+	.flags		= IRQCHIP_SKIP_SET_WAKE,
 };
 
 /*
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 99793b9b6d23a5..73a76e2ee93603 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -619,7 +619,7 @@ unsigned int kstat_irqs(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	int cpu;
-	int sum = 0;
+	unsigned int sum = 0;
 
 	if (!desc || !desc->kstat_irqs)
 		return 0;
@@ -639,7 +639,7 @@ unsigned int kstat_irqs(unsigned int irq)
  */
 unsigned int kstat_irqs_usr(unsigned int irq)
 {
-	int sum;
+	unsigned int sum;
 
 	irq_lock_sparse();
 	sum = kstat_irqs(irq);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e68932bb308e96..b1c7e8f46bfb2f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -256,6 +256,37 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 }
 EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
 
+/**
+ *	irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
+ *	@irq: interrupt number to set affinity
+ *	@vcpu_info: vCPU specific data
+ *
+ *	This function uses the vCPU specific data to set the vCPU
+ *	affinity for an irq. The vCPU specific data is passed from
+ *	outside, such as KVM. One example code path is as below:
+ *	KVM -> IOMMU -> irq_set_vcpu_affinity().
+ */
+int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
+{
+	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+	struct irq_data *data;
+	struct irq_chip *chip;
+	int ret = -ENOSYS;
+
+	if (!desc)
+		return -EINVAL;
+
+	data = irq_desc_get_irq_data(desc);
+	chip = irq_data_get_irq_chip(data);
+	if (chip && chip->irq_set_vcpu_affinity)
+		ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
+	irq_put_desc_unlock(desc, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);
+
 static void irq_affinity_notify(struct work_struct *work)
 {
 	struct irq_affinity_notify *notify =
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 474de5cb394d3c..7bf1f1bbb7fa56 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -124,7 +124,7 @@ static void msi_domain_free(struct irq_domain *domain, unsigned int virq,
 	irq_domain_free_irqs_top(domain, virq, nr_irqs);
 }
 
-static struct irq_domain_ops msi_domain_ops = {
+static const struct irq_domain_ops msi_domain_ops = {
 	.alloc		= msi_domain_alloc,
 	.free		= msi_domain_free,
 	.activate	= msi_domain_activate,
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 5204a6d1b9854f..d22786a6dbde92 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -123,6 +123,8 @@ void suspend_device_irqs(void)
 		unsigned long flags;
 		bool sync;
 
+		if (irq_settings_is_nested_thread(desc))
+			continue;
 		raw_spin_lock_irqsave(&desc->lock, flags);
 		sync = suspend_device_irq(desc, irq);
 		raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -163,6 +165,8 @@ static void resume_irqs(bool want_early)
 
 		if (!is_early && want_early)
 			continue;
+		if (irq_settings_is_nested_thread(desc))
+			continue;
 
 		raw_spin_lock_irqsave(&desc->lock, flags);
 		resume_irq(desc, irq);
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index de7a416cca2a79..7dd5c9918e4c24 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_SMP) += spinlock.o
 obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
 obj-$(CONFIG_SMP) += lglock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
@@ -25,5 +26,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
 obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
 obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
-obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
+obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index a0831e1b99f4aa..a61bb1d37a52c3 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4066,8 +4066,7 @@ void __init lockdep_info(void)
 
 #ifdef CONFIG_DEBUG_LOCKDEP
 	if (lockdep_init_error) {
-		printk("WARNING: lockdep init error! lock-%s was acquired"
-			"before lockdep_init\n", lock_init_error);
+		printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error);
 		printk("Call stack leading to lockdep invocation was:\n");
 		print_stack_trace(&lockdep_init_trace, 0);
 	}
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index ec8cce25977906..32244186f1f2ae 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -122,12 +122,12 @@ static int torture_lock_busted_write_lock(void)
 
 static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
 {
-	const unsigned long longdelay_us = 100;
+	const unsigned long longdelay_ms = 100;
 
 	/* We want a long delay occasionally to force massive contention.  */
 	if (!(torture_random(trsp) %
-	      (cxt.nrealwriters_stress * 2000 * longdelay_us)))
-		mdelay(longdelay_us);
+	      (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+		mdelay(longdelay_ms);
 #ifdef CONFIG_PREEMPT
 	if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
 		preempt_schedule();  /* Allow test to be preempted. */
@@ -160,14 +160,14 @@ static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
 static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
 {
 	const unsigned long shortdelay_us = 2;
-	const unsigned long longdelay_us = 100;
+	const unsigned long longdelay_ms = 100;
 
 	/* We want a short delay mostly to emulate likely code, and
 	 * we want a long delay occasionally to force massive contention.
 	 */
 	if (!(torture_random(trsp) %
-	      (cxt.nrealwriters_stress * 2000 * longdelay_us)))
-		mdelay(longdelay_us);
+	      (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+		mdelay(longdelay_ms);
 	if (!(torture_random(trsp) %
 	      (cxt.nrealwriters_stress * 2 * shortdelay_us)))
 		udelay(shortdelay_us);
@@ -309,7 +309,7 @@ static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
 static void torture_rwlock_read_unlock_irq(void)
 __releases(torture_rwlock)
 {
-	write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
+	read_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
 }
 
 static struct lock_torture_ops rw_lock_irq_ops = {
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 75e114bdf3f26f..fd91aaa4554c8b 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -17,6 +17,7 @@
 struct mcs_spinlock {
 	struct mcs_spinlock *next;
 	int locked; /* 1 if lock acquired */
+	int count;  /* nesting count, see qspinlock.c */
 };
 
 #ifndef arch_mcs_spin_lock_contended
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index f956ede7f90df0..00c12bb390b572 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -1,5 +1,5 @@
 /*
- * Queue read/write lock
+ * Queued read/write locks
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
new file mode 100644
index 00000000000000..38c49202d532b3
--- /dev/null
+++ b/kernel/locking/qspinlock.c
@@ -0,0 +1,473 @@
+/*
+ * Queued spinlock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ * (C) Copyright 2013-2014 Red Hat, Inc.
+ * (C) Copyright 2015 Intel Corp.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ *          Peter Zijlstra <peterz@infradead.org>
+ */
+
+#ifndef _GEN_PV_LOCK_SLOWPATH
+
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <asm/byteorder.h>
+#include <asm/qspinlock.h>
+
+/*
+ * The basic principle of a queue-based spinlock can best be understood
+ * by studying a classic queue-based spinlock implementation called the
+ * MCS lock. The paper below provides a good description for this kind
+ * of lock.
+ *
+ * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
+ *
+ * This queued spinlock implementation is based on the MCS lock, however to make
+ * it fit the 4 bytes we assume spinlock_t to be, and preserve its existing
+ * API, we must modify it somehow.
+ *
+ * In particular; where the traditional MCS lock consists of a tail pointer
+ * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
+ * unlock the next pending (next->locked), we compress both these: {tail,
+ * next->locked} into a single u32 value.
+ *
+ * Since a spinlock disables recursion of its own context and there is a limit
+ * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
+ * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
+ * we can encode the tail by combining the 2-bit nesting level with the cpu
+ * number. With one byte for the lock value and 3 bytes for the tail, only a
+ * 32-bit word is now needed. Even though we only need 1 bit for the lock,
+ * we extend it to a full byte to achieve better performance for architectures
+ * that support atomic byte write.
+ *
+ * We also change the first spinner to spin on the lock bit instead of its
+ * node; whereby avoiding the need to carry a node from lock to unlock, and
+ * preserving existing lock API. This also makes the unlock code simpler and
+ * faster.
+ *
+ * N.B. The current implementation only supports architectures that allow
+ *      atomic operations on smaller 8-bit and 16-bit data types.
+ *
+ */
+
+#include "mcs_spinlock.h"
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define MAX_NODES	8
+#else
+#define MAX_NODES	4
+#endif
+
+/*
+ * Per-CPU queue node structures; we can never have more than 4 nested
+ * contexts: task, softirq, hardirq, nmi.
+ *
+ * Exactly fits one 64-byte cacheline on a 64-bit architecture.
+ *
+ * PV doubles the storage and uses the second cacheline for PV state.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
+
+/*
+ * We must be able to distinguish between no-tail and the tail at 0:0,
+ * therefore increment the cpu number by one.
+ */
+
+static inline u32 encode_tail(int cpu, int idx)
+{
+	u32 tail;
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+	BUG_ON(idx > 3);
+#endif
+	tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+	tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
+
+	return tail;
+}
+
+static inline struct mcs_spinlock *decode_tail(u32 tail)
+{
+	int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
+	int idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+
+	return per_cpu_ptr(&mcs_nodes[idx], cpu);
+}
+
+#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+
+/*
+ * By using the whole 2nd least significant byte for the pending bit, we
+ * can allow better optimization of the lock acquisition for the pending
+ * bit holder.
+ *
+ * This internal structure is also used by the set_locked function which
+ * is not restricted to _Q_PENDING_BITS == 8.
+ */
+struct __qspinlock {
+	union {
+		atomic_t val;
+#ifdef __LITTLE_ENDIAN
+		struct {
+			u8	locked;
+			u8	pending;
+		};
+		struct {
+			u16	locked_pending;
+			u16	tail;
+		};
+#else
+		struct {
+			u16	tail;
+			u16	locked_pending;
+		};
+		struct {
+			u8	reserved[2];
+			u8	pending;
+			u8	locked;
+		};
+#endif
+	};
+};
+
+#if _Q_PENDING_BITS == 8
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+
+	WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
+}
+
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+	struct __qspinlock *l = (void *)lock;
+
+	return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+
+#else /* _Q_PENDING_BITS == 8 */
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+	atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
+}
+
+/**
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+	u32 old, new, val = atomic_read(&lock->val);
+
+	for (;;) {
+		new = (val & _Q_LOCKED_PENDING_MASK) | tail;
+		old = atomic_cmpxchg(&lock->val, val, new);
+		if (old == val)
+			break;
+
+		val = old;
+	}
+	return old;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/**
+ * set_locked - Set the lock bit and own the lock
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,*,0 -> *,0,1
+ */
+static __always_inline void set_locked(struct qspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+
+	WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+}
+
+
+/*
+ * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
+ * all the PV callbacks.
+ */
+
+static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
+
+static __always_inline void __pv_wait_head(struct qspinlock *lock,
+					   struct mcs_spinlock *node) { }
+
+#define pv_enabled()		false
+
+#define pv_init_node		__pv_init_node
+#define pv_wait_node		__pv_wait_node
+#define pv_kick_node		__pv_kick_node
+#define pv_wait_head		__pv_wait_head
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath
+#endif
+
+#endif /* _GEN_PV_LOCK_SLOWPATH */
+
+/**
+ * queued_spin_lock_slowpath - acquire the queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ * @val: Current value of the queued spinlock 32-bit word
+ *
+ * (queue tail, pending bit, lock value)
+ *
+ *              fast     :    slow                                  :    unlock
+ *                       :                                          :
+ * uncontended  (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
+ *                       :       | ^--------.------.             /  :
+ *                       :       v           \      \            |  :
+ * pending               :    (0,1,1) +--> (0,1,0)   \           |  :
+ *                       :       | ^--'              |           |  :
+ *                       :       v                   |           |  :
+ * uncontended           :    (n,x,y) +--> (n,0,0) --'           |  :
+ *   queue               :       | ^--'                          |  :
+ *                       :       v                               |  :
+ * contended             :    (*,x,y) +--> (*,0,0) ---> (*,0,1) -'  :
+ *   queue               :         ^--'                             :
+ */
+void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+	struct mcs_spinlock *prev, *next, *node;
+	u32 new, old, tail;
+	int idx;
+
+	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+
+	if (pv_enabled())
+		goto queue;
+
+	if (virt_queued_spin_lock(lock))
+		return;
+
+	/*
+	 * wait for in-progress pending->locked hand-overs
+	 *
+	 * 0,1,0 -> 0,0,1
+	 */
+	if (val == _Q_PENDING_VAL) {
+		while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
+			cpu_relax();
+	}
+
+	/*
+	 * trylock || pending
+	 *
+	 * 0,0,0 -> 0,0,1 ; trylock
+	 * 0,0,1 -> 0,1,1 ; pending
+	 */
+	for (;;) {
+		/*
+		 * If we observe any contention; queue.
+		 */
+		if (val & ~_Q_LOCKED_MASK)
+			goto queue;
+
+		new = _Q_LOCKED_VAL;
+		if (val == new)
+			new |= _Q_PENDING_VAL;
+
+		old = atomic_cmpxchg(&lock->val, val, new);
+		if (old == val)
+			break;
+
+		val = old;
+	}
+
+	/*
+	 * we won the trylock
+	 */
+	if (new == _Q_LOCKED_VAL)
+		return;
+
+	/*
+	 * we're pending, wait for the owner to go away.
+	 *
+	 * *,1,1 -> *,1,0
+	 *
+	 * this wait loop must be a load-acquire such that we match the
+	 * store-release that clears the locked bit and create lock
+	 * sequentiality; this is because not all clear_pending_set_locked()
+	 * implementations imply full barriers.
+	 */
+	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
+		cpu_relax();
+
+	/*
+	 * take ownership and clear the pending bit.
+	 *
+	 * *,1,0 -> *,0,1
+	 */
+	clear_pending_set_locked(lock);
+	return;
+
+	/*
+	 * End of pending bit optimistic spinning and beginning of MCS
+	 * queuing.
+	 */
+queue:
+	node = this_cpu_ptr(&mcs_nodes[0]);
+	idx = node->count++;
+	tail = encode_tail(smp_processor_id(), idx);
+
+	node += idx;
+	node->locked = 0;
+	node->next = NULL;
+	pv_init_node(node);
+
+	/*
+	 * We touched a (possibly) cold cacheline in the per-cpu queue node;
+	 * attempt the trylock once more in the hope someone let go while we
+	 * weren't watching.
+	 */
+	if (queued_spin_trylock(lock))
+		goto release;
+
+	/*
+	 * We have already touched the queueing cacheline; don't bother with
+	 * pending stuff.
+	 *
+	 * p,*,* -> n,*,*
+	 */
+	old = xchg_tail(lock, tail);
+
+	/*
+	 * if there was a previous node; link it and wait until reaching the
+	 * head of the waitqueue.
+	 */
+	if (old & _Q_TAIL_MASK) {
+		prev = decode_tail(old);
+		WRITE_ONCE(prev->next, node);
+
+		pv_wait_node(node);
+		arch_mcs_spin_lock_contended(&node->locked);
+	}
+
+	/*
+	 * we're at the head of the waitqueue, wait for the owner & pending to
+	 * go away.
+	 *
+	 * *,x,y -> *,0,0
+	 *
+	 * this wait loop must use a load-acquire such that we match the
+	 * store-release that clears the locked bit and create lock
+	 * sequentiality; this is because the set_locked() function below
+	 * does not imply a full barrier.
+	 *
+	 */
+	pv_wait_head(lock, node);
+	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
+		cpu_relax();
+
+	/*
+	 * claim the lock:
+	 *
+	 * n,0,0 -> 0,0,1 : lock, uncontended
+	 * *,0,0 -> *,0,1 : lock, contended
+	 *
+	 * If the queue head is the only one in the queue (lock value == tail),
+	 * clear the tail code and grab the lock. Otherwise, we only need
+	 * to grab the lock.
+	 */
+	for (;;) {
+		if (val != tail) {
+			set_locked(lock);
+			break;
+		}
+		old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+		if (old == val)
+			goto release;	/* No contention */
+
+		val = old;
+	}
+
+	/*
+	 * contended path; wait for next, release.
+	 */
+	while (!(next = READ_ONCE(node->next)))
+		cpu_relax();
+
+	arch_mcs_spin_unlock_contended(&next->locked);
+	pv_kick_node(next);
+
+release:
+	/*
+	 * release the node
+	 */
+	this_cpu_dec(mcs_nodes[0].count);
+}
+EXPORT_SYMBOL(queued_spin_lock_slowpath);
+
+/*
+ * Generate the paravirt code for queued_spin_unlock_slowpath().
+ */
+#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#define _GEN_PV_LOCK_SLOWPATH
+
+#undef  pv_enabled
+#define pv_enabled()	true
+
+#undef pv_init_node
+#undef pv_wait_node
+#undef pv_kick_node
+#undef pv_wait_head
+
+#undef  queued_spin_lock_slowpath
+#define queued_spin_lock_slowpath	__pv_queued_spin_lock_slowpath
+
+#include "qspinlock_paravirt.h"
+#include "qspinlock.c"
+
+#endif
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
new file mode 100644
index 00000000000000..04ab18151cc8fa
--- /dev/null
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -0,0 +1,325 @@
+#ifndef _GEN_PV_LOCK_SLOWPATH
+#error "do not include this file"
+#endif
+
+#include <linux/hash.h>
+#include <linux/bootmem.h>
+
+/*
+ * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
+ * of spinning them.
+ *
+ * This relies on the architecture to provide two paravirt hypercalls:
+ *
+ *   pv_wait(u8 *ptr, u8 val) -- suspends the vcpu if *ptr == val
+ *   pv_kick(cpu)             -- wakes a suspended vcpu
+ *
+ * Using these we implement __pv_queued_spin_lock_slowpath() and
+ * __pv_queued_spin_unlock() to replace native_queued_spin_lock_slowpath() and
+ * native_queued_spin_unlock().
+ */
+
+#define _Q_SLOW_VAL	(3U << _Q_LOCKED_OFFSET)
+
+enum vcpu_state {
+	vcpu_running = 0,
+	vcpu_halted,
+};
+
+struct pv_node {
+	struct mcs_spinlock	mcs;
+	struct mcs_spinlock	__res[3];
+
+	int			cpu;
+	u8			state;
+};
+
+/*
+ * Lock and MCS node addresses hash table for fast lookup
+ *
+ * Hashing is done on a per-cacheline basis to minimize the need to access
+ * more than one cacheline.
+ *
+ * Dynamically allocate a hash table big enough to hold at least 4X the
+ * number of possible cpus in the system. Allocation is done on page
+ * granularity. So the minimum number of hash buckets should be at least
+ * 256 (64-bit) or 512 (32-bit) to fully utilize a 4k page.
+ *
+ * Since we should not be holding locks from NMI context (very rare indeed) the
+ * max load factor is 0.75, which is around the point where open addressing
+ * breaks down.
+ *
+ */
+struct pv_hash_entry {
+	struct qspinlock *lock;
+	struct pv_node   *node;
+};
+
+#define PV_HE_PER_LINE	(SMP_CACHE_BYTES / sizeof(struct pv_hash_entry))
+#define PV_HE_MIN	(PAGE_SIZE / sizeof(struct pv_hash_entry))
+
+static struct pv_hash_entry *pv_lock_hash;
+static unsigned int pv_lock_hash_bits __read_mostly;
+
+/*
+ * Allocate memory for the PV qspinlock hash buckets
+ *
+ * This function should be called from the paravirt spinlock initialization
+ * routine.
+ */
+void __init __pv_init_lock_hash(void)
+{
+	int pv_hash_size = ALIGN(4 * num_possible_cpus(), PV_HE_PER_LINE);
+
+	if (pv_hash_size < PV_HE_MIN)
+		pv_hash_size = PV_HE_MIN;
+
+	/*
+	 * Allocate space from bootmem which should be page-size aligned
+	 * and hence cacheline aligned.
+	 */
+	pv_lock_hash = alloc_large_system_hash("PV qspinlock",
+					       sizeof(struct pv_hash_entry),
+					       pv_hash_size, 0, HASH_EARLY,
+					       &pv_lock_hash_bits, NULL,
+					       pv_hash_size, pv_hash_size);
+}
+
+#define for_each_hash_entry(he, offset, hash)						\
+	for (hash &= ~(PV_HE_PER_LINE - 1), he = &pv_lock_hash[hash], offset = 0;	\
+	     offset < (1 << pv_lock_hash_bits);						\
+	     offset++, he = &pv_lock_hash[(hash + offset) & ((1 << pv_lock_hash_bits) - 1)])
+
+static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
+{
+	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+	struct pv_hash_entry *he;
+
+	for_each_hash_entry(he, offset, hash) {
+		if (!cmpxchg(&he->lock, NULL, lock)) {
+			WRITE_ONCE(he->node, node);
+			return &he->lock;
+		}
+	}
+	/*
+	 * Hard assume there is a free entry for us.
+	 *
+	 * This is guaranteed by ensuring every blocked lock only ever consumes
+	 * a single entry, and since we only have 4 nesting levels per CPU
+	 * and allocated 4*nr_possible_cpus(), this must be so.
+	 *
+	 * The single entry is guaranteed by having the lock owner unhash
+	 * before it releases.
+	 */
+	BUG();
+}
+
+static struct pv_node *pv_unhash(struct qspinlock *lock)
+{
+	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
+	struct pv_hash_entry *he;
+	struct pv_node *node;
+
+	for_each_hash_entry(he, offset, hash) {
+		if (READ_ONCE(he->lock) == lock) {
+			node = READ_ONCE(he->node);
+			WRITE_ONCE(he->lock, NULL);
+			return node;
+		}
+	}
+	/*
+	 * Hard assume we'll find an entry.
+	 *
+	 * This guarantees a limited lookup time and is itself guaranteed by
+	 * having the lock owner do the unhash -- IFF the unlock sees the
+	 * SLOW flag, there MUST be a hash entry.
+	 */
+	BUG();
+}
+
+/*
+ * Initialize the PV part of the mcs_spinlock node.
+ */
+static void pv_init_node(struct mcs_spinlock *node)
+{
+	struct pv_node *pn = (struct pv_node *)node;
+
+	BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock));
+
+	pn->cpu = smp_processor_id();
+	pn->state = vcpu_running;
+}
+
+/*
+ * Wait for node->locked to become true, halt the vcpu after a short spin.
+ * pv_kick_node() is used to wake the vcpu again.
+ */
+static void pv_wait_node(struct mcs_spinlock *node)
+{
+	struct pv_node *pn = (struct pv_node *)node;
+	int loop;
+
+	for (;;) {
+		for (loop = SPIN_THRESHOLD; loop; loop--) {
+			if (READ_ONCE(node->locked))
+				return;
+			cpu_relax();
+		}
+
+		/*
+		 * Order pn->state vs pn->locked thusly:
+		 *
+		 * [S] pn->state = vcpu_halted	  [S] next->locked = 1
+		 *     MB			      MB
+		 * [L] pn->locked		[RmW] pn->state = vcpu_running
+		 *
+		 * Matches the xchg() from pv_kick_node().
+		 */
+		smp_store_mb(pn->state, vcpu_halted);
+
+		if (!READ_ONCE(node->locked))
+			pv_wait(&pn->state, vcpu_halted);
+
+		/*
+		 * Reset the vCPU state to avoid unncessary CPU kicking
+		 */
+		WRITE_ONCE(pn->state, vcpu_running);
+
+		/*
+		 * If the locked flag is still not set after wakeup, it is a
+		 * spurious wakeup and the vCPU should wait again. However,
+		 * there is a pretty high overhead for CPU halting and kicking.
+		 * So it is better to spin for a while in the hope that the
+		 * MCS lock will be released soon.
+		 */
+	}
+	/*
+	 * By now our node->locked should be 1 and our caller will not actually
+	 * spin-wait for it. We do however rely on our caller to do a
+	 * load-acquire for us.
+	 */
+}
+
+/*
+ * Called after setting next->locked = 1, used to wake those stuck in
+ * pv_wait_node().
+ */
+static void pv_kick_node(struct mcs_spinlock *node)
+{
+	struct pv_node *pn = (struct pv_node *)node;
+
+	/*
+	 * Note that because node->locked is already set, this actual
+	 * mcs_spinlock entry could be re-used already.
+	 *
+	 * This should be fine however, kicking people for no reason is
+	 * harmless.
+	 *
+	 * See the comment in pv_wait_node().
+	 */
+	if (xchg(&pn->state, vcpu_running) == vcpu_halted)
+		pv_kick(pn->cpu);
+}
+
+/*
+ * Wait for l->locked to become clear; halt the vcpu after a short spin.
+ * __pv_queued_spin_unlock() will wake us.
+ */
+static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+{
+	struct pv_node *pn = (struct pv_node *)node;
+	struct __qspinlock *l = (void *)lock;
+	struct qspinlock **lp = NULL;
+	int loop;
+
+	for (;;) {
+		for (loop = SPIN_THRESHOLD; loop; loop--) {
+			if (!READ_ONCE(l->locked))
+				return;
+			cpu_relax();
+		}
+
+		WRITE_ONCE(pn->state, vcpu_halted);
+		if (!lp) { /* ONCE */
+			lp = pv_hash(lock, pn);
+			/*
+			 * lp must be set before setting _Q_SLOW_VAL
+			 *
+			 * [S] lp = lock                [RmW] l = l->locked = 0
+			 *     MB                             MB
+			 * [S] l->locked = _Q_SLOW_VAL  [L]   lp
+			 *
+			 * Matches the cmpxchg() in __pv_queued_spin_unlock().
+			 */
+			if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+				/*
+				 * The lock is free and _Q_SLOW_VAL has never
+				 * been set. Therefore we need to unhash before
+				 * getting the lock.
+				 */
+				WRITE_ONCE(*lp, NULL);
+				return;
+			}
+		}
+		pv_wait(&l->locked, _Q_SLOW_VAL);
+
+		/*
+		 * The unlocker should have freed the lock before kicking the
+		 * CPU. So if the lock is still not free, it is a spurious
+		 * wakeup and so the vCPU should wait again after spinning for
+		 * a while.
+		 */
+	}
+
+	/*
+	 * Lock is unlocked now; the caller will acquire it without waiting.
+	 * As with pv_wait_node() we rely on the caller to do a load-acquire
+	 * for us.
+	 */
+}
+
+/*
+ * PV version of the unlock function to be used in stead of
+ * queued_spin_unlock().
+ */
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+	struct __qspinlock *l = (void *)lock;
+	struct pv_node *node;
+
+	/*
+	 * We must not unlock if SLOW, because in that case we must first
+	 * unhash. Otherwise it would be possible to have multiple @lock
+	 * entries, which would be BAD.
+	 */
+	if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL))
+		return;
+
+	/*
+	 * Since the above failed to release, this must be the SLOW path.
+	 * Therefore start by looking up the blocked node and unhashing it.
+	 */
+	node = pv_unhash(lock);
+
+	/*
+	 * Now that we have a reference to the (likely) blocked pv_node,
+	 * release the lock.
+	 */
+	smp_store_release(&l->locked, 0);
+
+	/*
+	 * At this point the memory pointed at by lock can be freed/reused,
+	 * however we can still use the pv_node to kick the CPU.
+	 */
+	if (READ_ONCE(node->state) == vcpu_halted)
+		pv_kick(node->cpu);
+}
+/*
+ * Include the architecture specific callee-save thunk of the
+ * __pv_queued_spin_unlock(). This thunk is put together with
+ * __pv_queued_spin_unlock() near the top of the file to make sure
+ * that the callee-save thunk and the real unlock function are close
+ * to each other sharing consecutive instruction cachelines.
+ */
+#include <asm/qspinlock_paravirt.h>
+
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b025295f496624..36573e96a47761 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -70,10 +70,10 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 }
 
 /*
- * We can speed up the acquire/release, if the architecture
- * supports cmpxchg and if there's no debugging state to be set up
+ * We can speed up the acquire/release, if there's no debugging state to be
+ * set up.
  */
-#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+#ifndef CONFIG_DEBUG_RT_MUTEXES
 # define rt_mutex_cmpxchg(l,c,n)	(cmpxchg(&l->owner, c, n) == c)
 static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 {
@@ -1182,11 +1182,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	set_current_state(state);
 
 	/* Setup the timer, when timeout != NULL */
-	if (unlikely(timeout)) {
+	if (unlikely(timeout))
 		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-		if (!hrtimer_active(&timeout->timer))
-			timeout->task = NULL;
-	}
 
 	ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
 
@@ -1443,10 +1440,17 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
  *
  * @lock:	the rt_mutex to be locked
  *
+ * This function can only be called in thread context. It's safe to
+ * call it from atomic regions, but not from hard interrupt or soft
+ * interrupt context.
+ *
  * Returns 1 on success and 0 on contention
  */
 int __sched rt_mutex_trylock(struct rt_mutex *lock)
 {
+	if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq()))
+		return 0;
+
 	return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
 }
 EXPORT_SYMBOL_GPL(rt_mutex_trylock);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 3417d0172a5d2e..0f189714e45701 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -409,11 +409,24 @@ done:
 	return taken;
 }
 
+/*
+ * Return true if the rwsem has active spinner
+ */
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+	return osq_is_locked(&sem->osq);
+}
+
 #else
 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 {
 	return false;
 }
+
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+	return false;
+}
 #endif
 
 /*
@@ -496,7 +509,38 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 {
 	unsigned long flags;
 
+	/*
+	 * If a spinner is present, it is not necessary to do the wakeup.
+	 * Try to do wakeup only if the trylock succeeds to minimize
+	 * spinlock contention which may introduce too much delay in the
+	 * unlock operation.
+	 *
+	 *    spinning writer		up_write/up_read caller
+	 *    ---------------		-----------------------
+	 * [S]   osq_unlock()		[L]   osq
+	 *	 MB			      RMB
+	 * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock)
+	 *
+	 * Here, it is important to make sure that there won't be a missed
+	 * wakeup while the rwsem is free and the only spinning writer goes
+	 * to sleep without taking the rwsem. Even when the spinning writer
+	 * is just going to break out of the waiting loop, it will still do
+	 * a trylock in rwsem_down_write_failed() before sleeping. IOW, if
+	 * rwsem_has_spinner() is true, it will guarantee at least one
+	 * trylock attempt on the rwsem later on.
+	 */
+	if (rwsem_has_spinner(sem)) {
+		/*
+		 * The smp_rmb() here is to make sure that the spinner
+		 * state is consulted before reading the wait_lock.
+		 */
+		smp_rmb();
+		if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags))
+			return sem;
+		goto locked;
+	}
 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
+locked:
 
 	/* do nothing if list empty */
 	if (!list_empty(&sem->wait_list))
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 8dbe27611ec399..59e32684c23b58 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -241,6 +241,7 @@ rcu_torture_free(struct rcu_torture *p)
 struct rcu_torture_ops {
 	int ttype;
 	void (*init)(void);
+	void (*cleanup)(void);
 	int (*readlock)(void);
 	void (*read_delay)(struct torture_random_state *rrsp);
 	void (*readunlock)(int idx);
@@ -477,10 +478,12 @@ static struct rcu_torture_ops rcu_busted_ops = {
  */
 
 DEFINE_STATIC_SRCU(srcu_ctl);
+static struct srcu_struct srcu_ctld;
+static struct srcu_struct *srcu_ctlp = &srcu_ctl;
 
-static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
+static int srcu_torture_read_lock(void) __acquires(srcu_ctlp)
 {
-	return srcu_read_lock(&srcu_ctl);
+	return srcu_read_lock(srcu_ctlp);
 }
 
 static void srcu_read_delay(struct torture_random_state *rrsp)
@@ -499,49 +502,49 @@ static void srcu_read_delay(struct torture_random_state *rrsp)
 		rcu_read_delay(rrsp);
 }
 
-static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
+static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp)
 {
-	srcu_read_unlock(&srcu_ctl, idx);
+	srcu_read_unlock(srcu_ctlp, idx);
 }
 
 static unsigned long srcu_torture_completed(void)
 {
-	return srcu_batches_completed(&srcu_ctl);
+	return srcu_batches_completed(srcu_ctlp);
 }
 
 static void srcu_torture_deferred_free(struct rcu_torture *rp)
 {
-	call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
+	call_srcu(srcu_ctlp, &rp->rtort_rcu, rcu_torture_cb);
 }
 
 static void srcu_torture_synchronize(void)
 {
-	synchronize_srcu(&srcu_ctl);
+	synchronize_srcu(srcu_ctlp);
 }
 
 static void srcu_torture_call(struct rcu_head *head,
 			      void (*func)(struct rcu_head *head))
 {
-	call_srcu(&srcu_ctl, head, func);
+	call_srcu(srcu_ctlp, head, func);
 }
 
 static void srcu_torture_barrier(void)
 {
-	srcu_barrier(&srcu_ctl);
+	srcu_barrier(srcu_ctlp);
 }
 
 static void srcu_torture_stats(void)
 {
 	int cpu;
-	int idx = srcu_ctl.completed & 0x1;
+	int idx = srcu_ctlp->completed & 0x1;
 
 	pr_alert("%s%s per-CPU(idx=%d):",
 		 torture_type, TORTURE_FLAG, idx);
 	for_each_possible_cpu(cpu) {
 		long c0, c1;
 
-		c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
-		c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
+		c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx];
+		c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx];
 		pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
 	}
 	pr_cont("\n");
@@ -549,7 +552,7 @@ static void srcu_torture_stats(void)
 
 static void srcu_torture_synchronize_expedited(void)
 {
-	synchronize_srcu_expedited(&srcu_ctl);
+	synchronize_srcu_expedited(srcu_ctlp);
 }
 
 static struct rcu_torture_ops srcu_ops = {
@@ -569,6 +572,38 @@ static struct rcu_torture_ops srcu_ops = {
 	.name		= "srcu"
 };
 
+static void srcu_torture_init(void)
+{
+	rcu_sync_torture_init();
+	WARN_ON(init_srcu_struct(&srcu_ctld));
+	srcu_ctlp = &srcu_ctld;
+}
+
+static void srcu_torture_cleanup(void)
+{
+	cleanup_srcu_struct(&srcu_ctld);
+	srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
+}
+
+/* As above, but dynamically allocated. */
+static struct rcu_torture_ops srcud_ops = {
+	.ttype		= SRCU_FLAVOR,
+	.init		= srcu_torture_init,
+	.cleanup	= srcu_torture_cleanup,
+	.readlock	= srcu_torture_read_lock,
+	.read_delay	= srcu_read_delay,
+	.readunlock	= srcu_torture_read_unlock,
+	.started	= NULL,
+	.completed	= srcu_torture_completed,
+	.deferred_free	= srcu_torture_deferred_free,
+	.sync		= srcu_torture_synchronize,
+	.exp_sync	= srcu_torture_synchronize_expedited,
+	.call		= srcu_torture_call,
+	.cb_barrier	= srcu_torture_barrier,
+	.stats		= srcu_torture_stats,
+	.name		= "srcud"
+};
+
 /*
  * Definitions for sched torture testing.
  */
@@ -672,8 +707,8 @@ static void rcu_torture_boost_cb(struct rcu_head *head)
 	struct rcu_boost_inflight *rbip =
 		container_of(head, struct rcu_boost_inflight, rcu);
 
-	smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
-	rbip->inflight = 0;
+	/* Ensure RCU-core accesses precede clearing ->inflight */
+	smp_store_release(&rbip->inflight, 0);
 }
 
 static int rcu_torture_boost(void *arg)
@@ -710,9 +745,9 @@ static int rcu_torture_boost(void *arg)
 		call_rcu_time = jiffies;
 		while (ULONG_CMP_LT(jiffies, endtime)) {
 			/* If we don't have a callback in flight, post one. */
-			if (!rbi.inflight) {
-				smp_mb(); /* RCU core before ->inflight = 1. */
-				rbi.inflight = 1;
+			if (!smp_load_acquire(&rbi.inflight)) {
+				/* RCU core before ->inflight = 1. */
+				smp_store_release(&rbi.inflight, 1);
 				call_rcu(&rbi.rcu, rcu_torture_boost_cb);
 				if (jiffies - call_rcu_time >
 					 test_boost_duration * HZ - HZ / 2) {
@@ -751,11 +786,10 @@ checkwait:	stutter_wait("rcu_torture_boost");
 	} while (!torture_must_stop());
 
 	/* Clean up and exit. */
-	while (!kthread_should_stop() || rbi.inflight) {
+	while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) {
 		torture_shutdown_absorb("rcu_torture_boost");
 		schedule_timeout_uninterruptible(1);
 	}
-	smp_mb(); /* order accesses to ->inflight before stack-frame death. */
 	destroy_rcu_head_on_stack(&rbi.rcu);
 	torture_kthread_stopping("rcu_torture_boost");
 	return 0;
@@ -1054,7 +1088,7 @@ static void rcu_torture_timer(unsigned long unused)
 	p = rcu_dereference_check(rcu_torture_current,
 				  rcu_read_lock_bh_held() ||
 				  rcu_read_lock_sched_held() ||
-				  srcu_read_lock_held(&srcu_ctl));
+				  srcu_read_lock_held(srcu_ctlp));
 	if (p == NULL) {
 		/* Leave because rcu_torture_writer is not yet underway */
 		cur_ops->readunlock(idx);
@@ -1128,7 +1162,7 @@ rcu_torture_reader(void *arg)
 		p = rcu_dereference_check(rcu_torture_current,
 					  rcu_read_lock_bh_held() ||
 					  rcu_read_lock_sched_held() ||
-					  srcu_read_lock_held(&srcu_ctl));
+					  srcu_read_lock_held(srcu_ctlp));
 		if (p == NULL) {
 			/* Wait for rcu_torture_writer to get underway */
 			cur_ops->readunlock(idx);
@@ -1413,12 +1447,15 @@ static int rcu_torture_barrier_cbs(void *arg)
 	do {
 		wait_event(barrier_cbs_wq[myid],
 			   (newphase =
-			    ACCESS_ONCE(barrier_phase)) != lastphase ||
+			    smp_load_acquire(&barrier_phase)) != lastphase ||
 			   torture_must_stop());
 		lastphase = newphase;
-		smp_mb(); /* ensure barrier_phase load before ->call(). */
 		if (torture_must_stop())
 			break;
+		/*
+		 * The above smp_load_acquire() ensures barrier_phase load
+		 * is ordered before the folloiwng ->call().
+		 */
 		cur_ops->call(&rcu, rcu_torture_barrier_cbf);
 		if (atomic_dec_and_test(&barrier_cbs_count))
 			wake_up(&barrier_wq);
@@ -1439,8 +1476,8 @@ static int rcu_torture_barrier(void *arg)
 	do {
 		atomic_set(&barrier_cbs_invoked, 0);
 		atomic_set(&barrier_cbs_count, n_barrier_cbs);
-		smp_mb(); /* Ensure barrier_phase after prior assignments. */
-		barrier_phase = !barrier_phase;
+		/* Ensure barrier_phase ordered after prior assignments. */
+		smp_store_release(&barrier_phase, !barrier_phase);
 		for (i = 0; i < n_barrier_cbs; i++)
 			wake_up(&barrier_cbs_wq[i]);
 		wait_event(barrier_wq,
@@ -1588,10 +1625,14 @@ rcu_torture_cleanup(void)
 			rcutorture_booster_cleanup(i);
 	}
 
-	/* Wait for all RCU callbacks to fire.  */
-
+	/*
+	 * Wait for all RCU callbacks to fire, then do flavor-specific
+	 * cleanup operations.
+	 */
 	if (cur_ops->cb_barrier != NULL)
 		cur_ops->cb_barrier();
+	if (cur_ops->cleanup != NULL)
+		cur_ops->cleanup();
 
 	rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
 
@@ -1668,8 +1709,8 @@ rcu_torture_init(void)
 	int cpu;
 	int firsterr = 0;
 	static struct rcu_torture_ops *torture_ops[] = {
-		&rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
-		RCUTORTURE_TASKS_OPS
+		&rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
+		&sched_ops, RCUTORTURE_TASKS_OPS
 	};
 
 	if (!torture_init_begin(torture_type, verbose, &torture_runnable))
@@ -1701,7 +1742,7 @@ rcu_torture_init(void)
 	if (nreaders >= 0) {
 		nrealreaders = nreaders;
 	} else {
-		nrealreaders = num_online_cpus() - 1;
+		nrealreaders = num_online_cpus() - 2 - nreaders;
 		if (nrealreaders <= 0)
 			nrealreaders = 1;
 	}
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index cad76e76b4e7de..fb33d35ee0b7c0 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -151,7 +151,7 @@ static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
 	unsigned long t;
 
 	for_each_possible_cpu(cpu) {
-		t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
+		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
 		sum += t;
 	}
 	return sum;
@@ -168,7 +168,7 @@ static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
 	unsigned long t;
 
 	for_each_possible_cpu(cpu) {
-		t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
+		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
 		sum += t;
 	}
 	return sum;
@@ -265,8 +265,8 @@ static int srcu_readers_active(struct srcu_struct *sp)
 	unsigned long sum = 0;
 
 	for_each_possible_cpu(cpu) {
-		sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
-		sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
+		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
+		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
 	}
 	return sum;
 }
@@ -296,7 +296,7 @@ int __srcu_read_lock(struct srcu_struct *sp)
 {
 	int idx;
 
-	idx = ACCESS_ONCE(sp->completed) & 0x1;
+	idx = READ_ONCE(sp->completed) & 0x1;
 	preempt_disable();
 	__this_cpu_inc(sp->per_cpu_ref->c[idx]);
 	smp_mb(); /* B */  /* Avoid leaking the critical section. */
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 069742d61c6887..591af0cb7b9f4e 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -49,39 +49,6 @@ static void __call_rcu(struct rcu_head *head,
 
 #include "tiny_plugin.h"
 
-/*
- * Enter idle, which is an extended quiescent state if we have fully
- * entered that mode.
- */
-void rcu_idle_enter(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
-
-/*
- * Exit an interrupt handler towards idle.
- */
-void rcu_irq_exit(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_irq_exit);
-
-/*
- * Exit idle, so that we are no longer in an extended quiescent state.
- */
-void rcu_idle_exit(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
-
-/*
- * Enter an interrupt handler, moving away from idle.
- */
-void rcu_irq_enter(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_irq_enter);
-
 #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
 
 /*
@@ -170,6 +137,11 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 
 	/* Move the ready-to-invoke callbacks to a local list. */
 	local_irq_save(flags);
+	if (rcp->donetail == &rcp->rcucblist) {
+		/* No callbacks ready, so just leave. */
+		local_irq_restore(flags);
+		return;
+	}
 	RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
 	list = rcp->rcucblist;
 	rcp->rcucblist = *rcp->donetail;
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index f94e209a10d615..e492a5253e0f10 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -144,16 +144,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
 		return;
 	rcp->ticks_this_gp++;
 	j = jiffies;
-	js = ACCESS_ONCE(rcp->jiffies_stall);
+	js = READ_ONCE(rcp->jiffies_stall);
 	if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
 		pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
 		       rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
 		       jiffies - rcp->gp_start, rcp->qlen);
 		dump_stack();
-		ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
-			3 * rcu_jiffies_till_stall_check() + 3;
+		WRITE_ONCE(rcp->jiffies_stall,
+			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
 	} else if (ULONG_CMP_GE(j, js)) {
-		ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+		WRITE_ONCE(rcp->jiffies_stall,
+			   jiffies + rcu_jiffies_till_stall_check());
 	}
 }
 
@@ -161,7 +162,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
 {
 	rcp->ticks_this_gp = 0;
 	rcp->gp_start = jiffies;
-	ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+	WRITE_ONCE(rcp->jiffies_stall,
+		   jiffies + rcu_jiffies_till_stall_check());
 }
 
 static void check_cpu_stalls(void)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8cf7304b2867f5..add042926a6608 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -91,7 +91,7 @@ static const char *tp_##sname##_varname __used __tracepoint_string = sname##_var
 
 #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
 DEFINE_RCU_TPS(sname) \
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
 struct rcu_state sname##_state = { \
 	.level = { &sname##_state.node[0] }, \
 	.rda = &sname##_data, \
@@ -110,11 +110,18 @@ struct rcu_state sname##_state = { \
 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
 RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
 
-static struct rcu_state *rcu_state_p;
+static struct rcu_state *const rcu_state_p;
+static struct rcu_data __percpu *const rcu_data_p;
 LIST_HEAD(rcu_struct_flavors);
 
-/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
-static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
+/* Dump rcu_node combining tree at boot to verify correct setup. */
+static bool dump_tree;
+module_param(dump_tree, bool, 0444);
+/* Control rcu_node-tree auto-balancing at boot time. */
+static bool rcu_fanout_exact;
+module_param(rcu_fanout_exact, bool, 0444);
+/* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */
+static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
 module_param(rcu_fanout_leaf, int, 0444);
 int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
 static int num_rcu_lvl[] = {  /* Number of rcu_nodes at specified level. */
@@ -159,17 +166,46 @@ static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 
 /* rcuc/rcub kthread realtime priority */
+#ifdef CONFIG_RCU_KTHREAD_PRIO
 static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
+#else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */
+static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
+#endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */
 module_param(kthread_prio, int, 0644);
 
 /* Delay in jiffies for grace-period initialization delays, debug only. */
+
+#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT
+static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY;
+module_param(gp_preinit_delay, int, 0644);
+#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */
+static const int gp_preinit_delay;
+#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */
+
 #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT
 static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY;
 module_param(gp_init_delay, int, 0644);
 #else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
 static const int gp_init_delay;
 #endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
-#define PER_RCU_NODE_PERIOD 10	/* Number of grace periods between delays. */
+
+#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP
+static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY;
+module_param(gp_cleanup_delay, int, 0644);
+#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
+static const int gp_cleanup_delay;
+#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
+
+/*
+ * Number of grace periods between delays, normalized by the duration of
+ * the delay.  The longer the the delay, the more the grace periods between
+ * each delay.  The reason for this normalization is that it means that,
+ * for non-zero delays, the overall slowdown of grace periods is constant
+ * regardless of the duration of the delay.  This arrangement balances
+ * the need for long delays to increase some race probabilities with the
+ * need for fast grace periods to increase other race probabilities.
+ */
+#define PER_RCU_NODE_PERIOD 3	/* Number of grace periods between delays. */
 
 /*
  * Track the rcutorture test sequence number and the update version
@@ -191,17 +227,17 @@ unsigned long rcutorture_vernum;
  */
 unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
 {
-	return ACCESS_ONCE(rnp->qsmaskinitnext);
+	return READ_ONCE(rnp->qsmaskinitnext);
 }
 
 /*
- * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
+ * Return true if an RCU grace period is in progress.  The READ_ONCE()s
  * permit this function to be invoked without holding the root rcu_node
  * structure's ->lock, but of course results can be subject to change.
  */
 static int rcu_gp_in_progress(struct rcu_state *rsp)
 {
-	return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
+	return READ_ONCE(rsp->completed) != READ_ONCE(rsp->gpnum);
 }
 
 /*
@@ -278,8 +314,8 @@ static void rcu_momentary_dyntick_idle(void)
 		if (!(resched_mask & rsp->flavor_mask))
 			continue;
 		smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
-		if (ACCESS_ONCE(rdp->mynode->completed) !=
-		    ACCESS_ONCE(rdp->cond_resched_completed))
+		if (READ_ONCE(rdp->mynode->completed) !=
+		    READ_ONCE(rdp->cond_resched_completed))
 			continue;
 
 		/*
@@ -491,9 +527,9 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
 		break;
 	}
 	if (rsp != NULL) {
-		*flags = ACCESS_ONCE(rsp->gp_flags);
-		*gpnum = ACCESS_ONCE(rsp->gpnum);
-		*completed = ACCESS_ONCE(rsp->completed);
+		*flags = READ_ONCE(rsp->gp_flags);
+		*gpnum = READ_ONCE(rsp->gpnum);
+		*completed = READ_ONCE(rsp->completed);
 		return;
 	}
 	*flags = 0;
@@ -539,10 +575,10 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
 static int rcu_future_needs_gp(struct rcu_state *rsp)
 {
 	struct rcu_node *rnp = rcu_get_root(rsp);
-	int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1;
+	int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
 	int *fp = &rnp->need_future_gp[idx];
 
-	return ACCESS_ONCE(*fp);
+	return READ_ONCE(*fp);
 }
 
 /*
@@ -565,7 +601,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
 		return 1;  /* Yes, this CPU has newly registered callbacks. */
 	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
 		if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
-		    ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
+		    ULONG_CMP_LT(READ_ONCE(rsp->completed),
 				 rdp->nxtcompleted[i]))
 			return 1;  /* Yes, CBs for future grace period. */
 	return 0; /* No grace period needed. */
@@ -585,7 +621,8 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
 
 	trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
-	if (!user && !is_idle_task(current)) {
+	if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+	    !user && !is_idle_task(current)) {
 		struct task_struct *idle __maybe_unused =
 			idle_task(smp_processor_id());
 
@@ -604,7 +641,8 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
 	smp_mb__before_atomic();  /* See above. */
 	atomic_inc(&rdtp->dynticks);
 	smp_mb__after_atomic();  /* Force ordering with next sojourn. */
-	WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+		     atomic_read(&rdtp->dynticks) & 0x1);
 	rcu_dynticks_task_enter();
 
 	/*
@@ -630,7 +668,8 @@ static void rcu_eqs_enter(bool user)
 
 	rdtp = this_cpu_ptr(&rcu_dynticks);
 	oldval = rdtp->dynticks_nesting;
-	WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+		     (oldval & DYNTICK_TASK_NEST_MASK) == 0);
 	if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
 		rdtp->dynticks_nesting = 0;
 		rcu_eqs_enter_common(oldval, user);
@@ -703,7 +742,8 @@ void rcu_irq_exit(void)
 	rdtp = this_cpu_ptr(&rcu_dynticks);
 	oldval = rdtp->dynticks_nesting;
 	rdtp->dynticks_nesting--;
-	WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+		     rdtp->dynticks_nesting < 0);
 	if (rdtp->dynticks_nesting)
 		trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
 	else
@@ -728,10 +768,12 @@ static void rcu_eqs_exit_common(long long oldval, int user)
 	atomic_inc(&rdtp->dynticks);
 	/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
 	smp_mb__after_atomic();  /* See above. */
-	WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+		     !(atomic_read(&rdtp->dynticks) & 0x1));
 	rcu_cleanup_after_idle();
 	trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
-	if (!user && !is_idle_task(current)) {
+	if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+	    !user && !is_idle_task(current)) {
 		struct task_struct *idle __maybe_unused =
 			idle_task(smp_processor_id());
 
@@ -755,7 +797,7 @@ static void rcu_eqs_exit(bool user)
 
 	rdtp = this_cpu_ptr(&rcu_dynticks);
 	oldval = rdtp->dynticks_nesting;
-	WARN_ON_ONCE(oldval < 0);
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
 	if (oldval & DYNTICK_TASK_NEST_MASK) {
 		rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
 	} else {
@@ -828,7 +870,8 @@ void rcu_irq_enter(void)
 	rdtp = this_cpu_ptr(&rcu_dynticks);
 	oldval = rdtp->dynticks_nesting;
 	rdtp->dynticks_nesting++;
-	WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+		     rdtp->dynticks_nesting == 0);
 	if (oldval)
 		trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
 	else
@@ -1011,9 +1054,9 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
 		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
 		return 1;
 	} else {
-		if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
+		if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
 				 rdp->mynode->gpnum))
-			ACCESS_ONCE(rdp->gpwrap) = true;
+			WRITE_ONCE(rdp->gpwrap, true);
 		return 0;
 	}
 }
@@ -1093,12 +1136,12 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 	if (ULONG_CMP_GE(jiffies,
 			 rdp->rsp->gp_start + jiffies_till_sched_qs) ||
 	    ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
-		if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
-			ACCESS_ONCE(rdp->cond_resched_completed) =
-				ACCESS_ONCE(rdp->mynode->completed);
+		if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
+			WRITE_ONCE(rdp->cond_resched_completed,
+				   READ_ONCE(rdp->mynode->completed));
 			smp_mb(); /* ->cond_resched_completed before *rcrmp. */
-			ACCESS_ONCE(*rcrmp) =
-				ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
+			WRITE_ONCE(*rcrmp,
+				   READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask);
 			resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
 			rdp->rsp->jiffies_resched += 5; /* Enable beating. */
 		} else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
@@ -1119,9 +1162,9 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
 	rsp->gp_start = j;
 	smp_wmb(); /* Record start time before stall time. */
 	j1 = rcu_jiffies_till_stall_check();
-	ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
+	WRITE_ONCE(rsp->jiffies_stall, j + j1);
 	rsp->jiffies_resched = j + j1 / 2;
-	rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
+	rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs);
 }
 
 /*
@@ -1133,10 +1176,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
 	unsigned long j;
 
 	j = jiffies;
-	gpa = ACCESS_ONCE(rsp->gp_activity);
+	gpa = READ_ONCE(rsp->gp_activity);
 	if (j - gpa > 2 * HZ)
-		pr_err("%s kthread starved for %ld jiffies!\n",
-		       rsp->name, j - gpa);
+		pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x\n",
+		       rsp->name, j - gpa,
+		       rsp->gpnum, rsp->completed, rsp->gp_flags);
 }
 
 /*
@@ -1173,12 +1217,13 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 	/* Only let one CPU complain about others per time interval. */
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-	delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
+	delta = jiffies - READ_ONCE(rsp->jiffies_stall);
 	if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
-	ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+	WRITE_ONCE(rsp->jiffies_stall,
+		   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
 	/*
@@ -1212,12 +1257,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 	if (ndetected) {
 		rcu_dump_cpu_stacks(rsp);
 	} else {
-		if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
-		    ACCESS_ONCE(rsp->completed) == gpnum) {
+		if (READ_ONCE(rsp->gpnum) != gpnum ||
+		    READ_ONCE(rsp->completed) == gpnum) {
 			pr_err("INFO: Stall ended before state dump start\n");
 		} else {
 			j = jiffies;
-			gpa = ACCESS_ONCE(rsp->gp_activity);
+			gpa = READ_ONCE(rsp->gp_activity);
 			pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
 			       rsp->name, j - gpa, j, gpa,
 			       jiffies_till_next_fqs,
@@ -1262,9 +1307,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
 	rcu_dump_cpu_stacks(rsp);
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-	if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
-		ACCESS_ONCE(rsp->jiffies_stall) = jiffies +
-				     3 * rcu_jiffies_till_stall_check() + 3;
+	if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
+		WRITE_ONCE(rsp->jiffies_stall,
+			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
 	/*
@@ -1307,20 +1352,20 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 	 * Given this check, comparisons of jiffies, rsp->jiffies_stall,
 	 * and rsp->gp_start suffice to forestall false positives.
 	 */
-	gpnum = ACCESS_ONCE(rsp->gpnum);
+	gpnum = READ_ONCE(rsp->gpnum);
 	smp_rmb(); /* Pick up ->gpnum first... */
-	js = ACCESS_ONCE(rsp->jiffies_stall);
+	js = READ_ONCE(rsp->jiffies_stall);
 	smp_rmb(); /* ...then ->jiffies_stall before the rest... */
-	gps = ACCESS_ONCE(rsp->gp_start);
+	gps = READ_ONCE(rsp->gp_start);
 	smp_rmb(); /* ...and finally ->gp_start before ->completed. */
-	completed = ACCESS_ONCE(rsp->completed);
+	completed = READ_ONCE(rsp->completed);
 	if (ULONG_CMP_GE(completed, gpnum) ||
 	    ULONG_CMP_LT(j, js) ||
 	    ULONG_CMP_GE(gps, js))
 		return; /* No stall or GP completed since entering function. */
 	rnp = rdp->mynode;
 	if (rcu_gp_in_progress(rsp) &&
-	    (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) {
+	    (READ_ONCE(rnp->qsmask) & rdp->grpmask)) {
 
 		/* We haven't checked in, so go dump stack. */
 		print_cpu_stall(rsp);
@@ -1347,7 +1392,7 @@ void rcu_cpu_stall_reset(void)
 	struct rcu_state *rsp;
 
 	for_each_rcu_flavor(rsp)
-		ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2;
+		WRITE_ONCE(rsp->jiffies_stall, jiffies + ULONG_MAX / 2);
 }
 
 /*
@@ -1457,7 +1502,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
 	 * doing some extra useless work.
 	 */
 	if (rnp->gpnum != rnp->completed ||
-	    ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) {
+	    READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) {
 		rnp->need_future_gp[c & 0x1]++;
 		trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
 		goto out;
@@ -1542,7 +1587,7 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
 static void rcu_gp_kthread_wake(struct rcu_state *rsp)
 {
 	if (current == rsp->gp_kthread ||
-	    !ACCESS_ONCE(rsp->gp_flags) ||
+	    !READ_ONCE(rsp->gp_flags) ||
 	    !rsp->gp_kthread)
 		return;
 	wake_up(&rsp->gp_wq);
@@ -1677,7 +1722,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 
 	/* Handle the ends of any preceding grace periods first. */
 	if (rdp->completed == rnp->completed &&
-	    !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
+	    !unlikely(READ_ONCE(rdp->gpwrap))) {
 
 		/* No grace period end, so just accelerate recent callbacks. */
 		ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1692,7 +1737,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 		trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
 	}
 
-	if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
+	if (rdp->gpnum != rnp->gpnum || unlikely(READ_ONCE(rdp->gpwrap))) {
 		/*
 		 * If the current grace period is waiting for this CPU,
 		 * set up to detect a quiescent state, otherwise don't
@@ -1704,7 +1749,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
 		rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
 		zero_cpu_stall_ticks(rdp);
-		ACCESS_ONCE(rdp->gpwrap) = false;
+		WRITE_ONCE(rdp->gpwrap, false);
 	}
 	return ret;
 }
@@ -1717,9 +1762,9 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	local_irq_save(flags);
 	rnp = rdp->mynode;
-	if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
-	     rdp->completed == ACCESS_ONCE(rnp->completed) &&
-	     !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
+	if ((rdp->gpnum == READ_ONCE(rnp->gpnum) &&
+	     rdp->completed == READ_ONCE(rnp->completed) &&
+	     !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
 	    !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
 		local_irq_restore(flags);
 		return;
@@ -1731,6 +1776,13 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
 		rcu_gp_kthread_wake(rsp);
 }
 
+static void rcu_gp_slow(struct rcu_state *rsp, int delay)
+{
+	if (delay > 0 &&
+	    !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
+		schedule_timeout_uninterruptible(delay);
+}
+
 /*
  * Initialize a new grace period.  Return 0 if no grace period required.
  */
@@ -1740,15 +1792,15 @@ static int rcu_gp_init(struct rcu_state *rsp)
 	struct rcu_data *rdp;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
-	ACCESS_ONCE(rsp->gp_activity) = jiffies;
+	WRITE_ONCE(rsp->gp_activity, jiffies);
 	raw_spin_lock_irq(&rnp->lock);
 	smp_mb__after_unlock_lock();
-	if (!ACCESS_ONCE(rsp->gp_flags)) {
+	if (!READ_ONCE(rsp->gp_flags)) {
 		/* Spurious wakeup, tell caller to go back to sleep.  */
 		raw_spin_unlock_irq(&rnp->lock);
 		return 0;
 	}
-	ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */
+	WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */
 
 	if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
 		/*
@@ -1773,6 +1825,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 	 * will handle subsequent offline CPUs.
 	 */
 	rcu_for_each_leaf_node(rsp, rnp) {
+		rcu_gp_slow(rsp, gp_preinit_delay);
 		raw_spin_lock_irq(&rnp->lock);
 		smp_mb__after_unlock_lock();
 		if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
@@ -1829,14 +1882,15 @@ static int rcu_gp_init(struct rcu_state *rsp)
 	 * process finishes, because this kthread handles both.
 	 */
 	rcu_for_each_node_breadth_first(rsp, rnp) {
+		rcu_gp_slow(rsp, gp_init_delay);
 		raw_spin_lock_irq(&rnp->lock);
 		smp_mb__after_unlock_lock();
 		rdp = this_cpu_ptr(rsp->rda);
 		rcu_preempt_check_blocked_tasks(rnp);
 		rnp->qsmask = rnp->qsmaskinit;
-		ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
+		WRITE_ONCE(rnp->gpnum, rsp->gpnum);
 		if (WARN_ON_ONCE(rnp->completed != rsp->completed))
-			ACCESS_ONCE(rnp->completed) = rsp->completed;
+			WRITE_ONCE(rnp->completed, rsp->completed);
 		if (rnp == rdp->mynode)
 			(void)__note_gp_changes(rsp, rnp, rdp);
 		rcu_preempt_boost_start_gp(rnp);
@@ -1845,10 +1899,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 					    rnp->grphi, rnp->qsmask);
 		raw_spin_unlock_irq(&rnp->lock);
 		cond_resched_rcu_qs();
-		ACCESS_ONCE(rsp->gp_activity) = jiffies;
-		if (gp_init_delay > 0 &&
-		    !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD)))
-			schedule_timeout_uninterruptible(gp_init_delay);
+		WRITE_ONCE(rsp->gp_activity, jiffies);
 	}
 
 	return 1;
@@ -1864,7 +1915,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
 	unsigned long maxj;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
-	ACCESS_ONCE(rsp->gp_activity) = jiffies;
+	WRITE_ONCE(rsp->gp_activity, jiffies);
 	rsp->n_force_qs++;
 	if (fqs_state == RCU_SAVE_DYNTICK) {
 		/* Collect dyntick-idle snapshots. */
@@ -1882,11 +1933,11 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
 		force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
 	}
 	/* Clear flag to prevent immediate re-entry. */
-	if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+	if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
 		raw_spin_lock_irq(&rnp->lock);
 		smp_mb__after_unlock_lock();
-		ACCESS_ONCE(rsp->gp_flags) =
-			ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS;
+		WRITE_ONCE(rsp->gp_flags,
+			   READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
 		raw_spin_unlock_irq(&rnp->lock);
 	}
 	return fqs_state;
@@ -1903,7 +1954,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 	struct rcu_data *rdp;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
-	ACCESS_ONCE(rsp->gp_activity) = jiffies;
+	WRITE_ONCE(rsp->gp_activity, jiffies);
 	raw_spin_lock_irq(&rnp->lock);
 	smp_mb__after_unlock_lock();
 	gp_duration = jiffies - rsp->gp_start;
@@ -1934,7 +1985,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 		smp_mb__after_unlock_lock();
 		WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
 		WARN_ON_ONCE(rnp->qsmask);
-		ACCESS_ONCE(rnp->completed) = rsp->gpnum;
+		WRITE_ONCE(rnp->completed, rsp->gpnum);
 		rdp = this_cpu_ptr(rsp->rda);
 		if (rnp == rdp->mynode)
 			needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
@@ -1942,7 +1993,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 		nocb += rcu_future_gp_cleanup(rsp, rnp);
 		raw_spin_unlock_irq(&rnp->lock);
 		cond_resched_rcu_qs();
-		ACCESS_ONCE(rsp->gp_activity) = jiffies;
+		WRITE_ONCE(rsp->gp_activity, jiffies);
+		rcu_gp_slow(rsp, gp_cleanup_delay);
 	}
 	rnp = rcu_get_root(rsp);
 	raw_spin_lock_irq(&rnp->lock);
@@ -1950,16 +2002,16 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 	rcu_nocb_gp_set(rnp, nocb);
 
 	/* Declare grace period done. */
-	ACCESS_ONCE(rsp->completed) = rsp->gpnum;
+	WRITE_ONCE(rsp->completed, rsp->gpnum);
 	trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
 	rsp->fqs_state = RCU_GP_IDLE;
 	rdp = this_cpu_ptr(rsp->rda);
 	/* Advance CBs to reduce false positives below. */
 	needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
 	if (needgp || cpu_needs_another_gp(rsp, rdp)) {
-		ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
+		WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
 		trace_rcu_grace_period(rsp->name,
-				       ACCESS_ONCE(rsp->gpnum),
+				       READ_ONCE(rsp->gpnum),
 				       TPS("newreq"));
 	}
 	raw_spin_unlock_irq(&rnp->lock);
@@ -1983,20 +2035,20 @@ static int __noreturn rcu_gp_kthread(void *arg)
 		/* Handle grace-period start. */
 		for (;;) {
 			trace_rcu_grace_period(rsp->name,
-					       ACCESS_ONCE(rsp->gpnum),
+					       READ_ONCE(rsp->gpnum),
 					       TPS("reqwait"));
 			rsp->gp_state = RCU_GP_WAIT_GPS;
 			wait_event_interruptible(rsp->gp_wq,
-						 ACCESS_ONCE(rsp->gp_flags) &
+						 READ_ONCE(rsp->gp_flags) &
 						 RCU_GP_FLAG_INIT);
 			/* Locking provides needed memory barrier. */
 			if (rcu_gp_init(rsp))
 				break;
 			cond_resched_rcu_qs();
-			ACCESS_ONCE(rsp->gp_activity) = jiffies;
+			WRITE_ONCE(rsp->gp_activity, jiffies);
 			WARN_ON(signal_pending(current));
 			trace_rcu_grace_period(rsp->name,
-					       ACCESS_ONCE(rsp->gpnum),
+					       READ_ONCE(rsp->gpnum),
 					       TPS("reqwaitsig"));
 		}
 
@@ -2012,39 +2064,39 @@ static int __noreturn rcu_gp_kthread(void *arg)
 			if (!ret)
 				rsp->jiffies_force_qs = jiffies + j;
 			trace_rcu_grace_period(rsp->name,
-					       ACCESS_ONCE(rsp->gpnum),
+					       READ_ONCE(rsp->gpnum),
 					       TPS("fqswait"));
 			rsp->gp_state = RCU_GP_WAIT_FQS;
 			ret = wait_event_interruptible_timeout(rsp->gp_wq,
-					((gf = ACCESS_ONCE(rsp->gp_flags)) &
+					((gf = READ_ONCE(rsp->gp_flags)) &
 					 RCU_GP_FLAG_FQS) ||
-					(!ACCESS_ONCE(rnp->qsmask) &&
+					(!READ_ONCE(rnp->qsmask) &&
 					 !rcu_preempt_blocked_readers_cgp(rnp)),
 					j);
 			/* Locking provides needed memory barriers. */
 			/* If grace period done, leave loop. */
-			if (!ACCESS_ONCE(rnp->qsmask) &&
+			if (!READ_ONCE(rnp->qsmask) &&
 			    !rcu_preempt_blocked_readers_cgp(rnp))
 				break;
 			/* If time for quiescent-state forcing, do it. */
 			if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) ||
 			    (gf & RCU_GP_FLAG_FQS)) {
 				trace_rcu_grace_period(rsp->name,
-						       ACCESS_ONCE(rsp->gpnum),
+						       READ_ONCE(rsp->gpnum),
 						       TPS("fqsstart"));
 				fqs_state = rcu_gp_fqs(rsp, fqs_state);
 				trace_rcu_grace_period(rsp->name,
-						       ACCESS_ONCE(rsp->gpnum),
+						       READ_ONCE(rsp->gpnum),
 						       TPS("fqsend"));
 				cond_resched_rcu_qs();
-				ACCESS_ONCE(rsp->gp_activity) = jiffies;
+				WRITE_ONCE(rsp->gp_activity, jiffies);
 			} else {
 				/* Deal with stray signal. */
 				cond_resched_rcu_qs();
-				ACCESS_ONCE(rsp->gp_activity) = jiffies;
+				WRITE_ONCE(rsp->gp_activity, jiffies);
 				WARN_ON(signal_pending(current));
 				trace_rcu_grace_period(rsp->name,
-						       ACCESS_ONCE(rsp->gpnum),
+						       READ_ONCE(rsp->gpnum),
 						       TPS("fqswaitsig"));
 			}
 			j = jiffies_till_next_fqs;
@@ -2086,8 +2138,8 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
 		 */
 		return false;
 	}
-	ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
-	trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
+	WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
+	trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum),
 			       TPS("newreq"));
 
 	/*
@@ -2137,6 +2189,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	__releases(rcu_get_root(rsp)->lock)
 {
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+	WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
 	raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
 	rcu_gp_kthread_wake(rsp);
 }
@@ -2334,8 +2387,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 	rcu_report_qs_rdp(rdp->cpu, rsp, rdp);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-
 /*
  * Send the specified CPU's RCU callbacks to the orphanage.  The
  * specified CPU must be offline, and the caller must hold the
@@ -2346,7 +2397,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
 			  struct rcu_node *rnp, struct rcu_data *rdp)
 {
 	/* No-CBs CPUs do not have orphanable callbacks. */
-	if (rcu_is_nocb_cpu(rdp->cpu))
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
 		return;
 
 	/*
@@ -2359,7 +2410,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
 		rsp->qlen += rdp->qlen;
 		rdp->n_cbs_orphaned += rdp->qlen;
 		rdp->qlen_lazy = 0;
-		ACCESS_ONCE(rdp->qlen) = 0;
+		WRITE_ONCE(rdp->qlen, 0);
 	}
 
 	/*
@@ -2405,7 +2456,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
 	struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
 
 	/* No-CBs CPUs are handled specially. */
-	if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+	    rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
 		return;
 
 	/* Do the accounting first. */
@@ -2452,6 +2504,9 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 	RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
 	RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
 
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+		return;
+
 	RCU_TRACE(mask = rdp->grpmask);
 	trace_rcu_grace_period(rsp->name,
 			       rnp->gpnum + 1 - !!(rnp->qsmask & mask),
@@ -2480,7 +2535,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 	long mask;
 	struct rcu_node *rnp = rnp_leaf;
 
-	if (rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+	    rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
 		return;
 	for (;;) {
 		mask = rnp->grpmask;
@@ -2511,6 +2567,9 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
 
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+		return;
+
 	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
 	mask = rdp->grpmask;
 	raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -2532,6 +2591,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
 
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+		return;
+
 	/* Adjust any no-longer-needed kthreads. */
 	rcu_boost_kthread_setaffinity(rnp, -1);
 
@@ -2546,26 +2608,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 		  cpu, rdp->qlen, rdp->nxtlist);
 }
 
-#else /* #ifdef CONFIG_HOTPLUG_CPU */
-
-static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
-{
-}
-
-static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
-{
-}
-
-static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
-{
-}
-
-static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
-
 /*
  * Invoke any RCU callbacks that have made it to the end of their grace
  * period.  Thottle as specified by rdp->blimit.
@@ -2580,7 +2622,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 	/* If no callbacks are ready, just return. */
 	if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
 		trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
-		trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
+		trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist),
 				    need_resched(), is_idle_task(current),
 				    rcu_is_callbacks_kthread());
 		return;
@@ -2636,7 +2678,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 	}
 	smp_mb(); /* List handling before counting for rcu_barrier(). */
 	rdp->qlen_lazy -= count_lazy;
-	ACCESS_ONCE(rdp->qlen) = rdp->qlen - count;
+	WRITE_ONCE(rdp->qlen, rdp->qlen - count);
 	rdp->n_cbs_invoked += count;
 
 	/* Reinstate batch limit if we have worked down the excess. */
@@ -2730,10 +2772,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
 		mask = 0;
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		smp_mb__after_unlock_lock();
-		if (!rcu_gp_in_progress(rsp)) {
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
-			return;
-		}
 		if (rnp->qsmask == 0) {
 			if (rcu_state_p == &rcu_sched_state ||
 			    rsp != rcu_state_p ||
@@ -2763,8 +2801,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
 		bit = 1;
 		for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
 			if ((rnp->qsmask & bit) != 0) {
-				if ((rnp->qsmaskinit & bit) == 0)
-					*isidle = false; /* Pending hotplug. */
 				if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
 					mask |= bit;
 			}
@@ -2793,7 +2829,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
 	/* Funnel through hierarchy to reduce memory contention. */
 	rnp = __this_cpu_read(rsp->rda->mynode);
 	for (; rnp != NULL; rnp = rnp->parent) {
-		ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
+		ret = (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
 		      !raw_spin_trylock(&rnp->fqslock);
 		if (rnp_old != NULL)
 			raw_spin_unlock(&rnp_old->fqslock);
@@ -2809,13 +2845,12 @@ static void force_quiescent_state(struct rcu_state *rsp)
 	raw_spin_lock_irqsave(&rnp_old->lock, flags);
 	smp_mb__after_unlock_lock();
 	raw_spin_unlock(&rnp_old->fqslock);
-	if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+	if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
 		rsp->n_force_qs_lh++;
 		raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
 		return;  /* Someone beat us to it. */
 	}
-	ACCESS_ONCE(rsp->gp_flags) =
-		ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS;
+	WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
 	raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
 	rcu_gp_kthread_wake(rsp);
 }
@@ -2881,7 +2916,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
  */
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-	if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
+	if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
 		return;
 	if (likely(!rsp->boost)) {
 		rcu_do_batch(rsp, rdp);
@@ -2972,7 +3007,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
 	if (debug_rcu_head_queue(head)) {
 		/* Probable double call_rcu(), so leak the callback. */
-		ACCESS_ONCE(head->func) = rcu_leak_callback;
+		WRITE_ONCE(head->func, rcu_leak_callback);
 		WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
 		return;
 	}
@@ -3011,7 +3046,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 		if (!likely(rdp->nxtlist))
 			init_default_callback_list(rdp);
 	}
-	ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
+	WRITE_ONCE(rdp->qlen, rdp->qlen + 1);
 	if (lazy)
 		rdp->qlen_lazy++;
 	else
@@ -3287,7 +3322,7 @@ void synchronize_sched_expedited(void)
 	if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
 			 (ulong)atomic_long_read(&rsp->expedited_done) +
 			 ULONG_MAX / 8)) {
-		synchronize_sched();
+		wait_rcu_gp(call_rcu_sched);
 		atomic_long_inc(&rsp->expedited_wrap);
 		return;
 	}
@@ -3450,14 +3485,14 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 	}
 
 	/* Has another RCU grace period completed?  */
-	if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
+	if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
 		rdp->n_rp_gp_completed++;
 		return 1;
 	}
 
 	/* Has a new RCU grace period started? */
-	if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
-	    unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
+	if (READ_ONCE(rnp->gpnum) != rdp->gpnum ||
+	    unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */
 		rdp->n_rp_gp_started++;
 		return 1;
 	}
@@ -3493,7 +3528,7 @@ static int rcu_pending(void)
  * non-NULL, store an indication of whether all callbacks are lazy.
  * (If there are no callbacks, all of them are deemed to be lazy.)
  */
-static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
+static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
 {
 	bool al = true;
 	bool hc = false;
@@ -3564,7 +3599,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 {
 	int cpu;
 	struct rcu_data *rdp;
-	unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
+	unsigned long snap = READ_ONCE(rsp->n_barrier_done);
 	unsigned long snap_done;
 
 	_rcu_barrier_trace(rsp, "Begin", -1, snap);
@@ -3606,10 +3641,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
 
 	/*
 	 * Increment ->n_barrier_done to avoid duplicate work.  Use
-	 * ACCESS_ONCE() to prevent the compiler from speculating
+	 * WRITE_ONCE() to prevent the compiler from speculating
 	 * the increment to precede the early-exit check.
 	 */
-	ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
+	WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
 	WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
 	_rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
 	smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
@@ -3645,7 +3680,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 				__call_rcu(&rdp->barrier_head,
 					   rcu_barrier_callback, rsp, cpu, 0);
 			}
-		} else if (ACCESS_ONCE(rdp->qlen)) {
+		} else if (READ_ONCE(rdp->qlen)) {
 			_rcu_barrier_trace(rsp, "OnlineQ", cpu,
 					   rsp->n_barrier_done);
 			smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
@@ -3665,7 +3700,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 
 	/* Increment ->n_barrier_done to prevent duplicate work. */
 	smp_mb(); /* Keep increment after above mechanism. */
-	ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
+	WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
 	WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
 	_rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
 	smp_mb(); /* Keep increment before caller's subsequent code. */
@@ -3780,7 +3815,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
 	rdp->completed = rnp->completed;
 	rdp->passed_quiesce = false;
-	rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
+	rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
 	rdp->qs_pending = false;
 	trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -3924,16 +3959,16 @@ void rcu_scheduler_starting(void)
 
 /*
  * Compute the per-level fanout, either using the exact fanout specified
- * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
+ * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
  */
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
 {
 	int i;
 
-	if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) {
+	if (rcu_fanout_exact) {
 		rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
 		for (i = rcu_num_lvls - 2; i >= 0; i--)
-			rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+			rsp->levelspread[i] = RCU_FANOUT;
 	} else {
 		int ccur;
 		int cprv;
@@ -3971,9 +4006,9 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 
 	BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
 
-	/* Silence gcc 4.8 warning about array index out of range. */
-	if (rcu_num_lvls > RCU_NUM_LVLS)
-		panic("rcu_init_one: rcu_num_lvls overflow");
+	/* Silence gcc 4.8 false positive about array index out of range. */
+	if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS)
+		panic("rcu_init_one: rcu_num_lvls out of range");
 
 	/* Initialize the level-tracking arrays. */
 
@@ -4059,7 +4094,7 @@ static void __init rcu_init_geometry(void)
 		jiffies_till_next_fqs = d;
 
 	/* If the compile-time values are accurate, just leave. */
-	if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
+	if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
 	    nr_cpu_ids == NR_CPUS)
 		return;
 	pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n",
@@ -4073,7 +4108,7 @@ static void __init rcu_init_geometry(void)
 	rcu_capacity[0] = 1;
 	rcu_capacity[1] = rcu_fanout_leaf;
 	for (i = 2; i <= MAX_RCU_LVLS; i++)
-		rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
+		rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
 
 	/*
 	 * The boot-time rcu_fanout_leaf parameter is only permitted
@@ -4083,7 +4118,7 @@ static void __init rcu_init_geometry(void)
 	 * the configured number of CPUs.  Complain and fall back to the
 	 * compile-time values if these limits are exceeded.
 	 */
-	if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
+	if (rcu_fanout_leaf < RCU_FANOUT_LEAF ||
 	    rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
 	    n > rcu_capacity[MAX_RCU_LVLS]) {
 		WARN_ON(1);
@@ -4109,6 +4144,28 @@ static void __init rcu_init_geometry(void)
 	rcu_num_nodes -= n;
 }
 
+/*
+ * Dump out the structure of the rcu_node combining tree associated
+ * with the rcu_state structure referenced by rsp.
+ */
+static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
+{
+	int level = 0;
+	struct rcu_node *rnp;
+
+	pr_info("rcu_node tree layout dump\n");
+	pr_info(" ");
+	rcu_for_each_node_breadth_first(rsp, rnp) {
+		if (rnp->level != level) {
+			pr_cont("\n");
+			pr_info(" ");
+			level = rnp->level;
+		}
+		pr_cont("%d:%d ^%d  ", rnp->grplo, rnp->grphi, rnp->grpnum);
+	}
+	pr_cont("\n");
+}
+
 void __init rcu_init(void)
 {
 	int cpu;
@@ -4119,6 +4176,8 @@ void __init rcu_init(void)
 	rcu_init_geometry();
 	rcu_init_one(&rcu_bh_state, &rcu_bh_data);
 	rcu_init_one(&rcu_sched_state, &rcu_sched_data);
+	if (dump_tree)
+		rcu_dump_rcu_node_tree(&rcu_sched_state);
 	__rcu_init_preempt();
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index a69d3dab2ec4db..4adb7ca0bf47a2 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -35,11 +35,33 @@
  * In practice, this did work well going from three levels to four.
  * Of course, your mileage may vary.
  */
+
 #define MAX_RCU_LVLS 4
-#define RCU_FANOUT_1	      (CONFIG_RCU_FANOUT_LEAF)
-#define RCU_FANOUT_2	      (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_3	      (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_4	      (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
+
+#ifdef CONFIG_RCU_FANOUT
+#define RCU_FANOUT CONFIG_RCU_FANOUT
+#else /* #ifdef CONFIG_RCU_FANOUT */
+# ifdef CONFIG_64BIT
+# define RCU_FANOUT 64
+# else
+# define RCU_FANOUT 32
+# endif
+#endif /* #else #ifdef CONFIG_RCU_FANOUT */
+
+#ifdef CONFIG_RCU_FANOUT_LEAF
+#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
+#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
+# ifdef CONFIG_64BIT
+# define RCU_FANOUT_LEAF 64
+# else
+# define RCU_FANOUT_LEAF 32
+# endif
+#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
+
+#define RCU_FANOUT_1	      (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2	      (RCU_FANOUT_1 * RCU_FANOUT)
+#define RCU_FANOUT_3	      (RCU_FANOUT_2 * RCU_FANOUT)
+#define RCU_FANOUT_4	      (RCU_FANOUT_3 * RCU_FANOUT)
 
 #if NR_CPUS <= RCU_FANOUT_1
 #  define RCU_NUM_LVLS	      1
@@ -170,7 +192,6 @@ struct rcu_node {
 				/*  if there is no such task.  If there */
 				/*  is no current expedited grace period, */
 				/*  then there can cannot be any such task. */
-#ifdef CONFIG_RCU_BOOST
 	struct list_head *boost_tasks;
 				/* Pointer to first task that needs to be */
 				/*  priority boosted, or NULL if no priority */
@@ -208,7 +229,6 @@ struct rcu_node {
 	unsigned long n_balk_nos;
 				/* Refused to boost: not sure why, though. */
 				/*  This can happen due to race conditions. */
-#endif /* #ifdef CONFIG_RCU_BOOST */
 #ifdef CONFIG_RCU_NOCB_CPU
 	wait_queue_head_t nocb_gp_wq[2];
 				/* Place for rcu_nocb_kthread() to wait GP. */
@@ -519,14 +539,11 @@ extern struct list_head rcu_struct_flavors;
  * RCU implementation internal declarations:
  */
 extern struct rcu_state rcu_sched_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
 
 extern struct rcu_state rcu_bh_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 #ifdef CONFIG_PREEMPT_RCU
 extern struct rcu_state rcu_preempt_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 
 #ifdef CONFIG_RCU_BOOST
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 8c0ec0f5a02702..c4e207dadfa33d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -43,7 +43,17 @@ DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
 DEFINE_PER_CPU(char, rcu_cpu_has_work);
 
-#endif /* #ifdef CONFIG_RCU_BOOST */
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
+ * all uses are in dead code.  Provide a definition to keep the compiler
+ * happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
+ * This probably needs to be excluded from -rt builds.
+ */
+#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
+
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
 
 #ifdef CONFIG_RCU_NOCB_CPU
 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
@@ -60,11 +70,11 @@ static void __init rcu_bootup_announce_oddness(void)
 {
 	if (IS_ENABLED(CONFIG_RCU_TRACE))
 		pr_info("\tRCU debugfs-based tracing is enabled.\n");
-	if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) ||
-	    (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32))
+	if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) ||
+	    (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32))
 		pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
-		       CONFIG_RCU_FANOUT);
-	if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT))
+		       RCU_FANOUT);
+	if (rcu_fanout_exact)
 		pr_info("\tHierarchical RCU autobalancing is disabled.\n");
 	if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
 		pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
@@ -76,10 +86,10 @@ static void __init rcu_bootup_announce_oddness(void)
 		pr_info("\tAdditional per-CPU info printed with stalls.\n");
 	if (NUM_RCU_LVL_4 != 0)
 		pr_info("\tFour-level hierarchy is enabled.\n");
-	if (CONFIG_RCU_FANOUT_LEAF != 16)
+	if (RCU_FANOUT_LEAF != 16)
 		pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
-			CONFIG_RCU_FANOUT_LEAF);
-	if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
+			RCU_FANOUT_LEAF);
+	if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
 		pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
 	if (nr_cpu_ids != NR_CPUS)
 		pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
@@ -90,7 +100,8 @@ static void __init rcu_bootup_announce_oddness(void)
 #ifdef CONFIG_PREEMPT_RCU
 
 RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
-static struct rcu_state *rcu_state_p = &rcu_preempt_state;
+static struct rcu_state *const rcu_state_p = &rcu_preempt_state;
+static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data;
 
 static int rcu_preempted_readers_exp(struct rcu_node *rnp);
 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
@@ -116,11 +127,11 @@ static void __init rcu_bootup_announce(void)
  */
 static void rcu_preempt_qs(void)
 {
-	if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) {
+	if (!__this_cpu_read(rcu_data_p->passed_quiesce)) {
 		trace_rcu_grace_period(TPS("rcu_preempt"),
-				       __this_cpu_read(rcu_preempt_data.gpnum),
+				       __this_cpu_read(rcu_data_p->gpnum),
 				       TPS("cpuqs"));
-		__this_cpu_write(rcu_preempt_data.passed_quiesce, 1);
+		__this_cpu_write(rcu_data_p->passed_quiesce, 1);
 		barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
 		current->rcu_read_unlock_special.b.need_qs = false;
 	}
@@ -150,7 +161,7 @@ static void rcu_preempt_note_context_switch(void)
 	    !t->rcu_read_unlock_special.b.blocked) {
 
 		/* Possibly blocking in an RCU read-side critical section. */
-		rdp = this_cpu_ptr(rcu_preempt_state.rda);
+		rdp = this_cpu_ptr(rcu_state_p->rda);
 		rnp = rdp->mynode;
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		smp_mb__after_unlock_lock();
@@ -180,10 +191,9 @@ static void rcu_preempt_note_context_switch(void)
 		if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
 			list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
 			rnp->gp_tasks = &t->rcu_node_entry;
-#ifdef CONFIG_RCU_BOOST
-			if (rnp->boost_tasks != NULL)
+			if (IS_ENABLED(CONFIG_RCU_BOOST) &&
+			    rnp->boost_tasks != NULL)
 				rnp->boost_tasks = rnp->gp_tasks;
-#endif /* #ifdef CONFIG_RCU_BOOST */
 		} else {
 			list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
 			if (rnp->qsmask & rdp->grpmask)
@@ -263,9 +273,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 	bool empty_exp_now;
 	unsigned long flags;
 	struct list_head *np;
-#ifdef CONFIG_RCU_BOOST
 	bool drop_boost_mutex = false;
-#endif /* #ifdef CONFIG_RCU_BOOST */
 	struct rcu_node *rnp;
 	union rcu_special special;
 
@@ -307,9 +315,11 @@ void rcu_read_unlock_special(struct task_struct *t)
 		t->rcu_read_unlock_special.b.blocked = false;
 
 		/*
-		 * Remove this task from the list it blocked on.  The
-		 * task can migrate while we acquire the lock, but at
-		 * most one time.  So at most two passes through loop.
+		 * Remove this task from the list it blocked on.  The task
+		 * now remains queued on the rcu_node corresponding to
+		 * the CPU it first blocked on, so the first attempt to
+		 * acquire the task's rcu_node's ->lock will succeed.
+		 * Keep the loop and add a WARN_ON() out of sheer paranoia.
 		 */
 		for (;;) {
 			rnp = t->rcu_blocked_node;
@@ -317,6 +327,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 			smp_mb__after_unlock_lock();
 			if (rnp == t->rcu_blocked_node)
 				break;
+			WARN_ON_ONCE(1);
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		}
 		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
@@ -331,12 +342,12 @@ void rcu_read_unlock_special(struct task_struct *t)
 			rnp->gp_tasks = np;
 		if (&t->rcu_node_entry == rnp->exp_tasks)
 			rnp->exp_tasks = np;
-#ifdef CONFIG_RCU_BOOST
-		if (&t->rcu_node_entry == rnp->boost_tasks)
-			rnp->boost_tasks = np;
-		/* Snapshot ->boost_mtx ownership with rcu_node lock held. */
-		drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
-#endif /* #ifdef CONFIG_RCU_BOOST */
+		if (IS_ENABLED(CONFIG_RCU_BOOST)) {
+			if (&t->rcu_node_entry == rnp->boost_tasks)
+				rnp->boost_tasks = np;
+			/* Snapshot ->boost_mtx ownership w/rnp->lock held. */
+			drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
+		}
 
 		/*
 		 * If this was the last task on the current list, and if
@@ -353,24 +364,21 @@ void rcu_read_unlock_special(struct task_struct *t)
 							 rnp->grplo,
 							 rnp->grphi,
 							 !!rnp->gp_tasks);
-			rcu_report_unblock_qs_rnp(&rcu_preempt_state,
-						  rnp, flags);
+			rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags);
 		} else {
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		}
 
-#ifdef CONFIG_RCU_BOOST
 		/* Unboost if we were boosted. */
-		if (drop_boost_mutex)
+		if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
 			rt_mutex_unlock(&rnp->boost_mtx);
-#endif /* #ifdef CONFIG_RCU_BOOST */
 
 		/*
 		 * If this was the last task on the expedited lists,
 		 * then we need to report up the rcu_node hierarchy.
 		 */
 		if (!empty_exp && empty_exp_now)
-			rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
+			rcu_report_exp_rnp(rcu_state_p, rnp, true);
 	} else {
 		local_irq_restore(flags);
 	}
@@ -390,7 +398,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
-	t = list_entry(rnp->gp_tasks,
+	t = list_entry(rnp->gp_tasks->prev,
 		       struct task_struct, rcu_node_entry);
 	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
 		sched_show_task(t);
@@ -447,7 +455,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 	if (!rcu_preempt_blocked_readers_cgp(rnp))
 		return 0;
 	rcu_print_task_stall_begin(rnp);
-	t = list_entry(rnp->gp_tasks,
+	t = list_entry(rnp->gp_tasks->prev,
 		       struct task_struct, rcu_node_entry);
 	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
 		pr_cont(" P%d", t->pid);
@@ -491,8 +499,8 @@ static void rcu_preempt_check_callbacks(void)
 		return;
 	}
 	if (t->rcu_read_lock_nesting > 0 &&
-	    __this_cpu_read(rcu_preempt_data.qs_pending) &&
-	    !__this_cpu_read(rcu_preempt_data.passed_quiesce))
+	    __this_cpu_read(rcu_data_p->qs_pending) &&
+	    !__this_cpu_read(rcu_data_p->passed_quiesce))
 		t->rcu_read_unlock_special.b.need_qs = true;
 }
 
@@ -500,7 +508,7 @@ static void rcu_preempt_check_callbacks(void)
 
 static void rcu_preempt_do_callbacks(void)
 {
-	rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
+	rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
 }
 
 #endif /* #ifdef CONFIG_RCU_BOOST */
@@ -510,7 +518,7 @@ static void rcu_preempt_do_callbacks(void)
  */
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
-	__call_rcu(head, func, &rcu_preempt_state, -1, 0);
+	__call_rcu(head, func, rcu_state_p, -1, 0);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
@@ -570,7 +578,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp)
 static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 {
 	return !rcu_preempted_readers_exp(rnp) &&
-	       ACCESS_ONCE(rnp->expmask) == 0;
+	       READ_ONCE(rnp->expmask) == 0;
 }
 
 /*
@@ -711,12 +719,12 @@ sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
 void synchronize_rcu_expedited(void)
 {
 	struct rcu_node *rnp;
-	struct rcu_state *rsp = &rcu_preempt_state;
+	struct rcu_state *rsp = rcu_state_p;
 	unsigned long snap;
 	int trycount = 0;
 
 	smp_mb(); /* Caller's modifications seen first by other CPUs. */
-	snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
+	snap = READ_ONCE(sync_rcu_preempt_exp_count) + 1;
 	smp_mb(); /* Above access cannot bleed into critical section. */
 
 	/*
@@ -740,7 +748,7 @@ void synchronize_rcu_expedited(void)
 	 */
 	while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
 		if (ULONG_CMP_LT(snap,
-		    ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+		    READ_ONCE(sync_rcu_preempt_exp_count))) {
 			put_online_cpus();
 			goto mb_ret; /* Others did our work for us. */
 		}
@@ -752,7 +760,7 @@ void synchronize_rcu_expedited(void)
 			return;
 		}
 	}
-	if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+	if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) {
 		put_online_cpus();
 		goto unlock_mb_ret; /* Others did our work for us. */
 	}
@@ -780,8 +788,7 @@ void synchronize_rcu_expedited(void)
 
 	/* Clean up and exit. */
 	smp_mb(); /* ensure expedited GP seen before counter increment. */
-	ACCESS_ONCE(sync_rcu_preempt_exp_count) =
-					sync_rcu_preempt_exp_count + 1;
+	WRITE_ONCE(sync_rcu_preempt_exp_count, sync_rcu_preempt_exp_count + 1);
 unlock_mb_ret:
 	mutex_unlock(&sync_rcu_preempt_exp_mutex);
 mb_ret:
@@ -799,7 +806,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
  */
 void rcu_barrier(void)
 {
-	_rcu_barrier(&rcu_preempt_state);
+	_rcu_barrier(rcu_state_p);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
@@ -808,7 +815,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier);
  */
 static void __init __rcu_init_preempt(void)
 {
-	rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
+	rcu_init_one(rcu_state_p, rcu_data_p);
 }
 
 /*
@@ -831,7 +838,8 @@ void exit_rcu(void)
 
 #else /* #ifdef CONFIG_PREEMPT_RCU */
 
-static struct rcu_state *rcu_state_p = &rcu_sched_state;
+static struct rcu_state *const rcu_state_p = &rcu_sched_state;
+static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data;
 
 /*
  * Tell them what RCU they are running.
@@ -994,8 +1002,8 @@ static int rcu_boost(struct rcu_node *rnp)
 	struct task_struct *t;
 	struct list_head *tb;
 
-	if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
-	    ACCESS_ONCE(rnp->boost_tasks) == NULL)
+	if (READ_ONCE(rnp->exp_tasks) == NULL &&
+	    READ_ONCE(rnp->boost_tasks) == NULL)
 		return 0;  /* Nothing left to boost. */
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1048,8 +1056,8 @@ static int rcu_boost(struct rcu_node *rnp)
 	rt_mutex_lock(&rnp->boost_mtx);
 	rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
 
-	return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
-	       ACCESS_ONCE(rnp->boost_tasks) != NULL;
+	return READ_ONCE(rnp->exp_tasks) != NULL ||
+	       READ_ONCE(rnp->boost_tasks) != NULL;
 }
 
 /*
@@ -1173,7 +1181,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 	struct sched_param sp;
 	struct task_struct *t;
 
-	if (&rcu_preempt_state != rsp)
+	if (rcu_state_p != rsp)
 		return 0;
 
 	if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
@@ -1367,13 +1375,12 @@ static void rcu_prepare_kthreads(int cpu)
  * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
  * any flavor of RCU.
  */
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(unsigned long *delta_jiffies)
+int rcu_needs_cpu(u64 basemono, u64 *nextevt)
 {
-	*delta_jiffies = ULONG_MAX;
-	return rcu_cpu_has_callbacks(NULL);
+	*nextevt = KTIME_MAX;
+	return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
+	       ? 0 : rcu_cpu_has_callbacks(NULL);
 }
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 
 /*
  * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
@@ -1462,7 +1469,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
 		 * callbacks not yet ready to invoke.
 		 */
 		if ((rdp->completed != rnp->completed ||
-		     unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
+		     unlikely(READ_ONCE(rdp->gpwrap))) &&
 		    rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
 			note_gp_changes(rsp, rdp);
 
@@ -1480,17 +1487,22 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
  *
  * The caller must have disabled interrupts.
  */
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(unsigned long *dj)
+int rcu_needs_cpu(u64 basemono, u64 *nextevt)
 {
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+	unsigned long dj;
+
+	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) {
+		*nextevt = KTIME_MAX;
+		return 0;
+	}
 
 	/* Snapshot to detect later posting of non-lazy callback. */
 	rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
 
 	/* If no callbacks, RCU doesn't need the CPU. */
 	if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
-		*dj = ULONG_MAX;
+		*nextevt = KTIME_MAX;
 		return 0;
 	}
 
@@ -1504,14 +1516,14 @@ int rcu_needs_cpu(unsigned long *dj)
 
 	/* Request timer delay depending on laziness, and round. */
 	if (!rdtp->all_lazy) {
-		*dj = round_up(rcu_idle_gp_delay + jiffies,
+		dj = round_up(rcu_idle_gp_delay + jiffies,
 			       rcu_idle_gp_delay) - jiffies;
 	} else {
-		*dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
+		dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
 	}
+	*nextevt = basemono + dj * TICK_NSEC;
 	return 0;
 }
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 
 /*
  * Prepare a CPU for idle from an RCU perspective.  The first major task
@@ -1525,7 +1537,6 @@ int rcu_needs_cpu(unsigned long *dj)
  */
 static void rcu_prepare_for_idle(void)
 {
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
 	bool needwake;
 	struct rcu_data *rdp;
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
@@ -1533,8 +1544,11 @@ static void rcu_prepare_for_idle(void)
 	struct rcu_state *rsp;
 	int tne;
 
+	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL))
+		return;
+
 	/* Handle nohz enablement switches conservatively. */
-	tne = ACCESS_ONCE(tick_nohz_active);
+	tne = READ_ONCE(tick_nohz_active);
 	if (tne != rdtp->tick_nohz_enabled_snap) {
 		if (rcu_cpu_has_callbacks(NULL))
 			invoke_rcu_core(); /* force nohz to see update. */
@@ -1580,7 +1594,6 @@ static void rcu_prepare_for_idle(void)
 		if (needwake)
 			rcu_gp_kthread_wake(rsp);
 	}
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 }
 
 /*
@@ -1590,12 +1603,11 @@ static void rcu_prepare_for_idle(void)
  */
 static void rcu_cleanup_after_idle(void)
 {
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
-	if (rcu_is_nocb_cpu(smp_processor_id()))
+	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
+	    rcu_is_nocb_cpu(smp_processor_id()))
 		return;
 	if (rcu_try_advance_all_cbs())
 		invoke_rcu_core();
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 }
 
 /*
@@ -1760,7 +1772,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
 	       atomic_read(&rdtp->dynticks) & 0xfff,
 	       rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
 	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
-	       ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
+	       READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
 	       fast_no_hz);
 }
 
@@ -1898,11 +1910,11 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
 {
 	struct rcu_data *rdp_leader = rdp->nocb_leader;
 
-	if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
+	if (!READ_ONCE(rdp_leader->nocb_kthread))
 		return;
-	if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
+	if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
 		/* Prior smp_mb__after_atomic() orders against prior enqueue. */
-		ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
+		WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
 		wake_up(&rdp_leader->nocb_wq);
 	}
 }
@@ -1934,14 +1946,14 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
 	ret = atomic_long_read(&rdp->nocb_q_count);
 
 #ifdef CONFIG_PROVE_RCU
-	rhp = ACCESS_ONCE(rdp->nocb_head);
+	rhp = READ_ONCE(rdp->nocb_head);
 	if (!rhp)
-		rhp = ACCESS_ONCE(rdp->nocb_gp_head);
+		rhp = READ_ONCE(rdp->nocb_gp_head);
 	if (!rhp)
-		rhp = ACCESS_ONCE(rdp->nocb_follower_head);
+		rhp = READ_ONCE(rdp->nocb_follower_head);
 
 	/* Having no rcuo kthread but CBs after scheduler starts is bad! */
-	if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp &&
+	if (!READ_ONCE(rdp->nocb_kthread) && rhp &&
 	    rcu_scheduler_fully_active) {
 		/* RCU callback enqueued before CPU first came online??? */
 		pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
@@ -1975,12 +1987,12 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
 	atomic_long_add(rhcount, &rdp->nocb_q_count);
 	/* rcu_barrier() relies on ->nocb_q_count add before xchg. */
 	old_rhpp = xchg(&rdp->nocb_tail, rhtp);
-	ACCESS_ONCE(*old_rhpp) = rhp;
+	WRITE_ONCE(*old_rhpp, rhp);
 	atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
 	smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
 
 	/* If we are not being polled and there is a kthread, awaken it ... */
-	t = ACCESS_ONCE(rdp->nocb_kthread);
+	t = READ_ONCE(rdp->nocb_kthread);
 	if (rcu_nocb_poll || !t) {
 		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
 				    TPS("WakeNotPoll"));
@@ -2118,7 +2130,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 	for (;;) {
 		wait_event_interruptible(
 			rnp->nocb_gp_wq[c & 0x1],
-			(d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
+			(d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
 		if (likely(d))
 			break;
 		WARN_ON(signal_pending(current));
@@ -2145,7 +2157,7 @@ wait_again:
 	if (!rcu_nocb_poll) {
 		trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
 		wait_event_interruptible(my_rdp->nocb_wq,
-				!ACCESS_ONCE(my_rdp->nocb_leader_sleep));
+				!READ_ONCE(my_rdp->nocb_leader_sleep));
 		/* Memory barrier handled by smp_mb() calls below and repoll. */
 	} else if (firsttime) {
 		firsttime = false; /* Don't drown trace log with "Poll"! */
@@ -2159,12 +2171,12 @@ wait_again:
 	 */
 	gotcbs = false;
 	for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
-		rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head);
+		rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
 		if (!rdp->nocb_gp_head)
 			continue;  /* No CBs here, try next follower. */
 
 		/* Move callbacks to wait-for-GP list, which is empty. */
-		ACCESS_ONCE(rdp->nocb_head) = NULL;
+		WRITE_ONCE(rdp->nocb_head, NULL);
 		rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
 		gotcbs = true;
 	}
@@ -2184,7 +2196,7 @@ wait_again:
 		my_rdp->nocb_leader_sleep = true;
 		smp_mb();  /* Ensure _sleep true before scan. */
 		for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
-			if (ACCESS_ONCE(rdp->nocb_head)) {
+			if (READ_ONCE(rdp->nocb_head)) {
 				/* Found CB, so short-circuit next wait. */
 				my_rdp->nocb_leader_sleep = false;
 				break;
@@ -2205,7 +2217,7 @@ wait_again:
 
 	/* Each pass through the following loop wakes a follower, if needed. */
 	for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
-		if (ACCESS_ONCE(rdp->nocb_head))
+		if (READ_ONCE(rdp->nocb_head))
 			my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
 		if (!rdp->nocb_gp_head)
 			continue; /* No CBs, so no need to wake follower. */
@@ -2241,7 +2253,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
 			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
 					    "FollowerSleep");
 			wait_event_interruptible(rdp->nocb_wq,
-						 ACCESS_ONCE(rdp->nocb_follower_head));
+						 READ_ONCE(rdp->nocb_follower_head));
 		} else if (firsttime) {
 			/* Don't drown trace log with "Poll"! */
 			firsttime = false;
@@ -2282,10 +2294,10 @@ static int rcu_nocb_kthread(void *arg)
 			nocb_follower_wait(rdp);
 
 		/* Pull the ready-to-invoke callbacks onto local list. */
-		list = ACCESS_ONCE(rdp->nocb_follower_head);
+		list = READ_ONCE(rdp->nocb_follower_head);
 		BUG_ON(!list);
 		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
-		ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
+		WRITE_ONCE(rdp->nocb_follower_head, NULL);
 		tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
 
 		/* Each pass through the following loop invokes a callback. */
@@ -2324,7 +2336,7 @@ static int rcu_nocb_kthread(void *arg)
 /* Is a deferred wakeup of rcu_nocb_kthread() required? */
 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
 {
-	return ACCESS_ONCE(rdp->nocb_defer_wakeup);
+	return READ_ONCE(rdp->nocb_defer_wakeup);
 }
 
 /* Do a deferred wakeup of rcu_nocb_kthread(). */
@@ -2334,8 +2346,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
 
 	if (!rcu_nocb_need_deferred_wakeup(rdp))
 		return;
-	ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup);
-	ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT;
+	ndw = READ_ONCE(rdp->nocb_defer_wakeup);
+	WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT);
 	wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
 	trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
 }
@@ -2448,7 +2460,7 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
 	t = kthread_run(rcu_nocb_kthread, rdp_spawn,
 			"rcuo%c/%d", rsp->abbr, cpu);
 	BUG_ON(IS_ERR(t));
-	ACCESS_ONCE(rdp_spawn->nocb_kthread) = t;
+	WRITE_ONCE(rdp_spawn->nocb_kthread, t);
 }
 
 /*
@@ -2663,7 +2675,7 @@ static void rcu_sysidle_enter(int irq)
 
 	/* Record start of fully idle period. */
 	j = jiffies;
-	ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
+	WRITE_ONCE(rdtp->dynticks_idle_jiffies, j);
 	smp_mb__before_atomic();
 	atomic_inc(&rdtp->dynticks_idle);
 	smp_mb__after_atomic();
@@ -2681,7 +2693,7 @@ static void rcu_sysidle_enter(int irq)
  */
 void rcu_sysidle_force_exit(void)
 {
-	int oldstate = ACCESS_ONCE(full_sysidle_state);
+	int oldstate = READ_ONCE(full_sysidle_state);
 	int newoldstate;
 
 	/*
@@ -2794,7 +2806,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
 	smp_mb(); /* Read counters before timestamps. */
 
 	/* Pick up timestamps. */
-	j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
+	j = READ_ONCE(rdtp->dynticks_idle_jiffies);
 	/* If this CPU entered idle more recently, update maxj timestamp. */
 	if (ULONG_CMP_LT(*maxj, j))
 		*maxj = j;
@@ -2831,11 +2843,11 @@ static unsigned long rcu_sysidle_delay(void)
 static void rcu_sysidle(unsigned long j)
 {
 	/* Check the current state. */
-	switch (ACCESS_ONCE(full_sysidle_state)) {
+	switch (READ_ONCE(full_sysidle_state)) {
 	case RCU_SYSIDLE_NOT:
 
 		/* First time all are idle, so note a short idle period. */
-		ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
+		WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT);
 		break;
 
 	case RCU_SYSIDLE_SHORT:
@@ -2873,7 +2885,7 @@ static void rcu_sysidle_cancel(void)
 {
 	smp_mb();
 	if (full_sysidle_state > RCU_SYSIDLE_SHORT)
-		ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
+		WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT);
 }
 
 /*
@@ -2925,7 +2937,7 @@ static void rcu_sysidle_cb(struct rcu_head *rhp)
 	smp_mb();  /* grace period precedes setting inuse. */
 
 	rshp = container_of(rhp, struct rcu_sysidle_head, rh);
-	ACCESS_ONCE(rshp->inuse) = 0;
+	WRITE_ONCE(rshp->inuse, 0);
 }
 
 /*
@@ -2936,7 +2948,7 @@ static void rcu_sysidle_cb(struct rcu_head *rhp)
 bool rcu_sys_is_idle(void)
 {
 	static struct rcu_sysidle_head rsh;
-	int rss = ACCESS_ONCE(full_sysidle_state);
+	int rss = READ_ONCE(full_sysidle_state);
 
 	if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
 		return false;
@@ -2964,7 +2976,7 @@ bool rcu_sys_is_idle(void)
 			}
 			rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
 			oldrss = rss;
-			rss = ACCESS_ONCE(full_sysidle_state);
+			rss = READ_ONCE(full_sysidle_state);
 		}
 	}
 
@@ -3048,10 +3060,10 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
 #ifdef CONFIG_NO_HZ_FULL
 	if (tick_nohz_full_cpu(smp_processor_id()) &&
 	    (!rcu_gp_in_progress(rsp) ||
-	     ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ)))
-		return 1;
+	     ULONG_CMP_LT(jiffies, READ_ONCE(rsp->gp_start) + HZ)))
+		return true;
 #endif /* #ifdef CONFIG_NO_HZ_FULL */
-	return 0;
+	return false;
 }
 
 /*
@@ -3077,7 +3089,7 @@ static void rcu_bind_gp_kthread(void)
 static void rcu_dynticks_task_enter(void)
 {
 #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
-	ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id();
+	WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
 #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
 }
 
@@ -3085,6 +3097,6 @@ static void rcu_dynticks_task_enter(void)
 static void rcu_dynticks_task_exit(void)
 {
 #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
-	ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1;
+	WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
 #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
 }
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index f92361efd0f55d..3ea7ffc7d5c4a7 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -277,7 +277,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 	seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
 		   rsp->n_force_qs, rsp->n_force_qs_ngp,
 		   rsp->n_force_qs - rsp->n_force_qs_ngp,
-		   ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
+		   READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
 	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
 		if (rnp->level != level) {
 			seq_puts(m, "\n");
@@ -323,8 +323,8 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
 	struct rcu_node *rnp = &rsp->node[0];
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-	completed = ACCESS_ONCE(rsp->completed);
-	gpnum = ACCESS_ONCE(rsp->gpnum);
+	completed = READ_ONCE(rsp->completed);
+	gpnum = READ_ONCE(rsp->gpnum);
 	if (completed == gpnum)
 		gpage = 0;
 	else
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 1f133350da01e3..afaecb7a799af2 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -150,14 +150,14 @@ void __rcu_read_unlock(void)
 		barrier();  /* critical section before exit code. */
 		t->rcu_read_lock_nesting = INT_MIN;
 		barrier();  /* assign before ->rcu_read_unlock_special load */
-		if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s)))
+		if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
 			rcu_read_unlock_special(t);
 		barrier();  /* ->rcu_read_unlock_special load before assign */
 		t->rcu_read_lock_nesting = 0;
 	}
 #ifdef CONFIG_PROVE_LOCKING
 	{
-		int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+		int rrln = READ_ONCE(t->rcu_read_lock_nesting);
 
 		WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
 	}
@@ -389,17 +389,17 @@ module_param(rcu_cpu_stall_timeout, int, 0644);
 
 int rcu_jiffies_till_stall_check(void)
 {
-	int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
+	int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
 
 	/*
 	 * Limit check must be consistent with the Kconfig limits
 	 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
 	 */
 	if (till_stall_check < 3) {
-		ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
+		WRITE_ONCE(rcu_cpu_stall_timeout, 3);
 		till_stall_check = 3;
 	} else if (till_stall_check > 300) {
-		ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
+		WRITE_ONCE(rcu_cpu_stall_timeout, 300);
 		till_stall_check = 300;
 	}
 	return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
@@ -550,12 +550,12 @@ static void check_holdout_task(struct task_struct *t,
 {
 	int cpu;
 
-	if (!ACCESS_ONCE(t->rcu_tasks_holdout) ||
-	    t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) ||
-	    !ACCESS_ONCE(t->on_rq) ||
+	if (!READ_ONCE(t->rcu_tasks_holdout) ||
+	    t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) ||
+	    !READ_ONCE(t->on_rq) ||
 	    (IS_ENABLED(CONFIG_NO_HZ_FULL) &&
 	     !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
-		ACCESS_ONCE(t->rcu_tasks_holdout) = false;
+		WRITE_ONCE(t->rcu_tasks_holdout, false);
 		list_del_init(&t->rcu_tasks_holdout_list);
 		put_task_struct(t);
 		return;
@@ -639,11 +639,11 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 		 */
 		rcu_read_lock();
 		for_each_process_thread(g, t) {
-			if (t != current && ACCESS_ONCE(t->on_rq) &&
+			if (t != current && READ_ONCE(t->on_rq) &&
 			    !is_idle_task(t)) {
 				get_task_struct(t);
-				t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw);
-				ACCESS_ONCE(t->rcu_tasks_holdout) = true;
+				t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw);
+				WRITE_ONCE(t->rcu_tasks_holdout, true);
 				list_add(&t->rcu_tasks_holdout_list,
 					 &rcu_tasks_holdouts);
 			}
@@ -672,7 +672,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 			struct task_struct *t1;
 
 			schedule_timeout_interruptible(HZ);
-			rtst = ACCESS_ONCE(rcu_task_stall_timeout);
+			rtst = READ_ONCE(rcu_task_stall_timeout);
 			needreport = rtst > 0 &&
 				     time_after(jiffies, lastreport + rtst);
 			if (needreport)
@@ -728,7 +728,7 @@ static void rcu_spawn_tasks_kthread(void)
 	static struct task_struct *rcu_tasks_kthread_ptr;
 	struct task_struct *t;
 
-	if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) {
+	if (READ_ONCE(rcu_tasks_kthread_ptr)) {
 		smp_mb(); /* Ensure caller sees full kthread. */
 		return;
 	}
@@ -740,7 +740,7 @@ static void rcu_spawn_tasks_kthread(void)
 	t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
 	BUG_ON(IS_ERR(t));
 	smp_mb(); /* Ensure others see full kthread. */
-	ACCESS_ONCE(rcu_tasks_kthread_ptr) = t;
+	WRITE_ONCE(rcu_tasks_kthread_ptr, t);
 	mutex_unlock(&rcu_tasks_kthread_mutex);
 }
 
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 46be8702487561..67687973ce80d6 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
-obj-y += core.o proc.o clock.o cputime.o
+obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
 obj-y += wait.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index eae160dd669d9d..750ed601ddf78e 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -1,5 +1,3 @@
-#ifdef CONFIG_SCHED_AUTOGROUP
-
 #include "sched.h"
 
 #include <linux/proc_fs.h>
@@ -141,7 +139,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 
 	p->signal->autogroup = autogroup_kref_get(ag);
 
-	if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+	if (!READ_ONCE(sysctl_sched_autogroup_enabled))
 		goto out;
 
 	for_each_thread(p, t)
@@ -249,5 +247,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
 	return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
 }
 #endif /* CONFIG_SCHED_DEBUG */
-
-#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
index 8bd047142816de..890c95f2587a4d 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
 static inline struct task_group *
 autogroup_task_group(struct task_struct *p, struct task_group *tg)
 {
-	int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+	int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
 
 	if (enabled && task_wants_autogroup(p, tg))
 		return p->signal->autogroup->tg;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 123673291ffbb1..4eec60757b1626 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,26 +90,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
 
-void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
-{
-	unsigned long delta;
-	ktime_t soft, hard, now;
-
-	for (;;) {
-		if (hrtimer_active(period_timer))
-			break;
-
-		now = hrtimer_cb_get_time(period_timer);
-		hrtimer_forward(period_timer, now, period);
-
-		soft = hrtimer_get_softexpires(period_timer);
-		hard = hrtimer_get_expires(period_timer);
-		delta = ktime_to_ns(ktime_sub(hard, soft));
-		__hrtimer_start_range_ns(period_timer, soft, delta,
-					 HRTIMER_MODE_ABS_PINNED, 0);
-	}
-}
-
 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
@@ -355,12 +335,11 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 
 #ifdef CONFIG_SMP
 
-static int __hrtick_restart(struct rq *rq)
+static void __hrtick_restart(struct rq *rq)
 {
 	struct hrtimer *timer = &rq->hrtick_timer;
-	ktime_t time = hrtimer_get_softexpires(timer);
 
-	return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
+	hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
 }
 
 /*
@@ -440,8 +419,8 @@ void hrtick_start(struct rq *rq, u64 delay)
 	 * doesn't make sense. Rely on vruntime for fairness.
 	 */
 	delay = max_t(u64, delay, 10000LL);
-	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
-			HRTIMER_MODE_REL_PINNED, 0);
+	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
+		      HRTIMER_MODE_REL_PINNED);
 }
 
 static inline void init_hrtick(void)
@@ -511,7 +490,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
 static bool set_nr_if_polling(struct task_struct *p)
 {
 	struct thread_info *ti = task_thread_info(p);
-	typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+	typeof(ti->flags) old, val = READ_ONCE(ti->flags);
 
 	for (;;) {
 		if (!(val & _TIF_POLLING_NRFLAG))
@@ -541,6 +520,52 @@ static bool set_nr_if_polling(struct task_struct *p)
 #endif
 #endif
 
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+	struct wake_q_node *node = &task->wake_q;
+
+	/*
+	 * Atomically grab the task, if ->wake_q is !nil already it means
+	 * its already queued (either by us or someone else) and will get the
+	 * wakeup due to that.
+	 *
+	 * This cmpxchg() implies a full barrier, which pairs with the write
+	 * barrier implied by the wakeup in wake_up_list().
+	 */
+	if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+		return;
+
+	get_task_struct(task);
+
+	/*
+	 * The head is context local, there can be no concurrency.
+	 */
+	*head->lastp = node;
+	head->lastp = &node->next;
+}
+
+void wake_up_q(struct wake_q_head *head)
+{
+	struct wake_q_node *node = head->first;
+
+	while (node != WAKE_Q_TAIL) {
+		struct task_struct *task;
+
+		task = container_of(node, struct task_struct, wake_q);
+		BUG_ON(!task);
+		/* task can safely be re-inserted now */
+		node = node->next;
+		task->wake_q.next = NULL;
+
+		/*
+		 * wake_up_process() implies a wmb() to pair with the queueing
+		 * in wake_q_add() so as not to miss wakeups.
+		 */
+		wake_up_process(task);
+		put_task_struct(task);
+	}
+}
+
 /*
  * resched_curr - mark rq's current task 'to be rescheduled now'.
  *
@@ -1049,7 +1074,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
-		perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+		perf_event_task_migrate(p);
 	}
 
 	__set_task_cpu(p, new_cpu);
@@ -2332,7 +2357,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 */
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 
-	context_tracking_task_switch(prev, next);
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 	barrier();
@@ -2397,9 +2421,9 @@ unsigned long nr_iowait_cpu(int cpu)
 
 void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
 {
-	struct rq *this = this_rq();
-	*nr_waiters = atomic_read(&this->nr_iowait);
-	*load = this->cpu_load[0];
+	struct rq *rq = this_rq();
+	*nr_waiters = atomic_read(&rq->nr_iowait);
+	*load = rq->load.weight;
 }
 
 #ifdef CONFIG_SMP
@@ -2497,6 +2521,7 @@ void scheduler_tick(void)
 	update_rq_clock(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	update_cpu_load_active(rq);
+	calc_global_load_tick(rq);
 	raw_spin_unlock(&rq->lock);
 
 	perf_event_task_tick();
@@ -2525,7 +2550,7 @@ void scheduler_tick(void)
 u64 scheduler_tick_max_deferment(void)
 {
 	struct rq *rq = this_rq();
-	unsigned long next, now = ACCESS_ONCE(jiffies);
+	unsigned long next, now = READ_ONCE(jiffies);
 
 	next = rq->last_sched_tick + HZ;
 
@@ -2726,9 +2751,7 @@ again:
  *          - return from syscall or exception to user-space
  *          - return from interrupt-handler to user-space
  *
- * WARNING: all callers must re-check need_resched() afterward and reschedule
- * accordingly in case an event triggered the need for rescheduling (such as
- * an interrupt waking up a task) while preemption was disabled in __schedule().
+ * WARNING: must be called with preemption disabled!
  */
 static void __sched __schedule(void)
 {
@@ -2737,7 +2760,6 @@ static void __sched __schedule(void)
 	struct rq *rq;
 	int cpu;
 
-	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_note_context_switch();
@@ -2801,8 +2823,6 @@ static void __sched __schedule(void)
 		raw_spin_unlock_irq(&rq->lock);
 
 	post_schedule(rq);
-
-	sched_preempt_enable_no_resched();
 }
 
 static inline void sched_submit_work(struct task_struct *tsk)
@@ -2823,7 +2843,9 @@ asmlinkage __visible void __sched schedule(void)
 
 	sched_submit_work(tsk);
 	do {
+		preempt_disable();
 		__schedule();
+		sched_preempt_enable_no_resched();
 	} while (need_resched());
 }
 EXPORT_SYMBOL(schedule);
@@ -2862,15 +2884,14 @@ void __sched schedule_preempt_disabled(void)
 static void __sched notrace preempt_schedule_common(void)
 {
 	do {
-		__preempt_count_add(PREEMPT_ACTIVE);
+		preempt_active_enter();
 		__schedule();
-		__preempt_count_sub(PREEMPT_ACTIVE);
+		preempt_active_exit();
 
 		/*
 		 * Check again in case we missed a preemption opportunity
 		 * between schedule and now.
 		 */
-		barrier();
 	} while (need_resched());
 }
 
@@ -2917,7 +2938,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
 		return;
 
 	do {
-		__preempt_count_add(PREEMPT_ACTIVE);
+		preempt_active_enter();
 		/*
 		 * Needs preempt disabled in case user_exit() is traced
 		 * and the tracer calls preempt_enable_notrace() causing
@@ -2927,8 +2948,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
 		__schedule();
 		exception_exit(prev_ctx);
 
-		__preempt_count_sub(PREEMPT_ACTIVE);
-		barrier();
+		preempt_active_exit();
 	} while (need_resched());
 }
 EXPORT_SYMBOL_GPL(preempt_schedule_context);
@@ -2952,17 +2972,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
 	prev_state = exception_enter();
 
 	do {
-		__preempt_count_add(PREEMPT_ACTIVE);
+		preempt_active_enter();
 		local_irq_enable();
 		__schedule();
 		local_irq_disable();
-		__preempt_count_sub(PREEMPT_ACTIVE);
-
-		/*
-		 * Check again in case we missed a preemption opportunity
-		 * between schedule and now.
-		 */
-		barrier();
+		preempt_active_exit();
 	} while (need_resched());
 
 	exception_exit(prev_state);
@@ -5314,7 +5328,7 @@ static struct notifier_block migration_notifier = {
 	.priority = CPU_PRI_MIGRATION,
 };
 
-static void __cpuinit set_cpu_rq_start_time(void)
+static void set_cpu_rq_start_time(void)
 {
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
@@ -7032,6 +7046,9 @@ void __init sched_init_smp(void)
 	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
 	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 
+	/* nohz_full won't take effect without isolating the cpus. */
+	tick_nohz_full_add_cpus_to(cpu_isolated_map);
+
 	sched_init_numa();
 
 	/*
@@ -7734,11 +7751,11 @@ static long sched_group_rt_runtime(struct task_group *tg)
 	return rt_runtime_us;
 }
 
-static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
 {
 	u64 rt_runtime, rt_period;
 
-	rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+	rt_period = rt_period_us * NSEC_PER_USEC;
 	rt_runtime = tg->rt_bandwidth.rt_runtime;
 
 	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
@@ -8105,10 +8122,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 
 	__refill_cfs_bandwidth_runtime(cfs_b);
 	/* restart the period timer (if active) to handle new period expiry */
-	if (runtime_enabled && cfs_b->timer_active) {
-		/* force a reprogram */
-		__start_cfs_bandwidth(cfs_b, true);
-	}
+	if (runtime_enabled)
+		start_cfs_bandwidth(cfs_b);
 	raw_spin_unlock_irq(&cfs_b->lock);
 
 	for_each_online_cpu(i) {
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8394b1ee600c38..f5a64ffad176f1 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -567,7 +567,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new)
 {
 	cputime_t old;
 
-	while (new > (old = ACCESS_ONCE(*counter)))
+	while (new > (old = READ_ONCE(*counter)))
 		cmpxchg_cputime(counter, old, new);
 }
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5e95145088fd37..7a08d590990e74 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -503,8 +503,6 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 	struct rq *rq = rq_of_dl_rq(dl_rq);
 	ktime_t now, act;
-	ktime_t soft, hard;
-	unsigned long range;
 	s64 delta;
 
 	if (boosted)
@@ -527,15 +525,9 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
 	if (ktime_us_delta(act, now) < 0)
 		return 0;
 
-	hrtimer_set_expires(&dl_se->dl_timer, act);
+	hrtimer_start(&dl_se->dl_timer, act, HRTIMER_MODE_ABS);
 
-	soft = hrtimer_get_softexpires(&dl_se->dl_timer);
-	hard = hrtimer_get_expires(&dl_se->dl_timer);
-	range = ktime_to_ns(ktime_sub(hard, soft));
-	__hrtimer_start_range_ns(&dl_se->dl_timer, soft,
-				 range, HRTIMER_MODE_ABS, 0);
-
-	return hrtimer_active(&dl_se->dl_timer);
+	return 1;
 }
 
 /*
@@ -995,7 +987,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
 	rq = cpu_rq(cpu);
 
 	rcu_read_lock();
-	curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+	curr = READ_ONCE(rq->curr); /* unlocked access */
 
 	/*
 	 * If we are dealing with a -deadline task, we must
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index a245c1fc6f0a61..f94724eda407ec 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -230,8 +230,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #endif
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
-	SEQ_printf(m, "  .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
-			cfs_rq->tg->cfs_bandwidth.timer_active);
 	SEQ_printf(m, "  .%-30s: %d\n", "throttled",
 			cfs_rq->throttled);
 	SEQ_printf(m, "  .%-30s: %d\n", "throttle_count",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ffeaa4105e48a3..d597aea4303041 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -141,9 +141,9 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
  *
  * This idea comes from the SD scheduler of Con Kolivas:
  */
-static int get_update_sysctl_factor(void)
+static unsigned int get_update_sysctl_factor(void)
 {
-	unsigned int cpus = min_t(int, num_online_cpus(), 8);
+	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
 	unsigned int factor;
 
 	switch (sysctl_sched_tunable_scaling) {
@@ -576,7 +576,7 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
 		loff_t *ppos)
 {
 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-	int factor = get_update_sysctl_factor();
+	unsigned int factor = get_update_sysctl_factor();
 
 	if (ret || !write)
 		return ret;
@@ -834,7 +834,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
 
 static unsigned int task_scan_min(struct task_struct *p)
 {
-	unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
+	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
 	unsigned int scan, floor;
 	unsigned int windows = 1;
 
@@ -1794,7 +1794,12 @@ static void task_numa_placement(struct task_struct *p)
 	u64 runtime, period;
 	spinlock_t *group_lock = NULL;
 
-	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+	/*
+	 * The p->mm->numa_scan_seq field gets updated without
+	 * exclusive access. Use READ_ONCE() here to ensure
+	 * that the field is read in a single access:
+	 */
+	seq = READ_ONCE(p->mm->numa_scan_seq);
 	if (p->numa_scan_seq == seq)
 		return;
 	p->numa_scan_seq = seq;
@@ -1938,7 +1943,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 	}
 
 	rcu_read_lock();
-	tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+	tsk = READ_ONCE(cpu_rq(cpu)->curr);
 
 	if (!cpupid_match_pid(tsk, cpupid))
 		goto no_join;
@@ -2107,7 +2112,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 
 static void reset_ptenuma_scan(struct task_struct *p)
 {
-	ACCESS_ONCE(p->mm->numa_scan_seq)++;
+	/*
+	 * We only did a read acquisition of the mmap sem, so
+	 * p->mm->numa_scan_seq is written to without exclusive access
+	 * and the update is not guaranteed to be atomic. That's not
+	 * much of an issue though, since this is just used for
+	 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
+	 * expensive, to avoid any form of compiler optimizations:
+	 */
+	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
 	p->mm->numa_scan_offset = 0;
 }
 
@@ -3476,16 +3489,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	if (cfs_b->quota == RUNTIME_INF)
 		amount = min_amount;
 	else {
-		/*
-		 * If the bandwidth pool has become inactive, then at least one
-		 * period must have elapsed since the last consumption.
-		 * Refresh the global state and ensure bandwidth timer becomes
-		 * active.
-		 */
-		if (!cfs_b->timer_active) {
-			__refill_cfs_bandwidth_runtime(cfs_b);
-			__start_cfs_bandwidth(cfs_b, false);
-		}
+		start_cfs_bandwidth(cfs_b);
 
 		if (cfs_b->runtime > 0) {
 			amount = min(cfs_b->runtime, min_amount);
@@ -3634,6 +3638,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 	struct sched_entity *se;
 	long task_delta, dequeue = 1;
+	bool empty;
 
 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
 
@@ -3663,13 +3668,21 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	cfs_rq->throttled = 1;
 	cfs_rq->throttled_clock = rq_clock(rq);
 	raw_spin_lock(&cfs_b->lock);
+	empty = list_empty(&cfs_rq->throttled_list);
+
 	/*
 	 * Add to the _head_ of the list, so that an already-started
 	 * distribute_cfs_runtime will not see us
 	 */
 	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-	if (!cfs_b->timer_active)
-		__start_cfs_bandwidth(cfs_b, false);
+
+	/*
+	 * If we're the first throttled task, make sure the bandwidth
+	 * timer is running.
+	 */
+	if (empty)
+		start_cfs_bandwidth(cfs_b);
+
 	raw_spin_unlock(&cfs_b->lock);
 }
 
@@ -3784,13 +3797,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 	if (cfs_b->idle && !throttled)
 		goto out_deactivate;
 
-	/*
-	 * if we have relooped after returning idle once, we need to update our
-	 * status as actually running, so that other cpus doing
-	 * __start_cfs_bandwidth will stop trying to cancel us.
-	 */
-	cfs_b->timer_active = 1;
-
 	__refill_cfs_bandwidth_runtime(cfs_b);
 
 	if (!throttled) {
@@ -3835,7 +3841,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 	return 0;
 
 out_deactivate:
-	cfs_b->timer_active = 0;
 	return 1;
 }
 
@@ -3850,7 +3855,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
  * Are we near the end of the current quota period?
  *
  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
- * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+ * hrtimer base being cleared by hrtimer_start. In the case of
  * migrate_hrtimers, base is never cleared, so we are fine.
  */
 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
@@ -3878,8 +3883,9 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
 	if (runtime_refresh_within(cfs_b, min_left))
 		return;
 
-	start_bandwidth_timer(&cfs_b->slack_timer,
-				ns_to_ktime(cfs_bandwidth_slack_period));
+	hrtimer_start(&cfs_b->slack_timer,
+			ns_to_ktime(cfs_bandwidth_slack_period),
+			HRTIMER_MODE_REL);
 }
 
 /* we know any runtime found here is valid as update_curr() precedes return */
@@ -3999,6 +4005,7 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
 {
 	struct cfs_bandwidth *cfs_b =
 		container_of(timer, struct cfs_bandwidth, slack_timer);
+
 	do_sched_cfs_slack_timer(cfs_b);
 
 	return HRTIMER_NORESTART;
@@ -4008,20 +4015,19 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
 {
 	struct cfs_bandwidth *cfs_b =
 		container_of(timer, struct cfs_bandwidth, period_timer);
-	ktime_t now;
 	int overrun;
 	int idle = 0;
 
 	raw_spin_lock(&cfs_b->lock);
 	for (;;) {
-		now = hrtimer_cb_get_time(timer);
-		overrun = hrtimer_forward(timer, now, cfs_b->period);
-
+		overrun = hrtimer_forward_now(timer, cfs_b->period);
 		if (!overrun)
 			break;
 
 		idle = do_sched_cfs_period_timer(cfs_b, overrun);
 	}
+	if (idle)
+		cfs_b->period_active = 0;
 	raw_spin_unlock(&cfs_b->lock);
 
 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
@@ -4035,7 +4041,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	cfs_b->period = ns_to_ktime(default_cfs_period());
 
 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
-	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 	cfs_b->period_timer.function = sched_cfs_period_timer;
 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
@@ -4047,28 +4053,15 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
 }
 
-/* requires cfs_b->lock, may release to reprogram timer */
-void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
+void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
-	/*
-	 * The timer may be active because we're trying to set a new bandwidth
-	 * period or because we're racing with the tear-down path
-	 * (timer_active==0 becomes visible before the hrtimer call-back
-	 * terminates).  In either case we ensure that it's re-programmed
-	 */
-	while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
-	       hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
-		/* bounce the lock to allow do_sched_cfs_period_timer to run */
-		raw_spin_unlock(&cfs_b->lock);
-		cpu_relax();
-		raw_spin_lock(&cfs_b->lock);
-		/* if someone else restarted the timer then we're done */
-		if (!force && cfs_b->timer_active)
-			return;
-	}
+	lockdep_assert_held(&cfs_b->lock);
 
-	cfs_b->timer_active = 1;
-	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+	if (!cfs_b->period_active) {
+		cfs_b->period_active = 1;
+		hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
+		hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
+	}
 }
 
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -4323,6 +4316,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SMP
+
+/*
+ * per rq 'load' arrray crap; XXX kill this.
+ */
+
+/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT		7
+static const unsigned char
+		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+					{0, 0, 0, 0, 0, 0, 0, 0},
+					{64, 32, 8, 0, 0, 0, 0, 0},
+					{96, 72, 40, 12, 1, 0, 0},
+					{112, 98, 75, 43, 15, 1, 0},
+					{120, 112, 98, 76, 45, 16, 2} };
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+	int j = 0;
+
+	if (!missed_updates)
+		return load;
+
+	if (missed_updates >= degrade_zero_ticks[idx])
+		return 0;
+
+	if (idx == 1)
+		return load >> missed_updates;
+
+	while (missed_updates) {
+		if (missed_updates % 2)
+			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+
+		missed_updates >>= 1;
+		j++;
+	}
+	return load;
+}
+
+/*
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
+ */
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+			      unsigned long pending_updates)
+{
+	int i, scale;
+
+	this_rq->nr_load_updates++;
+
+	/* Update our load: */
+	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+		unsigned long old_load, new_load;
+
+		/* scale is effectively 1 << i now, and >> i divides by scale */
+
+		old_load = this_rq->cpu_load[i];
+		old_load = decay_load_missed(old_load, pending_updates - 1, i);
+		new_load = this_load;
+		/*
+		 * Round up the averaging division if load is increasing. This
+		 * prevents us from getting stuck on 9 if the load is 10, for
+		 * example.
+		 */
+		if (new_load > old_load)
+			new_load += scale - 1;
+
+		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+	}
+
+	sched_avg_update(this_rq);
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+static void update_idle_cpu_load(struct rq *this_rq)
+{
+	unsigned long curr_jiffies = READ_ONCE(jiffies);
+	unsigned long load = this_rq->cfs.runnable_load_avg;
+	unsigned long pending_updates;
+
+	/*
+	 * bail if there's load or we're actually up-to-date.
+	 */
+	if (load || curr_jiffies == this_rq->last_load_update_tick)
+		return;
+
+	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+	this_rq->last_load_update_tick = curr_jiffies;
+
+	__update_cpu_load(this_rq, load, pending_updates);
+}
+
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+	struct rq *this_rq = this_rq();
+	unsigned long curr_jiffies = READ_ONCE(jiffies);
+	unsigned long pending_updates;
+
+	if (curr_jiffies == this_rq->last_load_update_tick)
+		return;
+
+	raw_spin_lock(&this_rq->lock);
+	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+	if (pending_updates) {
+		this_rq->last_load_update_tick = curr_jiffies;
+		/*
+		 * We were idle, this means load 0, the current load might be
+		 * !0 due to remote wakeups and the sort.
+		 */
+		__update_cpu_load(this_rq, 0, pending_updates);
+	}
+	raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+
+/*
+ * Called from scheduler_tick()
+ */
+void update_cpu_load_active(struct rq *this_rq)
+{
+	unsigned long load = this_rq->cfs.runnable_load_avg;
+	/*
+	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+	 */
+	this_rq->last_load_update_tick = jiffies;
+	__update_cpu_load(this_rq, load, 1);
+}
+
 /* Used instead of source_load when we know the type == 0 */
 static unsigned long weighted_cpuload(const int cpu)
 {
@@ -4375,7 +4551,7 @@ static unsigned long capacity_orig_of(int cpu)
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
+	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
 	unsigned long load_avg = rq->cfs.runnable_load_avg;
 
 	if (nr_running)
@@ -5467,10 +5643,15 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 }
 
 #ifdef CONFIG_NUMA_BALANCING
-/* Returns true if the destination node has incurred more faults */
+/*
+ * Returns true if the destination node is the preferred node.
+ * Needs to match fbq_classify_rq(): if there is a runnable task
+ * that is not on its preferred node, we should identify it.
+ */
 static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
 {
 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
+	unsigned long src_faults, dst_faults;
 	int src_nid, dst_nid;
 
 	if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
@@ -5484,29 +5665,30 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
 	if (src_nid == dst_nid)
 		return false;
 
-	if (numa_group) {
-		/* Task is already in the group's interleave set. */
-		if (node_isset(src_nid, numa_group->active_nodes))
-			return false;
-
-		/* Task is moving into the group's interleave set. */
-		if (node_isset(dst_nid, numa_group->active_nodes))
-			return true;
-
-		return group_faults(p, dst_nid) > group_faults(p, src_nid);
-	}
-
 	/* Encourage migration to the preferred node. */
 	if (dst_nid == p->numa_preferred_nid)
 		return true;
 
-	return task_faults(p, dst_nid) > task_faults(p, src_nid);
+	/* Migrating away from the preferred node is bad. */
+	if (src_nid == p->numa_preferred_nid)
+		return false;
+
+	if (numa_group) {
+		src_faults = group_faults(p, src_nid);
+		dst_faults = group_faults(p, dst_nid);
+	} else {
+		src_faults = task_faults(p, src_nid);
+		dst_faults = task_faults(p, dst_nid);
+	}
+
+	return dst_faults > src_faults;
 }
 
 
 static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 {
 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
+	unsigned long src_faults, dst_faults;
 	int src_nid, dst_nid;
 
 	if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5521,23 +5703,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 	if (src_nid == dst_nid)
 		return false;
 
-	if (numa_group) {
-		/* Task is moving within/into the group's interleave set. */
-		if (node_isset(dst_nid, numa_group->active_nodes))
-			return false;
+	/* Migrating away from the preferred node is bad. */
+	if (src_nid == p->numa_preferred_nid)
+		return true;
 
-		/* Task is moving out of the group's interleave set. */
-		if (node_isset(src_nid, numa_group->active_nodes))
-			return true;
+	/* Encourage migration to the preferred node. */
+	if (dst_nid == p->numa_preferred_nid)
+		return false;
 
-		return group_faults(p, dst_nid) < group_faults(p, src_nid);
+	if (numa_group) {
+		src_faults = group_faults(p, src_nid);
+		dst_faults = group_faults(p, dst_nid);
+	} else {
+		src_faults = task_faults(p, src_nid);
+		dst_faults = task_faults(p, dst_nid);
 	}
 
-	/* Migrating away from the preferred node is always bad. */
-	if (src_nid == p->numa_preferred_nid)
-		return true;
-
-	return task_faults(p, dst_nid) < task_faults(p, src_nid);
+	return dst_faults < src_faults;
 }
 
 #else
@@ -6037,8 +6219,8 @@ static unsigned long scale_rt_capacity(int cpu)
 	 * Since we're reading these variables without serialization make sure
 	 * we read them once before doing sanity checks on them.
 	 */
-	age_stamp = ACCESS_ONCE(rq->age_stamp);
-	avg = ACCESS_ONCE(rq->rt_avg);
+	age_stamp = READ_ONCE(rq->age_stamp);
+	avg = READ_ONCE(rq->rt_avg);
 	delta = __rq_clock_broken(rq) - age_stamp;
 
 	if (unlikely(delta < 0))
diff --git a/kernel/sched/proc.c b/kernel/sched/loadavg.c
index 8ecd552fe4f222..ef7159012cf366 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/loadavg.c
@@ -1,7 +1,9 @@
 /*
- *  kernel/sched/proc.c
+ * kernel/sched/loadavg.c
  *
- *  Kernel load calculations, forked from sched/core.c
+ * This file contains the magic bits required to compute the global loadavg
+ * figure. Its a silly number but people think its important. We go through
+ * great pains to make it work on big machines and tickless kernels.
  */
 
 #include <linux/export.h>
@@ -81,7 +83,7 @@ long calc_load_fold_active(struct rq *this_rq)
 	long nr_active, delta = 0;
 
 	nr_active = this_rq->nr_running;
-	nr_active += (long) this_rq->nr_uninterruptible;
+	nr_active += (long)this_rq->nr_uninterruptible;
 
 	if (nr_active != this_rq->calc_load_active) {
 		delta = nr_active - this_rq->calc_load_active;
@@ -186,6 +188,7 @@ void calc_load_enter_idle(void)
 	delta = calc_load_fold_active(this_rq);
 	if (delta) {
 		int idx = calc_load_write_idx();
+
 		atomic_long_add(delta, &calc_load_idle[idx]);
 	}
 }
@@ -241,18 +244,20 @@ fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
 {
 	unsigned long result = 1UL << frac_bits;
 
-	if (n) for (;;) {
-		if (n & 1) {
-			result *= x;
-			result += 1UL << (frac_bits - 1);
-			result >>= frac_bits;
+	if (n) {
+		for (;;) {
+			if (n & 1) {
+				result *= x;
+				result += 1UL << (frac_bits - 1);
+				result >>= frac_bits;
+			}
+			n >>= 1;
+			if (!n)
+				break;
+			x *= x;
+			x += 1UL << (frac_bits - 1);
+			x >>= frac_bits;
 		}
-		n >>= 1;
-		if (!n)
-			break;
-		x *= x;
-		x += 1UL << (frac_bits - 1);
-		x >>= frac_bits;
 	}
 
 	return result;
@@ -285,7 +290,6 @@ static unsigned long
 calc_load_n(unsigned long load, unsigned long exp,
 	    unsigned long active, unsigned int n)
 {
-
 	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
 }
 
@@ -339,6 +343,8 @@ static inline void calc_global_nohz(void) { }
 /*
  * calc_load - update the avenrun load estimates 10 ticks after the
  * CPUs have updated calc_load_tasks.
+ *
+ * Called from the global timer code.
  */
 void calc_global_load(unsigned long ticks)
 {
@@ -370,10 +376,10 @@ void calc_global_load(unsigned long ticks)
 }
 
 /*
- * Called from update_cpu_load() to periodically update this CPU's
+ * Called from scheduler_tick() to periodically update this CPU's
  * active count.
  */
-static void calc_load_account_active(struct rq *this_rq)
+void calc_global_load_tick(struct rq *this_rq)
 {
 	long delta;
 
@@ -386,199 +392,3 @@ static void calc_load_account_active(struct rq *this_rq)
 
 	this_rq->calc_load_update += LOAD_FREQ;
 }
-
-/*
- * End of global load-average stuff
- */
-
-/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
- *
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
- *
- * The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
- */
-#define DEGRADE_SHIFT		7
-static const unsigned char
-		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
-		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
-					{0, 0, 0, 0, 0, 0, 0, 0},
-					{64, 32, 8, 0, 0, 0, 0, 0},
-					{96, 72, 40, 12, 1, 0, 0},
-					{112, 98, 75, 43, 15, 1, 0},
-					{120, 112, 98, 76, 45, 16, 2} };
-
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
-	int j = 0;
-
-	if (!missed_updates)
-		return load;
-
-	if (missed_updates >= degrade_zero_ticks[idx])
-		return 0;
-
-	if (idx == 1)
-		return load >> missed_updates;
-
-	while (missed_updates) {
-		if (missed_updates % 2)
-			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-
-		missed_updates >>= 1;
-		j++;
-	}
-	return load;
-}
-
-/*
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
- */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
-			      unsigned long pending_updates)
-{
-	int i, scale;
-
-	this_rq->nr_load_updates++;
-
-	/* Update our load: */
-	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
-	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
-		unsigned long old_load, new_load;
-
-		/* scale is effectively 1 << i now, and >> i divides by scale */
-
-		old_load = this_rq->cpu_load[i];
-		old_load = decay_load_missed(old_load, pending_updates - 1, i);
-		new_load = this_load;
-		/*
-		 * Round up the averaging division if load is increasing. This
-		 * prevents us from getting stuck on 9 if the load is 10, for
-		 * example.
-		 */
-		if (new_load > old_load)
-			new_load += scale - 1;
-
-		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
-	}
-
-	sched_avg_update(this_rq);
-}
-
-#ifdef CONFIG_SMP
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
-	return rq->cfs.runnable_load_avg;
-}
-#else
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
-	return rq->load.weight;
-}
-#endif
-
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-void update_idle_cpu_load(struct rq *this_rq)
-{
-	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
-	unsigned long load = get_rq_runnable_load(this_rq);
-	unsigned long pending_updates;
-
-	/*
-	 * bail if there's load or we're actually up-to-date.
-	 */
-	if (load || curr_jiffies == this_rq->last_load_update_tick)
-		return;
-
-	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-	this_rq->last_load_update_tick = curr_jiffies;
-
-	__update_cpu_load(this_rq, load, pending_updates);
-}
-
-/*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
- */
-void update_cpu_load_nohz(void)
-{
-	struct rq *this_rq = this_rq();
-	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
-	unsigned long pending_updates;
-
-	if (curr_jiffies == this_rq->last_load_update_tick)
-		return;
-
-	raw_spin_lock(&this_rq->lock);
-	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-	if (pending_updates) {
-		this_rq->last_load_update_tick = curr_jiffies;
-		/*
-		 * We were idle, this means load 0, the current load might be
-		 * !0 due to remote wakeups and the sort.
-		 */
-		__update_cpu_load(this_rq, 0, pending_updates);
-	}
-	raw_spin_unlock(&this_rq->lock);
-}
-#endif /* CONFIG_NO_HZ */
-
-/*
- * Called from scheduler_tick()
- */
-void update_cpu_load_active(struct rq *this_rq)
-{
-	unsigned long load = get_rq_runnable_load(this_rq);
-	/*
-	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
-	 */
-	this_rq->last_load_update_tick = jiffies;
-	__update_cpu_load(this_rq, load, 1);
-
-	calc_load_account_active(this_rq);
-}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 575da76a3874a8..7d7093c51f8d16 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -18,19 +18,22 @@ static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
 {
 	struct rt_bandwidth *rt_b =
 		container_of(timer, struct rt_bandwidth, rt_period_timer);
-	ktime_t now;
-	int overrun;
 	int idle = 0;
+	int overrun;
 
+	raw_spin_lock(&rt_b->rt_runtime_lock);
 	for (;;) {
-		now = hrtimer_cb_get_time(timer);
-		overrun = hrtimer_forward(timer, now, rt_b->rt_period);
-
+		overrun = hrtimer_forward_now(timer, rt_b->rt_period);
 		if (!overrun)
 			break;
 
+		raw_spin_unlock(&rt_b->rt_runtime_lock);
 		idle = do_sched_rt_period_timer(rt_b, overrun);
+		raw_spin_lock(&rt_b->rt_runtime_lock);
 	}
+	if (idle)
+		rt_b->rt_period_active = 0;
+	raw_spin_unlock(&rt_b->rt_runtime_lock);
 
 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 }
@@ -52,11 +55,12 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 		return;
 
-	if (hrtimer_active(&rt_b->rt_period_timer))
-		return;
-
 	raw_spin_lock(&rt_b->rt_runtime_lock);
-	start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
+	if (!rt_b->rt_period_active) {
+		rt_b->rt_period_active = 1;
+		hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
+		hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
+	}
 	raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
 
@@ -1323,7 +1327,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
 	rq = cpu_rq(cpu);
 
 	rcu_read_lock();
-	curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+	curr = READ_ONCE(rq->curr); /* unlocked access */
 
 	/*
 	 * If the current task on @p's runqueue is an RT task, then
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e0e1299939588a..f10a445910c9a3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -26,8 +26,14 @@ extern __read_mostly int scheduler_running;
 extern unsigned long calc_load_update;
 extern atomic_long_t calc_load_tasks;
 
+extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq);
+
+#ifdef CONFIG_SMP
 extern void update_cpu_load_active(struct rq *this_rq);
+#else
+static inline void update_cpu_load_active(struct rq *this_rq) { }
+#endif
 
 /*
  * Helpers for converting nanosecond timing to jiffy resolution
@@ -131,6 +137,7 @@ struct rt_bandwidth {
 	ktime_t			rt_period;
 	u64			rt_runtime;
 	struct hrtimer		rt_period_timer;
+	unsigned int		rt_period_active;
 };
 
 void __dl_clear_params(struct task_struct *p);
@@ -215,7 +222,7 @@ struct cfs_bandwidth {
 	s64 hierarchical_quota;
 	u64 runtime_expires;
 
-	int idle, timer_active;
+	int idle, period_active;
 	struct hrtimer period_timer, slack_timer;
 	struct list_head throttled_cfs_rq;
 
@@ -306,7 +313,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 
 extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
-extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
+extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
 extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
 
 extern void free_rt_sched_group(struct task_group *tg);
@@ -707,7 +714,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
 static inline u64 __rq_clock_broken(struct rq *rq)
 {
-	return ACCESS_ONCE(rq->clock);
+	return READ_ONCE(rq->clock);
 }
 
 static inline u64 rq_clock(struct rq *rq)
@@ -1298,8 +1305,6 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
 
 unsigned long to_ratio(u64 period, u64 runtime);
 
-extern void update_idle_cpu_load(struct rq *this_rq);
-
 extern void init_task_runnable_average(struct task_struct *p);
 
 static inline void add_nr_running(struct rq *rq, unsigned count)
@@ -1406,8 +1411,6 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
 static inline void sched_avg_update(struct rq *rq) { }
 #endif
 
-extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
-
 /*
  * __task_rq_lock - lock the rq @p resides on.
  */
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4ab7043396569f..077ebbd5e10f14 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 
-	if (!cputimer->running)
+	/* Check if cputimer isn't running. This is accessed without locking. */
+	if (!READ_ONCE(cputimer->running))
 		return false;
 
 	/*
@@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
 	if (!cputimer_running(tsk))
 		return;
 
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.utime += cputime;
-	raw_spin_unlock(&cputimer->lock);
+	atomic64_add(cputime, &cputimer->cputime_atomic.utime);
 }
 
 /**
@@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
 	if (!cputimer_running(tsk))
 		return;
 
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.stime += cputime;
-	raw_spin_unlock(&cputimer->lock);
+	atomic64_add(cputime, &cputimer->cputime_atomic.stime);
 }
 
 /**
@@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
 	if (!cputimer_running(tsk))
 		return;
 
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.sum_exec_runtime += ns;
-	raw_spin_unlock(&cputimer->lock);
+	atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
 }
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 852143a79f367f..052e02672d1242 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -341,7 +341,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
 	 * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
 	 * an event.
 	 */
-	set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
+	smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
 
 	return timeout;
 }
@@ -354,7 +354,7 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
 	 * doesn't imply write barrier and the users expects write
 	 * barrier semantics on wakeup functions.  The following
 	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
-	 * and is paired with set_mb() in wait_woken().
+	 * and is paired with smp_store_mb() in wait_woken().
 	 */
 	smp_wmb(); /* C */
 	wait->flags |= WQ_FLAG_WOKEN;
@@ -601,7 +601,7 @@ EXPORT_SYMBOL(bit_wait_io);
 
 __sched int bit_wait_timeout(struct wait_bit_key *word)
 {
-	unsigned long now = ACCESS_ONCE(jiffies);
+	unsigned long now = READ_ONCE(jiffies);
 	if (signal_pending_state(current->state, current))
 		return 1;
 	if (time_after_eq(now, word->timeout))
@@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
 
 __sched int bit_wait_io_timeout(struct wait_bit_key *word)
 {
-	unsigned long now = ACCESS_ONCE(jiffies);
+	unsigned long now = READ_ONCE(jiffies);
 	if (signal_pending_state(current->state, current))
 		return 1;
 	if (time_after_eq(now, word->timeout))
diff --git a/kernel/signal.c b/kernel/signal.c
index d4972504f2f18b..836df8dac6ccd1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -245,7 +245,7 @@ static inline void print_dropped_signal(int sig)
  * RETURNS:
  * %true if @mask is set, %false if made noop because @task was dying.
  */
-bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
+bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask)
 {
 	BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
 			JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
@@ -297,7 +297,7 @@ void task_clear_jobctl_trapping(struct task_struct *task)
  * CONTEXT:
  * Must be called with @task->sighand->siglock held.
  */
-void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
+void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
 {
 	BUG_ON(mask & ~JOBCTL_PENDING_MASK);
 
@@ -1995,7 +1995,7 @@ static bool do_signal_stop(int signr)
 	struct signal_struct *sig = current->signal;
 
 	if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
-		unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
+		unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
 		struct task_struct *t;
 
 		/* signr will be recorded in task->jobctl for retries */
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 01f0312419b3cb..ffc4cc3dcd47b4 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -13,19 +13,4 @@ obj-$(CONFIG_TIMER_STATS)			+= timer_stats.o
 obj-$(CONFIG_DEBUG_FS)				+= timekeeping_debug.o
 obj-$(CONFIG_TEST_UDELAY)			+= test_udelay.o
 
-$(obj)/time.o: $(obj)/timeconst.h
-
-quiet_cmd_hzfile = HZFILE  $@
-      cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
-
-targets += hz.bc
-$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
-	$(call if_changed,hzfile)
-
-quiet_cmd_bc  = BC      $@
-      cmd_bc  = bc -q $(filter-out FORCE,$^) > $@
-
-targets += timeconst.h
-$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
-	$(call if_changed,bc)
-
+$(obj)/time.o: $(objtree)/include/config/
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 1b001ed1edb945..7fbba635a54998 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -317,19 +317,16 @@ EXPORT_SYMBOL_GPL(alarm_init);
  * @alarm: ptr to alarm to set
  * @start: time to run the alarm
  */
-int alarm_start(struct alarm *alarm, ktime_t start)
+void alarm_start(struct alarm *alarm, ktime_t start)
 {
 	struct alarm_base *base = &alarm_bases[alarm->type];
 	unsigned long flags;
-	int ret;
 
 	spin_lock_irqsave(&base->lock, flags);
 	alarm->node.expires = start;
 	alarmtimer_enqueue(base, alarm);
-	ret = hrtimer_start(&alarm->timer, alarm->node.expires,
-				HRTIMER_MODE_ABS);
+	hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
 	spin_unlock_irqrestore(&base->lock, flags);
-	return ret;
 }
 EXPORT_SYMBOL_GPL(alarm_start);
 
@@ -338,12 +335,12 @@ EXPORT_SYMBOL_GPL(alarm_start);
  * @alarm: ptr to alarm to set
  * @start: time relative to now to run the alarm
  */
-int alarm_start_relative(struct alarm *alarm, ktime_t start)
+void alarm_start_relative(struct alarm *alarm, ktime_t start)
 {
 	struct alarm_base *base = &alarm_bases[alarm->type];
 
 	start = ktime_add(start, base->gettime());
-	return alarm_start(alarm, start);
+	alarm_start(alarm, start);
 }
 EXPORT_SYMBOL_GPL(alarm_start_relative);
 
@@ -495,12 +492,12 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
  */
 static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
 {
-	clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
-
 	if (!alarmtimer_get_rtcdev())
 		return -EINVAL;
 
-	return hrtimer_get_res(baseid, tp);
+	tp->tv_sec = 0;
+	tp->tv_nsec = hrtimer_resolution;
+	return 0;
 }
 
 /**
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 637a09461c1d9a..2397b97320d811 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -94,8 +94,8 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
 }
 EXPORT_SYMBOL_GPL(clockevent_delta2ns);
 
-static int __clockevents_set_state(struct clock_event_device *dev,
-				   enum clock_event_state state)
+static int __clockevents_switch_state(struct clock_event_device *dev,
+				      enum clock_event_state state)
 {
 	/* Transition with legacy set_mode() callback */
 	if (dev->set_mode) {
@@ -134,32 +134,44 @@ static int __clockevents_set_state(struct clock_event_device *dev,
 			return -ENOSYS;
 		return dev->set_state_oneshot(dev);
 
+	case CLOCK_EVT_STATE_ONESHOT_STOPPED:
+		/* Core internal bug */
+		if (WARN_ONCE(!clockevent_state_oneshot(dev),
+			      "Current state: %d\n",
+			      clockevent_get_state(dev)))
+			return -EINVAL;
+
+		if (dev->set_state_oneshot_stopped)
+			return dev->set_state_oneshot_stopped(dev);
+		else
+			return -ENOSYS;
+
 	default:
 		return -ENOSYS;
 	}
 }
 
 /**
- * clockevents_set_state - set the operating state of a clock event device
+ * clockevents_switch_state - set the operating state of a clock event device
  * @dev:	device to modify
  * @state:	new state
  *
  * Must be called with interrupts disabled !
  */
-void clockevents_set_state(struct clock_event_device *dev,
-			   enum clock_event_state state)
+void clockevents_switch_state(struct clock_event_device *dev,
+			      enum clock_event_state state)
 {
-	if (dev->state != state) {
-		if (__clockevents_set_state(dev, state))
+	if (clockevent_get_state(dev) != state) {
+		if (__clockevents_switch_state(dev, state))
 			return;
 
-		dev->state = state;
+		clockevent_set_state(dev, state);
 
 		/*
 		 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
 		 * on it, so fix it up and emit a warning:
 		 */
-		if (state == CLOCK_EVT_STATE_ONESHOT) {
+		if (clockevent_state_oneshot(dev)) {
 			if (unlikely(!dev->mult)) {
 				dev->mult = 1;
 				WARN_ON(1);
@@ -174,7 +186,7 @@ void clockevents_set_state(struct clock_event_device *dev,
  */
 void clockevents_shutdown(struct clock_event_device *dev)
 {
-	clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
+	clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
 	dev->next_event.tv64 = KTIME_MAX;
 }
 
@@ -248,7 +260,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
 		delta = dev->min_delta_ns;
 		dev->next_event = ktime_add_ns(ktime_get(), delta);
 
-		if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
+		if (clockevent_state_shutdown(dev))
 			return 0;
 
 		dev->retries++;
@@ -285,7 +297,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
 	delta = dev->min_delta_ns;
 	dev->next_event = ktime_add_ns(ktime_get(), delta);
 
-	if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
+	if (clockevent_state_shutdown(dev))
 		return 0;
 
 	dev->retries++;
@@ -317,9 +329,13 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
 
 	dev->next_event = expires;
 
-	if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
+	if (clockevent_state_shutdown(dev))
 		return 0;
 
+	/* We must be in ONESHOT state here */
+	WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n",
+		  clockevent_get_state(dev));
+
 	/* Shortcut for clockevent devices that can deal with ktime. */
 	if (dev->features & CLOCK_EVT_FEAT_KTIME)
 		return dev->set_next_ktime(expires, dev);
@@ -362,7 +378,7 @@ static int clockevents_replace(struct clock_event_device *ced)
 	struct clock_event_device *dev, *newdev = NULL;
 
 	list_for_each_entry(dev, &clockevent_devices, list) {
-		if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED)
+		if (dev == ced || !clockevent_state_detached(dev))
 			continue;
 
 		if (!tick_check_replacement(newdev, dev))
@@ -388,7 +404,7 @@ static int clockevents_replace(struct clock_event_device *ced)
 static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
 {
 	/* Fast track. Device is unused */
-	if (ced->state == CLOCK_EVT_STATE_DETACHED) {
+	if (clockevent_state_detached(ced)) {
 		list_del_init(&ced->list);
 		return 0;
 	}
@@ -445,7 +461,8 @@ static int clockevents_sanity_check(struct clock_event_device *dev)
 	if (dev->set_mode) {
 		/* We shouldn't be supporting new modes now */
 		WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
-			dev->set_state_shutdown || dev->tick_resume);
+			dev->set_state_shutdown || dev->tick_resume ||
+			dev->set_state_oneshot_stopped);
 
 		BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
 		return 0;
@@ -480,7 +497,7 @@ void clockevents_register_device(struct clock_event_device *dev)
 	BUG_ON(clockevents_sanity_check(dev));
 
 	/* Initialize state to DETACHED */
-	dev->state = CLOCK_EVT_STATE_DETACHED;
+	clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
 
 	if (!dev->cpumask) {
 		WARN_ON(num_possible_cpus() > 1);
@@ -545,11 +562,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
 {
 	clockevents_config(dev, freq);
 
-	if (dev->state == CLOCK_EVT_STATE_ONESHOT)
+	if (clockevent_state_oneshot(dev))
 		return clockevents_program_event(dev, dev->next_event, false);
 
-	if (dev->state == CLOCK_EVT_STATE_PERIODIC)
-		return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
+	if (clockevent_state_periodic(dev))
+		return __clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);
 
 	return 0;
 }
@@ -603,13 +620,13 @@ void clockevents_exchange_device(struct clock_event_device *old,
 	 */
 	if (old) {
 		module_put(old->owner);
-		clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED);
+		clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED);
 		list_del(&old->list);
 		list_add(&old->list, &clockevents_released);
 	}
 
 	if (new) {
-		BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED);
+		BUG_ON(!clockevent_state_detached(new));
 		clockevents_shutdown(new);
 	}
 }
@@ -622,7 +639,7 @@ void clockevents_suspend(void)
 	struct clock_event_device *dev;
 
 	list_for_each_entry_reverse(dev, &clockevent_devices, list)
-		if (dev->suspend)
+		if (dev->suspend && dev->mode != CLOCK_EVT_MODE_UNUSED)
 			dev->suspend(dev);
 }
 
@@ -634,7 +651,7 @@ void clockevents_resume(void)
 	struct clock_event_device *dev;
 
 	list_for_each_entry(dev, &clockevent_devices, list)
-		if (dev->resume)
+		if (dev->resume && dev->mode != CLOCK_EVT_MODE_UNUSED)
 			dev->resume(dev);
 }
 
@@ -665,7 +682,7 @@ void tick_cleanup_dead_cpu(int cpu)
 		if (cpumask_test_cpu(cpu, dev->cpumask) &&
 		    cpumask_weight(dev->cpumask) == 1 &&
 		    !tick_is_broadcast_device(dev)) {
-			BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED);
+			BUG_ON(!clockevent_state_detached(dev));
 			list_del(&dev->list);
 		}
 	}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 93ef7190bdeaad..7cf3668560604d 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -66,7 +66,6 @@
  */
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
-
 	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
 	.clock_base =
 	{
@@ -74,25 +73,21 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 			.index = HRTIMER_BASE_MONOTONIC,
 			.clockid = CLOCK_MONOTONIC,
 			.get_time = &ktime_get,
-			.resolution = KTIME_LOW_RES,
 		},
 		{
 			.index = HRTIMER_BASE_REALTIME,
 			.clockid = CLOCK_REALTIME,
 			.get_time = &ktime_get_real,
-			.resolution = KTIME_LOW_RES,
 		},
 		{
 			.index = HRTIMER_BASE_BOOTTIME,
 			.clockid = CLOCK_BOOTTIME,
 			.get_time = &ktime_get_boottime,
-			.resolution = KTIME_LOW_RES,
 		},
 		{
 			.index = HRTIMER_BASE_TAI,
 			.clockid = CLOCK_TAI,
 			.get_time = &ktime_get_clocktai,
-			.resolution = KTIME_LOW_RES,
 		},
 	}
 };
@@ -109,27 +104,6 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
 	return hrtimer_clock_to_base_table[clock_id];
 }
 
-
-/*
- * Get the coarse grained time at the softirq based on xtime and
- * wall_to_monotonic.
- */
-static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
-{
-	ktime_t xtim, mono, boot, tai;
-	ktime_t off_real, off_boot, off_tai;
-
-	mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
-	boot = ktime_add(mono, off_boot);
-	xtim = ktime_add(mono, off_real);
-	tai = ktime_add(mono, off_tai);
-
-	base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
-	base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
-	base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
-	base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai;
-}
-
 /*
  * Functions and macros which are different for UP/SMP systems are kept in a
  * single place
@@ -443,24 +417,35 @@ static inline void debug_deactivate(struct hrtimer *timer)
 }
 
 #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
+					     struct hrtimer *timer)
+{
+#ifdef CONFIG_HIGH_RES_TIMERS
+	cpu_base->next_timer = timer;
+#endif
+}
+
 static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
 {
 	struct hrtimer_clock_base *base = cpu_base->clock_base;
 	ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
-	int i;
+	unsigned int active = cpu_base->active_bases;
 
-	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+	hrtimer_update_next_timer(cpu_base, NULL);
+	for (; active; base++, active >>= 1) {
 		struct timerqueue_node *next;
 		struct hrtimer *timer;
 
-		next = timerqueue_getnext(&base->active);
-		if (!next)
+		if (!(active & 0x01))
 			continue;
 
+		next = timerqueue_getnext(&base->active);
 		timer = container_of(next, struct hrtimer, node);
 		expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-		if (expires.tv64 < expires_next.tv64)
+		if (expires.tv64 < expires_next.tv64) {
 			expires_next = expires;
+			hrtimer_update_next_timer(cpu_base, timer);
+		}
 	}
 	/*
 	 * clock_was_set() might have changed base->offset of any of
@@ -473,6 +458,16 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
 }
 #endif
 
+static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
+{
+	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
+
+	return ktime_get_update_offsets_now(&base->clock_was_set_seq,
+					    offs_real, offs_boot, offs_tai);
+}
+
 /* High resolution timer related functions */
 #ifdef CONFIG_HIGH_RES_TIMERS
 
@@ -480,6 +475,8 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
  * High resolution timer enabled ?
  */
 static int hrtimer_hres_enabled __read_mostly  = 1;
+unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
+EXPORT_SYMBOL_GPL(hrtimer_resolution);
 
 /*
  * Enable / Disable high resolution mode
@@ -508,9 +505,14 @@ static inline int hrtimer_is_hres_enabled(void)
 /*
  * Is the high resolution mode active ?
  */
+static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
+{
+	return cpu_base->hres_active;
+}
+
 static inline int hrtimer_hres_active(void)
 {
-	return __this_cpu_read(hrtimer_bases.hres_active);
+	return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
 }
 
 /*
@@ -521,7 +523,12 @@ static inline int hrtimer_hres_active(void)
 static void
 hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 {
-	ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
+	ktime_t expires_next;
+
+	if (!cpu_base->hres_active)
+		return;
+
+	expires_next = __hrtimer_get_next_event(cpu_base);
 
 	if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
 		return;
@@ -545,63 +552,53 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 	if (cpu_base->hang_detected)
 		return;
 
-	if (cpu_base->expires_next.tv64 != KTIME_MAX)
-		tick_program_event(cpu_base->expires_next, 1);
+	tick_program_event(cpu_base->expires_next, 1);
 }
 
 /*
- * Shared reprogramming for clock_realtime and clock_monotonic
- *
  * When a timer is enqueued and expires earlier than the already enqueued
  * timers, we have to check, whether it expires earlier than the timer for
  * which the clock event device was armed.
  *
- * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming
- * and no expiry check happens. The timer gets enqueued into the rbtree. The
- * reprogramming and expiry check is done in the hrtimer_interrupt or in the
- * softirq.
- *
  * Called with interrupts disabled and base->cpu_base.lock held
  */
-static int hrtimer_reprogram(struct hrtimer *timer,
-			     struct hrtimer_clock_base *base)
+static void hrtimer_reprogram(struct hrtimer *timer,
+			      struct hrtimer_clock_base *base)
 {
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
 	ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-	int res;
 
 	WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
 
 	/*
-	 * When the callback is running, we do not reprogram the clock event
-	 * device. The timer callback is either running on a different CPU or
-	 * the callback is executed in the hrtimer_interrupt context. The
-	 * reprogramming is handled either by the softirq, which called the
-	 * callback or at the end of the hrtimer_interrupt.
+	 * If the timer is not on the current cpu, we cannot reprogram
+	 * the other cpus clock event device.
 	 */
-	if (hrtimer_callback_running(timer))
-		return 0;
+	if (base->cpu_base != cpu_base)
+		return;
+
+	/*
+	 * If the hrtimer interrupt is running, then it will
+	 * reevaluate the clock bases and reprogram the clock event
+	 * device. The callbacks are always executed in hard interrupt
+	 * context so we don't need an extra check for a running
+	 * callback.
+	 */
+	if (cpu_base->in_hrtirq)
+		return;
 
 	/*
 	 * CLOCK_REALTIME timer might be requested with an absolute
-	 * expiry time which is less than base->offset. Nothing wrong
-	 * about that, just avoid to call into the tick code, which
-	 * has now objections against negative expiry values.
+	 * expiry time which is less than base->offset. Set it to 0.
 	 */
 	if (expires.tv64 < 0)
-		return -ETIME;
+		expires.tv64 = 0;
 
 	if (expires.tv64 >= cpu_base->expires_next.tv64)
-		return 0;
+		return;
 
-	/*
-	 * When the target cpu of the timer is currently executing
-	 * hrtimer_interrupt(), then we do not touch the clock event
-	 * device. hrtimer_interrupt() will reevaluate all clock bases
-	 * before reprogramming the device.
-	 */
-	if (cpu_base->in_hrtirq)
-		return 0;
+	/* Update the pointer to the next expiring timer */
+	cpu_base->next_timer = timer;
 
 	/*
 	 * If a hang was detected in the last timer interrupt then we
@@ -610,15 +607,14 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 	 * to make progress.
 	 */
 	if (cpu_base->hang_detected)
-		return 0;
+		return;
 
 	/*
-	 * Clockevents returns -ETIME, when the event was in the past.
+	 * Program the timer hardware. We enforce the expiry for
+	 * events which are already in the past.
 	 */
-	res = tick_program_event(expires, 0);
-	if (!IS_ERR_VALUE(res))
-		cpu_base->expires_next = expires;
-	return res;
+	cpu_base->expires_next = expires;
+	tick_program_event(expires, 1);
 }
 
 /*
@@ -630,15 +626,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
 	base->hres_active = 0;
 }
 
-static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
-{
-	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
-	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
-	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
-
-	return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);
-}
-
 /*
  * Retrigger next event is called after clock was set
  *
@@ -648,7 +635,7 @@ static void retrigger_next_event(void *arg)
 {
 	struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
 
-	if (!hrtimer_hres_active())
+	if (!base->hres_active)
 		return;
 
 	raw_spin_lock(&base->lock);
@@ -662,29 +649,19 @@ static void retrigger_next_event(void *arg)
  */
 static int hrtimer_switch_to_hres(void)
 {
-	int i, cpu = smp_processor_id();
-	struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
-	unsigned long flags;
-
-	if (base->hres_active)
-		return 1;
-
-	local_irq_save(flags);
+	struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
 
 	if (tick_init_highres()) {
-		local_irq_restore(flags);
 		printk(KERN_WARNING "Could not switch to high resolution "
-				    "mode on CPU %d\n", cpu);
+				    "mode on CPU %d\n", base->cpu);
 		return 0;
 	}
 	base->hres_active = 1;
-	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
-		base->clock_base[i].resolution = KTIME_HIGH_RES;
+	hrtimer_resolution = HIGH_RES_NSEC;
 
 	tick_setup_sched_timer();
 	/* "Retrigger" the interrupt to get things going */
 	retrigger_next_event(NULL);
-	local_irq_restore(flags);
 	return 1;
 }
 
@@ -706,6 +683,7 @@ void clock_was_set_delayed(void)
 
 #else
 
+static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
 static inline int hrtimer_hres_active(void) { return 0; }
 static inline int hrtimer_is_hres_enabled(void) { return 0; }
 static inline int hrtimer_switch_to_hres(void) { return 0; }
@@ -803,6 +781,14 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
  *
  * Forward the timer expiry so it will expire in the future.
  * Returns the number of overruns.
+ *
+ * Can be safely called from the callback function of @timer. If
+ * called from other contexts @timer must neither be enqueued nor
+ * running the callback and the caller needs to take care of
+ * serialization.
+ *
+ * Note: This only updates the timer expiry value and does not requeue
+ * the timer.
  */
 u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 {
@@ -814,8 +800,11 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 	if (delta.tv64 < 0)
 		return 0;
 
-	if (interval.tv64 < timer->base->resolution.tv64)
-		interval.tv64 = timer->base->resolution.tv64;
+	if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
+		return 0;
+
+	if (interval.tv64 < hrtimer_resolution)
+		interval.tv64 = hrtimer_resolution;
 
 	if (unlikely(delta.tv64 >= interval.tv64)) {
 		s64 incr = ktime_to_ns(interval);
@@ -849,7 +838,6 @@ static int enqueue_hrtimer(struct hrtimer *timer,
 {
 	debug_activate(timer);
 
-	timerqueue_add(&base->active, &timer->node);
 	base->cpu_base->active_bases |= 1 << base->index;
 
 	/*
@@ -858,7 +846,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
 	 */
 	timer->state |= HRTIMER_STATE_ENQUEUED;
 
-	return (&timer->node == base->active.next);
+	return timerqueue_add(&base->active, &timer->node);
 }
 
 /*
@@ -875,29 +863,28 @@ static void __remove_hrtimer(struct hrtimer *timer,
 			     struct hrtimer_clock_base *base,
 			     unsigned long newstate, int reprogram)
 {
-	struct timerqueue_node *next_timer;
-	if (!(timer->state & HRTIMER_STATE_ENQUEUED))
-		goto out;
+	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
+	unsigned int state = timer->state;
+
+	timer->state = newstate;
+	if (!(state & HRTIMER_STATE_ENQUEUED))
+		return;
+
+	if (!timerqueue_del(&base->active, &timer->node))
+		cpu_base->active_bases &= ~(1 << base->index);
 
-	next_timer = timerqueue_getnext(&base->active);
-	timerqueue_del(&base->active, &timer->node);
-	if (&timer->node == next_timer) {
 #ifdef CONFIG_HIGH_RES_TIMERS
-		/* Reprogram the clock event device. if enabled */
-		if (reprogram && hrtimer_hres_active()) {
-			ktime_t expires;
-
-			expires = ktime_sub(hrtimer_get_expires(timer),
-					    base->offset);
-			if (base->cpu_base->expires_next.tv64 == expires.tv64)
-				hrtimer_force_reprogram(base->cpu_base, 1);
-		}
+	/*
+	 * Note: If reprogram is false we do not update
+	 * cpu_base->next_timer. This happens when we remove the first
+	 * timer on a remote cpu. No harm as we never dereference
+	 * cpu_base->next_timer. So the worst thing what can happen is
+	 * an superflous call to hrtimer_force_reprogram() on the
+	 * remote cpu later on if the same timer gets enqueued again.
+	 */
+	if (reprogram && timer == cpu_base->next_timer)
+		hrtimer_force_reprogram(cpu_base, 1);
 #endif
-	}
-	if (!timerqueue_getnext(&base->active))
-		base->cpu_base->active_bases &= ~(1 << base->index);
-out:
-	timer->state = newstate;
 }
 
 /*
@@ -933,18 +920,25 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 	return 0;
 }
 
-int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-		unsigned long delta_ns, const enum hrtimer_mode mode,
-		int wakeup)
+/**
+ * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
+ * @timer:	the timer to be added
+ * @tim:	expiry time
+ * @delta_ns:	"slack" range for the timer
+ * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
+ *		relative (HRTIMER_MODE_REL)
+ */
+void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+			    unsigned long delta_ns, const enum hrtimer_mode mode)
 {
 	struct hrtimer_clock_base *base, *new_base;
 	unsigned long flags;
-	int ret, leftmost;
+	int leftmost;
 
 	base = lock_hrtimer_base(timer, &flags);
 
 	/* Remove an active timer from the queue: */
-	ret = remove_hrtimer(timer, base);
+	remove_hrtimer(timer, base);
 
 	if (mode & HRTIMER_MODE_REL) {
 		tim = ktime_add_safe(tim, base->get_time());
@@ -956,7 +950,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 		 * timeouts. This will go away with the GTOD framework.
 		 */
 #ifdef CONFIG_TIME_LOW_RES
-		tim = ktime_add_safe(tim, base->resolution);
+		tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution));
 #endif
 	}
 
@@ -968,11 +962,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	timer_stats_hrtimer_set_start_info(timer);
 
 	leftmost = enqueue_hrtimer(timer, new_base);
-
-	if (!leftmost) {
-		unlock_hrtimer_base(timer, &flags);
-		return ret;
-	}
+	if (!leftmost)
+		goto unlock;
 
 	if (!hrtimer_is_hres_active(timer)) {
 		/*
@@ -980,73 +971,15 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 		 * on dynticks target.
 		 */
 		wake_up_nohz_cpu(new_base->cpu_base->cpu);
-	} else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) &&
-			hrtimer_reprogram(timer, new_base)) {
-		/*
-		 * Only allow reprogramming if the new base is on this CPU.
-		 * (it might still be on another CPU if the timer was pending)
-		 *
-		 * XXX send_remote_softirq() ?
-		 */
-		if (wakeup) {
-			/*
-			 * We need to drop cpu_base->lock to avoid a
-			 * lock ordering issue vs. rq->lock.
-			 */
-			raw_spin_unlock(&new_base->cpu_base->lock);
-			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-			local_irq_restore(flags);
-			return ret;
-		} else {
-			__raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-		}
+	} else {
+		hrtimer_reprogram(timer, new_base);
 	}
-
+unlock:
 	unlock_hrtimer_base(timer, &flags);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns);
-
-/**
- * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
- * @timer:	the timer to be added
- * @tim:	expiry time
- * @delta_ns:	"slack" range for the timer
- * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
- *		relative (HRTIMER_MODE_REL)
- *
- * Returns:
- *  0 on success
- *  1 when the timer was active
- */
-int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-		unsigned long delta_ns, const enum hrtimer_mode mode)
-{
-	return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
 }
 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 
 /**
- * hrtimer_start - (re)start an hrtimer on the current CPU
- * @timer:	the timer to be added
- * @tim:	expiry time
- * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
- *		relative (HRTIMER_MODE_REL)
- *
- * Returns:
- *  0 on success
- *  1 when the timer was active
- */
-int
-hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
-{
-	return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
-}
-EXPORT_SYMBOL_GPL(hrtimer_start);
-
-
-/**
  * hrtimer_try_to_cancel - try to deactivate a timer
  * @timer:	hrtimer to stop
  *
@@ -1062,6 +995,15 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
 	unsigned long flags;
 	int ret = -1;
 
+	/*
+	 * Check lockless first. If the timer is not active (neither
+	 * enqueued nor running the callback, nothing to do here.  The
+	 * base lock does not serialize against a concurrent enqueue,
+	 * so we can avoid taking it.
+	 */
+	if (!hrtimer_active(timer))
+		return 0;
+
 	base = lock_hrtimer_base(timer, &flags);
 
 	if (!hrtimer_callback_running(timer))
@@ -1115,26 +1057,22 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
 /**
  * hrtimer_get_next_event - get the time until next expiry event
  *
- * Returns the delta to the next expiry event or KTIME_MAX if no timer
- * is pending.
+ * Returns the next expiry time or KTIME_MAX if no timer is pending.
  */
-ktime_t hrtimer_get_next_event(void)
+u64 hrtimer_get_next_event(void)
 {
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-	ktime_t mindelta = { .tv64 = KTIME_MAX };
+	u64 expires = KTIME_MAX;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&cpu_base->lock, flags);
 
-	if (!hrtimer_hres_active())
-		mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
-				     ktime_get());
+	if (!__hrtimer_hres_active(cpu_base))
+		expires = __hrtimer_get_next_event(cpu_base).tv64;
 
 	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
 
-	if (mindelta.tv64 < 0)
-		mindelta.tv64 = 0;
-	return mindelta;
+	return expires;
 }
 #endif
 
@@ -1176,30 +1114,10 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 }
 EXPORT_SYMBOL_GPL(hrtimer_init);
 
-/**
- * hrtimer_get_res - get the timer resolution for a clock
- * @which_clock: which clock to query
- * @tp:		 pointer to timespec variable to store the resolution
- *
- * Store the resolution of the clock selected by @which_clock in the
- * variable pointed to by @tp.
- */
-int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
-{
-	struct hrtimer_cpu_base *cpu_base;
-	int base = hrtimer_clockid_to_base(which_clock);
-
-	cpu_base = raw_cpu_ptr(&hrtimer_bases);
-	*tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(hrtimer_get_res);
-
-static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
+static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
+			  struct hrtimer_clock_base *base,
+			  struct hrtimer *timer, ktime_t *now)
 {
-	struct hrtimer_clock_base *base = timer->base;
-	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
 	enum hrtimer_restart (*fn)(struct hrtimer *);
 	int restart;
 
@@ -1225,55 +1143,32 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 	 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
 	 * we do not reprogramm the event hardware. Happens either in
 	 * hrtimer_start_range_ns() or in hrtimer_interrupt()
+	 *
+	 * Note: Because we dropped the cpu_base->lock above,
+	 * hrtimer_start_range_ns() can have popped in and enqueued the timer
+	 * for us already.
 	 */
-	if (restart != HRTIMER_NORESTART) {
-		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+	if (restart != HRTIMER_NORESTART &&
+	    !(timer->state & HRTIMER_STATE_ENQUEUED))
 		enqueue_hrtimer(timer, base);
-	}
 
 	WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
 
 	timer->state &= ~HRTIMER_STATE_CALLBACK;
 }
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-
-/*
- * High resolution timer interrupt
- * Called with interrupts disabled
- */
-void hrtimer_interrupt(struct clock_event_device *dev)
+static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
 {
-	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-	ktime_t expires_next, now, entry_time, delta;
-	int i, retries = 0;
-
-	BUG_ON(!cpu_base->hres_active);
-	cpu_base->nr_events++;
-	dev->next_event.tv64 = KTIME_MAX;
-
-	raw_spin_lock(&cpu_base->lock);
-	entry_time = now = hrtimer_update_base(cpu_base);
-retry:
-	cpu_base->in_hrtirq = 1;
-	/*
-	 * We set expires_next to KTIME_MAX here with cpu_base->lock
-	 * held to prevent that a timer is enqueued in our queue via
-	 * the migration code. This does not affect enqueueing of
-	 * timers which run their callback and need to be requeued on
-	 * this CPU.
-	 */
-	cpu_base->expires_next.tv64 = KTIME_MAX;
+	struct hrtimer_clock_base *base = cpu_base->clock_base;
+	unsigned int active = cpu_base->active_bases;
 
-	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
-		struct hrtimer_clock_base *base;
+	for (; active; base++, active >>= 1) {
 		struct timerqueue_node *node;
 		ktime_t basenow;
 
-		if (!(cpu_base->active_bases & (1 << i)))
+		if (!(active & 0x01))
 			continue;
 
-		base = cpu_base->clock_base + i;
 		basenow = ktime_add(now, base->offset);
 
 		while ((node = timerqueue_getnext(&base->active))) {
@@ -1296,9 +1191,42 @@ retry:
 			if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
 				break;
 
-			__run_hrtimer(timer, &basenow);
+			__run_hrtimer(cpu_base, base, timer, &basenow);
 		}
 	}
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+/*
+ * High resolution timer interrupt
+ * Called with interrupts disabled
+ */
+void hrtimer_interrupt(struct clock_event_device *dev)
+{
+	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+	ktime_t expires_next, now, entry_time, delta;
+	int retries = 0;
+
+	BUG_ON(!cpu_base->hres_active);
+	cpu_base->nr_events++;
+	dev->next_event.tv64 = KTIME_MAX;
+
+	raw_spin_lock(&cpu_base->lock);
+	entry_time = now = hrtimer_update_base(cpu_base);
+retry:
+	cpu_base->in_hrtirq = 1;
+	/*
+	 * We set expires_next to KTIME_MAX here with cpu_base->lock
+	 * held to prevent that a timer is enqueued in our queue via
+	 * the migration code. This does not affect enqueueing of
+	 * timers which run their callback and need to be requeued on
+	 * this CPU.
+	 */
+	cpu_base->expires_next.tv64 = KTIME_MAX;
+
+	__hrtimer_run_queues(cpu_base, now);
+
 	/* Reevaluate the clock bases for the next expiry */
 	expires_next = __hrtimer_get_next_event(cpu_base);
 	/*
@@ -1310,8 +1238,7 @@ retry:
 	raw_spin_unlock(&cpu_base->lock);
 
 	/* Reprogramming necessary ? */
-	if (expires_next.tv64 == KTIME_MAX ||
-	    !tick_program_event(expires_next, 0)) {
+	if (!tick_program_event(expires_next, 0)) {
 		cpu_base->hang_detected = 0;
 		return;
 	}
@@ -1344,8 +1271,8 @@ retry:
 	cpu_base->hang_detected = 1;
 	raw_spin_unlock(&cpu_base->lock);
 	delta = ktime_sub(now, entry_time);
-	if (delta.tv64 > cpu_base->max_hang_time.tv64)
-		cpu_base->max_hang_time = delta;
+	if ((unsigned int)delta.tv64 > cpu_base->max_hang_time)
+		cpu_base->max_hang_time = (unsigned int) delta.tv64;
 	/*
 	 * Limit it to a sensible value as we enforce a longer
 	 * delay. Give the CPU at least 100ms to catch up.
@@ -1363,7 +1290,7 @@ retry:
  * local version of hrtimer_peek_ahead_timers() called with interrupts
  * disabled.
  */
-static void __hrtimer_peek_ahead_timers(void)
+static inline void __hrtimer_peek_ahead_timers(void)
 {
 	struct tick_device *td;
 
@@ -1375,29 +1302,6 @@ static void __hrtimer_peek_ahead_timers(void)
 		hrtimer_interrupt(td->evtdev);
 }
 
-/**
- * hrtimer_peek_ahead_timers -- run soft-expired timers now
- *
- * hrtimer_peek_ahead_timers will peek at the timer queue of
- * the current cpu and check if there are any timers for which
- * the soft expires time has passed. If any such timers exist,
- * they are run immediately and then removed from the timer queue.
- *
- */
-void hrtimer_peek_ahead_timers(void)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__hrtimer_peek_ahead_timers();
-	local_irq_restore(flags);
-}
-
-static void run_hrtimer_softirq(struct softirq_action *h)
-{
-	hrtimer_peek_ahead_timers();
-}
-
 #else /* CONFIG_HIGH_RES_TIMERS */
 
 static inline void __hrtimer_peek_ahead_timers(void) { }
@@ -1405,66 +1309,32 @@ static inline void __hrtimer_peek_ahead_timers(void) { }
 #endif	/* !CONFIG_HIGH_RES_TIMERS */
 
 /*
- * Called from timer softirq every jiffy, expire hrtimers:
- *
- * For HRT its the fall back code to run the softirq in the timer
- * softirq context in case the hrtimer initialization failed or has
- * not been done yet.
+ * Called from run_local_timers in hardirq context every jiffy
  */
-void hrtimer_run_pending(void)
+void hrtimer_run_queues(void)
 {
-	if (hrtimer_hres_active())
+	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+	ktime_t now;
+
+	if (__hrtimer_hres_active(cpu_base))
 		return;
 
 	/*
-	 * This _is_ ugly: We have to check in the softirq context,
-	 * whether we can switch to highres and / or nohz mode. The
-	 * clocksource switch happens in the timer interrupt with
-	 * xtime_lock held. Notification from there only sets the
-	 * check bit in the tick_oneshot code, otherwise we might
-	 * deadlock vs. xtime_lock.
+	 * This _is_ ugly: We have to check periodically, whether we
+	 * can switch to highres and / or nohz mode. The clocksource
+	 * switch happens with xtime_lock held. Notification from
+	 * there only sets the check bit in the tick_oneshot code,
+	 * otherwise we might deadlock vs. xtime_lock.
 	 */
-	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
+	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
 		hrtimer_switch_to_hres();
-}
-
-/*
- * Called from hardirq context every jiffy
- */
-void hrtimer_run_queues(void)
-{
-	struct timerqueue_node *node;
-	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-	struct hrtimer_clock_base *base;
-	int index, gettime = 1;
-
-	if (hrtimer_hres_active())
 		return;
-
-	for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
-		base = &cpu_base->clock_base[index];
-		if (!timerqueue_getnext(&base->active))
-			continue;
-
-		if (gettime) {
-			hrtimer_get_softirq_time(cpu_base);
-			gettime = 0;
-		}
-
-		raw_spin_lock(&cpu_base->lock);
-
-		while ((node = timerqueue_getnext(&base->active))) {
-			struct hrtimer *timer;
-
-			timer = container_of(node, struct hrtimer, node);
-			if (base->softirq_time.tv64 <=
-					hrtimer_get_expires_tv64(timer))
-				break;
-
-			__run_hrtimer(timer, &base->softirq_time);
-		}
-		raw_spin_unlock(&cpu_base->lock);
 	}
+
+	raw_spin_lock(&cpu_base->lock);
+	now = hrtimer_update_base(cpu_base);
+	__hrtimer_run_queues(cpu_base, now);
+	raw_spin_unlock(&cpu_base->lock);
 }
 
 /*
@@ -1497,8 +1367,6 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
 		hrtimer_start_expires(&t->timer, mode);
-		if (!hrtimer_active(&t->timer))
-			t->task = NULL;
 
 		if (likely(t->task))
 			freezable_schedule();
@@ -1731,9 +1599,6 @@ void __init hrtimers_init(void)
 	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
 			  (void *)(long)smp_processor_id());
 	register_cpu_notifier(&hrtimers_nb);
-#ifdef CONFIG_HIGH_RES_TIMERS
-	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
-#endif
 }
 
 /**
@@ -1772,8 +1637,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
 	hrtimer_init_sleeper(&t, current);
 
 	hrtimer_start_expires(&t.timer, mode);
-	if (!hrtimer_active(&t.timer))
-		t.task = NULL;
 
 	if (likely(t.task))
 		schedule();
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0075da74abf0c5..892e3dae0aac41 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 	return 0;
 }
 
-static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
+/*
+ * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
+ * to avoid race conditions with concurrent updates to cputime.
+ */
+static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
 {
-	if (b->utime > a->utime)
-		a->utime = b->utime;
+	u64 curr_cputime;
+retry:
+	curr_cputime = atomic64_read(cputime);
+	if (sum_cputime > curr_cputime) {
+		if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
+			goto retry;
+	}
+}
 
-	if (b->stime > a->stime)
-		a->stime = b->stime;
+static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
+{
+	__update_gt_cputime(&cputime_atomic->utime, sum->utime);
+	__update_gt_cputime(&cputime_atomic->stime, sum->stime);
+	__update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
+}
 
-	if (b->sum_exec_runtime > a->sum_exec_runtime)
-		a->sum_exec_runtime = b->sum_exec_runtime;
+/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
+static inline void sample_cputime_atomic(struct task_cputime *times,
+					 struct task_cputime_atomic *atomic_times)
+{
+	times->utime = atomic64_read(&atomic_times->utime);
+	times->stime = atomic64_read(&atomic_times->stime);
+	times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
 }
 
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 	struct task_cputime sum;
-	unsigned long flags;
 
-	if (!cputimer->running) {
+	/* Check if cputimer isn't running. This is accessed without locking. */
+	if (!READ_ONCE(cputimer->running)) {
 		/*
 		 * The POSIX timer interface allows for absolute time expiry
 		 * values through the TIMER_ABSTIME flag, therefore we have
-		 * to synchronize the timer to the clock every time we start
-		 * it.
+		 * to synchronize the timer to the clock every time we start it.
 		 */
 		thread_group_cputime(tsk, &sum);
-		raw_spin_lock_irqsave(&cputimer->lock, flags);
-		cputimer->running = 1;
-		update_gt_cputime(&cputimer->cputime, &sum);
-	} else
-		raw_spin_lock_irqsave(&cputimer->lock, flags);
-	*times = cputimer->cputime;
-	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+		update_gt_cputime(&cputimer->cputime_atomic, &sum);
+
+		/*
+		 * We're setting cputimer->running without a lock. Ensure
+		 * this only gets written to in one operation. We set
+		 * running after update_gt_cputime() as a small optimization,
+		 * but barriers are not required because update_gt_cputime()
+		 * can handle concurrent updates.
+		 */
+		WRITE_ONCE(cputimer->running, 1);
+	}
+	sample_cputime_atomic(times, &cputimer->cputime_atomic);
 }
 
 /*
@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
 	if (!task_cputime_zero(&tsk->cputime_expires))
 		return false;
 
-	if (tsk->signal->cputimer.running)
+	/* Check if cputimer is running. This is accessed without locking. */
+	if (READ_ONCE(tsk->signal->cputimer.running))
 		return false;
 
 	return true;
@@ -852,10 +876,10 @@ static void check_thread_timers(struct task_struct *tsk,
 	/*
 	 * Check for the special case thread timers.
 	 */
-	soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
+	soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
 	if (soft != RLIM_INFINITY) {
 		unsigned long hard =
-			ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
+			READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
 
 		if (hard != RLIM_INFINITY &&
 		    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk,
 	}
 }
 
-static void stop_process_timers(struct signal_struct *sig)
+static inline void stop_process_timers(struct signal_struct *sig)
 {
 	struct thread_group_cputimer *cputimer = &sig->cputimer;
-	unsigned long flags;
 
-	raw_spin_lock_irqsave(&cputimer->lock, flags);
-	cputimer->running = 0;
-	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+	/* Turn off cputimer->running. This is done without locking. */
+	WRITE_ONCE(cputimer->running, 0);
 }
 
 static u32 onecputick;
@@ -958,11 +980,11 @@ static void check_process_timers(struct task_struct *tsk,
 			 SIGPROF);
 	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
 			 SIGVTALRM);
-	soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+	soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
 	if (soft != RLIM_INFINITY) {
 		unsigned long psecs = cputime_to_secs(ptime);
 		unsigned long hard =
-			ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
+			READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
 		cputime_t x;
 		if (psecs >= hard) {
 			/*
@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	}
 
 	sig = tsk->signal;
-	if (sig->cputimer.running) {
+	/* Check if cputimer is running. This is accessed without locking. */
+	if (READ_ONCE(sig->cputimer.running)) {
 		struct task_cputime group_sample;
 
-		raw_spin_lock(&sig->cputimer.lock);
-		group_sample = sig->cputimer.cputime;
-		raw_spin_unlock(&sig->cputimer.lock);
+		sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
 
 		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
 			return 1;
@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	 * If there are any active process wide timers (POSIX 1.b, itimers,
 	 * RLIMIT_CPU) cputimer must be running.
 	 */
-	if (tsk->signal->cputimer.running)
+	if (READ_ONCE(tsk->signal->cputimer.running))
 		check_process_timers(tsk, &firing);
 
 	/*
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 31ea01f42e1f08..31d11ac9fa4739 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -272,13 +272,20 @@ static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
 	return 0;
 }
 
+static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp)
+{
+	tp->tv_sec = 0;
+	tp->tv_nsec = hrtimer_resolution;
+	return 0;
+}
+
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
 static __init int init_posix_timers(void)
 {
 	struct k_clock clock_realtime = {
-		.clock_getres	= hrtimer_get_res,
+		.clock_getres	= posix_get_hrtimer_res,
 		.clock_get	= posix_clock_realtime_get,
 		.clock_set	= posix_clock_realtime_set,
 		.clock_adj	= posix_clock_realtime_adj,
@@ -290,7 +297,7 @@ static __init int init_posix_timers(void)
 		.timer_del	= common_timer_del,
 	};
 	struct k_clock clock_monotonic = {
-		.clock_getres	= hrtimer_get_res,
+		.clock_getres	= posix_get_hrtimer_res,
 		.clock_get	= posix_ktime_get_ts,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
@@ -300,7 +307,7 @@ static __init int init_posix_timers(void)
 		.timer_del	= common_timer_del,
 	};
 	struct k_clock clock_monotonic_raw = {
-		.clock_getres	= hrtimer_get_res,
+		.clock_getres	= posix_get_hrtimer_res,
 		.clock_get	= posix_get_monotonic_raw,
 	};
 	struct k_clock clock_realtime_coarse = {
@@ -312,7 +319,7 @@ static __init int init_posix_timers(void)
 		.clock_get	= posix_get_monotonic_coarse,
 	};
 	struct k_clock clock_tai = {
-		.clock_getres	= hrtimer_get_res,
+		.clock_getres	= posix_get_hrtimer_res,
 		.clock_get	= posix_get_tai,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
@@ -322,7 +329,7 @@ static __init int init_posix_timers(void)
 		.timer_del	= common_timer_del,
 	};
 	struct k_clock clock_boottime = {
-		.clock_getres	= hrtimer_get_res,
+		.clock_getres	= posix_get_hrtimer_res,
 		.clock_get	= posix_get_boottime,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 6aac4beedbbe23..3e7db49a2381d1 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -22,6 +22,7 @@ static void bc_set_mode(enum clock_event_mode mode,
 			struct clock_event_device *bc)
 {
 	switch (mode) {
+	case CLOCK_EVT_MODE_UNUSED:
 	case CLOCK_EVT_MODE_SHUTDOWN:
 		/*
 		 * Note, we cannot cancel the timer here as we might
@@ -66,9 +67,11 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
 	 * hrtimer_{start/cancel} functions call into tracing,
 	 * calls to these functions must be bound within RCU_NONIDLE.
 	 */
-	RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ?
-		!hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) :
-			0);
+	RCU_NONIDLE({
+			bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0;
+			if (bc_moved)
+				hrtimer_start(&bctimer, expires,
+					      HRTIMER_MODE_ABS_PINNED);});
 	if (bc_moved) {
 		/* Bind the "device" to the cpu */
 		bc->bound_on = smp_processor_id();
@@ -99,10 +102,13 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
 {
 	ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
 
-	if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX)
+	switch (ce_broadcast_hrtimer.mode) {
+	case CLOCK_EVT_MODE_ONESHOT:
+		if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX)
+			return HRTIMER_RESTART;
+	default:
 		return HRTIMER_NORESTART;
-
-	return HRTIMER_RESTART;
+	}
 }
 
 void tick_setup_hrtimer_broadcast(void)
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 7e8ca4f448a88c..d39f32cdd1b59c 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -255,18 +255,18 @@ int tick_receive_broadcast(void)
 /*
  * Broadcast the event to the cpus, which are set in the mask (mangled).
  */
-static void tick_do_broadcast(struct cpumask *mask)
+static bool tick_do_broadcast(struct cpumask *mask)
 {
 	int cpu = smp_processor_id();
 	struct tick_device *td;
+	bool local = false;
 
 	/*
 	 * Check, if the current cpu is in the mask
 	 */
 	if (cpumask_test_cpu(cpu, mask)) {
 		cpumask_clear_cpu(cpu, mask);
-		td = &per_cpu(tick_cpu_device, cpu);
-		td->evtdev->event_handler(td->evtdev);
+		local = true;
 	}
 
 	if (!cpumask_empty(mask)) {
@@ -279,16 +279,17 @@ static void tick_do_broadcast(struct cpumask *mask)
 		td = &per_cpu(tick_cpu_device, cpumask_first(mask));
 		td->evtdev->broadcast(mask);
 	}
+	return local;
 }
 
 /*
  * Periodic broadcast:
  * - invoke the broadcast handlers
  */
-static void tick_do_periodic_broadcast(void)
+static bool tick_do_periodic_broadcast(void)
 {
 	cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);
-	tick_do_broadcast(tmpmask);
+	return tick_do_broadcast(tmpmask);
 }
 
 /*
@@ -296,34 +297,26 @@ static void tick_do_periodic_broadcast(void)
  */
 static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
 {
-	ktime_t next;
+	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+	bool bc_local;
 
 	raw_spin_lock(&tick_broadcast_lock);
+	bc_local = tick_do_periodic_broadcast();
 
-	tick_do_periodic_broadcast();
+	if (clockevent_state_oneshot(dev)) {
+		ktime_t next = ktime_add(dev->next_event, tick_period);
 
-	/*
-	 * The device is in periodic mode. No reprogramming necessary:
-	 */
-	if (dev->state == CLOCK_EVT_STATE_PERIODIC)
-		goto unlock;
+		clockevents_program_event(dev, next, true);
+	}
+	raw_spin_unlock(&tick_broadcast_lock);
 
 	/*
-	 * Setup the next period for devices, which do not have
-	 * periodic mode. We read dev->next_event first and add to it
-	 * when the event already expired. clockevents_program_event()
-	 * sets dev->next_event only when the event is really
-	 * programmed to the device.
+	 * We run the handler of the local cpu after dropping
+	 * tick_broadcast_lock because the handler might deadlock when
+	 * trying to switch to oneshot mode.
 	 */
-	for (next = dev->next_event; ;) {
-		next = ktime_add(next, tick_period);
-
-		if (!clockevents_program_event(dev, next, false))
-			goto unlock;
-		tick_do_periodic_broadcast();
-	}
-unlock:
-	raw_spin_unlock(&tick_broadcast_lock);
+	if (bc_local)
+		td->evtdev->event_handler(td->evtdev);
 }
 
 /**
@@ -532,23 +525,19 @@ static void tick_broadcast_set_affinity(struct clock_event_device *bc,
 	irq_set_affinity(bc->irq, bc->cpumask);
 }
 
-static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
-				    ktime_t expires, int force)
+static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
+				     ktime_t expires)
 {
-	int ret;
-
-	if (bc->state != CLOCK_EVT_STATE_ONESHOT)
-		clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
+	if (!clockevent_state_oneshot(bc))
+		clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
 
-	ret = clockevents_program_event(bc, expires, force);
-	if (!ret)
-		tick_broadcast_set_affinity(bc, cpumask_of(cpu));
-	return ret;
+	clockevents_program_event(bc, expires, 1);
+	tick_broadcast_set_affinity(bc, cpumask_of(cpu));
 }
 
 static void tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 {
-	clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
+	clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
 }
 
 /*
@@ -566,7 +555,7 @@ void tick_check_oneshot_broadcast_this_cpu(void)
 		 * switched over, leave the device alone.
 		 */
 		if (td->mode == TICKDEV_MODE_ONESHOT) {
-			clockevents_set_state(td->evtdev,
+			clockevents_switch_state(td->evtdev,
 					      CLOCK_EVT_STATE_ONESHOT);
 		}
 	}
@@ -580,9 +569,9 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
 	struct tick_device *td;
 	ktime_t now, next_event;
 	int cpu, next_cpu = 0;
+	bool bc_local;
 
 	raw_spin_lock(&tick_broadcast_lock);
-again:
 	dev->next_event.tv64 = KTIME_MAX;
 	next_event.tv64 = KTIME_MAX;
 	cpumask_clear(tmpmask);
@@ -624,7 +613,7 @@ again:
 	/*
 	 * Wakeup the cpus which have an expired event.
 	 */
-	tick_do_broadcast(tmpmask);
+	bc_local = tick_do_broadcast(tmpmask);
 
 	/*
 	 * Two reasons for reprogram:
@@ -636,15 +625,15 @@ again:
 	 * - There are pending events on sleeping CPUs which were not
 	 * in the event mask
 	 */
-	if (next_event.tv64 != KTIME_MAX) {
-		/*
-		 * Rearm the broadcast device. If event expired,
-		 * repeat the above
-		 */
-		if (tick_broadcast_set_event(dev, next_cpu, next_event, 0))
-			goto again;
-	}
+	if (next_event.tv64 != KTIME_MAX)
+		tick_broadcast_set_event(dev, next_cpu, next_event);
+
 	raw_spin_unlock(&tick_broadcast_lock);
+
+	if (bc_local) {
+		td = this_cpu_ptr(&tick_cpu_device);
+		td->evtdev->event_handler(td->evtdev);
+	}
 }
 
 static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu)
@@ -670,7 +659,7 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
 		if (dev->next_event.tv64 < bc->next_event.tv64)
 			return;
 	}
-	clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
+	clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
 }
 
 /**
@@ -726,7 +715,7 @@ int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
 			 */
 			if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) &&
 			    dev->next_event.tv64 < bc->next_event.tv64)
-				tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
+				tick_broadcast_set_event(bc, cpu, dev->next_event);
 		}
 		/*
 		 * If the current CPU owns the hrtimer broadcast
@@ -740,7 +729,7 @@ int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
 			cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
 	} else {
 		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
-			clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+			clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
 			/*
 			 * The cpu which was handling the broadcast
 			 * timer marked this cpu in the broadcast
@@ -842,7 +831,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 
 	/* Set it up only once ! */
 	if (bc->event_handler != tick_handle_oneshot_broadcast) {
-		int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC;
+		int was_periodic = clockevent_state_periodic(bc);
 
 		bc->event_handler = tick_handle_oneshot_broadcast;
 
@@ -858,10 +847,10 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 			   tick_broadcast_oneshot_mask, tmpmask);
 
 		if (was_periodic && !cpumask_empty(tmpmask)) {
-			clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
+			clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
 			tick_broadcast_init_next_event(tmpmask,
 						       tick_next_period);
-			tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
+			tick_broadcast_set_event(bc, cpu, tick_next_period);
 		} else
 			bc->next_event.tv64 = KTIME_MAX;
 	} else {
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 51508465153cf3..76446cb5dfe1a5 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -103,7 +103,17 @@ void tick_handle_periodic(struct clock_event_device *dev)
 
 	tick_periodic(cpu);
 
-	if (dev->state != CLOCK_EVT_STATE_ONESHOT)
+#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON)
+	/*
+	 * The cpu might have transitioned to HIGHRES or NOHZ mode via
+	 * update_process_times() -> run_local_timers() ->
+	 * hrtimer_run_queues().
+	 */
+	if (dev->event_handler != tick_handle_periodic)
+		return;
+#endif
+
+	if (!clockevent_state_oneshot(dev))
 		return;
 	for (;;) {
 		/*
@@ -141,7 +151,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
 
 	if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
 	    !tick_broadcast_oneshot_active()) {
-		clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
+		clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);
 	} else {
 		unsigned long seq;
 		ktime_t next;
@@ -151,7 +161,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
 			next = tick_next_period;
 		} while (read_seqretry(&jiffies_lock, seq));
 
-		clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+		clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
 
 		for (;;) {
 			if (!clockevents_program_event(dev, next, false))
@@ -368,7 +378,7 @@ void tick_shutdown(unsigned int cpu)
 		 * Prevent that the clock events layer tries to call
 		 * the set mode function!
 		 */
-		dev->state = CLOCK_EVT_STATE_DETACHED;
+		clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
 		dev->mode = CLOCK_EVT_MODE_UNUSED;
 		clockevents_exchange_device(dev, NULL);
 		dev->event_handler = clockevents_handle_noop;
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index b64fdd8054c56b..ec2208aabdd180 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -36,11 +36,22 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
 	return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
 }
 
+static inline enum clock_event_state clockevent_get_state(struct clock_event_device *dev)
+{
+	return dev->state_use_accessors;
+}
+
+static inline void clockevent_set_state(struct clock_event_device *dev,
+					enum clock_event_state state)
+{
+	dev->state_use_accessors = state;
+}
+
 extern void clockevents_shutdown(struct clock_event_device *dev);
 extern void clockevents_exchange_device(struct clock_event_device *old,
 					struct clock_event_device *new);
-extern void clockevents_set_state(struct clock_event_device *dev,
-				 enum clock_event_state state);
+extern void clockevents_switch_state(struct clock_event_device *dev,
+				     enum clock_event_state state);
 extern int clockevents_program_event(struct clock_event_device *dev,
 				     ktime_t expires, bool force);
 extern void clockevents_handle_noop(struct clock_event_device *dev);
@@ -137,3 +148,5 @@ extern void tick_nohz_init(void);
 # else
 static inline void tick_nohz_init(void) { }
 #endif
+
+extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 67a64b1670bfdb..b51344652330a2 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -28,6 +28,22 @@ int tick_program_event(ktime_t expires, int force)
 {
 	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 
+	if (unlikely(expires.tv64 == KTIME_MAX)) {
+		/*
+		 * We don't need the clock event device any more, stop it.
+		 */
+		clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED);
+		return 0;
+	}
+
+	if (unlikely(clockevent_state_oneshot_stopped(dev))) {
+		/*
+		 * We need the clock event again, configure it in ONESHOT mode
+		 * before using it.
+		 */
+		clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
+	}
+
 	return clockevents_program_event(dev, expires, force);
 }
 
@@ -38,7 +54,7 @@ void tick_resume_oneshot(void)
 {
 	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 
-	clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+	clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
 	clockevents_program_event(dev, ktime_get(), true);
 }
 
@@ -50,7 +66,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
 			ktime_t next_event)
 {
 	newdev->event_handler = handler;
-	clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT);
+	clockevents_switch_state(newdev, CLOCK_EVT_STATE_ONESHOT);
 	clockevents_program_event(newdev, next_event, true);
 }
 
@@ -81,7 +97,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
 
 	td->mode = TICKDEV_MODE_ONESHOT;
 	dev->event_handler = handler;
-	clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+	clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
 	tick_broadcast_switch_to_oneshot();
 	return 0;
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 914259128145e2..812f7a3b989836 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -565,156 +565,144 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 }
 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 
+static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
+{
+	hrtimer_cancel(&ts->sched_timer);
+	hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
+
+	/* Forward the time to expire in the future */
+	hrtimer_forward(&ts->sched_timer, now, tick_period);
+
+	if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+		hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
+	else
+		tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+}
+
 static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 					 ktime_t now, int cpu)
 {
-	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
-	ktime_t last_update, expires, ret = { .tv64 = 0 };
-	unsigned long rcu_delta_jiffies;
 	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
-	u64 time_delta;
-
-	time_delta = timekeeping_max_deferment();
+	u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
+	unsigned long seq, basejiff;
+	ktime_t	tick;
 
 	/* Read jiffies and the time when jiffies were updated last */
 	do {
 		seq = read_seqbegin(&jiffies_lock);
-		last_update = last_jiffies_update;
-		last_jiffies = jiffies;
+		basemono = last_jiffies_update.tv64;
+		basejiff = jiffies;
 	} while (read_seqretry(&jiffies_lock, seq));
+	ts->last_jiffies = basejiff;
 
-	if (rcu_needs_cpu(&rcu_delta_jiffies) ||
+	if (rcu_needs_cpu(basemono, &next_rcu) ||
 	    arch_needs_cpu() || irq_work_needs_cpu()) {
-		next_jiffies = last_jiffies + 1;
-		delta_jiffies = 1;
+		next_tick = basemono + TICK_NSEC;
 	} else {
-		/* Get the next timer wheel timer */
-		next_jiffies = get_next_timer_interrupt(last_jiffies);
-		delta_jiffies = next_jiffies - last_jiffies;
-		if (rcu_delta_jiffies < delta_jiffies) {
-			next_jiffies = last_jiffies + rcu_delta_jiffies;
-			delta_jiffies = rcu_delta_jiffies;
-		}
+		/*
+		 * Get the next pending timer. If high resolution
+		 * timers are enabled this only takes the timer wheel
+		 * timers into account. If high resolution timers are
+		 * disabled this also looks at the next expiring
+		 * hrtimer.
+		 */
+		next_tmr = get_next_timer_interrupt(basejiff, basemono);
+		ts->next_timer = next_tmr;
+		/* Take the next rcu event into account */
+		next_tick = next_rcu < next_tmr ? next_rcu : next_tmr;
 	}
 
 	/*
-	 * Do not stop the tick, if we are only one off (or less)
-	 * or if the cpu is required for RCU:
+	 * If the tick is due in the next period, keep it ticking or
+	 * restart it proper.
 	 */
-	if (!ts->tick_stopped && delta_jiffies <= 1)
-		goto out;
-
-	/* Schedule the tick, if we are at least one jiffie off */
-	if ((long)delta_jiffies >= 1) {
-
-		/*
-		 * If this cpu is the one which updates jiffies, then
-		 * give up the assignment and let it be taken by the
-		 * cpu which runs the tick timer next, which might be
-		 * this cpu as well. If we don't drop this here the
-		 * jiffies might be stale and do_timer() never
-		 * invoked. Keep track of the fact that it was the one
-		 * which had the do_timer() duty last. If this cpu is
-		 * the one which had the do_timer() duty last, we
-		 * limit the sleep time to the timekeeping
-		 * max_deferement value which we retrieved
-		 * above. Otherwise we can sleep as long as we want.
-		 */
-		if (cpu == tick_do_timer_cpu) {
-			tick_do_timer_cpu = TICK_DO_TIMER_NONE;
-			ts->do_timer_last = 1;
-		} else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
-			time_delta = KTIME_MAX;
-			ts->do_timer_last = 0;
-		} else if (!ts->do_timer_last) {
-			time_delta = KTIME_MAX;
+	delta = next_tick - basemono;
+	if (delta <= (u64)TICK_NSEC) {
+		tick.tv64 = 0;
+		if (!ts->tick_stopped)
+			goto out;
+		if (delta == 0) {
+			/* Tick is stopped, but required now. Enforce it */
+			tick_nohz_restart(ts, now);
+			goto out;
 		}
+	}
+
+	/*
+	 * If this cpu is the one which updates jiffies, then give up
+	 * the assignment and let it be taken by the cpu which runs
+	 * the tick timer next, which might be this cpu as well. If we
+	 * don't drop this here the jiffies might be stale and
+	 * do_timer() never invoked. Keep track of the fact that it
+	 * was the one which had the do_timer() duty last. If this cpu
+	 * is the one which had the do_timer() duty last, we limit the
+	 * sleep time to the timekeeping max_deferement value.
+	 * Otherwise we can sleep as long as we want.
+	 */
+	delta = timekeeping_max_deferment();
+	if (cpu == tick_do_timer_cpu) {
+		tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+		ts->do_timer_last = 1;
+	} else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
+		delta = KTIME_MAX;
+		ts->do_timer_last = 0;
+	} else if (!ts->do_timer_last) {
+		delta = KTIME_MAX;
+	}
 
 #ifdef CONFIG_NO_HZ_FULL
-		if (!ts->inidle) {
-			time_delta = min(time_delta,
-					 scheduler_tick_max_deferment());
-		}
+	/* Limit the tick delta to the maximum scheduler deferment */
+	if (!ts->inidle)
+		delta = min(delta, scheduler_tick_max_deferment());
 #endif
 
-		/*
-		 * calculate the expiry time for the next timer wheel
-		 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
-		 * that there is no timer pending or at least extremely
-		 * far into the future (12 days for HZ=1000). In this
-		 * case we set the expiry to the end of time.
-		 */
-		if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
-			/*
-			 * Calculate the time delta for the next timer event.
-			 * If the time delta exceeds the maximum time delta
-			 * permitted by the current clocksource then adjust
-			 * the time delta accordingly to ensure the
-			 * clocksource does not wrap.
-			 */
-			time_delta = min_t(u64, time_delta,
-					   tick_period.tv64 * delta_jiffies);
-		}
-
-		if (time_delta < KTIME_MAX)
-			expires = ktime_add_ns(last_update, time_delta);
-		else
-			expires.tv64 = KTIME_MAX;
-
-		/* Skip reprogram of event if its not changed */
-		if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
-			goto out;
+	/* Calculate the next expiry time */
+	if (delta < (KTIME_MAX - basemono))
+		expires = basemono + delta;
+	else
+		expires = KTIME_MAX;
 
-		ret = expires;
+	expires = min_t(u64, expires, next_tick);
+	tick.tv64 = expires;
 
-		/*
-		 * nohz_stop_sched_tick can be called several times before
-		 * the nohz_restart_sched_tick is called. This happens when
-		 * interrupts arrive which do not cause a reschedule. In the
-		 * first call we save the current tick time, so we can restart
-		 * the scheduler tick in nohz_restart_sched_tick.
-		 */
-		if (!ts->tick_stopped) {
-			nohz_balance_enter_idle(cpu);
-			calc_load_enter_idle();
+	/* Skip reprogram of event if its not changed */
+	if (ts->tick_stopped && (expires == dev->next_event.tv64))
+		goto out;
 
-			ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
-			ts->tick_stopped = 1;
-			trace_tick_stop(1, " ");
-		}
+	/*
+	 * nohz_stop_sched_tick can be called several times before
+	 * the nohz_restart_sched_tick is called. This happens when
+	 * interrupts arrive which do not cause a reschedule. In the
+	 * first call we save the current tick time, so we can restart
+	 * the scheduler tick in nohz_restart_sched_tick.
+	 */
+	if (!ts->tick_stopped) {
+		nohz_balance_enter_idle(cpu);
+		calc_load_enter_idle();
 
-		/*
-		 * If the expiration time == KTIME_MAX, then
-		 * in this case we simply stop the tick timer.
-		 */
-		 if (unlikely(expires.tv64 == KTIME_MAX)) {
-			if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
-				hrtimer_cancel(&ts->sched_timer);
-			goto out;
-		}
+		ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
+		ts->tick_stopped = 1;
+		trace_tick_stop(1, " ");
+	}
 
-		if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
-			hrtimer_start(&ts->sched_timer, expires,
-				      HRTIMER_MODE_ABS_PINNED);
-			/* Check, if the timer was already in the past */
-			if (hrtimer_active(&ts->sched_timer))
-				goto out;
-		} else if (!tick_program_event(expires, 0))
-				goto out;
-		/*
-		 * We are past the event already. So we crossed a
-		 * jiffie boundary. Update jiffies and raise the
-		 * softirq.
-		 */
-		tick_do_update_jiffies64(ktime_get());
+	/*
+	 * If the expiration time == KTIME_MAX, then we simply stop
+	 * the tick timer.
+	 */
+	if (unlikely(expires == KTIME_MAX)) {
+		if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+			hrtimer_cancel(&ts->sched_timer);
+		goto out;
 	}
-	raise_softirq_irqoff(TIMER_SOFTIRQ);
+
+	if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+		hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
+	else
+		tick_program_event(tick, 1);
 out:
-	ts->next_jiffies = next_jiffies;
-	ts->last_jiffies = last_jiffies;
+	/* Update the estimated sleep length */
 	ts->sleep_length = ktime_sub(dev->next_event, now);
-
-	return ret;
+	return tick;
 }
 
 static void tick_nohz_full_stop_tick(struct tick_sched *ts)
@@ -876,32 +864,6 @@ ktime_t tick_nohz_get_sleep_length(void)
 	return ts->sleep_length;
 }
 
-static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
-{
-	hrtimer_cancel(&ts->sched_timer);
-	hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
-
-	while (1) {
-		/* Forward the time to expire in the future */
-		hrtimer_forward(&ts->sched_timer, now, tick_period);
-
-		if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
-			hrtimer_start_expires(&ts->sched_timer,
-					      HRTIMER_MODE_ABS_PINNED);
-			/* Check, if the timer was already in the past */
-			if (hrtimer_active(&ts->sched_timer))
-				break;
-		} else {
-			if (!tick_program_event(
-				hrtimer_get_expires(&ts->sched_timer), 0))
-				break;
-		}
-		/* Reread time and update jiffies */
-		now = ktime_get();
-		tick_do_update_jiffies64(now);
-	}
-}
-
 static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 {
 	/* Update jiffies first */
@@ -972,12 +934,6 @@ void tick_nohz_idle_exit(void)
 	local_irq_enable();
 }
 
-static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
-{
-	hrtimer_forward(&ts->sched_timer, now, tick_period);
-	return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
-}
-
 /*
  * The nohz low res interrupt handler
  */
@@ -996,10 +952,8 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 	if (unlikely(ts->tick_stopped))
 		return;
 
-	while (tick_nohz_reprogram(ts, now)) {
-		now = ktime_get();
-		tick_do_update_jiffies64(now);
-	}
+	hrtimer_forward(&ts->sched_timer, now, tick_period);
+	tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
 }
 
 /**
@@ -1013,11 +967,9 @@ static void tick_nohz_switch_to_nohz(void)
 	if (!tick_nohz_enabled)
 		return;
 
-	local_irq_disable();
-	if (tick_switch_to_oneshot(tick_nohz_handler)) {
-		local_irq_enable();
+	if (tick_switch_to_oneshot(tick_nohz_handler))
 		return;
-	}
+
 	tick_nohz_active = 1;
 	ts->nohz_mode = NOHZ_MODE_LOWRES;
 
@@ -1029,13 +981,9 @@ static void tick_nohz_switch_to_nohz(void)
 	/* Get the next period */
 	next = tick_init_jiffy_update();
 
-	for (;;) {
-		hrtimer_set_expires(&ts->sched_timer, next);
-		if (!tick_program_event(next, 0))
-			break;
-		next = ktime_add(next, tick_period);
-	}
-	local_irq_enable();
+	hrtimer_forward_now(&ts->sched_timer, tick_period);
+	hrtimer_set_expires(&ts->sched_timer, next);
+	tick_program_event(next, 1);
 }
 
 /*
@@ -1167,15 +1115,8 @@ void tick_setup_sched_timer(void)
 		hrtimer_add_expires_ns(&ts->sched_timer, offset);
 	}
 
-	for (;;) {
-		hrtimer_forward(&ts->sched_timer, now, tick_period);
-		hrtimer_start_expires(&ts->sched_timer,
-				      HRTIMER_MODE_ABS_PINNED);
-		/* Check, if the timer was already in the past */
-		if (hrtimer_active(&ts->sched_timer))
-			break;
-		now = ktime_get();
-	}
+	hrtimer_forward(&ts->sched_timer, now, tick_period);
+	hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
 
 #ifdef CONFIG_NO_HZ_COMMON
 	if (tick_nohz_enabled) {
@@ -1227,7 +1168,7 @@ void tick_oneshot_notify(void)
  * Called cyclic from the hrtimer softirq (driven by the timer
  * softirq) allow_nohz signals, that we can switch into low-res nohz
  * mode, because high resolution timers are disabled (either compile
- * or runtime).
+ * or runtime). Called with interrupts disabled.
  */
 int tick_check_oneshot_change(int allow_nohz)
 {
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 28b5da3e1a176e..42fdf4958bccd1 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -57,7 +57,7 @@ struct tick_sched {
 	ktime_t				iowait_sleeptime;
 	ktime_t				sleep_length;
 	unsigned long			last_jiffies;
-	unsigned long			next_jiffies;
+	u64				next_timer;
 	ktime_t				idle_expires;
 	int				do_timer_last;
 };
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 2c85b7724af4b0..972e3bbac96399 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -41,7 +41,7 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
-#include "timeconst.h"
+#include <generated/timeconst.h>
 #include "timekeeping.h"
 
 /*
@@ -173,6 +173,10 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
 		return error;
 
 	if (tz) {
+		/* Verify we're witin the +-15 hrs range */
+		if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60)
+			return -EINVAL;
+
 		sys_tz = *tz;
 		update_vsyscall_tz();
 		if (firsttime) {
@@ -483,9 +487,11 @@ struct timespec64 ns_to_timespec64(const s64 nsec)
 }
 EXPORT_SYMBOL(ns_to_timespec64);
 #endif
-/*
- * When we convert to jiffies then we interpret incoming values
- * the following way:
+/**
+ * msecs_to_jiffies: - convert milliseconds to jiffies
+ * @m:	time in milliseconds
+ *
+ * conversion is done as follows:
  *
  * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
  *
@@ -493,51 +499,28 @@ EXPORT_SYMBOL(ns_to_timespec64);
  *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
  *
  * - all other values are converted to jiffies by either multiplying
- *   the input value by a factor or dividing it with a factor
- *
- * We must also be careful about 32-bit overflows.
+ *   the input value by a factor or dividing it with a factor and
+ *   handling any 32-bit overflows.
+ *   for the details see __msecs_to_jiffies()
+ *
+ * msecs_to_jiffies() checks for the passed in value being a constant
+ * via __builtin_constant_p() allowing gcc to eliminate most of the
+ * code, __msecs_to_jiffies() is called if the value passed does not
+ * allow constant folding and the actual conversion must be done at
+ * runtime.
+ * the _msecs_to_jiffies helpers are the HZ dependent conversion
+ * routines found in include/linux/jiffies.h
  */
-unsigned long msecs_to_jiffies(const unsigned int m)
+unsigned long __msecs_to_jiffies(const unsigned int m)
 {
 	/*
 	 * Negative value, means infinite timeout:
 	 */
 	if ((int)m < 0)
 		return MAX_JIFFY_OFFSET;
-
-#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
-	/*
-	 * HZ is equal to or smaller than 1000, and 1000 is a nice
-	 * round multiple of HZ, divide with the factor between them,
-	 * but round upwards:
-	 */
-	return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
-#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
-	/*
-	 * HZ is larger than 1000, and HZ is a nice round multiple of
-	 * 1000 - simply multiply with the factor between them.
-	 *
-	 * But first make sure the multiplication result cannot
-	 * overflow:
-	 */
-	if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
-		return MAX_JIFFY_OFFSET;
-
-	return m * (HZ / MSEC_PER_SEC);
-#else
-	/*
-	 * Generic case - multiply, round and divide. But first
-	 * check that if we are doing a net multiplication, that
-	 * we wouldn't overflow:
-	 */
-	if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
-		return MAX_JIFFY_OFFSET;
-
-	return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
-		>> MSEC_TO_HZ_SHR32;
-#endif
+	return _msecs_to_jiffies(m);
 }
-EXPORT_SYMBOL(msecs_to_jiffies);
+EXPORT_SYMBOL(__msecs_to_jiffies);
 
 unsigned long usecs_to_jiffies(const unsigned int u)
 {
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc
index 511bdf2cafdaa2..c7388dee86358a 100644
--- a/kernel/time/timeconst.bc
+++ b/kernel/time/timeconst.bc
@@ -50,7 +50,7 @@ define timeconst(hz) {
 	print "#include <linux/types.h>\n\n"
 
 	print "#if HZ != ", hz, "\n"
-	print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n"
+	print "#error \qinclude/generated/timeconst.h has the wrong HZ value!\q\n"
 	print "#endif\n\n"
 
 	if (hz < 2) {
@@ -105,4 +105,5 @@ define timeconst(hz) {
 	halt
 }
 
+hz = read();
 timeconst(hz)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 266dafe8f015a8..0d784b34bda208 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -118,18 +118,6 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
 
 #ifdef CONFIG_DEBUG_TIMEKEEPING
 #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
-/*
- * These simple flag variables are managed
- * without locks, which is racy, but ok since
- * we don't really care about being super
- * precise about how many events were seen,
- * just that a problem was observed.
- */
-static int timekeeping_underflow_seen;
-static int timekeeping_overflow_seen;
-
-/* last_warning is only modified under the timekeeping lock */
-static long timekeeping_last_warning;
 
 static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
 {
@@ -149,29 +137,30 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
 		}
 	}
 
-	if (timekeeping_underflow_seen) {
-		if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+	if (tk->underflow_seen) {
+		if (jiffies - tk->last_warning > WARNING_FREQ) {
 			printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
 			printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
 			printk_deferred("         Your kernel is probably still fine.\n");
-			timekeeping_last_warning = jiffies;
+			tk->last_warning = jiffies;
 		}
-		timekeeping_underflow_seen = 0;
+		tk->underflow_seen = 0;
 	}
 
-	if (timekeeping_overflow_seen) {
-		if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+	if (tk->overflow_seen) {
+		if (jiffies - tk->last_warning > WARNING_FREQ) {
 			printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
 			printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
 			printk_deferred("         Your kernel is probably still fine.\n");
-			timekeeping_last_warning = jiffies;
+			tk->last_warning = jiffies;
 		}
-		timekeeping_overflow_seen = 0;
+		tk->overflow_seen = 0;
 	}
 }
 
 static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
 {
+	struct timekeeper *tk = &tk_core.timekeeper;
 	cycle_t now, last, mask, max, delta;
 	unsigned int seq;
 
@@ -197,13 +186,13 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
 	 * mask-relative negative values.
 	 */
 	if (unlikely((~delta & mask) < (mask >> 3))) {
-		timekeeping_underflow_seen = 1;
+		tk->underflow_seen = 1;
 		delta = 0;
 	}
 
 	/* Cap delta value to the max_cycles values to avoid mult overflows */
 	if (unlikely(delta > max)) {
-		timekeeping_overflow_seen = 1;
+		tk->overflow_seen = 1;
 		delta = tkr->clock->max_cycles;
 	}
 
@@ -577,6 +566,9 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 
 	update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
 	update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);
+
+	if (action & TK_CLOCK_WAS_SET)
+		tk->clock_was_set_seq++;
 }
 
 /**
@@ -674,6 +666,23 @@ ktime_t ktime_get(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get);
 
+u32 ktime_get_resolution_ns(void)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+	unsigned int seq;
+	u32 nsecs;
+
+	WARN_ON(timekeeping_suspended);
+
+	do {
+		seq = read_seqcount_begin(&tk_core.seq);
+		nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift;
+	} while (read_seqcount_retry(&tk_core.seq, seq));
+
+	return nsecs;
+}
+EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);
+
 static ktime_t *offsets[TK_OFFS_MAX] = {
 	[TK_OFFS_REAL]	= &tk_core.timekeeper.offs_real,
 	[TK_OFFS_BOOT]	= &tk_core.timekeeper.offs_boot,
@@ -1154,28 +1163,20 @@ void __weak read_persistent_clock64(struct timespec64 *ts64)
 }
 
 /**
- * read_boot_clock -  Return time of the system start.
+ * read_boot_clock64 -  Return time of the system start.
  *
  * Weak dummy function for arches that do not yet support it.
  * Function to read the exact time the system has been started.
- * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
+ * Returns a timespec64 with tv_sec=0 and tv_nsec=0 if unsupported.
  *
  *  XXX - Do be sure to remove it once all arches implement it.
  */
-void __weak read_boot_clock(struct timespec *ts)
+void __weak read_boot_clock64(struct timespec64 *ts)
 {
 	ts->tv_sec = 0;
 	ts->tv_nsec = 0;
 }
 
-void __weak read_boot_clock64(struct timespec64 *ts64)
-{
-	struct timespec ts;
-
-	read_boot_clock(&ts);
-	*ts64 = timespec_to_timespec64(ts);
-}
-
 /* Flag for if timekeeping_resume() has injected sleeptime */
 static bool sleeptime_injected;
 
@@ -1901,47 +1902,20 @@ void do_timer(unsigned long ticks)
 }
 
 /**
- * ktime_get_update_offsets_tick - hrtimer helper
- * @offs_real:	pointer to storage for monotonic -> realtime offset
- * @offs_boot:	pointer to storage for monotonic -> boottime offset
- * @offs_tai:	pointer to storage for monotonic -> clock tai offset
- *
- * Returns monotonic time at last tick and various offsets
- */
-ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
-							ktime_t *offs_tai)
-{
-	struct timekeeper *tk = &tk_core.timekeeper;
-	unsigned int seq;
-	ktime_t base;
-	u64 nsecs;
-
-	do {
-		seq = read_seqcount_begin(&tk_core.seq);
-
-		base = tk->tkr_mono.base;
-		nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
-
-		*offs_real = tk->offs_real;
-		*offs_boot = tk->offs_boot;
-		*offs_tai = tk->offs_tai;
-	} while (read_seqcount_retry(&tk_core.seq, seq));
-
-	return ktime_add_ns(base, nsecs);
-}
-
-#ifdef CONFIG_HIGH_RES_TIMERS
-/**
  * ktime_get_update_offsets_now - hrtimer helper
+ * @cwsseq:	pointer to check and store the clock was set sequence number
  * @offs_real:	pointer to storage for monotonic -> realtime offset
  * @offs_boot:	pointer to storage for monotonic -> boottime offset
  * @offs_tai:	pointer to storage for monotonic -> clock tai offset
  *
- * Returns current monotonic time and updates the offsets
+ * Returns current monotonic time and updates the offsets if the
+ * sequence number in @cwsseq and timekeeper.clock_was_set_seq are
+ * different.
+ *
  * Called from hrtimer_interrupt() or retrigger_next_event()
  */
-ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
-							ktime_t *offs_tai)
+ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
+				     ktime_t *offs_boot, ktime_t *offs_tai)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned int seq;
@@ -1953,15 +1927,16 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
 
 		base = tk->tkr_mono.base;
 		nsecs = timekeeping_get_ns(&tk->tkr_mono);
-
-		*offs_real = tk->offs_real;
-		*offs_boot = tk->offs_boot;
-		*offs_tai = tk->offs_tai;
+		if (*cwsseq != tk->clock_was_set_seq) {
+			*cwsseq = tk->clock_was_set_seq;
+			*offs_real = tk->offs_real;
+			*offs_boot = tk->offs_boot;
+			*offs_tai = tk->offs_tai;
+		}
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
 	return ktime_add_ns(base, nsecs);
 }
-#endif
 
 /**
  * do_adjtimex() - Accessor function to NTP __do_adjtimex function
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index ead8794b9a4e47..704f595ce83f03 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -3,19 +3,16 @@
 /*
  * Internal interfaces for kernel/time/
  */
-extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real,
-						ktime_t *offs_boot,
-						ktime_t *offs_tai);
-extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real,
-						ktime_t *offs_boot,
-						ktime_t *offs_tai);
+extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
+					    ktime_t *offs_real,
+					    ktime_t *offs_boot,
+					    ktime_t *offs_tai);
 
 extern int timekeeping_valid_for_hres(void);
 extern u64 timekeeping_max_deferment(void);
 extern int timekeeping_inject_offset(struct timespec *ts);
 extern s32 timekeeping_get_tai_offset(void);
 extern void timekeeping_set_tai_offset(s32 tai_offset);
-extern void timekeeping_clocktai(struct timespec *ts);
 extern int timekeeping_suspend(void);
 extern void timekeeping_resume(void);
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2ece3aa5069cad..7775d454a204fe 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -49,6 +49,8 @@
 #include <asm/timex.h>
 #include <asm/io.h>
 
+#include "tick-internal.h"
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/timer.h>
 
@@ -434,7 +436,7 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 	 * require special care against races with idle_cpu(), lets deal
 	 * with that later.
 	 */
-	if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu))
+	if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(base->cpu))
 		wake_up_nohz_cpu(base->cpu);
 }
 
@@ -648,7 +650,7 @@ static inline void
 debug_activate(struct timer_list *timer, unsigned long expires)
 {
 	debug_timer_activate(timer);
-	trace_timer_start(timer, expires);
+	trace_timer_start(timer, expires, tbase_get_deferrable(timer->base));
 }
 
 static inline void debug_deactivate(struct timer_list *timer)
@@ -1311,54 +1313,48 @@ cascade:
  * Check, if the next hrtimer event is before the next timer wheel
  * event:
  */
-static unsigned long cmp_next_hrtimer_event(unsigned long now,
-					    unsigned long expires)
+static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
 {
-	ktime_t hr_delta = hrtimer_get_next_event();
-	struct timespec tsdelta;
-	unsigned long delta;
-
-	if (hr_delta.tv64 == KTIME_MAX)
-		return expires;
+	u64 nextevt = hrtimer_get_next_event();
 
 	/*
-	 * Expired timer available, let it expire in the next tick
+	 * If high resolution timers are enabled
+	 * hrtimer_get_next_event() returns KTIME_MAX.
 	 */
-	if (hr_delta.tv64 <= 0)
-		return now + 1;
-
-	tsdelta = ktime_to_timespec(hr_delta);
-	delta = timespec_to_jiffies(&tsdelta);
+	if (expires <= nextevt)
+		return expires;
 
 	/*
-	 * Limit the delta to the max value, which is checked in
-	 * tick_nohz_stop_sched_tick():
+	 * If the next timer is already expired, return the tick base
+	 * time so the tick is fired immediately.
 	 */
-	if (delta > NEXT_TIMER_MAX_DELTA)
-		delta = NEXT_TIMER_MAX_DELTA;
+	if (nextevt <= basem)
+		return basem;
 
 	/*
-	 * Take rounding errors in to account and make sure, that it
-	 * expires in the next tick. Otherwise we go into an endless
-	 * ping pong due to tick_nohz_stop_sched_tick() retriggering
-	 * the timer softirq
+	 * Round up to the next jiffie. High resolution timers are
+	 * off, so the hrtimers are expired in the tick and we need to
+	 * make sure that this tick really expires the timer to avoid
+	 * a ping pong of the nohz stop code.
+	 *
+	 * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
 	 */
-	if (delta < 1)
-		delta = 1;
-	now += delta;
-	if (time_before(now, expires))
-		return now;
-	return expires;
+	return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
 }
 
 /**
- * get_next_timer_interrupt - return the jiffy of the next pending timer
- * @now: current time (in jiffies)
+ * get_next_timer_interrupt - return the time (clock mono) of the next timer
+ * @basej:	base time jiffies
+ * @basem:	base time clock monotonic
+ *
+ * Returns the tick aligned clock monotonic time of the next pending
+ * timer or KTIME_MAX if no timer is pending.
  */
-unsigned long get_next_timer_interrupt(unsigned long now)
+u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 {
 	struct tvec_base *base = __this_cpu_read(tvec_bases);
-	unsigned long expires = now + NEXT_TIMER_MAX_DELTA;
+	u64 expires = KTIME_MAX;
+	unsigned long nextevt;
 
 	/*
 	 * Pretend that there is no timer pending if the cpu is offline.
@@ -1371,14 +1367,15 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 	if (base->active_timers) {
 		if (time_before_eq(base->next_timer, base->timer_jiffies))
 			base->next_timer = __next_timer_interrupt(base);
-		expires = base->next_timer;
+		nextevt = base->next_timer;
+		if (time_before_eq(nextevt, basej))
+			expires = basem;
+		else
+			expires = basem + (nextevt - basej) * TICK_NSEC;
 	}
 	spin_unlock(&base->lock);
 
-	if (time_before_eq(expires, now))
-		return now;
-
-	return cmp_next_hrtimer_event(now, expires);
+	return cmp_next_hrtimer_event(basem, expires);
 }
 #endif
 
@@ -1409,8 +1406,6 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 	struct tvec_base *base = __this_cpu_read(tvec_bases);
 
-	hrtimer_run_pending();
-
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
 }
@@ -1697,14 +1692,14 @@ unsigned long msleep_interruptible(unsigned int msecs)
 
 EXPORT_SYMBOL(msleep_interruptible);
 
-static int __sched do_usleep_range(unsigned long min, unsigned long max)
+static void __sched do_usleep_range(unsigned long min, unsigned long max)
 {
 	ktime_t kmin;
 	unsigned long delta;
 
 	kmin = ktime_set(0, min * NSEC_PER_USEC);
 	delta = (max - min) * NSEC_PER_USEC;
-	return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
+	schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
 }
 
 /**
@@ -1712,7 +1707,7 @@ static int __sched do_usleep_range(unsigned long min, unsigned long max)
  * @min: Minimum time in usecs to sleep
  * @max: Maximum time in usecs to sleep
  */
-void usleep_range(unsigned long min, unsigned long max)
+void __sched usleep_range(unsigned long min, unsigned long max)
 {
 	__set_current_state(TASK_UNINTERRUPTIBLE);
 	do_usleep_range(min, max);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index e878c2e0ba45e0..1327004429be6f 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -35,13 +35,20 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
  * This allows printing both to /proc/timer_list and
  * to the console (on SysRq-Q):
  */
-#define SEQ_printf(m, x...)			\
- do {						\
-	if (m)					\
-		seq_printf(m, x);		\
-	else					\
-		printk(x);			\
- } while (0)
+__printf(2, 3)
+static void SEQ_printf(struct seq_file *m, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+
+	if (m)
+		seq_vprintf(m, fmt, args);
+	else
+		vprintk(fmt, args);
+
+	va_end(args);
+}
 
 static void print_name_offset(struct seq_file *m, void *sym)
 {
@@ -120,10 +127,10 @@ static void
 print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
 {
 	SEQ_printf(m, "  .base:       %pK\n", base);
-	SEQ_printf(m, "  .index:      %d\n",
-			base->index);
-	SEQ_printf(m, "  .resolution: %Lu nsecs\n",
-			(unsigned long long)ktime_to_ns(base->resolution));
+	SEQ_printf(m, "  .index:      %d\n", base->index);
+
+	SEQ_printf(m, "  .resolution: %u nsecs\n", (unsigned) hrtimer_resolution);
+
 	SEQ_printf(m,   "  .get_time:   ");
 	print_name_offset(m, base->get_time);
 	SEQ_printf(m,   "\n");
@@ -158,7 +165,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 	P(nr_events);
 	P(nr_retries);
 	P(nr_hangs);
-	P_ns(max_hang_time);
+	P(max_hang_time);
 #endif
 #undef P
 #undef P_ns
@@ -184,7 +191,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 		P_ns(idle_sleeptime);
 		P_ns(iowait_sleeptime);
 		P(last_jiffies);
-		P(next_jiffies);
+		P(next_timer);
 		P_ns(idle_expires);
 		SEQ_printf(m, "jiffies: %Lu\n",
 			   (unsigned long long)jiffies);
@@ -251,6 +258,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 			SEQ_printf(m, "\n");
 		}
 
+		if (dev->set_state_oneshot_stopped) {
+			SEQ_printf(m, " oneshot stopped: ");
+			print_name_offset(m, dev->set_state_oneshot_stopped);
+			SEQ_printf(m, "\n");
+		}
+
 		if (dev->tick_resume) {
 			SEQ_printf(m, " resume:   ");
 			print_name_offset(m, dev->tick_resume);
@@ -269,11 +282,11 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
 {
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 	print_tickdevice(m, tick_get_broadcast_device(), -1);
-	SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
-		   cpumask_bits(tick_get_broadcast_mask())[0]);
+	SEQ_printf(m, "tick_broadcast_mask: %*pb\n",
+		   cpumask_pr_args(tick_get_broadcast_mask()));
 #ifdef CONFIG_TICK_ONESHOT
-	SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
-		   cpumask_bits(tick_get_broadcast_oneshot_mask())[0]);
+	SEQ_printf(m, "tick_broadcast_oneshot_mask: %*pb\n",
+		   cpumask_pr_args(tick_get_broadcast_oneshot_mask()));
 #endif
 	SEQ_printf(m, "\n");
 #endif
@@ -282,7 +295,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
 
 static inline void timer_list_header(struct seq_file *m, u64 now)
 {
-	SEQ_printf(m, "Timer List Version: v0.7\n");
+	SEQ_printf(m, "Timer List Version: v0.8\n");
 	SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
 	SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
 	SEQ_printf(m, "\n");
diff --git a/kernel/torture.c b/kernel/torture.c
index dd70993c266c38..3e4840633d3ee7 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -409,7 +409,7 @@ static void (*torture_shutdown_hook)(void);
  */
 void torture_shutdown_absorb(const char *title)
 {
-	while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+	while (READ_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
 		pr_notice("torture thread %s parking due to system shutdown\n",
 			  title);
 		schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
@@ -480,9 +480,9 @@ static int torture_shutdown_notify(struct notifier_block *unused1,
 				   unsigned long unused2, void *unused3)
 {
 	mutex_lock(&fullstop_mutex);
-	if (ACCESS_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
+	if (READ_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
 		VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected");
-		ACCESS_ONCE(fullstop) = FULLSTOP_SHUTDOWN;
+		WRITE_ONCE(fullstop, FULLSTOP_SHUTDOWN);
 	} else {
 		pr_warn("Concurrent rmmod and shutdown illegal!\n");
 	}
@@ -523,13 +523,13 @@ static int stutter;
  */
 void stutter_wait(const char *title)
 {
-	while (ACCESS_ONCE(stutter_pause_test) ||
-	       (torture_runnable && !ACCESS_ONCE(*torture_runnable))) {
+	while (READ_ONCE(stutter_pause_test) ||
+	       (torture_runnable && !READ_ONCE(*torture_runnable))) {
 		if (stutter_pause_test)
-			if (ACCESS_ONCE(stutter_pause_test) == 1)
+			if (READ_ONCE(stutter_pause_test) == 1)
 				schedule_timeout_interruptible(1);
 			else
-				while (ACCESS_ONCE(stutter_pause_test))
+				while (READ_ONCE(stutter_pause_test))
 					cond_resched();
 		else
 			schedule_timeout_interruptible(round_jiffies_relative(HZ));
@@ -549,14 +549,14 @@ static int torture_stutter(void *arg)
 		if (!torture_must_stop()) {
 			if (stutter > 1) {
 				schedule_timeout_interruptible(stutter - 1);
-				ACCESS_ONCE(stutter_pause_test) = 2;
+				WRITE_ONCE(stutter_pause_test, 2);
 			}
 			schedule_timeout_interruptible(1);
-			ACCESS_ONCE(stutter_pause_test) = 1;
+			WRITE_ONCE(stutter_pause_test, 1);
 		}
 		if (!torture_must_stop())
 			schedule_timeout_interruptible(stutter);
-		ACCESS_ONCE(stutter_pause_test) = 0;
+		WRITE_ONCE(stutter_pause_test, 0);
 		torture_shutdown_absorb("torture_stutter");
 	} while (!torture_must_stop());
 	torture_kthread_stopping("torture_stutter");
@@ -642,13 +642,13 @@ EXPORT_SYMBOL_GPL(torture_init_end);
 bool torture_cleanup_begin(void)
 {
 	mutex_lock(&fullstop_mutex);
-	if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+	if (READ_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
 		pr_warn("Concurrent rmmod and shutdown illegal!\n");
 		mutex_unlock(&fullstop_mutex);
 		schedule_timeout_uninterruptible(10);
 		return true;
 	}
-	ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD;
+	WRITE_ONCE(fullstop, FULLSTOP_RMMOD);
 	mutex_unlock(&fullstop_mutex);
 	torture_shutdown_cleanup();
 	torture_shuffle_cleanup();
@@ -681,7 +681,7 @@ EXPORT_SYMBOL_GPL(torture_must_stop);
  */
 bool torture_must_stop_irq(void)
 {
-	return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP;
+	return READ_ONCE(fullstop) != FULLSTOP_DONTSTOP;
 }
 EXPORT_SYMBOL_GPL(torture_must_stop_irq);
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index f3d0e3e2b30ae3..eb3997bcbf8936 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1233,6 +1233,7 @@ config RCU_TORTURE_TEST
 	depends on DEBUG_KERNEL
 	select TORTURE_TEST
 	select SRCU
+	select TASKS_RCU
 	default n
 	help
 	  This option provides a kernel module that runs torture tests
@@ -1261,12 +1262,38 @@ config RCU_TORTURE_TEST_RUNNABLE
 	  Say N here if you want the RCU torture tests to start only
 	  after being manually enabled via /proc.
 
+config RCU_TORTURE_TEST_SLOW_PREINIT
+	bool "Slow down RCU grace-period pre-initialization to expose races"
+	depends on RCU_TORTURE_TEST
+	help
+	  This option delays grace-period pre-initialization (the
+	  propagation of CPU-hotplug changes up the rcu_node combining
+	  tree) for a few jiffies between initializing each pair of
+	  consecutive rcu_node structures.  This helps to expose races
+	  involving grace-period pre-initialization, in other words, it
+	  makes your kernel less stable.  It can also greatly increase
+	  grace-period latency, especially on systems with large numbers
+	  of CPUs.  This is useful when torture-testing RCU, but in
+	  almost no other circumstance.
+
+	  Say Y here if you want your system to crash and hang more often.
+	  Say N if you want a sane system.
+
+config RCU_TORTURE_TEST_SLOW_PREINIT_DELAY
+	int "How much to slow down RCU grace-period pre-initialization"
+	range 0 5
+	default 3
+	depends on RCU_TORTURE_TEST_SLOW_PREINIT
+	help
+	  This option specifies the number of jiffies to wait between
+	  each rcu_node structure pre-initialization step.
+
 config RCU_TORTURE_TEST_SLOW_INIT
 	bool "Slow down RCU grace-period initialization to expose races"
 	depends on RCU_TORTURE_TEST
 	help
-	  This option makes grace-period initialization block for a
-	  few jiffies between initializing each pair of consecutive
+	  This option delays grace-period initialization for a few
+	  jiffies between initializing each pair of consecutive
 	  rcu_node structures.	This helps to expose races involving
 	  grace-period initialization, in other words, it makes your
 	  kernel less stable.  It can also greatly increase grace-period
@@ -1286,6 +1313,30 @@ config RCU_TORTURE_TEST_SLOW_INIT_DELAY
 	  This option specifies the number of jiffies to wait between
 	  each rcu_node structure initialization.
 
+config RCU_TORTURE_TEST_SLOW_CLEANUP
+	bool "Slow down RCU grace-period cleanup to expose races"
+	depends on RCU_TORTURE_TEST
+	help
+	  This option delays grace-period cleanup for a few jiffies
+	  between cleaning up each pair of consecutive rcu_node
+	  structures.  This helps to expose races involving grace-period
+	  cleanup, in other words, it makes your kernel less stable.
+	  It can also greatly increase grace-period latency, especially
+	  on systems with large numbers of CPUs.  This is useful when
+	  torture-testing RCU, but in almost no other circumstance.
+
+	  Say Y here if you want your system to crash and hang more often.
+	  Say N if you want a sane system.
+
+config RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY
+	int "How much to slow down RCU grace-period cleanup"
+	range 0 5
+	default 3
+	depends on RCU_TORTURE_TEST_SLOW_CLEANUP
+	help
+	  This option specifies the number of jiffies to wait between
+	  each rcu_node structure cleanup operation.
+
 config RCU_CPU_STALL_TIMEOUT
 	int "RCU CPU stall timeout in seconds"
 	depends on RCU_STALL_COMMON
@@ -1322,6 +1373,17 @@ config RCU_TRACE
 	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
 
+config RCU_EQS_DEBUG
+	bool "Use this when adding any sort of NO_HZ support to your arch"
+	depends on DEBUG_KERNEL
+	help
+	  This option provides consistency checks in RCU's handling of
+	  NO_HZ.  These checks have proven quite helpful in detecting
+	  bugs in arch-specific NO_HZ code.
+
+	  Say N here if you need ultimate kernel/user switch latencies
+	  Say Y if you are unsure
+
 endmenu # "RCU Debugging"
 
 config DEBUG_BLOCK_EXT_DEVT
diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c
index 4f134d8907a7d0..f610b2a10b3eda 100644
--- a/lib/cpu_rmap.c
+++ b/lib/cpu_rmap.c
@@ -191,7 +191,7 @@ int cpu_rmap_update(struct cpu_rmap *rmap, u16 index,
 	/* Update distances based on topology */
 	for_each_cpu(cpu, update_mask) {
 		if (cpu_rmap_copy_neigh(rmap, cpu,
-					topology_thread_cpumask(cpu), 1))
+					topology_sibling_cpumask(cpu), 1))
 			continue;
 		if (cpu_rmap_copy_neigh(rmap, cpu,
 					topology_core_cpumask(cpu), 2))
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 3d2aa27b845b53..061550de77bc04 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -33,7 +33,7 @@
 #include <linux/string.h>
 #include <linux/bitops.h>
 #include <linux/rcupdate.h>
-#include <linux/preempt_mask.h>		/* in_interrupt() */
+#include <linux/preempt.h>		/* in_interrupt() */
 
 
 /*
diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h
index b7595484a8150c..8fe9d9662abbcd 100644
--- a/lib/raid6/x86.h
+++ b/lib/raid6/x86.h
@@ -23,7 +23,7 @@
 
 #ifdef __KERNEL__ /* Real code */
 
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 #else /* Dummy code for user space testing */
 
diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c
index fe9a32591c2498..3a5f2b366d84ed 100644
--- a/lib/strnlen_user.c
+++ b/lib/strnlen_user.c
@@ -85,7 +85,8 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
  * @str: The string to measure.
  * @count: Maximum count (including NUL character)
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Get the size of a NUL-terminated string in user space.
  *
@@ -121,7 +122,8 @@ EXPORT_SYMBOL(strnlen_user);
  * strlen_user: - Get the size of a user string INCLUDING final NUL.
  * @str: The string to measure.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Get the size of a NUL-terminated string in user space.
  *
diff --git a/lib/timerqueue.c b/lib/timerqueue.c
index a382e4a3260916..782ae8ca2c06f2 100644
--- a/lib/timerqueue.c
+++ b/lib/timerqueue.c
@@ -36,7 +36,7 @@
  * Adds the timer node to the timerqueue, sorted by the
  * node's expires value.
  */
-void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
+bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
 {
 	struct rb_node **p = &head->head.rb_node;
 	struct rb_node *parent = NULL;
@@ -56,8 +56,11 @@ void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
 	rb_link_node(&node->node, parent, p);
 	rb_insert_color(&node->node, &head->head);
 
-	if (!head->next || node->expires.tv64 < head->next->expires.tv64)
+	if (!head->next || node->expires.tv64 < head->next->expires.tv64) {
 		head->next = node;
+		return true;
+	}
+	return false;
 }
 EXPORT_SYMBOL_GPL(timerqueue_add);
 
@@ -69,7 +72,7 @@ EXPORT_SYMBOL_GPL(timerqueue_add);
  *
  * Removes the timer node from the timerqueue.
  */
-void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
+bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
 {
 	WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));
 
@@ -82,6 +85,7 @@ void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
 	}
 	rb_erase(&node->node, &head->head);
 	RB_CLEAR_NODE(&node->node);
+	return head->next != NULL;
 }
 EXPORT_SYMBOL_GPL(timerqueue_del);
 
diff --git a/mm/memory.c b/mm/memory.c
index 22e037e3364e0f..17734c3c1183ed 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3737,7 +3737,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
 }
 
 #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
-void might_fault(void)
+void __might_fault(const char *file, int line)
 {
 	/*
 	 * Some code (nfs/sunrpc) uses socket ops on kernel memory while
@@ -3747,21 +3747,15 @@ void might_fault(void)
 	 */
 	if (segment_eq(get_fs(), KERNEL_DS))
 		return;
-
-	/*
-	 * it would be nicer only to annotate paths which are not under
-	 * pagefault_disable, however that requires a larger audit and
-	 * providing helpers like get_user_atomic.
-	 */
-	if (in_atomic())
+	if (pagefault_disabled())
 		return;
-
-	__might_sleep(__FILE__, __LINE__, 0);
-
+	__might_sleep(file, line, 0);
+#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
 	if (current->mm)
 		might_lock_read(&current->mm->mmap_sem);
+#endif
 }
-EXPORT_SYMBOL(might_fault);
+EXPORT_SYMBOL(__might_fault);
 #endif
 
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index d93cbc5715f45b..05badbb588659e 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2263,8 +2263,6 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
 		do {
 			set_current_state(TASK_INTERRUPTIBLE);
 			hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
-			if (!hrtimer_active(&t.timer))
-				t.task = NULL;
 
 			if (likely(t.task))
 				schedule();
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index c5b9db84d069d4..f06aa01d60fd34 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1879,13 +1879,10 @@ EXPORT_SYMBOL(tcf_destroy_chain);
 #ifdef CONFIG_PROC_FS
 static int psched_show(struct seq_file *seq, void *v)
 {
-	struct timespec ts;
-
-	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
 	seq_printf(seq, "%08x %08x %08x %08x\n",
 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
 		   1000000,
-		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
+		   (u32)NSEC_PER_SEC / hrtimer_resolution);
 
 	return 0;
 }
diff --git a/sound/core/hrtimer.c b/sound/core/hrtimer.c
index 886be7da989d1a..f845ecf7e17293 100644
--- a/sound/core/hrtimer.c
+++ b/sound/core/hrtimer.c
@@ -121,16 +121,9 @@ static struct snd_timer *mytimer;
 static int __init snd_hrtimer_init(void)
 {
 	struct snd_timer *timer;
-	struct timespec tp;
 	int err;
 
-	hrtimer_get_res(CLOCK_MONOTONIC, &tp);
-	if (tp.tv_sec > 0 || !tp.tv_nsec) {
-		pr_err("snd-hrtimer: Invalid resolution %u.%09u",
-			   (unsigned)tp.tv_sec, (unsigned)tp.tv_nsec);
-		return -EINVAL;
-	}
-	resolution = tp.tv_nsec;
+	resolution = hrtimer_resolution;
 
 	/* Create a new timer and set up the fields */
 	err = snd_timer_global_new("hrtimer", SNDRV_TIMER_GLOBAL_HRTIMER,
diff --git a/sound/drivers/pcsp/pcsp.c b/sound/drivers/pcsp/pcsp.c
index d9647bd84d0f49..27e25bb78c9782 100644
--- a/sound/drivers/pcsp/pcsp.c
+++ b/sound/drivers/pcsp/pcsp.c
@@ -42,16 +42,13 @@ struct snd_pcsp pcsp_chip;
 static int snd_pcsp_create(struct snd_card *card)
 {
 	static struct snd_device_ops ops = { };
-	struct timespec tp;
-	int err;
-	int div, min_div, order;
-
-	hrtimer_get_res(CLOCK_MONOTONIC, &tp);
+	unsigned int resolution = hrtimer_resolution;
+	int err, div, min_div, order;
 
 	if (!nopcm) {
-		if (tp.tv_sec || tp.tv_nsec > PCSP_MAX_PERIOD_NS) {
+		if (resolution > PCSP_MAX_PERIOD_NS) {
 			printk(KERN_ERR "PCSP: Timer resolution is not sufficient "
-				"(%linS)\n", tp.tv_nsec);
+				"(%unS)\n", resolution);
 			printk(KERN_ERR "PCSP: Make sure you have HPET and ACPI "
 				"enabled.\n");
 			printk(KERN_ERR "PCSP: Turned into nopcm mode.\n");
@@ -59,13 +56,13 @@ static int snd_pcsp_create(struct snd_card *card)
 		}
 	}
 
-	if (loops_per_jiffy >= PCSP_MIN_LPJ && tp.tv_nsec <= PCSP_MIN_PERIOD_NS)
+	if (loops_per_jiffy >= PCSP_MIN_LPJ && resolution <= PCSP_MIN_PERIOD_NS)
 		min_div = MIN_DIV;
 	else
 		min_div = MAX_DIV;
 #if PCSP_DEBUG
-	printk(KERN_DEBUG "PCSP: lpj=%li, min_div=%i, res=%li\n",
-	       loops_per_jiffy, min_div, tp.tv_nsec);
+	printk(KERN_DEBUG "PCSP: lpj=%li, min_div=%i, res=%u\n",
+	       loops_per_jiffy, min_div, resolution);
 #endif
 
 	div = MAX_DIV / min_div;
diff --git a/tools/Makefile b/tools/Makefile
index 9a617adc6675dc..b35102721cbbc8 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,3 +1,8 @@
+# Some of the tools (perf) use same make variables
+# as in kernel build.
+export srctree=
+export objtree=
+
 include scripts/Makefile.include
 
 help:
@@ -47,11 +52,16 @@ cgroup firewire hv guest usb virtio vm net: FORCE
 liblockdep: FORCE
 	$(call descend,lib/lockdep)
 
-libapikfs: FORCE
+libapi: FORCE
 	$(call descend,lib/api)
 
-perf: libapikfs FORCE
-	$(call descend,$@)
+# The perf build does not follow the descend function setup,
+# invoking it via it's own make rule.
+PERF_O   = $(if $(O),$(O)/tools/perf,)
+
+perf: FORCE
+	$(Q)mkdir -p $(PERF_O) .
+	$(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir=
 
 selftests: FORCE
 	$(call descend,testing/$@)
@@ -97,10 +107,10 @@ cgroup_clean hv_clean firewire_clean lguest_clean usb_clean virtio_clean vm_clea
 liblockdep_clean:
 	$(call descend,lib/lockdep,clean)
 
-libapikfs_clean:
+libapi_clean:
 	$(call descend,lib/api,clean)
 
-perf_clean: libapikfs_clean
+perf_clean:
 	$(call descend,$(@:_clean=),clean)
 
 selftests_clean:
diff --git a/tools/arch/alpha/include/asm/barrier.h b/tools/arch/alpha/include/asm/barrier.h
new file mode 100644
index 00000000000000..95df19c9548234
--- /dev/null
+++ b/tools/arch/alpha/include/asm/barrier.h
@@ -0,0 +1,8 @@
+#ifndef __TOOLS_LINUX_ASM_ALPHA_BARRIER_H
+#define __TOOLS_LINUX_ASM_ALPHA_BARRIER_H
+
+#define mb()	__asm__ __volatile__("mb": : :"memory")
+#define rmb()	__asm__ __volatile__("mb": : :"memory")
+#define wmb()	__asm__ __volatile__("wmb": : :"memory")
+
+#endif		/* __TOOLS_LINUX_ASM_ALPHA_BARRIER_H */
diff --git a/tools/arch/arm/include/asm/barrier.h b/tools/arch/arm/include/asm/barrier.h
new file mode 100644
index 00000000000000..005c618a0ab081
--- /dev/null
+++ b/tools/arch/arm/include/asm/barrier.h
@@ -0,0 +1,12 @@
+#ifndef _TOOLS_LINUX_ASM_ARM_BARRIER_H
+#define _TOOLS_LINUX_ASM_ARM_BARRIER_H
+
+/*
+ * Use the __kuser_memory_barrier helper in the CPU helper page. See
+ * arch/arm/kernel/entry-armv.S in the kernel source for details.
+ */
+#define mb()		((void(*)(void))0xffff0fa0)()
+#define wmb()		((void(*)(void))0xffff0fa0)()
+#define rmb()		((void(*)(void))0xffff0fa0)()
+
+#endif /* _TOOLS_LINUX_ASM_ARM_BARRIER_H */
diff --git a/tools/arch/arm64/include/asm/barrier.h b/tools/arch/arm64/include/asm/barrier.h
new file mode 100644
index 00000000000000..a0483c8e01429c
--- /dev/null
+++ b/tools/arch/arm64/include/asm/barrier.h
@@ -0,0 +1,16 @@
+#ifndef _TOOLS_LINUX_ASM_AARCH64_BARRIER_H
+#define _TOOLS_LINUX_ASM_AARCH64_BARRIER_H
+
+/*
+ * From tools/perf/perf-sys.h, last modified in:
+ * f428ebd184c82a7914b2aa7e9f868918aaf7ea78 perf tools: Fix AAAAARGH64 memory barriers
+ *
+ * XXX: arch/arm64/include/asm/barrier.h in the kernel sources use dsb, is this
+ * a case like for arm32 where we do things differently in userspace?
+ */
+
+#define mb()		asm volatile("dmb ish" ::: "memory")
+#define wmb()		asm volatile("dmb ishst" ::: "memory")
+#define rmb()		asm volatile("dmb ishld" ::: "memory")
+
+#endif /* _TOOLS_LINUX_ASM_AARCH64_BARRIER_H */
diff --git a/tools/arch/ia64/include/asm/barrier.h b/tools/arch/ia64/include/asm/barrier.h
new file mode 100644
index 00000000000000..e4422b4b634e6b
--- /dev/null
+++ b/tools/arch/ia64/include/asm/barrier.h
@@ -0,0 +1,48 @@
+/*
+ * Copied from the kernel sources to tools/:
+ *
+ * Memory barrier definitions.  This is based on information published
+ * in the Processor Abstraction Layer and the System Abstraction Layer
+ * manual.
+ *
+ * Copyright (C) 1998-2003 Hewlett-Packard Co
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com>
+ * Copyright (C) 1999 Don Dugger <don.dugger@intel.com>
+ */
+#ifndef _TOOLS_LINUX_ASM_IA64_BARRIER_H
+#define _TOOLS_LINUX_ASM_IA64_BARRIER_H
+
+#include <linux/compiler.h>
+
+/*
+ * Macros to force memory ordering.  In these descriptions, "previous"
+ * and "subsequent" refer to program order; "visible" means that all
+ * architecturally visible effects of a memory access have occurred
+ * (at a minimum, this means the memory has been read or written).
+ *
+ *   wmb():	Guarantees that all preceding stores to memory-
+ *		like regions are visible before any subsequent
+ *		stores and that all following stores will be
+ *		visible only after all previous stores.
+ *   rmb():	Like wmb(), but for reads.
+ *   mb():	wmb()/rmb() combo, i.e., all previous memory
+ *		accesses are visible before all subsequent
+ *		accesses and vice versa.  This is also known as
+ *		a "fence."
+ *
+ * Note: "mb()" and its variants cannot be used as a fence to order
+ * accesses to memory mapped I/O registers.  For that, mf.a needs to
+ * be used.  However, we don't want to always use mf.a because (a)
+ * it's (presumably) much slower than mf and (b) mf.a is supported for
+ * sequential memory pages only.
+ */
+
+/* XXX From arch/ia64/include/uapi/asm/gcc_intrin.h */
+#define ia64_mf()       asm volatile ("mf" ::: "memory")
+
+#define mb()		ia64_mf()
+#define rmb()		mb()
+#define wmb()		mb()
+
+#endif /* _TOOLS_LINUX_ASM_IA64_BARRIER_H */
diff --git a/tools/arch/mips/include/asm/barrier.h b/tools/arch/mips/include/asm/barrier.h
new file mode 100644
index 00000000000000..80f96f7556e32e
--- /dev/null
+++ b/tools/arch/mips/include/asm/barrier.h
@@ -0,0 +1,20 @@
+#ifndef _TOOLS_LINUX_ASM_MIPS_BARRIER_H
+#define _TOOLS_LINUX_ASM_MIPS_BARRIER_H
+/*
+ * FIXME: This came from tools/perf/perf-sys.h, where it was first introduced
+ * in c1e028ef40b8d6943b767028ba17d4f2ba020edb, more work needed to make it
+ * more closely follow the Linux kernel arch/mips/include/asm/barrier.h file.
+ * Probably when we continue work on tools/ Kconfig support to have all the
+ * CONFIG_ needed for properly doing that.
+ */
+#define mb()		asm volatile(					\
+				".set	mips2\n\t"			\
+				"sync\n\t"				\
+				".set	mips0"				\
+				: /* no output */			\
+				: /* no input */			\
+				: "memory")
+#define wmb()	mb()
+#define rmb()	mb()
+
+#endif /* _TOOLS_LINUX_ASM_MIPS_BARRIER_H */
diff --git a/tools/arch/powerpc/include/asm/barrier.h b/tools/arch/powerpc/include/asm/barrier.h
new file mode 100644
index 00000000000000..b23aee8e6d9083
--- /dev/null
+++ b/tools/arch/powerpc/include/asm/barrier.h
@@ -0,0 +1,29 @@
+/*
+ * Copied from the kernel sources:
+ *
+ * Copyright (C) 1999 Cort Dougan <cort@cs.nmt.edu>
+ */
+#ifndef _TOOLS_LINUX_ASM_POWERPC_BARRIER_H
+#define _TOOLS_LINUX_ASM_POWERPC_BARRIER_H
+
+/*
+ * Memory barrier.
+ * The sync instruction guarantees that all memory accesses initiated
+ * by this processor have been performed (with respect to all other
+ * mechanisms that access memory).  The eieio instruction is a barrier
+ * providing an ordering (separately) for (a) cacheable stores and (b)
+ * loads and stores to non-cacheable memory (e.g. I/O devices).
+ *
+ * mb() prevents loads and stores being reordered across this point.
+ * rmb() prevents loads being reordered across this point.
+ * wmb() prevents stores being reordered across this point.
+ *
+ * *mb() variants without smp_ prefix must order all types of memory
+ * operations with one another. sync is the only instruction sufficient
+ * to do this.
+ */
+#define mb()   __asm__ __volatile__ ("sync" : : : "memory")
+#define rmb()  __asm__ __volatile__ ("sync" : : : "memory")
+#define wmb()  __asm__ __volatile__ ("sync" : : : "memory")
+
+#endif /* _TOOLS_LINUX_ASM_POWERPC_BARRIER_H */
diff --git a/tools/arch/s390/include/asm/barrier.h b/tools/arch/s390/include/asm/barrier.h
new file mode 100644
index 00000000000000..f85141266b92c5
--- /dev/null
+++ b/tools/arch/s390/include/asm/barrier.h
@@ -0,0 +1,30 @@
+/*
+ * Copied from the kernel sources:
+ *
+ * Copyright IBM Corp. 1999, 2009
+ *
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#ifndef __TOOLS_LINUX_ASM_BARRIER_H
+#define __TOOLS_LINUX_ASM_BARRIER_H
+
+/*
+ * Force strict CPU ordering.
+ * And yes, this is required on UP too when we're talking
+ * to devices.
+ */
+
+#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
+/* Fast-BCR without checkpoint synchronization */
+#define __ASM_BARRIER "bcr 14,0\n"
+#else
+#define __ASM_BARRIER "bcr 15,0\n"
+#endif
+
+#define mb() do {  asm volatile(__ASM_BARRIER : : : "memory"); } while (0)
+
+#define rmb()				mb()
+#define wmb()				mb()
+
+#endif /* __TOOLS_LIB_ASM_BARRIER_H */
diff --git a/tools/arch/sh/include/asm/barrier.h b/tools/arch/sh/include/asm/barrier.h
new file mode 100644
index 00000000000000..c18fd7599b9719
--- /dev/null
+++ b/tools/arch/sh/include/asm/barrier.h
@@ -0,0 +1,32 @@
+/*
+ * Copied from the kernel sources:
+ *
+ * Copyright (C) 1999, 2000  Niibe Yutaka  &  Kaz Kojima
+ * Copyright (C) 2002 Paul Mundt
+ */
+#ifndef __TOOLS_LINUX_ASM_SH_BARRIER_H
+#define __TOOLS_LINUX_ASM_SH_BARRIER_H
+
+/*
+ * A brief note on ctrl_barrier(), the control register write barrier.
+ *
+ * Legacy SH cores typically require a sequence of 8 nops after
+ * modification of a control register in order for the changes to take
+ * effect. On newer cores (like the sh4a and sh5) this is accomplished
+ * with icbi.
+ *
+ * Also note that on sh4a in the icbi case we can forego a synco for the
+ * write barrier, as it's not necessary for control registers.
+ *
+ * Historically we have only done this type of barrier for the MMUCR, but
+ * it's also necessary for the CCR, so we make it generic here instead.
+ */
+#if defined(__SH4A__) || defined(__SH5__)
+#define mb()		__asm__ __volatile__ ("synco": : :"memory")
+#define rmb()		mb()
+#define wmb()		mb()
+#endif
+
+#include <asm-generic/barrier.h>
+
+#endif /* __TOOLS_LINUX_ASM_SH_BARRIER_H */
diff --git a/tools/arch/sparc/include/asm/barrier.h b/tools/arch/sparc/include/asm/barrier.h
new file mode 100644
index 00000000000000..8c017b3b13911a
--- /dev/null
+++ b/tools/arch/sparc/include/asm/barrier.h
@@ -0,0 +1,8 @@
+#ifndef ___TOOLS_LINUX_ASM_SPARC_BARRIER_H
+#define ___TOOLS_LINUX_ASM_SPARC_BARRIER_H
+#if defined(__sparc__) && defined(__arch64__)
+#include "barrier_64.h"
+#else
+#include "barrier_32.h"
+#endif
+#endif
diff --git a/tools/arch/sparc/include/asm/barrier_32.h b/tools/arch/sparc/include/asm/barrier_32.h
new file mode 100644
index 00000000000000..c5eadd0a7233a8
--- /dev/null
+++ b/tools/arch/sparc/include/asm/barrier_32.h
@@ -0,0 +1,6 @@
+#ifndef __TOOLS_PERF_SPARC_BARRIER_H
+#define __TOOLS_PERF_SPARC_BARRIER_H
+
+#include <asm-generic/barrier.h>
+
+#endif /* !(__TOOLS_PERF_SPARC_BARRIER_H) */
diff --git a/tools/arch/sparc/include/asm/barrier_64.h b/tools/arch/sparc/include/asm/barrier_64.h
new file mode 100644
index 00000000000000..9a7d7322c3f785
--- /dev/null
+++ b/tools/arch/sparc/include/asm/barrier_64.h
@@ -0,0 +1,42 @@
+#ifndef __TOOLS_LINUX_SPARC64_BARRIER_H
+#define __TOOLS_LINUX_SPARC64_BARRIER_H
+
+/* Copied from the kernel sources to tools/:
+ *
+ * These are here in an effort to more fully work around Spitfire Errata
+ * #51.  Essentially, if a memory barrier occurs soon after a mispredicted
+ * branch, the chip can stop executing instructions until a trap occurs.
+ * Therefore, if interrupts are disabled, the chip can hang forever.
+ *
+ * It used to be believed that the memory barrier had to be right in the
+ * delay slot, but a case has been traced recently wherein the memory barrier
+ * was one instruction after the branch delay slot and the chip still hung.
+ * The offending sequence was the following in sym_wakeup_done() of the
+ * sym53c8xx_2 driver:
+ *
+ *	call	sym_ccb_from_dsa, 0
+ *	 movge	%icc, 0, %l0
+ *	brz,pn	%o0, .LL1303
+ *	 mov	%o0, %l2
+ *	membar	#LoadLoad
+ *
+ * The branch has to be mispredicted for the bug to occur.  Therefore, we put
+ * the memory barrier explicitly into a "branch always, predicted taken"
+ * delay slot to avoid the problem case.
+ */
+#define membar_safe(type) \
+do {	__asm__ __volatile__("ba,pt	%%xcc, 1f\n\t" \
+			     " membar	" type "\n" \
+			     "1:\n" \
+			     : : : "memory"); \
+} while (0)
+
+/* The kernel always executes in TSO memory model these days,
+ * and furthermore most sparc64 chips implement more stringent
+ * memory ordering than required by the specifications.
+ */
+#define mb()	membar_safe("#StoreLoad")
+#define rmb()	__asm__ __volatile__("":::"memory")
+#define wmb()	__asm__ __volatile__("":::"memory")
+
+#endif /* !(__TOOLS_LINUX_SPARC64_BARRIER_H) */
diff --git a/tools/arch/tile/include/asm/barrier.h b/tools/arch/tile/include/asm/barrier.h
new file mode 100644
index 00000000000000..7d3692c3d4ac2b
--- /dev/null
+++ b/tools/arch/tile/include/asm/barrier.h
@@ -0,0 +1,15 @@
+#ifndef _TOOLS_LINUX_ASM_TILE_BARRIER_H
+#define _TOOLS_LINUX_ASM_TILE_BARRIER_H
+/*
+ * FIXME: This came from tools/perf/perf-sys.h, where it was first introduced
+ * in 620830b6954913647b7c7f68920cf48eddf6ad92, more work needed to make it
+ * more closely follow the Linux kernel arch/tile/include/asm/barrier.h file.
+ * Probably when we continue work on tools/ Kconfig support to have all the
+ * CONFIG_ needed for properly doing that.
+ */
+
+#define mb()		asm volatile ("mf" ::: "memory")
+#define wmb()		mb()
+#define rmb()		mb()
+
+#endif /* _TOOLS_LINUX_ASM_TILE_BARRIER_H */
diff --git a/tools/arch/x86/include/asm/atomic.h b/tools/arch/x86/include/asm/atomic.h
new file mode 100644
index 00000000000000..059e33e94260ab
--- /dev/null
+++ b/tools/arch/x86/include/asm/atomic.h
@@ -0,0 +1,65 @@
+#ifndef _TOOLS_LINUX_ASM_X86_ATOMIC_H
+#define _TOOLS_LINUX_ASM_X86_ATOMIC_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include "rmwcc.h"
+
+#define LOCK_PREFIX "\n\tlock; "
+
+/*
+ * Atomic operations that C can't guarantee us.  Useful for
+ * resource counting etc..
+ */
+
+#define ATOMIC_INIT(i)	{ (i) }
+
+/**
+ * atomic_read - read atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically reads the value of @v.
+ */
+static inline int atomic_read(const atomic_t *v)
+{
+	return ACCESS_ONCE((v)->counter);
+}
+
+/**
+ * atomic_set - set atomic variable
+ * @v: pointer of type atomic_t
+ * @i: required value
+ *
+ * Atomically sets the value of @v to @i.
+ */
+static inline void atomic_set(atomic_t *v, int i)
+{
+	v->counter = i;
+}
+
+/**
+ * atomic_inc - increment atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1.
+ */
+static inline void atomic_inc(atomic_t *v)
+{
+	asm volatile(LOCK_PREFIX "incl %0"
+		     : "+m" (v->counter));
+}
+
+/**
+ * atomic_dec_and_test - decrement and test
+ * @v: pointer of type atomic_t
+ *
+ * Atomically decrements @v by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int atomic_dec_and_test(atomic_t *v)
+{
+	GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e");
+}
+
+#endif /* _TOOLS_LINUX_ASM_X86_ATOMIC_H */
diff --git a/tools/arch/x86/include/asm/barrier.h b/tools/arch/x86/include/asm/barrier.h
new file mode 100644
index 00000000000000..f366d8e550e4ae
--- /dev/null
+++ b/tools/arch/x86/include/asm/barrier.h
@@ -0,0 +1,28 @@
+#ifndef _TOOLS_LINUX_ASM_X86_BARRIER_H
+#define _TOOLS_LINUX_ASM_X86_BARRIER_H
+
+/*
+ * Copied from the Linux kernel sources, and also moving code
+ * out from tools/perf/perf-sys.h so as to make it be located
+ * in a place similar as in the kernel sources.
+ *
+ * Force strict CPU ordering.
+ * And yes, this is required on UP too when we're talking
+ * to devices.
+ */
+
+#if defined(__i386__)
+/*
+ * Some non-Intel clones support out of order store. wmb() ceases to be a
+ * nop for these.
+ */
+#define mb()	asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#define rmb()	asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#define wmb()	asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#elif defined(__x86_64__)
+#define mb() 	asm volatile("mfence":::"memory")
+#define rmb()	asm volatile("lfence":::"memory")
+#define wmb()	asm volatile("sfence" ::: "memory")
+#endif
+
+#endif /* _TOOLS_LINUX_ASM_X86_BARRIER_H */
diff --git a/tools/arch/x86/include/asm/rmwcc.h b/tools/arch/x86/include/asm/rmwcc.h
new file mode 100644
index 00000000000000..a6669bc069395c
--- /dev/null
+++ b/tools/arch/x86/include/asm/rmwcc.h
@@ -0,0 +1,41 @@
+#ifndef _TOOLS_LINUX_ASM_X86_RMWcc
+#define _TOOLS_LINUX_ASM_X86_RMWcc
+
+#ifdef CC_HAVE_ASM_GOTO
+
+#define __GEN_RMWcc(fullop, var, cc, ...)				\
+do {									\
+	asm_volatile_goto (fullop "; j" cc " %l[cc_label]"		\
+			: : "m" (var), ## __VA_ARGS__ 			\
+			: "memory" : cc_label);				\
+	return 0;							\
+cc_label:								\
+	return 1;							\
+} while (0)
+
+#define GEN_UNARY_RMWcc(op, var, arg0, cc) 				\
+	__GEN_RMWcc(op " " arg0, var, cc)
+
+#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc)			\
+	__GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val))
+
+#else /* !CC_HAVE_ASM_GOTO */
+
+#define __GEN_RMWcc(fullop, var, cc, ...)				\
+do {									\
+	char c;								\
+	asm volatile (fullop "; set" cc " %1"				\
+			: "+m" (var), "=qm" (c)				\
+			: __VA_ARGS__ : "memory");			\
+	return c != 0;							\
+} while (0)
+
+#define GEN_UNARY_RMWcc(op, var, arg0, cc)				\
+	__GEN_RMWcc(op " " arg0, var, cc)
+
+#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc)			\
+	__GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val))
+
+#endif /* CC_HAVE_ASM_GOTO */
+
+#endif /* _TOOLS_LINUX_ASM_X86_RMWcc */
diff --git a/tools/arch/xtensa/include/asm/barrier.h b/tools/arch/xtensa/include/asm/barrier.h
new file mode 100644
index 00000000000000..583800bd725907
--- /dev/null
+++ b/tools/arch/xtensa/include/asm/barrier.h
@@ -0,0 +1,18 @@
+/*
+ * Copied from the kernel sources to tools/:
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2001 - 2012 Tensilica Inc.
+ */
+
+#ifndef _TOOLS_LINUX_XTENSA_SYSTEM_H
+#define _TOOLS_LINUX_XTENSA_SYSTEM_H
+
+#define mb()  ({ __asm__ __volatile__("memw" : : : "memory"); })
+#define rmb() barrier()
+#define wmb() mb()
+
+#endif /* _TOOLS_LINUX_XTENSA_SYSTEM_H */
diff --git a/tools/build/Makefile.build b/tools/build/Makefile.build
index 10df57237a66d2..69c35cf09cad25 100644
--- a/tools/build/Makefile.build
+++ b/tools/build/Makefile.build
@@ -37,7 +37,7 @@ subdir-obj-y :=
 
 # Build definitions
 build-file := $(dir)/Build
-include $(build-file)
+-include $(build-file)
 
 quiet_cmd_flex  = FLEX     $@
 quiet_cmd_bison = BISON    $@
diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 3a0b0ca2a28c1b..2975632d51e234 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -27,7 +27,7 @@ endef
 #   the rule that uses them - an example for that is the 'bionic'
 #   feature check. ]
 #
-FEATURE_TESTS =			\
+FEATURE_TESTS ?=			\
 	backtrace			\
 	dwarf				\
 	fortify-source			\
@@ -53,7 +53,7 @@ FEATURE_TESTS =			\
 	zlib				\
 	lzma
 
-FEATURE_DISPLAY =			\
+FEATURE_DISPLAY ?=			\
 	dwarf				\
 	glibc				\
 	gtk2				\
diff --git a/tools/build/tests/ex/Build b/tools/build/tests/ex/Build
index 0e6c3e6767e6c5..70d876237c5709 100644
--- a/tools/build/tests/ex/Build
+++ b/tools/build/tests/ex/Build
@@ -2,6 +2,7 @@ ex-y += ex.o
 ex-y += a.o
 ex-y += b.o
 ex-y += empty/
+ex-y += empty2/
 
 libex-y += c.o
 libex-y += d.o
diff --git a/tools/build/tests/ex/empty2/README b/tools/build/tests/ex/empty2/README
new file mode 100644
index 00000000000000..2107cc5bf5a9f9
--- /dev/null
+++ b/tools/build/tests/ex/empty2/README
@@ -0,0 +1,2 @@
+This directory is left intentionally without Build file
+to test proper nesting into Build-less directories.
diff --git a/tools/include/asm-generic/atomic-gcc.h b/tools/include/asm-generic/atomic-gcc.h
new file mode 100644
index 00000000000000..2ba78c9f570108
--- /dev/null
+++ b/tools/include/asm-generic/atomic-gcc.h
@@ -0,0 +1,63 @@
+#ifndef __TOOLS_ASM_GENERIC_ATOMIC_H
+#define __TOOLS_ASM_GENERIC_ATOMIC_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+/*
+ * Atomic operations that C can't guarantee us.  Useful for
+ * resource counting etc..
+ *
+ * Excerpts obtained from the Linux kernel sources.
+ */
+
+#define ATOMIC_INIT(i)	{ (i) }
+
+/**
+ * atomic_read - read atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically reads the value of @v.
+ */
+static inline int atomic_read(const atomic_t *v)
+{
+	return ACCESS_ONCE((v)->counter);
+}
+
+/**
+ * atomic_set - set atomic variable
+ * @v: pointer of type atomic_t
+ * @i: required value
+ *
+ * Atomically sets the value of @v to @i.
+ */
+static inline void atomic_set(atomic_t *v, int i)
+{
+        v->counter = i;
+}
+
+/**
+ * atomic_inc - increment atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1.
+ */
+static inline void atomic_inc(atomic_t *v)
+{
+	__sync_add_and_fetch(&v->counter, 1);
+}
+
+/**
+ * atomic_dec_and_test - decrement and test
+ * @v: pointer of type atomic_t
+ *
+ * Atomically decrements @v by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int atomic_dec_and_test(atomic_t *v)
+{
+	return __sync_sub_and_fetch(&v->counter, 1) == 0;
+}
+
+#endif /* __TOOLS_ASM_GENERIC_ATOMIC_H */
diff --git a/tools/include/asm-generic/barrier.h b/tools/include/asm-generic/barrier.h
new file mode 100644
index 00000000000000..47b933903eaf3d
--- /dev/null
+++ b/tools/include/asm-generic/barrier.h
@@ -0,0 +1,44 @@
+/*
+ * Copied from the kernel sources to tools/perf/:
+ *
+ * Generic barrier definitions, originally based on MN10300 definitions.
+ *
+ * It should be possible to use these on really simple architectures,
+ * but it serves more as a starting point for new ports.
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#ifndef __TOOLS_LINUX_ASM_GENERIC_BARRIER_H
+#define __TOOLS_LINUX_ASM_GENERIC_BARRIER_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/compiler.h>
+
+/*
+ * Force strict CPU ordering. And yes, this is required on UP too when we're
+ * talking to devices.
+ *
+ * Fall back to compiler barriers if nothing better is provided.
+ */
+
+#ifndef mb
+#define mb()	barrier()
+#endif
+
+#ifndef rmb
+#define rmb()	mb()
+#endif
+
+#ifndef wmb
+#define wmb()	mb()
+#endif
+
+#endif /* !__ASSEMBLY__ */
+#endif /* __TOOLS_LINUX_ASM_GENERIC_BARRIER_H */
diff --git a/tools/include/asm/atomic.h b/tools/include/asm/atomic.h
new file mode 100644
index 00000000000000..70794f538a8601
--- /dev/null
+++ b/tools/include/asm/atomic.h
@@ -0,0 +1,10 @@
+#ifndef __TOOLS_LINUX_ASM_ATOMIC_H
+#define __TOOLS_LINUX_ASM_ATOMIC_H
+
+#if defined(__i386__) || defined(__x86_64__)
+#include "../../arch/x86/include/asm/atomic.h"
+#else
+#include <asm-generic/atomic-gcc.h>
+#endif
+
+#endif /* __TOOLS_LINUX_ASM_ATOMIC_H */
diff --git a/tools/include/asm/barrier.h b/tools/include/asm/barrier.h
new file mode 100644
index 00000000000000..ac66ac5946852d
--- /dev/null
+++ b/tools/include/asm/barrier.h
@@ -0,0 +1,27 @@
+#if defined(__i386__) || defined(__x86_64__)
+#include "../../arch/x86/include/asm/barrier.h"
+#elif defined(__arm__)
+#include "../../arch/arm/include/asm/barrier.h"
+#elif defined(__aarch64__)
+#include "../../arch/arm64/include/asm/barrier.h"
+#elif defined(__powerpc__)
+#include "../../arch/powerpc/include/asm/barrier.h"
+#elif defined(__s390__)
+#include "../../arch/s390/include/asm/barrier.h"
+#elif defined(__sh__)
+#include "../../arch/sh/include/asm/barrier.h"
+#elif defined(__sparc__)
+#include "../../arch/sparc/include/asm/barrier.h"
+#elif defined(__tile__)
+#include "../../arch/tile/include/asm/barrier.h"
+#elif defined(__alpha__)
+#include "../../arch/alpha/include/asm/barrier.h"
+#elif defined(__mips__)
+#include "../../arch/mips/include/asm/barrier.h"
+#elif defined(__ia64__)
+#include "../../arch/ia64/include/asm/barrier.h"
+#elif defined(__xtensa__)
+#include "../../arch/xtensa/include/asm/barrier.h"
+#else
+#include <asm-generic/barrier.h>
+#endif
diff --git a/tools/include/linux/atomic.h b/tools/include/linux/atomic.h
new file mode 100644
index 00000000000000..4e3d3d18ebab62
--- /dev/null
+++ b/tools/include/linux/atomic.h
@@ -0,0 +1,6 @@
+#ifndef __TOOLS_LINUX_ATOMIC_H
+#define __TOOLS_LINUX_ATOMIC_H
+
+#include <asm/atomic.h>
+
+#endif /* __TOOLS_LINUX_ATOMIC_H */
diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h
index 88461f09cc860b..f0e72674c52d2c 100644
--- a/tools/include/linux/compiler.h
+++ b/tools/include/linux/compiler.h
@@ -1,6 +1,10 @@
 #ifndef _TOOLS_LINUX_COMPILER_H_
 #define _TOOLS_LINUX_COMPILER_H_
 
+/* Optimization barrier */
+/* The "volatile" is due to gcc bugs */
+#define barrier() __asm__ __volatile__("": : :"memory")
+
 #ifndef __always_inline
 # define __always_inline	inline __attribute__((always_inline))
 #endif
diff --git a/tools/include/linux/types.h b/tools/include/linux/types.h
index b5cf25e05df2ec..8ebf6278b2ef23 100644
--- a/tools/include/linux/types.h
+++ b/tools/include/linux/types.h
@@ -60,6 +60,14 @@ typedef __u32 __bitwise __be32;
 typedef __u64 __bitwise __le64;
 typedef __u64 __bitwise __be64;
 
+typedef struct {
+	int counter;
+} atomic_t;
+
+#ifndef __aligned_u64
+# define __aligned_u64 __u64 __attribute__((aligned(8)))
+#endif
+
 struct list_head {
 	struct list_head *next, *prev;
 };
diff --git a/tools/lib/traceevent/Makefile b/tools/lib/traceevent/Makefile
index d410da335e3dae..84640394ebf987 100644
--- a/tools/lib/traceevent/Makefile
+++ b/tools/lib/traceevent/Makefile
@@ -34,9 +34,15 @@ INSTALL = install
 DESTDIR ?=
 DESTDIR_SQ = '$(subst ','\'',$(DESTDIR))'
 
+LP64 := $(shell echo __LP64__ | ${CC} ${CFLAGS} -E -x c - | tail -n 1)
+ifeq ($(LP64), 1)
+  libdir_relative = lib64
+else
+  libdir_relative = lib
+endif
+
 prefix ?= /usr/local
-bindir_relative = bin
-bindir = $(prefix)/$(bindir_relative)
+libdir = $(prefix)/$(libdir_relative)
 man_dir = $(prefix)/share/man
 man_dir_SQ = '$(subst ','\'',$(man_dir))'
 
@@ -58,7 +64,7 @@ ifeq ($(prefix),$(HOME))
 override plugin_dir = $(HOME)/.traceevent/plugins
 set_plugin_dir := 0
 else
-override plugin_dir = $(prefix)/lib/traceevent/plugins
+override plugin_dir = $(libdir)/traceevent/plugins
 endif
 endif
 
@@ -85,11 +91,11 @@ srctree := $(patsubst %/,%,$(dir $(srctree)))
 #$(info Determined 'srctree' to be $(srctree))
 endif
 
-export prefix bindir src obj
+export prefix libdir src obj
 
 # Shell quotes
-bindir_SQ = $(subst ','\'',$(bindir))
-bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
+libdir_SQ = $(subst ','\'',$(libdir))
+libdir_relative_SQ = $(subst ','\'',$(libdir_relative))
 plugin_dir_SQ = $(subst ','\'',$(plugin_dir))
 
 LIB_FILE = libtraceevent.a libtraceevent.so
@@ -240,7 +246,7 @@ endef
 
 install_lib: all_cmd install_plugins
 	$(call QUIET_INSTALL, $(LIB_FILE)) \
-		$(call do_install,$(LIB_FILE),$(bindir_SQ))
+		$(call do_install,$(LIB_FILE),$(libdir_SQ))
 
 install_plugins: $(PLUGINS)
 	$(call QUIET_INSTALL, trace_plugins) \
diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index 29f94f6f0d9e9e..cc25f059ab3dfc 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -1387,7 +1387,7 @@ static int event_read_fields(struct event_format *event, struct format_field **f
 			do_warning_event(event, "%s: no type found", __func__);
 			goto fail;
 		}
-		field->name = last_token;
+		field->name = field->alias = last_token;
 
 		if (test_type(type, EVENT_OP))
 			goto fail;
@@ -1469,7 +1469,7 @@ static int event_read_fields(struct event_format *event, struct format_field **f
 				size_dynamic = type_size(field->name);
 				free_token(field->name);
 				strcat(field->type, brackets);
-				field->name = token;
+				field->name = field->alias = token;
 				type = read_token(&token);
 			} else {
 				char *new_type;
@@ -6444,6 +6444,8 @@ void pevent_ref(struct pevent *pevent)
 void pevent_free_format_field(struct format_field *field)
 {
 	free(field->type);
+	if (field->alias != field->name)
+		free(field->alias);
 	free(field->name);
 	free(field);
 }
diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h
index 86a5839fb048e8..063b1971eb3528 100644
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h
@@ -191,6 +191,7 @@ struct format_field {
 	struct event_format	*event;
 	char			*type;
 	char			*name;
+	char			*alias;
 	int			offset;
 	int			size;
 	unsigned int		arraylen;
diff --git a/tools/lib/traceevent/plugin_cfg80211.c b/tools/lib/traceevent/plugin_cfg80211.c
index 4592d84383188e..ec57d0c1fbc2b6 100644
--- a/tools/lib/traceevent/plugin_cfg80211.c
+++ b/tools/lib/traceevent/plugin_cfg80211.c
@@ -4,6 +4,19 @@
 #include <endian.h>
 #include "event-parse.h"
 
+/*
+ * From glibc endian.h, for older systems where it is not present, e.g.: RHEL5,
+ * Fedora6.
+ */
+#ifndef le16toh
+# if __BYTE_ORDER == __LITTLE_ENDIAN
+#  define le16toh(x) (x)
+# else
+#  define le16toh(x) __bswap_16 (x)
+# endif
+#endif
+
+
 static unsigned long long
 process___le16_to_cpup(struct trace_seq *s, unsigned long long *args)
 {
diff --git a/tools/perf/Documentation/callchain-overhead-calculation.txt b/tools/perf/Documentation/callchain-overhead-calculation.txt
new file mode 100644
index 00000000000000..1a757927195ede
--- /dev/null
+++ b/tools/perf/Documentation/callchain-overhead-calculation.txt
@@ -0,0 +1,108 @@
+Overhead calculation
+--------------------
+The overhead can be shown in two columns as 'Children' and 'Self' when
+perf collects callchains.  The 'self' overhead is simply calculated by
+adding all period values of the entry - usually a function (symbol).
+This is the value that perf shows traditionally and sum of all the
+'self' overhead values should be 100%.
+
+The 'children' overhead is calculated by adding all period values of
+the child functions so that it can show the total overhead of the
+higher level functions even if they don't directly execute much.
+'Children' here means functions that are called from another (parent)
+function.
+
+It might be confusing that the sum of all the 'children' overhead
+values exceeds 100% since each of them is already an accumulation of
+'self' overhead of its child functions.  But with this enabled, users
+can find which function has the most overhead even if samples are
+spread over the children.
+
+Consider the following example; there are three functions like below.
+
+-----------------------
+void foo(void) {
+    /* do something */
+}
+
+void bar(void) {
+    /* do something */
+    foo();
+}
+
+int main(void) {
+    bar()
+    return 0;
+}
+-----------------------
+
+In this case 'foo' is a child of 'bar', and 'bar' is an immediate
+child of 'main' so 'foo' also is a child of 'main'.  In other words,
+'main' is a parent of 'foo' and 'bar', and 'bar' is a parent of 'foo'.
+
+Suppose all samples are recorded in 'foo' and 'bar' only.  When it's
+recorded with callchains the output will show something like below
+in the usual (self-overhead-only) output of perf report:
+
+----------------------------------
+Overhead  Symbol
+........  .....................
+  60.00%  foo
+          |
+          --- foo
+              bar
+              main
+              __libc_start_main
+
+  40.00%  bar
+          |
+          --- bar
+              main
+              __libc_start_main
+----------------------------------
+
+When the --children option is enabled, the 'self' overhead values of
+child functions (i.e. 'foo' and 'bar') are added to the parents to
+calculate the 'children' overhead.  In this case the report could be
+displayed as:
+
+-------------------------------------------
+Children      Self  Symbol
+........  ........  ....................
+ 100.00%     0.00%  __libc_start_main
+          |
+          --- __libc_start_main
+
+ 100.00%     0.00%  main
+          |
+          --- main
+              __libc_start_main
+
+ 100.00%    40.00%  bar
+          |
+          --- bar
+              main
+              __libc_start_main
+
+  60.00%    60.00%  foo
+          |
+          --- foo
+              bar
+              main
+              __libc_start_main
+-------------------------------------------
+
+In the above output, the 'self' overhead of 'foo' (60%) was add to the
+'children' overhead of 'bar', 'main' and '\_\_libc_start_main'.
+Likewise, the 'self' overhead of 'bar' (40%) was added to the
+'children' overhead of 'main' and '\_\_libc_start_main'.
+
+So '\_\_libc_start_main' and 'main' are shown first since they have
+same (100%) 'children' overhead (even though they have zero 'self'
+overhead) and they are the parents of 'foo' and 'bar'.
+
+Since v3.16 the 'children' overhead is shown by default and the output
+is sorted by its values. The 'children' overhead is disabled by
+specifying --no-children option on the command line or by adding
+'report.children = false' or 'top.children = false' in the perf config
+file.
diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt
index f6480cbf309b40..bf3d0644bf1066 100644
--- a/tools/perf/Documentation/perf-bench.txt
+++ b/tools/perf/Documentation/perf-bench.txt
@@ -210,6 +210,9 @@ Suite for evaluating hash tables.
 *wake*::
 Suite for evaluating wake calls.
 
+*wake-parallel*::
+Suite for evaluating parallel wake calls.
+
 *requeue*::
 Suite for evaluating requeue calls.
 
diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt
index dc7442cf3d7f80..b876ae312699b9 100644
--- a/tools/perf/Documentation/perf-inject.txt
+++ b/tools/perf/Documentation/perf-inject.txt
@@ -44,6 +44,33 @@ OPTIONS
 --kallsyms=<file>::
 	kallsyms pathname
 
+--itrace::
+	Decode Instruction Tracing data, replacing it with synthesized events.
+	Options are:
+
+		i	synthesize instructions events
+		b	synthesize branches events
+		c	synthesize branches events (calls only)
+		r	synthesize branches events (returns only)
+		x	synthesize transactions events
+		e	synthesize error events
+		d	create a debug log
+		g	synthesize a call chain (use with i or x)
+
+	The default is all events i.e. the same as --itrace=ibxe
+
+	In addition, the period (default 100000) for instructions events
+	can be specified in units of:
+
+		i	instructions
+		t	ticks
+		ms	milliseconds
+		us	microseconds
+		ns	nanoseconds (default)
+
+	Also the call chain size (default 16, max. 1024) for instructions or
+	transactions events can be specified.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1]
diff --git a/tools/perf/Documentation/perf-kmem.txt b/tools/perf/Documentation/perf-kmem.txt
index 23219c65c16f77..ff0f433b3fce1e 100644
--- a/tools/perf/Documentation/perf-kmem.txt
+++ b/tools/perf/Documentation/perf-kmem.txt
@@ -37,7 +37,11 @@ OPTIONS
 
 -s <key[,key2...]>::
 --sort=<key[,key2...]>::
-	Sort the output (default: frag,hit,bytes)
+	Sort the output (default: 'frag,hit,bytes' for slab and 'bytes,hit'
+	for page).  Available sort keys are 'ptr, callsite, bytes, hit,
+	pingpong, frag' for slab and 'page, callsite, bytes, hit, order,
+	migtype, gfp' for page.  This option should be preceded by one of the
+	mode selection options - i.e. --slab, --page, --alloc and/or --caller.
 
 -l <num>::
 --line=<num>::
@@ -52,6 +56,11 @@ OPTIONS
 --page::
 	Analyze page allocator events
 
+--live::
+	Show live page stat.  The perf kmem shows total allocation stat by
+	default, but this option shows live (currently allocated) pages
+	instead.  (This option works with --page option only)
+
 SEE ALSO
 --------
 linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index 239609c09f83a1..3a8a9ba2b0412a 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -14,11 +14,13 @@ or
 or
 'perf probe' [options] --del='[GROUP:]EVENT' [...]
 or
-'perf probe' --list
+'perf probe' --list[=[GROUP:]EVENT]
 or
 'perf probe' [options] --line='LINE'
 or
 'perf probe' [options] --vars='PROBEPOINT'
+or
+'perf probe' [options] --funcs
 
 DESCRIPTION
 -----------
@@ -64,8 +66,8 @@ OPTIONS
 	classes(e.g. [a-z], [!A-Z]).
 
 -l::
---list::
-	List up current probe events.
+--list[=[GROUP:]EVENT]::
+	List up current probe events. This can also accept filtering patterns of event names.
 
 -L::
 --line=::
@@ -81,10 +83,15 @@ OPTIONS
 	(Only for --vars) Show external defined variables in addition to local
 	variables.
 
+--no-inlines::
+	(Only for --add) Search only for non-inlined functions. The functions
+	which do not have instances are ignored.
+
 -F::
---funcs::
+--funcs[=FILTER]::
 	Show available functions in given module or kernel. With -x/--exec,
 	can also list functions in a user space executable / shared library.
+	This also can accept a FILTER rule argument.
 
 --filter=FILTER::
 	(Only for --vars and --funcs) Set filter. FILTER is a combination of glob
@@ -148,7 +155,7 @@ Each probe argument follows below syntax.
  [NAME=]LOCALVAR|$retval|%REG|@SYMBOL[:TYPE]
 
 'NAME' specifies the name of this argument (optional). You can use the name of local variable, local data structure member (e.g. var->field, var.field2), local array with fixed index (e.g. array[1], var->array[0], var->pointer[2]), or kprobe-tracer argument format (e.g. $retval, %ax, etc). Note that the name of this argument will be set as the last member name if you specify a local data structure member (e.g. field2 for 'var->field1.field2'.)
-'$vars' special argument is also available for NAME, it is expanded to the local variables which can access at given probe point.
+'$vars' and '$params' special arguments are also available for NAME, '$vars' is expanded to the local variables (including function parameters) which can access at given probe point. '$params' is expanded to only the function parameters.
 'TYPE' casts the type of this argument (optional). If omitted, perf probe automatically set the type based on debuginfo. You can specify 'string' type only for the local variable or structure member which is an array of or a pointer to 'char' or 'unsigned char' type.
 
 On x86 systems %REG is always the short form of the register: for example %AX. %RAX or %EAX is not valid.
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 4847a793de6516..280533ebf9df43 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -108,6 +108,8 @@ OPTIONS
 	Number of mmap data pages (must be a power of two) or size
 	specification with appended unit character - B/K/M/G. The
 	size is rounded up to have nearest pages power of two value.
+	Also, by adding a comma, the number of mmap pages for AUX
+	area tracing can be specified.
 
 --group::
 	Put all events in a single event group.  This precedes the --event
@@ -145,7 +147,8 @@ OPTIONS
 
 -s::
 --stat::
-	Per thread counts.
+	Record per-thread event counts.  Use it with 'perf report -T' to see
+	the values.
 
 -d::
 --data::
@@ -257,6 +260,13 @@ records. See clock_gettime(). In particular CLOCK_MONOTONIC and
 CLOCK_MONOTONIC_RAW are supported, some events might also allow
 CLOCK_BOOTTIME, CLOCK_REALTIME and CLOCK_TAI.
 
+-S::
+--snapshot::
+Select AUX area tracing Snapshot Mode. This option is valid only with an
+AUX area tracing event. Optionally the number of bytes to capture per
+snapshot can be specified. In Snapshot Mode, trace data is captured only when
+signal SIGUSR2 is received.
+
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 4879cf63882482..c33b69f3374fda 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -34,7 +34,8 @@ OPTIONS
 
 -T::
 --threads::
-	Show per-thread event counters
+	Show per-thread event counters.  The input data file should be recorded
+	with -s option.
 -c::
 --comms=::
 	Only consider symbols in these comms. CSV that understands
@@ -193,6 +194,7 @@ OPTIONS
 	Accumulate callchain of children to parent entry so that then can
 	show up in the output.  The output will have a new "Children" column
 	and will be sorted on the data.  It requires callchains are recorded.
+	See the `overhead calculation' section for more details.
 
 --max-stack::
 	Set the stack depth limit when parsing the callchain, anything
@@ -323,6 +325,37 @@ OPTIONS
 --header-only::
 	Show only perf.data header (forces --stdio).
 
+--itrace::
+	Options for decoding instruction tracing data. The options are:
+
+		i	synthesize instructions events
+		b	synthesize branches events
+		c	synthesize branches events (calls only)
+		r	synthesize branches events (returns only)
+		x	synthesize transactions events
+		e	synthesize error events
+		d	create a debug log
+		g	synthesize a call chain (use with i or x)
+
+	The default is all events i.e. the same as --itrace=ibxe
+
+	In addition, the period (default 100000) for instructions events
+	can be specified in units of:
+
+		i	instructions
+		t	ticks
+		ms	milliseconds
+		us	microseconds
+		ns	nanoseconds (default)
+
+	Also the call chain size (default 16, max. 1024) for instructions or
+	transactions events can be specified.
+
+	To disable decoding entirely, use --no-itrace.
+
+
+include::callchain-overhead-calculation.txt[]
+
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-annotate[1]
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 79445750fcb322..c82df572fac2ed 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -115,7 +115,8 @@ OPTIONS
 -f::
 --fields::
         Comma separated list of fields to print. Options are:
-        comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff, srcline, period.
+        comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
+	srcline, period, flags.
         Field list can be prepended with the type, trace, sw or hw,
         to indicate to which event type the field list applies.
         e.g., -f sw:comm,tid,time,ip,sym  and -f trace:time,cpu,trace
@@ -165,6 +166,12 @@ OPTIONS
 
 	At this point usage is displayed, and perf-script exits.
 
+	The flags field is synthesized and may have a value when Instruction
+	Trace decoding. The flags are "bcrosyiABEx" which stand for branch,
+	call, return, conditional, system, asynchronous, interrupt,
+	transaction abort, trace begin, trace end, and in transaction,
+	respectively.
+
 	Finally, a user may not set fields to none for all event types.
 	i.e., -f "" is not allowed.
 
@@ -221,6 +228,34 @@ OPTIONS
 --header-only
 	Show only perf.data header.
 
+--itrace::
+	Options for decoding instruction tracing data. The options are:
+
+		i	synthesize instructions events
+		b	synthesize branches events
+		c	synthesize branches events (calls only)
+		r	synthesize branches events (returns only)
+		x	synthesize transactions events
+		e	synthesize error events
+		d	create a debug log
+		g	synthesize a call chain (use with i or x)
+
+	The default is all events i.e. the same as --itrace=ibxe
+
+	In addition, the period (default 100000) for instructions events
+	can be specified in units of:
+
+		i	instructions
+		t	ticks
+		ms	milliseconds
+		us	microseconds
+		ns	nanoseconds (default)
+
+	Also the call chain size (default 16, max. 1024) for instructions or
+	transactions events can be specified.
+
+	To disable decoding entirely, use --no-itrace.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-script-perl[1],
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index 3265b107051880..9e5b07eb7d3509 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -168,7 +168,7 @@ Default is to monitor all CPUS.
 	Accumulate callchain of children to parent entry so that then can
 	show up in the output.  The output will have a new "Children" column
 	and will be sorted on the data.  It requires -g/--call-graph option
-	enabled.
+	enabled.  See the `overhead calculation' section for more details.
 
 --max-stack::
 	Set the stack depth limit when parsing the callchain, anything
@@ -234,6 +234,7 @@ INTERACTIVE PROMPTING KEYS
 
 Pressing any unmapped key displays a menu, and prompts for input.
 
+include::callchain-overhead-calculation.txt[]
 
 SEE ALSO
 --------
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index ba03fd5d1a5476..1db9c8b79880e2 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -35,7 +35,7 @@ OPTIONS
 
 -e::
 --expr::
-	List of events to show, currently only syscall names.
+	List of syscalls to show, currently only syscall names.
 	Prefixing with ! shows all syscalls but the ones specified.  You may
 	need to escape it.
 
diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST
index 11ccbb22ea2b8f..a83cf75164e1e6 100644
--- a/tools/perf/MANIFEST
+++ b/tools/perf/MANIFEST
@@ -1,12 +1,30 @@
 tools/perf
+tools/arch/alpha/include/asm/barrier.h
+tools/arch/arm/include/asm/barrier.h
+tools/arch/ia64/include/asm/barrier.h
+tools/arch/mips/include/asm/barrier.h
+tools/arch/powerpc/include/asm/barrier.h
+tools/arch/s390/include/asm/barrier.h
+tools/arch/sh/include/asm/barrier.h
+tools/arch/sparc/include/asm/barrier.h
+tools/arch/sparc/include/asm/barrier_32.h
+tools/arch/sparc/include/asm/barrier_64.h
+tools/arch/tile/include/asm/barrier.h
+tools/arch/x86/include/asm/barrier.h
+tools/arch/xtensa/include/asm/barrier.h
 tools/scripts
 tools/build
+tools/arch/x86/include/asm/atomic.h
+tools/arch/x86/include/asm/rmwcc.h
 tools/lib/traceevent
 tools/lib/api
 tools/lib/symbol/kallsyms.c
 tools/lib/symbol/kallsyms.h
 tools/lib/util/find_next_bit.c
+tools/include/asm/atomic.h
+tools/include/asm/barrier.h
 tools/include/asm/bug.h
+tools/include/asm-generic/barrier.h
 tools/include/asm-generic/bitops/arch_hweight.h
 tools/include/asm-generic/bitops/atomic.h
 tools/include/asm-generic/bitops/const_hweight.h
@@ -17,6 +35,7 @@ tools/include/asm-generic/bitops/fls64.h
 tools/include/asm-generic/bitops/fls.h
 tools/include/asm-generic/bitops/hweight.h
 tools/include/asm-generic/bitops.h
+tools/include/linux/atomic.h
 tools/include/linux/bitops.h
 tools/include/linux/compiler.h
 tools/include/linux/export.h
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index c43a2051759157..5816a3bb7e9fb0 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -73,6 +73,8 @@ include config/utilities.mak
 # for CTF data format.
 #
 # Define NO_LZMA if you do not want to support compressed (xz) kernel modules
+#
+# Define NO_AUXTRACE if you do not want AUX area tracing support
 
 ifeq ($(srctree),)
 srctree := $(patsubst %/,%,$(dir $(shell pwd)))
@@ -462,7 +464,7 @@ check: $(OUTPUT)common-cmds.h
 
 install-gtk:
 
-install-bin: all install-gtk
+install-tools: all install-gtk
 	$(call QUIET_INSTALL, binaries) \
 		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'; \
 		$(INSTALL) $(OUTPUT)perf '$(DESTDIR_SQ)$(bindir_SQ)'; \
@@ -500,12 +502,16 @@ endif
 	$(call QUIET_INSTALL, perf_completion-script) \
 		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(sysconfdir_SQ)/bash_completion.d'; \
 		$(INSTALL) perf-completion.sh '$(DESTDIR_SQ)$(sysconfdir_SQ)/bash_completion.d/perf'
+
+install-tests: all install-gtk
 	$(call QUIET_INSTALL, tests) \
 		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests'; \
 		$(INSTALL) tests/attr.py '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests'; \
 		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'; \
 		$(INSTALL) tests/attr/* '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'
 
+install-bin: install-tools install-tests
+
 install: install-bin try-install-man install-traceevent-plugins
 
 install-python_ext:
diff --git a/tools/perf/arch/arm64/Build b/tools/perf/arch/arm64/Build
index 54afe4a467e7d9..41bf61da476a4c 100644
--- a/tools/perf/arch/arm64/Build
+++ b/tools/perf/arch/arm64/Build
@@ -1 +1,2 @@
 libperf-y += util/
+libperf-$(CONFIG_DWARF_UNWIND) += tests/
diff --git a/tools/perf/arch/arm64/include/perf_regs.h b/tools/perf/arch/arm64/include/perf_regs.h
index 1d3f39c3aa564f..4e5af27e3fbfa5 100644
--- a/tools/perf/arch/arm64/include/perf_regs.h
+++ b/tools/perf/arch/arm64/include/perf_regs.h
@@ -5,8 +5,11 @@
 #include <linux/types.h>
 #include <asm/perf_regs.h>
 
+void perf_regs_load(u64 *regs);
+
 #define PERF_REGS_MASK	((1ULL << PERF_REG_ARM64_MAX) - 1)
 #define PERF_REGS_MAX	PERF_REG_ARM64_MAX
+#define PERF_SAMPLE_REGS_ABI	PERF_SAMPLE_REGS_ABI_64
 
 #define PERF_REG_IP	PERF_REG_ARM64_PC
 #define PERF_REG_SP	PERF_REG_ARM64_SP
diff --git a/tools/perf/arch/arm64/tests/Build b/tools/perf/arch/arm64/tests/Build
new file mode 100644
index 00000000000000..b30eff9bcc83fd
--- /dev/null
+++ b/tools/perf/arch/arm64/tests/Build
@@ -0,0 +1,2 @@
+libperf-y += regs_load.o
+libperf-y += dwarf-unwind.o
diff --git a/tools/perf/arch/arm64/tests/dwarf-unwind.c b/tools/perf/arch/arm64/tests/dwarf-unwind.c
new file mode 100644
index 00000000000000..cf04a4c91c596e
--- /dev/null
+++ b/tools/perf/arch/arm64/tests/dwarf-unwind.c
@@ -0,0 +1,61 @@
+#include <string.h>
+#include "perf_regs.h"
+#include "thread.h"
+#include "map.h"
+#include "event.h"
+#include "debug.h"
+#include "tests/tests.h"
+
+#define STACK_SIZE 8192
+
+static int sample_ustack(struct perf_sample *sample,
+		struct thread *thread, u64 *regs)
+{
+	struct stack_dump *stack = &sample->user_stack;
+	struct map *map;
+	unsigned long sp;
+	u64 stack_size, *buf;
+
+	buf = malloc(STACK_SIZE);
+	if (!buf) {
+		pr_debug("failed to allocate sample uregs data\n");
+		return -1;
+	}
+
+	sp = (unsigned long) regs[PERF_REG_ARM64_SP];
+
+	map = map_groups__find(thread->mg, MAP__VARIABLE, (u64) sp);
+	if (!map) {
+		pr_debug("failed to get stack map\n");
+		free(buf);
+		return -1;
+	}
+
+	stack_size = map->end - sp;
+	stack_size = stack_size > STACK_SIZE ? STACK_SIZE : stack_size;
+
+	memcpy(buf, (void *) sp, stack_size);
+	stack->data = (char *) buf;
+	stack->size = stack_size;
+	return 0;
+}
+
+int test__arch_unwind_sample(struct perf_sample *sample,
+		struct thread *thread)
+{
+	struct regs_dump *regs = &sample->user_regs;
+	u64 *buf;
+
+	buf = calloc(1, sizeof(u64) * PERF_REGS_MAX);
+	if (!buf) {
+		pr_debug("failed to allocate sample uregs data\n");
+		return -1;
+	}
+
+	perf_regs_load(buf);
+	regs->abi  = PERF_SAMPLE_REGS_ABI;
+	regs->regs = buf;
+	regs->mask = PERF_REGS_MASK;
+
+	return sample_ustack(sample, thread, buf);
+}
diff --git a/tools/perf/arch/arm64/tests/regs_load.S b/tools/perf/arch/arm64/tests/regs_load.S
new file mode 100644
index 00000000000000..025b46e579a6fa
--- /dev/null
+++ b/tools/perf/arch/arm64/tests/regs_load.S
@@ -0,0 +1,46 @@
+#include <linux/linkage.h>
+
+.text
+.type perf_regs_load,%function
+#define STR_REG(r)	str x##r, [x0, 8 * r]
+#define LDR_REG(r)	ldr x##r, [x0, 8 * r]
+#define SP	(8 * 31)
+#define PC	(8 * 32)
+ENTRY(perf_regs_load)
+	STR_REG(0)
+	STR_REG(1)
+	STR_REG(2)
+	STR_REG(3)
+	STR_REG(4)
+	STR_REG(5)
+	STR_REG(6)
+	STR_REG(7)
+	STR_REG(8)
+	STR_REG(9)
+	STR_REG(10)
+	STR_REG(11)
+	STR_REG(12)
+	STR_REG(13)
+	STR_REG(14)
+	STR_REG(15)
+	STR_REG(16)
+	STR_REG(17)
+	STR_REG(18)
+	STR_REG(19)
+	STR_REG(20)
+	STR_REG(21)
+	STR_REG(22)
+	STR_REG(23)
+	STR_REG(24)
+	STR_REG(25)
+	STR_REG(26)
+	STR_REG(27)
+	STR_REG(28)
+	STR_REG(29)
+	STR_REG(30)
+	mov x1, sp
+	str x1, [x0, #SP]
+	str x30, [x0, #PC]
+	LDR_REG(1)
+	ret
+ENDPROC(perf_regs_load)
diff --git a/tools/perf/arch/common.c b/tools/perf/arch/common.c
index 49776f190abfab..b7bb42c4469401 100644
--- a/tools/perf/arch/common.c
+++ b/tools/perf/arch/common.c
@@ -61,7 +61,7 @@ const char *const mips_triplets[] = {
 static bool lookup_path(char *name)
 {
 	bool found = false;
-	char *path, *tmp;
+	char *path, *tmp = NULL;
 	char buf[PATH_MAX];
 	char *env = getenv("PATH");
 
diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build
index 0af6e9b3f72857..7b8b0d1a1b6260 100644
--- a/tools/perf/arch/powerpc/util/Build
+++ b/tools/perf/arch/powerpc/util/Build
@@ -1,4 +1,5 @@
 libperf-y += header.o
+libperf-y += sym-handling.o
 
 libperf-$(CONFIG_DWARF) += dwarf-regs.o
 libperf-$(CONFIG_DWARF) += skip-callchain-idx.o
diff --git a/tools/perf/arch/powerpc/util/sym-handling.c b/tools/perf/arch/powerpc/util/sym-handling.c
new file mode 100644
index 00000000000000..bbc1a50768dd5d
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/sym-handling.c
@@ -0,0 +1,82 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright (C) 2015 Naveen N. Rao, IBM Corporation
+ */
+
+#include "debug.h"
+#include "symbol.h"
+#include "map.h"
+#include "probe-event.h"
+
+#ifdef HAVE_LIBELF_SUPPORT
+bool elf__needs_adjust_symbols(GElf_Ehdr ehdr)
+{
+	return ehdr.e_type == ET_EXEC ||
+	       ehdr.e_type == ET_REL ||
+	       ehdr.e_type == ET_DYN;
+}
+
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+void arch__elf_sym_adjust(GElf_Sym *sym)
+{
+	sym->st_value += PPC64_LOCAL_ENTRY_OFFSET(sym->st_other);
+}
+#endif
+#endif
+
+#if !defined(_CALL_ELF) || _CALL_ELF != 2
+int arch__choose_best_symbol(struct symbol *syma,
+			     struct symbol *symb __maybe_unused)
+{
+	char *sym = syma->name;
+
+	/* Skip over any initial dot */
+	if (*sym == '.')
+		sym++;
+
+	/* Avoid "SyS" kernel syscall aliases */
+	if (strlen(sym) >= 3 && !strncmp(sym, "SyS", 3))
+		return SYMBOL_B;
+	if (strlen(sym) >= 10 && !strncmp(sym, "compat_SyS", 10))
+		return SYMBOL_B;
+
+	return SYMBOL_A;
+}
+
+/* Allow matching against dot variants */
+int arch__compare_symbol_names(const char *namea, const char *nameb)
+{
+	/* Skip over initial dot */
+	if (*namea == '.')
+		namea++;
+	if (*nameb == '.')
+		nameb++;
+
+	return strcmp(namea, nameb);
+}
+#endif
+
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+bool arch__prefers_symtab(void)
+{
+	return true;
+}
+
+#define PPC64LE_LEP_OFFSET	8
+
+void arch__fix_tev_from_maps(struct perf_probe_event *pev,
+			     struct probe_trace_event *tev, struct map *map)
+{
+	/*
+	 * ppc64 ABIv2 local entry point is currently always 2 instructions
+	 * (8 bytes) after the global entry point.
+	 */
+	if (!pev->uprobes && map->dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS) {
+		tev->point.address += PPC64LE_LEP_OFFSET;
+		tev->point.offset += PPC64LE_LEP_OFFSET;
+	}
+}
+#endif
diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
index 5ce98023d518fc..c3ab760e06b4d7 100644
--- a/tools/perf/bench/Build
+++ b/tools/perf/bench/Build
@@ -3,6 +3,7 @@ perf-y += sched-pipe.o
 perf-y += mem-memcpy.o
 perf-y += futex-hash.o
 perf-y += futex-wake.o
+perf-y += futex-wake-parallel.o
 perf-y += futex-requeue.o
 
 perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o
diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h
index 3c4dd44d45cb7b..70b2f718cc2179 100644
--- a/tools/perf/bench/bench.h
+++ b/tools/perf/bench/bench.h
@@ -33,6 +33,8 @@ extern int bench_mem_memcpy(int argc, const char **argv,
 extern int bench_mem_memset(int argc, const char **argv, const char *prefix);
 extern int bench_futex_hash(int argc, const char **argv, const char *prefix);
 extern int bench_futex_wake(int argc, const char **argv, const char *prefix);
+extern int bench_futex_wake_parallel(int argc, const char **argv,
+				     const char *prefix);
 extern int bench_futex_requeue(int argc, const char **argv, const char *prefix);
 
 #define BENCH_FORMAT_DEFAULT_STR	"default"
diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c
new file mode 100644
index 00000000000000..6d8c9fa2a16c87
--- /dev/null
+++ b/tools/perf/bench/futex-wake-parallel.c
@@ -0,0 +1,294 @@
+/*
+ * Copyright (C) 2015 Davidlohr Bueso.
+ *
+ * Block a bunch of threads and let parallel waker threads wakeup an
+ * equal amount of them. The program output reflects the avg latency
+ * for each individual thread to service its share of work. Ultimately
+ * it can be used to measure futex_wake() changes.
+ */
+
+#include "../perf.h"
+#include "../util/util.h"
+#include "../util/stat.h"
+#include "../util/parse-options.h"
+#include "../util/header.h"
+#include "bench.h"
+#include "futex.h"
+
+#include <err.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+struct thread_data {
+	pthread_t worker;
+	unsigned int nwoken;
+	struct timeval runtime;
+};
+
+static unsigned int nwakes = 1;
+
+/* all threads will block on the same futex -- hash bucket chaos ;) */
+static u_int32_t futex = 0;
+
+static pthread_t *blocked_worker;
+static bool done = false, silent = false, fshared = false;
+static unsigned int nblocked_threads = 0, nwaking_threads = 0;
+static pthread_mutex_t thread_lock;
+static pthread_cond_t thread_parent, thread_worker;
+static struct stats waketime_stats, wakeup_stats;
+static unsigned int ncpus, threads_starting;
+static int futex_flag = 0;
+
+static const struct option options[] = {
+	OPT_UINTEGER('t', "threads", &nblocked_threads, "Specify amount of threads"),
+	OPT_UINTEGER('w', "nwakers", &nwaking_threads, "Specify amount of waking threads"),
+	OPT_BOOLEAN( 's', "silent",  &silent,   "Silent mode: do not display data/details"),
+	OPT_BOOLEAN( 'S', "shared",  &fshared,  "Use shared futexes instead of private ones"),
+	OPT_END()
+};
+
+static const char * const bench_futex_wake_parallel_usage[] = {
+	"perf bench futex wake-parallel <options>",
+	NULL
+};
+
+static void *waking_workerfn(void *arg)
+{
+	struct thread_data *waker = (struct thread_data *) arg;
+	struct timeval start, end;
+
+	gettimeofday(&start, NULL);
+
+	waker->nwoken = futex_wake(&futex, nwakes, futex_flag);
+	if (waker->nwoken != nwakes)
+		warnx("couldn't wakeup all tasks (%d/%d)",
+		      waker->nwoken, nwakes);
+
+	gettimeofday(&end, NULL);
+	timersub(&end, &start, &waker->runtime);
+
+	pthread_exit(NULL);
+	return NULL;
+}
+
+static void wakeup_threads(struct thread_data *td, pthread_attr_t thread_attr)
+{
+	unsigned int i;
+
+	pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);
+
+	/* create and block all threads */
+	for (i = 0; i < nwaking_threads; i++) {
+		/*
+		 * Thread creation order will impact per-thread latency
+		 * as it will affect the order to acquire the hb spinlock.
+		 * For now let the scheduler decide.
+		 */
+		if (pthread_create(&td[i].worker, &thread_attr,
+				   waking_workerfn, (void *)&td[i]))
+			err(EXIT_FAILURE, "pthread_create");
+	}
+
+	for (i = 0; i < nwaking_threads; i++)
+		if (pthread_join(td[i].worker, NULL))
+			err(EXIT_FAILURE, "pthread_join");
+}
+
+static void *blocked_workerfn(void *arg __maybe_unused)
+{
+	pthread_mutex_lock(&thread_lock);
+	threads_starting--;
+	if (!threads_starting)
+		pthread_cond_signal(&thread_parent);
+	pthread_cond_wait(&thread_worker, &thread_lock);
+	pthread_mutex_unlock(&thread_lock);
+
+	while (1) { /* handle spurious wakeups */
+		if (futex_wait(&futex, 0, NULL, futex_flag) != EINTR)
+			break;
+	}
+
+	pthread_exit(NULL);
+	return NULL;
+}
+
+static void block_threads(pthread_t *w, pthread_attr_t thread_attr)
+{
+	cpu_set_t cpu;
+	unsigned int i;
+
+	threads_starting = nblocked_threads;
+
+	/* create and block all threads */
+	for (i = 0; i < nblocked_threads; i++) {
+		CPU_ZERO(&cpu);
+		CPU_SET(i % ncpus, &cpu);
+
+		if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu))
+			err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
+
+		if (pthread_create(&w[i], &thread_attr, blocked_workerfn, NULL))
+			err(EXIT_FAILURE, "pthread_create");
+	}
+}
+
+static void print_run(struct thread_data *waking_worker, unsigned int run_num)
+{
+	unsigned int i, wakeup_avg;
+	double waketime_avg, waketime_stddev;
+	struct stats __waketime_stats, __wakeup_stats;
+
+	init_stats(&__wakeup_stats);
+	init_stats(&__waketime_stats);
+
+	for (i = 0; i < nwaking_threads; i++) {
+		update_stats(&__waketime_stats, waking_worker[i].runtime.tv_usec);
+		update_stats(&__wakeup_stats, waking_worker[i].nwoken);
+	}
+
+	waketime_avg = avg_stats(&__waketime_stats);
+	waketime_stddev = stddev_stats(&__waketime_stats);
+	wakeup_avg = avg_stats(&__wakeup_stats);
+
+	printf("[Run %d]: Avg per-thread latency (waking %d/%d threads) "
+	       "in %.4f ms (+-%.2f%%)\n", run_num + 1, wakeup_avg,
+	       nblocked_threads, waketime_avg/1e3,
+	       rel_stddev_stats(waketime_stddev, waketime_avg));
+}
+
+static void print_summary(void)
+{
+	unsigned int wakeup_avg;
+	double waketime_avg, waketime_stddev;
+
+	waketime_avg = avg_stats(&waketime_stats);
+	waketime_stddev = stddev_stats(&waketime_stats);
+	wakeup_avg = avg_stats(&wakeup_stats);
+
+	printf("Avg per-thread latency (waking %d/%d threads) in %.4f ms (+-%.2f%%)\n",
+	       wakeup_avg,
+	       nblocked_threads,
+	       waketime_avg/1e3,
+	       rel_stddev_stats(waketime_stddev, waketime_avg));
+}
+
+
+static void do_run_stats(struct thread_data *waking_worker)
+{
+	unsigned int i;
+
+	for (i = 0; i < nwaking_threads; i++) {
+		update_stats(&waketime_stats, waking_worker[i].runtime.tv_usec);
+		update_stats(&wakeup_stats, waking_worker[i].nwoken);
+	}
+
+}
+
+static void toggle_done(int sig __maybe_unused,
+			siginfo_t *info __maybe_unused,
+			void *uc __maybe_unused)
+{
+	done = true;
+}
+
+int bench_futex_wake_parallel(int argc, const char **argv,
+			      const char *prefix __maybe_unused)
+{
+	int ret = 0;
+	unsigned int i, j;
+	struct sigaction act;
+	pthread_attr_t thread_attr;
+	struct thread_data *waking_worker;
+
+	argc = parse_options(argc, argv, options,
+			     bench_futex_wake_parallel_usage, 0);
+	if (argc) {
+		usage_with_options(bench_futex_wake_parallel_usage, options);
+		exit(EXIT_FAILURE);
+	}
+
+	sigfillset(&act.sa_mask);
+	act.sa_sigaction = toggle_done;
+	sigaction(SIGINT, &act, NULL);
+
+	ncpus = sysconf(_SC_NPROCESSORS_ONLN);
+	if (!nblocked_threads)
+		nblocked_threads = ncpus;
+
+	/* some sanity checks */
+	if (nwaking_threads > nblocked_threads || !nwaking_threads)
+		nwaking_threads = nblocked_threads;
+
+	if (nblocked_threads % nwaking_threads)
+		errx(EXIT_FAILURE, "Must be perfectly divisible");
+	/*
+	 * Each thread will wakeup nwakes tasks in
+	 * a single futex_wait call.
+	 */
+	nwakes = nblocked_threads/nwaking_threads;
+
+	blocked_worker = calloc(nblocked_threads, sizeof(*blocked_worker));
+	if (!blocked_worker)
+		err(EXIT_FAILURE, "calloc");
+
+	if (!fshared)
+		futex_flag = FUTEX_PRIVATE_FLAG;
+
+	printf("Run summary [PID %d]: blocking on %d threads (at [%s] "
+	       "futex %p), %d threads waking up %d at a time.\n\n",
+	       getpid(), nblocked_threads, fshared ? "shared":"private",
+	       &futex, nwaking_threads, nwakes);
+
+	init_stats(&wakeup_stats);
+	init_stats(&waketime_stats);
+
+	pthread_attr_init(&thread_attr);
+	pthread_mutex_init(&thread_lock, NULL);
+	pthread_cond_init(&thread_parent, NULL);
+	pthread_cond_init(&thread_worker, NULL);
+
+	for (j = 0; j < bench_repeat && !done; j++) {
+		waking_worker = calloc(nwaking_threads, sizeof(*waking_worker));
+		if (!waking_worker)
+			err(EXIT_FAILURE, "calloc");
+
+		/* create, launch & block all threads */
+		block_threads(blocked_worker, thread_attr);
+
+		/* make sure all threads are already blocked */
+		pthread_mutex_lock(&thread_lock);
+		while (threads_starting)
+			pthread_cond_wait(&thread_parent, &thread_lock);
+		pthread_cond_broadcast(&thread_worker);
+		pthread_mutex_unlock(&thread_lock);
+
+		usleep(100000);
+
+		/* Ok, all threads are patiently blocked, start waking folks up */
+		wakeup_threads(waking_worker, thread_attr);
+
+		for (i = 0; i < nblocked_threads; i++) {
+			ret = pthread_join(blocked_worker[i], NULL);
+			if (ret)
+				err(EXIT_FAILURE, "pthread_join");
+		}
+
+		do_run_stats(waking_worker);
+		if (!silent)
+			print_run(waking_worker, j);
+
+		free(waking_worker);
+	}
+
+	/* cleanup & report results */
+	pthread_cond_destroy(&thread_parent);
+	pthread_cond_destroy(&thread_worker);
+	pthread_mutex_destroy(&thread_lock);
+	pthread_attr_destroy(&thread_attr);
+
+	print_summary();
+
+	free(blocked_worker);
+	return ret;
+}
diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c
index 929f762be47e97..e5e41d3bdce724 100644
--- a/tools/perf/bench/futex-wake.c
+++ b/tools/perf/bench/futex-wake.c
@@ -60,7 +60,12 @@ static void *workerfn(void *arg __maybe_unused)
 	pthread_cond_wait(&thread_worker, &thread_lock);
 	pthread_mutex_unlock(&thread_lock);
 
-	futex_wait(&futex1, 0, NULL, futex_flag);
+	while (1) {
+		if (futex_wait(&futex1, 0, NULL, futex_flag) != EINTR)
+			break;
+	}
+
+	pthread_exit(NULL);
 	return NULL;
 }
 
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index ba5efa4710b558..870b7e665a2032 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -8,6 +8,7 @@
 #include "../builtin.h"
 #include "../util/util.h"
 #include "../util/parse-options.h"
+#include "../util/cloexec.h"
 
 #include "bench.h"
 
@@ -23,6 +24,7 @@
 #include <pthread.h>
 #include <sys/mman.h>
 #include <sys/time.h>
+#include <sys/resource.h>
 #include <sys/wait.h>
 #include <sys/prctl.h>
 #include <sys/types.h>
@@ -51,6 +53,9 @@ struct thread_data {
 	unsigned int		loops_done;
 	u64			val;
 	u64			runtime_ns;
+	u64			system_time_ns;
+	u64			user_time_ns;
+	double			speed_gbs;
 	pthread_mutex_t		*process_lock;
 };
 
@@ -1042,6 +1047,7 @@ static void *worker_thread(void *__tdata)
 	u64 bytes_done;
 	long work_done;
 	u32 l;
+	struct rusage rusage;
 
 	bind_to_cpumask(td->bind_cpumask);
 	bind_to_memnode(td->bind_node);
@@ -1194,6 +1200,13 @@ static void *worker_thread(void *__tdata)
 	timersub(&stop, &start0, &diff);
 	td->runtime_ns = diff.tv_sec * 1000000000ULL;
 	td->runtime_ns += diff.tv_usec * 1000ULL;
+	td->speed_gbs = bytes_done / (td->runtime_ns / 1e9) / 1e9;
+
+	getrusage(RUSAGE_THREAD, &rusage);
+	td->system_time_ns = rusage.ru_stime.tv_sec * 1000000000ULL;
+	td->system_time_ns += rusage.ru_stime.tv_usec * 1000ULL;
+	td->user_time_ns = rusage.ru_utime.tv_sec * 1000000000ULL;
+	td->user_time_ns += rusage.ru_utime.tv_usec * 1000ULL;
 
 	free_data(thread_data, g->p.bytes_thread);
 
@@ -1420,7 +1433,7 @@ static int __bench_numa(const char *name)
 	double runtime_sec_min;
 	int wait_stat;
 	double bytes;
-	int i, t;
+	int i, t, p;
 
 	if (init())
 		return -1;
@@ -1556,6 +1569,24 @@ static int __bench_numa(const char *name)
 	print_res(name, bytes / runtime_sec_max / 1e9,
 		"GB/sec,", "total-speed",	"GB/sec total speed");
 
+	if (g->p.show_details >= 2) {
+		char tname[32];
+		struct thread_data *td;
+		for (p = 0; p < g->p.nr_proc; p++) {
+			for (t = 0; t < g->p.nr_threads; t++) {
+				memset(tname, 0, 32);
+				td = g->threads + p*g->p.nr_threads + t;
+				snprintf(tname, 32, "process%d:thread%d", p, t);
+				print_res(tname, td->speed_gbs,
+					"GB/sec",	"thread-speed", "GB/sec/thread speed");
+				print_res(tname, td->system_time_ns / 1e9,
+					"secs",	"thread-system-time", "system CPU time/thread");
+				print_res(tname, td->user_time_ns / 1e9,
+					"secs",	"thread-user-time", "user CPU time/thread");
+			}
+		}
+	}
+
 	free(pids);
 
 	deinit();
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 71bf7451c0cad1..4e08c2d2090e57 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -59,6 +59,10 @@ static int perf_evsel__add_sample(struct perf_evsel *evsel,
 	    (al->sym == NULL ||
 	     strcmp(ann->sym_hist_filter, al->sym->name) != 0)) {
 		/* We're only interested in a symbol named sym_hist_filter */
+		/*
+		 * FIXME: why isn't this done in the symbol_filter when loading
+		 * the DSO?
+		 */
 		if (al->sym != NULL) {
 			rb_erase(&al->sym->rb_node,
 				 &al->map->dso->symbols[al->map->type]);
@@ -84,6 +88,7 @@ static int process_sample_event(struct perf_tool *tool,
 {
 	struct perf_annotate *ann = container_of(tool, struct perf_annotate, tool);
 	struct addr_location al;
+	int ret = 0;
 
 	if (perf_event__preprocess_sample(event, machine, &al, sample) < 0) {
 		pr_warning("problem processing %d event, skipping it.\n",
@@ -92,15 +97,16 @@ static int process_sample_event(struct perf_tool *tool,
 	}
 
 	if (ann->cpu_list && !test_bit(sample->cpu, ann->cpu_bitmap))
-		return 0;
+		goto out_put;
 
 	if (!al.filtered && perf_evsel__add_sample(evsel, sample, &al, ann)) {
 		pr_warning("problem incrementing symbol count, "
 			   "skipping event\n");
-		return -1;
+		ret = -1;
 	}
-
-	return 0;
+out_put:
+	addr_location__put(&al);
+	return ret;
 }
 
 static int hist_entry__tty_annotate(struct hist_entry *he,
@@ -283,7 +289,6 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
 		},
 	};
 	struct perf_data_file file = {
-		.path  = input_name,
 		.mode  = PERF_DATA_MODE_READ,
 	};
 	const struct option options[] = {
@@ -340,6 +345,8 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
 	else if (annotate.use_gtk)
 		use_browser = 2;
 
+	file.path  = input_name;
+
 	setup_browser(true);
 
 	annotate.session = perf_session__new(&file, false, &annotate.tool);
diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c
index b9a56fa8333065..b5314e452ec7f2 100644
--- a/tools/perf/builtin-bench.c
+++ b/tools/perf/builtin-bench.c
@@ -58,6 +58,7 @@ static struct bench mem_benchmarks[] = {
 static struct bench futex_benchmarks[] = {
 	{ "hash",	"Benchmark for futex hash table",               bench_futex_hash	},
 	{ "wake",	"Benchmark for futex wake calls",               bench_futex_wake	},
+	{ "wake-parallel", "Benchmark for parallel futex wake calls",   bench_futex_wake_parallel },
 	{ "requeue",	"Benchmark for futex requeue calls",            bench_futex_requeue	},
 	{ "all",	"Test all futex benchmarks",			NULL			},
 	{ NULL,		NULL,						NULL			}
diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c
index feb420f74c2d9f..9fe93c8d4fcff1 100644
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c
@@ -69,6 +69,15 @@ static int perf_session__list_build_ids(bool force, bool with_hits)
 	session = perf_session__new(&file, false, &build_id__mark_dso_hit_ops);
 	if (session == NULL)
 		return -1;
+
+	/*
+	 * We take all buildids when the file contains AUX area tracing data
+	 * because we do not decode the trace because it would take too long.
+	 */
+	if (!perf_data_file__is_pipe(&file) &&
+	    perf_header__has_feat(&session->header, HEADER_AUXTRACE))
+		with_hits = false;
+
 	/*
 	 * in pipe-mode, the only way to get the buildids is to parse
 	 * the record stream. Buildids are stored as RECORD_HEADER_BUILD_ID
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index df6307b4050aaa..daaa7dca9c3ba8 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -328,6 +328,7 @@ static int diff__process_sample_event(struct perf_tool *tool __maybe_unused,
 {
 	struct addr_location al;
 	struct hists *hists = evsel__hists(evsel);
+	int ret = -1;
 
 	if (perf_event__preprocess_sample(event, machine, &al, sample) < 0) {
 		pr_warning("problem processing %d event, skipping it.\n",
@@ -338,7 +339,7 @@ static int diff__process_sample_event(struct perf_tool *tool __maybe_unused,
 	if (hists__add_entry(hists, &al, sample->period,
 			     sample->weight, sample->transaction)) {
 		pr_warning("problem incrementing symbol period, skipping event\n");
-		return -1;
+		goto out_put;
 	}
 
 	/*
@@ -350,8 +351,10 @@ static int diff__process_sample_event(struct perf_tool *tool __maybe_unused,
 	hists->stats.total_period += sample->period;
 	if (!al.filtered)
 		hists->stats.total_non_filtered_period += sample->period;
-
-	return 0;
+	ret = 0;
+out_put:
+	addr_location__put(&al);
+	return ret;
 }
 
 static struct perf_tool tool = {
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 40a33d7334cce2..52ec66b236076c 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -16,6 +16,7 @@
 #include "util/debug.h"
 #include "util/build-id.h"
 #include "util/data.h"
+#include "util/auxtrace.h"
 
 #include "util/parse-options.h"
 
@@ -26,10 +27,12 @@ struct perf_inject {
 	struct perf_session	*session;
 	bool			build_ids;
 	bool			sched_stat;
+	bool			have_auxtrace;
 	const char		*input_name;
 	struct perf_data_file	output;
 	u64			bytes_written;
 	struct list_head	samples;
+	struct itrace_synth_opts itrace_synth_opts;
 };
 
 struct event_entry {
@@ -38,14 +41,11 @@ struct event_entry {
 	union perf_event event[0];
 };
 
-static int perf_event__repipe_synth(struct perf_tool *tool,
-				    union perf_event *event)
+static int output_bytes(struct perf_inject *inject, void *buf, size_t sz)
 {
-	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
 	ssize_t size;
 
-	size = perf_data_file__write(&inject->output, event,
-				     event->header.size);
+	size = perf_data_file__write(&inject->output, buf, sz);
 	if (size < 0)
 		return -errno;
 
@@ -53,6 +53,15 @@ static int perf_event__repipe_synth(struct perf_tool *tool,
 	return 0;
 }
 
+static int perf_event__repipe_synth(struct perf_tool *tool,
+				    union perf_event *event)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject,
+						  tool);
+
+	return output_bytes(inject, event, event->header.size);
+}
+
 static int perf_event__repipe_oe_synth(struct perf_tool *tool,
 				       union perf_event *event,
 				       struct ordered_events *oe __maybe_unused)
@@ -86,6 +95,79 @@ static int perf_event__repipe_attr(struct perf_tool *tool,
 	return perf_event__repipe_synth(tool, event);
 }
 
+#ifdef HAVE_AUXTRACE_SUPPORT
+
+static int copy_bytes(struct perf_inject *inject, int fd, off_t size)
+{
+	char buf[4096];
+	ssize_t ssz;
+	int ret;
+
+	while (size > 0) {
+		ssz = read(fd, buf, min(size, (off_t)sizeof(buf)));
+		if (ssz < 0)
+			return -errno;
+		ret = output_bytes(inject, buf, ssz);
+		if (ret)
+			return ret;
+		size -= ssz;
+	}
+
+	return 0;
+}
+
+static s64 perf_event__repipe_auxtrace(struct perf_tool *tool,
+				       union perf_event *event,
+				       struct perf_session *session
+				       __maybe_unused)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject,
+						  tool);
+	int ret;
+
+	inject->have_auxtrace = true;
+
+	if (!inject->output.is_pipe) {
+		off_t offset;
+
+		offset = lseek(inject->output.fd, 0, SEEK_CUR);
+		if (offset == -1)
+			return -errno;
+		ret = auxtrace_index__auxtrace_event(&session->auxtrace_index,
+						     event, offset);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (perf_data_file__is_pipe(session->file) || !session->one_mmap) {
+		ret = output_bytes(inject, event, event->header.size);
+		if (ret < 0)
+			return ret;
+		ret = copy_bytes(inject, perf_data_file__fd(session->file),
+				 event->auxtrace.size);
+	} else {
+		ret = output_bytes(inject, event,
+				   event->header.size + event->auxtrace.size);
+	}
+	if (ret < 0)
+		return ret;
+
+	return event->auxtrace.size;
+}
+
+#else
+
+static s64
+perf_event__repipe_auxtrace(struct perf_tool *tool __maybe_unused,
+			    union perf_event *event __maybe_unused,
+			    struct perf_session *session __maybe_unused)
+{
+	pr_err("AUX area tracing not supported\n");
+	return -EINVAL;
+}
+
+#endif
+
 static int perf_event__repipe(struct perf_tool *tool,
 			      union perf_event *event,
 			      struct perf_sample *sample __maybe_unused,
@@ -155,6 +237,32 @@ static int perf_event__repipe_fork(struct perf_tool *tool,
 	return err;
 }
 
+static int perf_event__repipe_comm(struct perf_tool *tool,
+				   union perf_event *event,
+				   struct perf_sample *sample,
+				   struct machine *machine)
+{
+	int err;
+
+	err = perf_event__process_comm(tool, event, sample, machine);
+	perf_event__repipe(tool, event, sample, machine);
+
+	return err;
+}
+
+static int perf_event__repipe_exit(struct perf_tool *tool,
+				   union perf_event *event,
+				   struct perf_sample *sample,
+				   struct machine *machine)
+{
+	int err;
+
+	err = perf_event__process_exit(tool, event, sample, machine);
+	perf_event__repipe(tool, event, sample, machine);
+
+	return err;
+}
+
 static int perf_event__repipe_tracing_data(struct perf_tool *tool,
 					   union perf_event *event,
 					   struct perf_session *session)
@@ -167,6 +275,18 @@ static int perf_event__repipe_tracing_data(struct perf_tool *tool,
 	return err;
 }
 
+static int perf_event__repipe_id_index(struct perf_tool *tool,
+				       union perf_event *event,
+				       struct perf_session *session)
+{
+	int err;
+
+	perf_event__repipe_synth(tool, event);
+	err = perf_event__process_id_index(tool, event, session);
+
+	return err;
+}
+
 static int dso__read_build_id(struct dso *dso)
 {
 	if (dso->has_build_id)
@@ -245,6 +365,7 @@ static int perf_event__inject_buildid(struct perf_tool *tool,
 		}
 	}
 
+	thread__put(thread);
 repipe:
 	perf_event__repipe(tool, event, sample, machine);
 	return 0;
@@ -351,16 +472,20 @@ static int __cmd_inject(struct perf_inject *inject)
 	struct perf_session *session = inject->session;
 	struct perf_data_file *file_out = &inject->output;
 	int fd = perf_data_file__fd(file_out);
+	u64 output_data_offset;
 
 	signal(SIGINT, sig_handler);
 
-	if (inject->build_ids || inject->sched_stat) {
+	if (inject->build_ids || inject->sched_stat ||
+	    inject->itrace_synth_opts.set) {
 		inject->tool.mmap	  = perf_event__repipe_mmap;
 		inject->tool.mmap2	  = perf_event__repipe_mmap2;
 		inject->tool.fork	  = perf_event__repipe_fork;
 		inject->tool.tracing_data = perf_event__repipe_tracing_data;
 	}
 
+	output_data_offset = session->header.data_offset;
+
 	if (inject->build_ids) {
 		inject->tool.sample = perf_event__inject_buildid;
 	} else if (inject->sched_stat) {
@@ -379,17 +504,43 @@ static int __cmd_inject(struct perf_inject *inject)
 			else if (!strncmp(name, "sched:sched_stat_", 17))
 				evsel->handler = perf_inject__sched_stat;
 		}
+	} else if (inject->itrace_synth_opts.set) {
+		session->itrace_synth_opts = &inject->itrace_synth_opts;
+		inject->itrace_synth_opts.inject = true;
+		inject->tool.comm	    = perf_event__repipe_comm;
+		inject->tool.exit	    = perf_event__repipe_exit;
+		inject->tool.id_index	    = perf_event__repipe_id_index;
+		inject->tool.auxtrace_info  = perf_event__process_auxtrace_info;
+		inject->tool.auxtrace	    = perf_event__process_auxtrace;
+		inject->tool.ordered_events = true;
+		inject->tool.ordering_requires_timestamps = true;
+		/* Allow space in the header for new attributes */
+		output_data_offset = 4096;
 	}
 
+	if (!inject->itrace_synth_opts.set)
+		auxtrace_index__free(&session->auxtrace_index);
+
 	if (!file_out->is_pipe)
-		lseek(fd, session->header.data_offset, SEEK_SET);
+		lseek(fd, output_data_offset, SEEK_SET);
 
 	ret = perf_session__process_events(session);
 
 	if (!file_out->is_pipe) {
-		if (inject->build_ids)
+		if (inject->build_ids) {
 			perf_header__set_feat(&session->header,
 					      HEADER_BUILD_ID);
+			if (inject->have_auxtrace)
+				dsos__hit_all(session);
+		}
+		/*
+		 * The AUX areas have been removed and replaced with
+		 * synthesized hardware events, so clear the feature flag.
+		 */
+		if (inject->itrace_synth_opts.set)
+			perf_header__clear_feat(&session->header,
+						HEADER_AUXTRACE);
+		session->header.data_offset = output_data_offset;
 		session->header.data_size = inject->bytes_written;
 		perf_session__write_header(session, session->evlist, fd, true);
 	}
@@ -408,11 +559,16 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 			.fork		= perf_event__repipe,
 			.exit		= perf_event__repipe,
 			.lost		= perf_event__repipe,
+			.aux		= perf_event__repipe,
+			.itrace_start	= perf_event__repipe,
 			.read		= perf_event__repipe_sample,
 			.throttle	= perf_event__repipe,
 			.unthrottle	= perf_event__repipe,
 			.attr		= perf_event__repipe_attr,
 			.tracing_data	= perf_event__repipe_op2_synth,
+			.auxtrace_info	= perf_event__repipe_op2_synth,
+			.auxtrace	= perf_event__repipe_auxtrace,
+			.auxtrace_error	= perf_event__repipe_op2_synth,
 			.finished_round	= perf_event__repipe_oe_synth,
 			.build_id	= perf_event__repipe_op2_synth,
 			.id_index	= perf_event__repipe_op2_synth,
@@ -444,6 +600,9 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 		OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, "file",
 			   "kallsyms pathname"),
 		OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
+		OPT_CALLBACK_OPTARG(0, "itrace", &inject.itrace_synth_opts,
+				    NULL, "opts", "Instruction Tracing options",
+				    itrace_parse_synth_opts),
 		OPT_END()
 	};
 	const char * const inject_usage[] = {
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 1634186d537cdc..950f296dfcf7a4 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -10,6 +10,7 @@
 #include "util/header.h"
 #include "util/session.h"
 #include "util/tool.h"
+#include "util/callchain.h"
 
 #include "util/parse-options.h"
 #include "util/trace-event.h"
@@ -21,14 +22,19 @@
 #include <linux/rbtree.h>
 #include <linux/string.h>
 #include <locale.h>
+#include <regex.h>
 
 static int	kmem_slab;
 static int	kmem_page;
 
 static long	kmem_page_size;
+static enum {
+	KMEM_SLAB,
+	KMEM_PAGE,
+} kmem_default = KMEM_SLAB;  /* for backward compatibility */
 
 struct alloc_stat;
-typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *);
+typedef int (*sort_fn_t)(void *, void *);
 
 static int			alloc_flag;
 static int			caller_flag;
@@ -179,8 +185,8 @@ static int perf_evsel__process_alloc_node_event(struct perf_evsel *evsel,
 	return ret;
 }
 
-static int ptr_cmp(struct alloc_stat *, struct alloc_stat *);
-static int callsite_cmp(struct alloc_stat *, struct alloc_stat *);
+static int ptr_cmp(void *, void *);
+static int slab_callsite_cmp(void *, void *);
 
 static struct alloc_stat *search_alloc_stat(unsigned long ptr,
 					    unsigned long call_site,
@@ -221,7 +227,8 @@ static int perf_evsel__process_free_event(struct perf_evsel *evsel,
 		s_alloc->pingpong++;
 
 		s_caller = search_alloc_stat(0, s_alloc->call_site,
-					     &root_caller_stat, callsite_cmp);
+					     &root_caller_stat,
+					     slab_callsite_cmp);
 		if (!s_caller)
 			return -1;
 		s_caller->pingpong++;
@@ -241,6 +248,8 @@ static unsigned long nr_page_fails;
 static unsigned long nr_page_nomatch;
 
 static bool use_pfn;
+static bool live_page;
+static struct perf_session *kmem_session;
 
 #define MAX_MIGRATE_TYPES  6
 #define MAX_PAGE_ORDER     11
@@ -250,6 +259,7 @@ static int order_stats[MAX_PAGE_ORDER][MAX_MIGRATE_TYPES];
 struct page_stat {
 	struct rb_node 	node;
 	u64 		page;
+	u64 		callsite;
 	int 		order;
 	unsigned 	gfp_flags;
 	unsigned 	migrate_type;
@@ -259,13 +269,158 @@ struct page_stat {
 	int 		nr_free;
 };
 
-static struct rb_root page_tree;
+static struct rb_root page_live_tree;
 static struct rb_root page_alloc_tree;
 static struct rb_root page_alloc_sorted;
+static struct rb_root page_caller_tree;
+static struct rb_root page_caller_sorted;
 
-static struct page_stat *search_page(unsigned long page, bool create)
+struct alloc_func {
+	u64 start;
+	u64 end;
+	char *name;
+};
+
+static int nr_alloc_funcs;
+static struct alloc_func *alloc_func_list;
+
+static int funcmp(const void *a, const void *b)
+{
+	const struct alloc_func *fa = a;
+	const struct alloc_func *fb = b;
+
+	if (fa->start > fb->start)
+		return 1;
+	else
+		return -1;
+}
+
+static int callcmp(const void *a, const void *b)
+{
+	const struct alloc_func *fa = a;
+	const struct alloc_func *fb = b;
+
+	if (fb->start <= fa->start && fa->end < fb->end)
+		return 0;
+
+	if (fa->start > fb->start)
+		return 1;
+	else
+		return -1;
+}
+
+static int build_alloc_func_list(void)
 {
-	struct rb_node **node = &page_tree.rb_node;
+	int ret;
+	struct map *kernel_map;
+	struct symbol *sym;
+	struct rb_node *node;
+	struct alloc_func *func;
+	struct machine *machine = &kmem_session->machines.host;
+	regex_t alloc_func_regex;
+	const char pattern[] = "^_?_?(alloc|get_free|get_zeroed)_pages?";
+
+	ret = regcomp(&alloc_func_regex, pattern, REG_EXTENDED);
+	if (ret) {
+		char err[BUFSIZ];
+
+		regerror(ret, &alloc_func_regex, err, sizeof(err));
+		pr_err("Invalid regex: %s\n%s", pattern, err);
+		return -EINVAL;
+	}
+
+	kernel_map = machine->vmlinux_maps[MAP__FUNCTION];
+	if (map__load(kernel_map, NULL) < 0) {
+		pr_err("cannot load kernel map\n");
+		return -ENOENT;
+	}
+
+	map__for_each_symbol(kernel_map, sym, node) {
+		if (regexec(&alloc_func_regex, sym->name, 0, NULL, 0))
+			continue;
+
+		func = realloc(alloc_func_list,
+			       (nr_alloc_funcs + 1) * sizeof(*func));
+		if (func == NULL)
+			return -ENOMEM;
+
+		pr_debug("alloc func: %s\n", sym->name);
+		func[nr_alloc_funcs].start = sym->start;
+		func[nr_alloc_funcs].end   = sym->end;
+		func[nr_alloc_funcs].name  = sym->name;
+
+		alloc_func_list = func;
+		nr_alloc_funcs++;
+	}
+
+	qsort(alloc_func_list, nr_alloc_funcs, sizeof(*func), funcmp);
+
+	regfree(&alloc_func_regex);
+	return 0;
+}
+
+/*
+ * Find first non-memory allocation function from callchain.
+ * The allocation functions are in the 'alloc_func_list'.
+ */
+static u64 find_callsite(struct perf_evsel *evsel, struct perf_sample *sample)
+{
+	struct addr_location al;
+	struct machine *machine = &kmem_session->machines.host;
+	struct callchain_cursor_node *node;
+
+	if (alloc_func_list == NULL) {
+		if (build_alloc_func_list() < 0)
+			goto out;
+	}
+
+	al.thread = machine__findnew_thread(machine, sample->pid, sample->tid);
+	sample__resolve_callchain(sample, NULL, evsel, &al, 16);
+
+	callchain_cursor_commit(&callchain_cursor);
+	while (true) {
+		struct alloc_func key, *caller;
+		u64 addr;
+
+		node = callchain_cursor_current(&callchain_cursor);
+		if (node == NULL)
+			break;
+
+		key.start = key.end = node->ip;
+		caller = bsearch(&key, alloc_func_list, nr_alloc_funcs,
+				 sizeof(key), callcmp);
+		if (!caller) {
+			/* found */
+			if (node->map)
+				addr = map__unmap_ip(node->map, node->ip);
+			else
+				addr = node->ip;
+
+			return addr;
+		} else
+			pr_debug3("skipping alloc function: %s\n", caller->name);
+
+		callchain_cursor_advance(&callchain_cursor);
+	}
+
+out:
+	pr_debug2("unknown callsite: %"PRIx64 "\n", sample->ip);
+	return sample->ip;
+}
+
+struct sort_dimension {
+	const char		name[20];
+	sort_fn_t		cmp;
+	struct list_head	list;
+};
+
+static LIST_HEAD(page_alloc_sort_input);
+static LIST_HEAD(page_caller_sort_input);
+
+static struct page_stat *
+__page_stat__findnew_page(struct page_stat *pstat, bool create)
+{
+	struct rb_node **node = &page_live_tree.rb_node;
 	struct rb_node *parent = NULL;
 	struct page_stat *data;
 
@@ -275,7 +430,7 @@ static struct page_stat *search_page(unsigned long page, bool create)
 		parent = *node;
 		data = rb_entry(*node, struct page_stat, node);
 
-		cmp = data->page - page;
+		cmp = data->page - pstat->page;
 		if (cmp < 0)
 			node = &parent->rb_left;
 		else if (cmp > 0)
@@ -289,49 +444,48 @@ static struct page_stat *search_page(unsigned long page, bool create)
 
 	data = zalloc(sizeof(*data));
 	if (data != NULL) {
-		data->page = page;
+		data->page = pstat->page;
+		data->order = pstat->order;
+		data->gfp_flags = pstat->gfp_flags;
+		data->migrate_type = pstat->migrate_type;
 
 		rb_link_node(&data->node, parent, node);
-		rb_insert_color(&data->node, &page_tree);
+		rb_insert_color(&data->node, &page_live_tree);
 	}
 
 	return data;
 }
 
-static int page_stat_cmp(struct page_stat *a, struct page_stat *b)
+static struct page_stat *page_stat__find_page(struct page_stat *pstat)
 {
-	if (a->page > b->page)
-		return -1;
-	if (a->page < b->page)
-		return 1;
-	if (a->order > b->order)
-		return -1;
-	if (a->order < b->order)
-		return 1;
-	if (a->migrate_type > b->migrate_type)
-		return -1;
-	if (a->migrate_type < b->migrate_type)
-		return 1;
-	if (a->gfp_flags > b->gfp_flags)
-		return -1;
-	if (a->gfp_flags < b->gfp_flags)
-		return 1;
-	return 0;
+	return __page_stat__findnew_page(pstat, false);
+}
+
+static struct page_stat *page_stat__findnew_page(struct page_stat *pstat)
+{
+	return __page_stat__findnew_page(pstat, true);
 }
 
-static struct page_stat *search_page_alloc_stat(struct page_stat *pstat, bool create)
+static struct page_stat *
+__page_stat__findnew_alloc(struct page_stat *pstat, bool create)
 {
 	struct rb_node **node = &page_alloc_tree.rb_node;
 	struct rb_node *parent = NULL;
 	struct page_stat *data;
+	struct sort_dimension *sort;
 
 	while (*node) {
-		s64 cmp;
+		int cmp = 0;
 
 		parent = *node;
 		data = rb_entry(*node, struct page_stat, node);
 
-		cmp = page_stat_cmp(data, pstat);
+		list_for_each_entry(sort, &page_alloc_sort_input, list) {
+			cmp = sort->cmp(pstat, data);
+			if (cmp)
+				break;
+		}
+
 		if (cmp < 0)
 			node = &parent->rb_left;
 		else if (cmp > 0)
@@ -357,6 +511,71 @@ static struct page_stat *search_page_alloc_stat(struct page_stat *pstat, bool cr
 	return data;
 }
 
+static struct page_stat *page_stat__find_alloc(struct page_stat *pstat)
+{
+	return __page_stat__findnew_alloc(pstat, false);
+}
+
+static struct page_stat *page_stat__findnew_alloc(struct page_stat *pstat)
+{
+	return __page_stat__findnew_alloc(pstat, true);
+}
+
+static struct page_stat *
+__page_stat__findnew_caller(struct page_stat *pstat, bool create)
+{
+	struct rb_node **node = &page_caller_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct page_stat *data;
+	struct sort_dimension *sort;
+
+	while (*node) {
+		int cmp = 0;
+
+		parent = *node;
+		data = rb_entry(*node, struct page_stat, node);
+
+		list_for_each_entry(sort, &page_caller_sort_input, list) {
+			cmp = sort->cmp(pstat, data);
+			if (cmp)
+				break;
+		}
+
+		if (cmp < 0)
+			node = &parent->rb_left;
+		else if (cmp > 0)
+			node = &parent->rb_right;
+		else
+			return data;
+	}
+
+	if (!create)
+		return NULL;
+
+	data = zalloc(sizeof(*data));
+	if (data != NULL) {
+		data->callsite = pstat->callsite;
+		data->order = pstat->order;
+		data->gfp_flags = pstat->gfp_flags;
+		data->migrate_type = pstat->migrate_type;
+
+		rb_link_node(&data->node, parent, node);
+		rb_insert_color(&data->node, &page_caller_tree);
+	}
+
+	return data;
+}
+
+static struct page_stat *page_stat__find_caller(struct page_stat *pstat)
+{
+	return __page_stat__findnew_caller(pstat, false);
+}
+
+static struct page_stat *page_stat__findnew_caller(struct page_stat *pstat)
+{
+	return __page_stat__findnew_caller(pstat, true);
+}
+
 static bool valid_page(u64 pfn_or_page)
 {
 	if (use_pfn && pfn_or_page == -1UL)
@@ -366,6 +585,176 @@ static bool valid_page(u64 pfn_or_page)
 	return true;
 }
 
+struct gfp_flag {
+	unsigned int flags;
+	char *compact_str;
+	char *human_readable;
+};
+
+static struct gfp_flag *gfps;
+static int nr_gfps;
+
+static int gfpcmp(const void *a, const void *b)
+{
+	const struct gfp_flag *fa = a;
+	const struct gfp_flag *fb = b;
+
+	return fa->flags - fb->flags;
+}
+
+/* see include/trace/events/gfpflags.h */
+static const struct {
+	const char *original;
+	const char *compact;
+} gfp_compact_table[] = {
+	{ "GFP_TRANSHUGE",		"THP" },
+	{ "GFP_HIGHUSER_MOVABLE",	"HUM" },
+	{ "GFP_HIGHUSER",		"HU" },
+	{ "GFP_USER",			"U" },
+	{ "GFP_TEMPORARY",		"TMP" },
+	{ "GFP_KERNEL",			"K" },
+	{ "GFP_NOFS",			"NF" },
+	{ "GFP_ATOMIC",			"A" },
+	{ "GFP_NOIO",			"NI" },
+	{ "GFP_HIGH",			"H" },
+	{ "GFP_WAIT",			"W" },
+	{ "GFP_IO",			"I" },
+	{ "GFP_COLD",			"CO" },
+	{ "GFP_NOWARN",			"NWR" },
+	{ "GFP_REPEAT",			"R" },
+	{ "GFP_NOFAIL",			"NF" },
+	{ "GFP_NORETRY",		"NR" },
+	{ "GFP_COMP",			"C" },
+	{ "GFP_ZERO",			"Z" },
+	{ "GFP_NOMEMALLOC",		"NMA" },
+	{ "GFP_MEMALLOC",		"MA" },
+	{ "GFP_HARDWALL",		"HW" },
+	{ "GFP_THISNODE",		"TN" },
+	{ "GFP_RECLAIMABLE",		"RC" },
+	{ "GFP_MOVABLE",		"M" },
+	{ "GFP_NOTRACK",		"NT" },
+	{ "GFP_NO_KSWAPD",		"NK" },
+	{ "GFP_OTHER_NODE",		"ON" },
+	{ "GFP_NOWAIT",			"NW" },
+};
+
+static size_t max_gfp_len;
+
+static char *compact_gfp_flags(char *gfp_flags)
+{
+	char *orig_flags = strdup(gfp_flags);
+	char *new_flags = NULL;
+	char *str, *pos = NULL;
+	size_t len = 0;
+
+	if (orig_flags == NULL)
+		return NULL;
+
+	str = strtok_r(orig_flags, "|", &pos);
+	while (str) {
+		size_t i;
+		char *new;
+		const char *cpt;
+
+		for (i = 0; i < ARRAY_SIZE(gfp_compact_table); i++) {
+			if (strcmp(gfp_compact_table[i].original, str))
+				continue;
+
+			cpt = gfp_compact_table[i].compact;
+			new = realloc(new_flags, len + strlen(cpt) + 2);
+			if (new == NULL) {
+				free(new_flags);
+				return NULL;
+			}
+
+			new_flags = new;
+
+			if (!len) {
+				strcpy(new_flags, cpt);
+			} else {
+				strcat(new_flags, "|");
+				strcat(new_flags, cpt);
+				len++;
+			}
+
+			len += strlen(cpt);
+		}
+
+		str = strtok_r(NULL, "|", &pos);
+	}
+
+	if (max_gfp_len < len)
+		max_gfp_len = len;
+
+	free(orig_flags);
+	return new_flags;
+}
+
+static char *compact_gfp_string(unsigned long gfp_flags)
+{
+	struct gfp_flag key = {
+		.flags = gfp_flags,
+	};
+	struct gfp_flag *gfp;
+
+	gfp = bsearch(&key, gfps, nr_gfps, sizeof(*gfps), gfpcmp);
+	if (gfp)
+		return gfp->compact_str;
+
+	return NULL;
+}
+
+static int parse_gfp_flags(struct perf_evsel *evsel, struct perf_sample *sample,
+			   unsigned int gfp_flags)
+{
+	struct pevent_record record = {
+		.cpu = sample->cpu,
+		.data = sample->raw_data,
+		.size = sample->raw_size,
+	};
+	struct trace_seq seq;
+	char *str, *pos = NULL;
+
+	if (nr_gfps) {
+		struct gfp_flag key = {
+			.flags = gfp_flags,
+		};
+
+		if (bsearch(&key, gfps, nr_gfps, sizeof(*gfps), gfpcmp))
+			return 0;
+	}
+
+	trace_seq_init(&seq);
+	pevent_event_info(&seq, evsel->tp_format, &record);
+
+	str = strtok_r(seq.buffer, " ", &pos);
+	while (str) {
+		if (!strncmp(str, "gfp_flags=", 10)) {
+			struct gfp_flag *new;
+
+			new = realloc(gfps, (nr_gfps + 1) * sizeof(*gfps));
+			if (new == NULL)
+				return -ENOMEM;
+
+			gfps = new;
+			new += nr_gfps++;
+
+			new->flags = gfp_flags;
+			new->human_readable = strdup(str + 10);
+			new->compact_str = compact_gfp_flags(str + 10);
+			if (!new->human_readable || !new->compact_str)
+				return -ENOMEM;
+
+			qsort(gfps, nr_gfps, sizeof(*gfps), gfpcmp);
+		}
+
+		str = strtok_r(NULL, " ", &pos);
+	}
+
+	trace_seq_destroy(&seq);
+	return 0;
+}
+
 static int perf_evsel__process_page_alloc_event(struct perf_evsel *evsel,
 						struct perf_sample *sample)
 {
@@ -375,6 +764,7 @@ static int perf_evsel__process_page_alloc_event(struct perf_evsel *evsel,
 	unsigned int migrate_type = perf_evsel__intval(evsel, sample,
 						       "migratetype");
 	u64 bytes = kmem_page_size << order;
+	u64 callsite;
 	struct page_stat *pstat;
 	struct page_stat this = {
 		.order = order,
@@ -397,20 +787,36 @@ static int perf_evsel__process_page_alloc_event(struct perf_evsel *evsel,
 		return 0;
 	}
 
+	if (parse_gfp_flags(evsel, sample, gfp_flags) < 0)
+		return -1;
+
+	callsite = find_callsite(evsel, sample);
+
 	/*
 	 * This is to find the current page (with correct gfp flags and
 	 * migrate type) at free event.
 	 */
-	pstat = search_page(page, true);
+	this.page = page;
+	pstat = page_stat__findnew_page(&this);
 	if (pstat == NULL)
 		return -ENOMEM;
 
-	pstat->order = order;
-	pstat->gfp_flags = gfp_flags;
-	pstat->migrate_type = migrate_type;
+	pstat->nr_alloc++;
+	pstat->alloc_bytes += bytes;
+	pstat->callsite = callsite;
+
+	if (!live_page) {
+		pstat = page_stat__findnew_alloc(&this);
+		if (pstat == NULL)
+			return -ENOMEM;
 
-	this.page = page;
-	pstat = search_page_alloc_stat(&this, true);
+		pstat->nr_alloc++;
+		pstat->alloc_bytes += bytes;
+		pstat->callsite = callsite;
+	}
+
+	this.callsite = callsite;
+	pstat = page_stat__findnew_caller(&this);
 	if (pstat == NULL)
 		return -ENOMEM;
 
@@ -441,7 +847,8 @@ static int perf_evsel__process_page_free_event(struct perf_evsel *evsel,
 	nr_page_frees++;
 	total_page_free_bytes += bytes;
 
-	pstat = search_page(page, false);
+	this.page = page;
+	pstat = page_stat__find_page(&this);
 	if (pstat == NULL) {
 		pr_debug2("missing free at page %"PRIx64" (order: %d)\n",
 			  page, order);
@@ -452,20 +859,41 @@ static int perf_evsel__process_page_free_event(struct perf_evsel *evsel,
 		return 0;
 	}
 
-	this.page = page;
 	this.gfp_flags = pstat->gfp_flags;
 	this.migrate_type = pstat->migrate_type;
+	this.callsite = pstat->callsite;
 
-	rb_erase(&pstat->node, &page_tree);
+	rb_erase(&pstat->node, &page_live_tree);
 	free(pstat);
 
-	pstat = search_page_alloc_stat(&this, false);
+	if (live_page) {
+		order_stats[this.order][this.migrate_type]--;
+	} else {
+		pstat = page_stat__find_alloc(&this);
+		if (pstat == NULL)
+			return -ENOMEM;
+
+		pstat->nr_free++;
+		pstat->free_bytes += bytes;
+	}
+
+	pstat = page_stat__find_caller(&this);
 	if (pstat == NULL)
 		return -ENOENT;
 
 	pstat->nr_free++;
 	pstat->free_bytes += bytes;
 
+	if (live_page) {
+		pstat->nr_alloc--;
+		pstat->alloc_bytes -= bytes;
+
+		if (pstat->nr_alloc == 0) {
+			rb_erase(&pstat->node, &page_caller_tree);
+			free(pstat);
+		}
+	}
+
 	return 0;
 }
 
@@ -478,6 +906,7 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused,
 				struct perf_evsel *evsel,
 				struct machine *machine)
 {
+	int err = 0;
 	struct thread *thread = machine__findnew_thread(machine, sample->pid,
 							sample->tid);
 
@@ -491,10 +920,12 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused,
 
 	if (evsel->handler != NULL) {
 		tracepoint_handler f = evsel->handler;
-		return f(evsel, sample);
+		err = f(evsel, sample);
 	}
 
-	return 0;
+	thread__put(thread);
+
+	return err;
 }
 
 static struct perf_tool perf_kmem = {
@@ -576,41 +1007,111 @@ static const char * const migrate_type_str[] = {
 	"UNKNOWN",
 };
 
-static void __print_page_result(struct rb_root *root,
-				struct perf_session *session __maybe_unused,
-				int n_lines)
+static void __print_page_alloc_result(struct perf_session *session, int n_lines)
 {
-	struct rb_node *next = rb_first(root);
+	struct rb_node *next = rb_first(&page_alloc_sorted);
+	struct machine *machine = &session->machines.host;
 	const char *format;
+	int gfp_len = max(strlen("GFP flags"), max_gfp_len);
 
-	printf("\n%.80s\n", graph_dotted_line);
-	printf(" %-16s | Total alloc (KB) | Hits      | Order | Mig.type | GFP flags\n",
-	       use_pfn ? "PFN" : "Page");
-	printf("%.80s\n", graph_dotted_line);
+	printf("\n%.105s\n", graph_dotted_line);
+	printf(" %-16s | %5s alloc (KB) | Hits      | Order | Mig.type | %-*s | Callsite\n",
+	       use_pfn ? "PFN" : "Page", live_page ? "Live" : "Total",
+	       gfp_len, "GFP flags");
+	printf("%.105s\n", graph_dotted_line);
 
 	if (use_pfn)
-		format = " %16llu | %'16llu | %'9d | %5d | %8s |  %08lx\n";
+		format = " %16llu | %'16llu | %'9d | %5d | %8s | %-*s | %s\n";
 	else
-		format = " %016llx | %'16llu | %'9d | %5d | %8s |  %08lx\n";
+		format = " %016llx | %'16llu | %'9d | %5d | %8s | %-*s | %s\n";
 
 	while (next && n_lines--) {
 		struct page_stat *data;
+		struct symbol *sym;
+		struct map *map;
+		char buf[32];
+		char *caller = buf;
 
 		data = rb_entry(next, struct page_stat, node);
+		sym = machine__find_kernel_function(machine, data->callsite,
+						    &map, NULL);
+		if (sym && sym->name)
+			caller = sym->name;
+		else
+			scnprintf(buf, sizeof(buf), "%"PRIx64, data->callsite);
 
 		printf(format, (unsigned long long)data->page,
 		       (unsigned long long)data->alloc_bytes / 1024,
 		       data->nr_alloc, data->order,
 		       migrate_type_str[data->migrate_type],
-		       (unsigned long)data->gfp_flags);
+		       gfp_len, compact_gfp_string(data->gfp_flags), caller);
 
 		next = rb_next(next);
 	}
 
-	if (n_lines == -1)
-		printf(" ...              | ...              | ...       | ...   | ...      | ...     \n");
+	if (n_lines == -1) {
+		printf(" ...              | ...              | ...       | ...   | ...      | %-*s | ...\n",
+		       gfp_len, "...");
+	}
+
+	printf("%.105s\n", graph_dotted_line);
+}
+
+static void __print_page_caller_result(struct perf_session *session, int n_lines)
+{
+	struct rb_node *next = rb_first(&page_caller_sorted);
+	struct machine *machine = &session->machines.host;
+	int gfp_len = max(strlen("GFP flags"), max_gfp_len);
+
+	printf("\n%.105s\n", graph_dotted_line);
+	printf(" %5s alloc (KB) | Hits      | Order | Mig.type | %-*s | Callsite\n",
+	       live_page ? "Live" : "Total", gfp_len, "GFP flags");
+	printf("%.105s\n", graph_dotted_line);
+
+	while (next && n_lines--) {
+		struct page_stat *data;
+		struct symbol *sym;
+		struct map *map;
+		char buf[32];
+		char *caller = buf;
+
+		data = rb_entry(next, struct page_stat, node);
+		sym = machine__find_kernel_function(machine, data->callsite,
+						    &map, NULL);
+		if (sym && sym->name)
+			caller = sym->name;
+		else
+			scnprintf(buf, sizeof(buf), "%"PRIx64, data->callsite);
+
+		printf(" %'16llu | %'9d | %5d | %8s | %-*s | %s\n",
+		       (unsigned long long)data->alloc_bytes / 1024,
+		       data->nr_alloc, data->order,
+		       migrate_type_str[data->migrate_type],
+		       gfp_len, compact_gfp_string(data->gfp_flags), caller);
+
+		next = rb_next(next);
+	}
+
+	if (n_lines == -1) {
+		printf(" ...              | ...       | ...   | ...      | %-*s | ...\n",
+		       gfp_len, "...");
+	}
 
-	printf("%.80s\n", graph_dotted_line);
+	printf("%.105s\n", graph_dotted_line);
+}
+
+static void print_gfp_flags(void)
+{
+	int i;
+
+	printf("#\n");
+	printf("# GFP flags\n");
+	printf("# ---------\n");
+	for (i = 0; i < nr_gfps; i++) {
+		printf("# %08x: %*s: %s\n", gfps[i].flags,
+		       (int) max_gfp_len, gfps[i].compact_str,
+		       gfps[i].human_readable);
+	}
 }
 
 static void print_slab_summary(void)
@@ -682,8 +1183,12 @@ static void print_slab_result(struct perf_session *session)
 
 static void print_page_result(struct perf_session *session)
 {
+	if (caller_flag || alloc_flag)
+		print_gfp_flags();
+	if (caller_flag)
+		__print_page_caller_result(session, caller_lines);
 	if (alloc_flag)
-		__print_page_result(&page_alloc_sorted, session, alloc_lines);
+		__print_page_alloc_result(session, alloc_lines);
 	print_page_summary();
 }
 
@@ -695,14 +1200,10 @@ static void print_result(struct perf_session *session)
 		print_page_result(session);
 }
 
-struct sort_dimension {
-	const char		name[20];
-	sort_fn_t		cmp;
-	struct list_head	list;
-};
-
-static LIST_HEAD(caller_sort);
-static LIST_HEAD(alloc_sort);
+static LIST_HEAD(slab_caller_sort);
+static LIST_HEAD(slab_alloc_sort);
+static LIST_HEAD(page_caller_sort);
+static LIST_HEAD(page_alloc_sort);
 
 static void sort_slab_insert(struct rb_root *root, struct alloc_stat *data,
 			     struct list_head *sort_list)
@@ -751,10 +1252,12 @@ static void __sort_slab_result(struct rb_root *root, struct rb_root *root_sorted
 	}
 }
 
-static void sort_page_insert(struct rb_root *root, struct page_stat *data)
+static void sort_page_insert(struct rb_root *root, struct page_stat *data,
+			     struct list_head *sort_list)
 {
 	struct rb_node **new = &root->rb_node;
 	struct rb_node *parent = NULL;
+	struct sort_dimension *sort;
 
 	while (*new) {
 		struct page_stat *this;
@@ -763,8 +1266,11 @@ static void sort_page_insert(struct rb_root *root, struct page_stat *data)
 		this = rb_entry(*new, struct page_stat, node);
 		parent = *new;
 
-		/* TODO: support more sort key */
-		cmp = data->alloc_bytes - this->alloc_bytes;
+		list_for_each_entry(sort, sort_list, list) {
+			cmp = sort->cmp(data, this);
+			if (cmp)
+				break;
+		}
 
 		if (cmp > 0)
 			new = &parent->rb_left;
@@ -776,7 +1282,8 @@ static void sort_page_insert(struct rb_root *root, struct page_stat *data)
 	rb_insert_color(&data->node, root);
 }
 
-static void __sort_page_result(struct rb_root *root, struct rb_root *root_sorted)
+static void __sort_page_result(struct rb_root *root, struct rb_root *root_sorted,
+			       struct list_head *sort_list)
 {
 	struct rb_node *node;
 	struct page_stat *data;
@@ -788,7 +1295,7 @@ static void __sort_page_result(struct rb_root *root, struct rb_root *root_sorted
 
 		rb_erase(node, root);
 		data = rb_entry(node, struct page_stat, node);
-		sort_page_insert(root_sorted, data);
+		sort_page_insert(root_sorted, data, sort_list);
 	}
 }
 
@@ -796,12 +1303,20 @@ static void sort_result(void)
 {
 	if (kmem_slab) {
 		__sort_slab_result(&root_alloc_stat, &root_alloc_sorted,
-				   &alloc_sort);
+				   &slab_alloc_sort);
 		__sort_slab_result(&root_caller_stat, &root_caller_sorted,
-				   &caller_sort);
+				   &slab_caller_sort);
 	}
 	if (kmem_page) {
-		__sort_page_result(&page_alloc_tree, &page_alloc_sorted);
+		if (live_page)
+			__sort_page_result(&page_live_tree, &page_alloc_sorted,
+					   &page_alloc_sort);
+		else
+			__sort_page_result(&page_alloc_tree, &page_alloc_sorted,
+					   &page_alloc_sort);
+
+		__sort_page_result(&page_caller_tree, &page_caller_sorted,
+				   &page_caller_sort);
 	}
 }
 
@@ -850,8 +1365,12 @@ out:
 	return err;
 }
 
-static int ptr_cmp(struct alloc_stat *l, struct alloc_stat *r)
+/* slab sort keys */
+static int ptr_cmp(void *a, void *b)
 {
+	struct alloc_stat *l = a;
+	struct alloc_stat *r = b;
+
 	if (l->ptr < r->ptr)
 		return -1;
 	else if (l->ptr > r->ptr)
@@ -864,8 +1383,11 @@ static struct sort_dimension ptr_sort_dimension = {
 	.cmp	= ptr_cmp,
 };
 
-static int callsite_cmp(struct alloc_stat *l, struct alloc_stat *r)
+static int slab_callsite_cmp(void *a, void *b)
 {
+	struct alloc_stat *l = a;
+	struct alloc_stat *r = b;
+
 	if (l->call_site < r->call_site)
 		return -1;
 	else if (l->call_site > r->call_site)
@@ -875,11 +1397,14 @@ static int callsite_cmp(struct alloc_stat *l, struct alloc_stat *r)
 
 static struct sort_dimension callsite_sort_dimension = {
 	.name	= "callsite",
-	.cmp	= callsite_cmp,
+	.cmp	= slab_callsite_cmp,
 };
 
-static int hit_cmp(struct alloc_stat *l, struct alloc_stat *r)
+static int hit_cmp(void *a, void *b)
 {
+	struct alloc_stat *l = a;
+	struct alloc_stat *r = b;
+
 	if (l->hit < r->hit)
 		return -1;
 	else if (l->hit > r->hit)
@@ -892,8 +1417,11 @@ static struct sort_dimension hit_sort_dimension = {
 	.cmp	= hit_cmp,
 };
 
-static int bytes_cmp(struct alloc_stat *l, struct alloc_stat *r)
+static int bytes_cmp(void *a, void *b)
 {
+	struct alloc_stat *l = a;
+	struct alloc_stat *r = b;
+
 	if (l->bytes_alloc < r->bytes_alloc)
 		return -1;
 	else if (l->bytes_alloc > r->bytes_alloc)
@@ -906,9 +1434,11 @@ static struct sort_dimension bytes_sort_dimension = {
 	.cmp	= bytes_cmp,
 };
 
-static int frag_cmp(struct alloc_stat *l, struct alloc_stat *r)
+static int frag_cmp(void *a, void *b)
 {
 	double x, y;
+	struct alloc_stat *l = a;
+	struct alloc_stat *r = b;
 
 	x = fragmentation(l->bytes_req, l->bytes_alloc);
 	y = fragmentation(r->bytes_req, r->bytes_alloc);
@@ -925,8 +1455,11 @@ static struct sort_dimension frag_sort_dimension = {
 	.cmp	= frag_cmp,
 };
 
-static int pingpong_cmp(struct alloc_stat *l, struct alloc_stat *r)
+static int pingpong_cmp(void *a, void *b)
 {
+	struct alloc_stat *l = a;
+	struct alloc_stat *r = b;
+
 	if (l->pingpong < r->pingpong)
 		return -1;
 	else if (l->pingpong > r->pingpong)
@@ -939,7 +1472,135 @@ static struct sort_dimension pingpong_sort_dimension = {
 	.cmp	= pingpong_cmp,
 };
 
-static struct sort_dimension *avail_sorts[] = {
+/* page sort keys */
+static int page_cmp(void *a, void *b)
+{
+	struct page_stat *l = a;
+	struct page_stat *r = b;
+
+	if (l->page < r->page)
+		return -1;
+	else if (l->page > r->page)
+		return 1;
+	return 0;
+}
+
+static struct sort_dimension page_sort_dimension = {
+	.name	= "page",
+	.cmp	= page_cmp,
+};
+
+static int page_callsite_cmp(void *a, void *b)
+{
+	struct page_stat *l = a;
+	struct page_stat *r = b;
+
+	if (l->callsite < r->callsite)
+		return -1;
+	else if (l->callsite > r->callsite)
+		return 1;
+	return 0;
+}
+
+static struct sort_dimension page_callsite_sort_dimension = {
+	.name	= "callsite",
+	.cmp	= page_callsite_cmp,
+};
+
+static int page_hit_cmp(void *a, void *b)
+{
+	struct page_stat *l = a;
+	struct page_stat *r = b;
+
+	if (l->nr_alloc < r->nr_alloc)
+		return -1;
+	else if (l->nr_alloc > r->nr_alloc)
+		return 1;
+	return 0;
+}
+
+static struct sort_dimension page_hit_sort_dimension = {
+	.name	= "hit",
+	.cmp	= page_hit_cmp,
+};
+
+static int page_bytes_cmp(void *a, void *b)
+{
+	struct page_stat *l = a;
+	struct page_stat *r = b;
+
+	if (l->alloc_bytes < r->alloc_bytes)
+		return -1;
+	else if (l->alloc_bytes > r->alloc_bytes)
+		return 1;
+	return 0;
+}
+
+static struct sort_dimension page_bytes_sort_dimension = {
+	.name	= "bytes",
+	.cmp	= page_bytes_cmp,
+};
+
+static int page_order_cmp(void *a, void *b)
+{
+	struct page_stat *l = a;
+	struct page_stat *r = b;
+
+	if (l->order < r->order)
+		return -1;
+	else if (l->order > r->order)
+		return 1;
+	return 0;
+}
+
+static struct sort_dimension page_order_sort_dimension = {
+	.name	= "order",
+	.cmp	= page_order_cmp,
+};
+
+static int migrate_type_cmp(void *a, void *b)
+{
+	struct page_stat *l = a;
+	struct page_stat *r = b;
+
+	/* for internal use to find free'd page */
+	if (l->migrate_type == -1U)
+		return 0;
+
+	if (l->migrate_type < r->migrate_type)
+		return -1;
+	else if (l->migrate_type > r->migrate_type)
+		return 1;
+	return 0;
+}
+
+static struct sort_dimension migrate_type_sort_dimension = {
+	.name	= "migtype",
+	.cmp	= migrate_type_cmp,
+};
+
+static int gfp_flags_cmp(void *a, void *b)
+{
+	struct page_stat *l = a;
+	struct page_stat *r = b;
+
+	/* for internal use to find free'd page */
+	if (l->gfp_flags == -1U)
+		return 0;
+
+	if (l->gfp_flags < r->gfp_flags)
+		return -1;
+	else if (l->gfp_flags > r->gfp_flags)
+		return 1;
+	return 0;
+}
+
+static struct sort_dimension gfp_flags_sort_dimension = {
+	.name	= "gfp",
+	.cmp	= gfp_flags_cmp,
+};
+
+static struct sort_dimension *slab_sorts[] = {
 	&ptr_sort_dimension,
 	&callsite_sort_dimension,
 	&hit_sort_dimension,
@@ -948,16 +1609,44 @@ static struct sort_dimension *avail_sorts[] = {
 	&pingpong_sort_dimension,
 };
 
-#define NUM_AVAIL_SORTS	((int)ARRAY_SIZE(avail_sorts))
+static struct sort_dimension *page_sorts[] = {
+	&page_sort_dimension,
+	&page_callsite_sort_dimension,
+	&page_hit_sort_dimension,
+	&page_bytes_sort_dimension,
+	&page_order_sort_dimension,
+	&migrate_type_sort_dimension,
+	&gfp_flags_sort_dimension,
+};
+
+static int slab_sort_dimension__add(const char *tok, struct list_head *list)
+{
+	struct sort_dimension *sort;
+	int i;
+
+	for (i = 0; i < (int)ARRAY_SIZE(slab_sorts); i++) {
+		if (!strcmp(slab_sorts[i]->name, tok)) {
+			sort = memdup(slab_sorts[i], sizeof(*slab_sorts[i]));
+			if (!sort) {
+				pr_err("%s: memdup failed\n", __func__);
+				return -1;
+			}
+			list_add_tail(&sort->list, list);
+			return 0;
+		}
+	}
+
+	return -1;
+}
 
-static int sort_dimension__add(const char *tok, struct list_head *list)
+static int page_sort_dimension__add(const char *tok, struct list_head *list)
 {
 	struct sort_dimension *sort;
 	int i;
 
-	for (i = 0; i < NUM_AVAIL_SORTS; i++) {
-		if (!strcmp(avail_sorts[i]->name, tok)) {
-			sort = memdup(avail_sorts[i], sizeof(*avail_sorts[i]));
+	for (i = 0; i < (int)ARRAY_SIZE(page_sorts); i++) {
+		if (!strcmp(page_sorts[i]->name, tok)) {
+			sort = memdup(page_sorts[i], sizeof(*page_sorts[i]));
 			if (!sort) {
 				pr_err("%s: memdup failed\n", __func__);
 				return -1;
@@ -970,7 +1659,33 @@ static int sort_dimension__add(const char *tok, struct list_head *list)
 	return -1;
 }
 
-static int setup_sorting(struct list_head *sort_list, const char *arg)
+static int setup_slab_sorting(struct list_head *sort_list, const char *arg)
+{
+	char *tok;
+	char *str = strdup(arg);
+	char *pos = str;
+
+	if (!str) {
+		pr_err("%s: strdup failed\n", __func__);
+		return -1;
+	}
+
+	while (true) {
+		tok = strsep(&pos, ",");
+		if (!tok)
+			break;
+		if (slab_sort_dimension__add(tok, sort_list) < 0) {
+			error("Unknown slab --sort key: '%s'", tok);
+			free(str);
+			return -1;
+		}
+	}
+
+	free(str);
+	return 0;
+}
+
+static int setup_page_sorting(struct list_head *sort_list, const char *arg)
 {
 	char *tok;
 	char *str = strdup(arg);
@@ -985,8 +1700,8 @@ static int setup_sorting(struct list_head *sort_list, const char *arg)
 		tok = strsep(&pos, ",");
 		if (!tok)
 			break;
-		if (sort_dimension__add(tok, sort_list) < 0) {
-			error("Unknown --sort key: '%s'", tok);
+		if (page_sort_dimension__add(tok, sort_list) < 0) {
+			error("Unknown page --sort key: '%s'", tok);
 			free(str);
 			return -1;
 		}
@@ -1002,10 +1717,18 @@ static int parse_sort_opt(const struct option *opt __maybe_unused,
 	if (!arg)
 		return -1;
 
-	if (caller_flag > alloc_flag)
-		return setup_sorting(&caller_sort, arg);
-	else
-		return setup_sorting(&alloc_sort, arg);
+	if (kmem_page > kmem_slab ||
+	    (kmem_page == 0 && kmem_slab == 0 && kmem_default == KMEM_PAGE)) {
+		if (caller_flag > alloc_flag)
+			return setup_page_sorting(&page_caller_sort, arg);
+		else
+			return setup_page_sorting(&page_alloc_sort, arg);
+	} else {
+		if (caller_flag > alloc_flag)
+			return setup_slab_sorting(&slab_caller_sort, arg);
+		else
+			return setup_slab_sorting(&slab_alloc_sort, arg);
+	}
 
 	return 0;
 }
@@ -1084,7 +1807,7 @@ static int __cmd_record(int argc, const char **argv)
 	if (kmem_slab)
 		rec_argc += ARRAY_SIZE(slab_events);
 	if (kmem_page)
-		rec_argc += ARRAY_SIZE(page_events);
+		rec_argc += ARRAY_SIZE(page_events) + 1; /* for -g */
 
 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
 
@@ -1099,6 +1822,8 @@ static int __cmd_record(int argc, const char **argv)
 			rec_argv[i] = strdup(slab_events[j]);
 	}
 	if (kmem_page) {
+		rec_argv[i++] = strdup("-g");
+
 		for (j = 0; j < ARRAY_SIZE(page_events); j++, i++)
 			rec_argv[i] = strdup(page_events[j]);
 	}
@@ -1109,9 +1834,26 @@ static int __cmd_record(int argc, const char **argv)
 	return cmd_record(i, rec_argv, NULL);
 }
 
+static int kmem_config(const char *var, const char *value, void *cb)
+{
+	if (!strcmp(var, "kmem.default")) {
+		if (!strcmp(value, "slab"))
+			kmem_default = KMEM_SLAB;
+		else if (!strcmp(value, "page"))
+			kmem_default = KMEM_PAGE;
+		else
+			pr_err("invalid default value ('slab' or 'page' required): %s\n",
+			       value);
+		return 0;
+	}
+
+	return perf_default_config(var, value, cb);
+}
+
 int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 {
-	const char * const default_sort_order = "frag,hit,bytes";
+	const char * const default_slab_sort = "frag,hit,bytes";
+	const char * const default_page_sort = "bytes,hit";
 	struct perf_data_file file = {
 		.mode = PERF_DATA_MODE_READ,
 	};
@@ -1124,8 +1866,8 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_CALLBACK_NOOPT(0, "alloc", NULL, NULL,
 			   "show per-allocation statistics", parse_alloc_opt),
 	OPT_CALLBACK('s', "sort", NULL, "key[,key2...]",
-		     "sort by keys: ptr, call_site, bytes, hit, pingpong, frag",
-		     parse_sort_opt),
+		     "sort by keys: ptr, callsite, bytes, hit, pingpong, frag, "
+		     "page, order, migtype, gfp", parse_sort_opt),
 	OPT_CALLBACK('l', "line", NULL, "num", "show n lines", parse_line_opt),
 	OPT_BOOLEAN(0, "raw-ip", &raw_ip, "show raw ip instead of symbol"),
 	OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
@@ -1133,6 +1875,7 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 			   parse_slab_opt),
 	OPT_CALLBACK_NOOPT(0, "page", NULL, NULL, "Analyze page allocator",
 			   parse_page_opt),
+	OPT_BOOLEAN(0, "live", &live_page, "Show live page stat"),
 	OPT_END()
 	};
 	const char *const kmem_subcommands[] = { "record", "stat", NULL };
@@ -1142,15 +1885,21 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 	};
 	struct perf_session *session;
 	int ret = -1;
+	const char errmsg[] = "No %s allocation events found.  Have you run 'perf kmem record --%s'?\n";
 
+	perf_config(kmem_config, NULL);
 	argc = parse_options_subcommand(argc, argv, kmem_options,
 					kmem_subcommands, kmem_usage, 0);
 
 	if (!argc)
 		usage_with_options(kmem_usage, kmem_options);
 
-	if (kmem_slab == 0 && kmem_page == 0)
-		kmem_slab = 1;  /* for backward compatibility */
+	if (kmem_slab == 0 && kmem_page == 0) {
+		if (kmem_default == KMEM_SLAB)
+			kmem_slab = 1;
+		else
+			kmem_page = 1;
+	}
 
 	if (!strncmp(argv[0], "rec", 3)) {
 		symbol__init(NULL);
@@ -1159,19 +1908,30 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 
 	file.path = input_name;
 
-	session = perf_session__new(&file, false, &perf_kmem);
+	kmem_session = session = perf_session__new(&file, false, &perf_kmem);
 	if (session == NULL)
 		return -1;
 
+	if (kmem_slab) {
+		if (!perf_evlist__find_tracepoint_by_name(session->evlist,
+							  "kmem:kmalloc")) {
+			pr_err(errmsg, "slab", "slab");
+			return -1;
+		}
+	}
+
 	if (kmem_page) {
-		struct perf_evsel *evsel = perf_evlist__first(session->evlist);
+		struct perf_evsel *evsel;
 
-		if (evsel == NULL || evsel->tp_format == NULL) {
-			pr_err("invalid event found.. aborting\n");
+		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
+							     "kmem:mm_page_alloc");
+		if (evsel == NULL) {
+			pr_err(errmsg, "page", "page");
 			return -1;
 		}
 
 		kmem_page_size = pevent_get_page_size(evsel->tp_format->pevent);
+		symbol_conf.use_callchain = true;
 	}
 
 	symbol__init(&session->header.env);
@@ -1182,11 +1942,21 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 		if (cpu__setup_cpunode_map())
 			goto out_delete;
 
-		if (list_empty(&caller_sort))
-			setup_sorting(&caller_sort, default_sort_order);
-		if (list_empty(&alloc_sort))
-			setup_sorting(&alloc_sort, default_sort_order);
-
+		if (list_empty(&slab_caller_sort))
+			setup_slab_sorting(&slab_caller_sort, default_slab_sort);
+		if (list_empty(&slab_alloc_sort))
+			setup_slab_sorting(&slab_alloc_sort, default_slab_sort);
+		if (list_empty(&page_caller_sort))
+			setup_page_sorting(&page_caller_sort, default_page_sort);
+		if (list_empty(&page_alloc_sort))
+			setup_page_sorting(&page_alloc_sort, default_page_sort);
+
+		if (kmem_page) {
+			setup_page_sorting(&page_alloc_sort_input,
+					   "page,order,migtype,gfp");
+			setup_page_sorting(&page_caller_sort_input,
+					   "callsite,order,migtype,gfp");
+		}
 		ret = __cmd_kmem(session);
 	} else
 		usage_with_options(kmem_usage, kmem_options);
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index 1f9338f6109cdb..15fecd3dc5d8c4 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -651,6 +651,7 @@ static int process_sample_event(struct perf_tool *tool,
 				struct perf_evsel *evsel,
 				struct machine *machine)
 {
+	int err = 0;
 	struct thread *thread;
 	struct perf_kvm_stat *kvm = container_of(tool, struct perf_kvm_stat,
 						 tool);
@@ -666,9 +667,10 @@ static int process_sample_event(struct perf_tool *tool,
 	}
 
 	if (!handle_kvm_event(kvm, thread, evsel, sample))
-		return -1;
+		err = -1;
 
-	return 0;
+	thread__put(thread);
+	return err;
 }
 
 static int cpu_isa_config(struct perf_kvm_stat *kvm)
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index d49c2ab85fc2dd..de16aaed516e60 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -769,6 +769,7 @@ static void dump_threads(void)
 		t = perf_session__findnew(session, st->tid);
 		pr_info("%10d: %s\n", st->tid, thread__comm_str(t));
 		node = rb_next(node);
+		thread__put(t);
 	};
 }
 
@@ -810,6 +811,7 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused,
 				struct perf_evsel *evsel,
 				struct machine *machine)
 {
+	int err = 0;
 	struct thread *thread = machine__findnew_thread(machine, sample->pid,
 							sample->tid);
 
@@ -821,10 +823,12 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused,
 
 	if (evsel->handler != NULL) {
 		tracepoint_handler f = evsel->handler;
-		return f(evsel, sample);
+		err = f(evsel, sample);
 	}
 
-	return 0;
+	thread__put(thread);
+
+	return err;
 }
 
 static void sort_result(void)
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 675216e08bfcd0..da2ec06f0742dc 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -74,7 +74,7 @@ dump_raw_samples(struct perf_tool *tool,
 	}
 
 	if (al.filtered || (mem->hide_unresolved && al.sym == NULL))
-		return 0;
+		goto out_put;
 
 	if (al.map != NULL)
 		al.map->dso->hit = 1;
@@ -103,7 +103,8 @@ dump_raw_samples(struct perf_tool *tool,
 		symbol_conf.field_sep,
 		al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???",
 		al.sym ? al.sym->name : "???");
-
+out_put:
+	addr_location__put(&al);
 	return 0;
 }
 
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index f7b1af67e9f686..1272559fa22d9e 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -44,25 +44,19 @@
 
 #define DEFAULT_VAR_FILTER "!__k???tab_* & !__crc_*"
 #define DEFAULT_FUNC_FILTER "!_*"
+#define DEFAULT_LIST_FILTER "*:*"
 
 /* Session management structure */
 static struct {
+	int command;	/* Command short_name */
 	bool list_events;
-	bool force_add;
-	bool show_lines;
-	bool show_vars;
-	bool show_ext_vars;
-	bool show_funcs;
-	bool mod_events;
 	bool uprobes;
 	bool quiet;
 	bool target_used;
 	int nevents;
 	struct perf_probe_event events[MAX_PROBES];
-	struct strlist *dellist;
 	struct line_range line_range;
 	char *target;
-	int max_probe_points;
 	struct strfilter *filter;
 } params;
 
@@ -93,6 +87,28 @@ static int parse_probe_event(const char *str)
 	return ret;
 }
 
+static int params_add_filter(const char *str)
+{
+	const char *err = NULL;
+	int ret = 0;
+
+	pr_debug2("Add filter: %s\n", str);
+	if (!params.filter) {
+		params.filter = strfilter__new(str, &err);
+		if (!params.filter)
+			ret = err ? -EINVAL : -ENOMEM;
+	} else
+		ret = strfilter__or(params.filter, str, &err);
+
+	if (ret == -EINVAL) {
+		pr_err("Filter parse error at %td.\n", err - str + 1);
+		pr_err("Source: \"%s\"\n", str);
+		pr_err("         %*c\n", (int)(err - str + 1), '^');
+	}
+
+	return ret;
+}
+
 static int set_target(const char *ptr)
 {
 	int found = 0;
@@ -152,34 +168,11 @@ static int parse_probe_event_argv(int argc, const char **argv)
 
 		len += sprintf(&buf[len], "%s ", argv[i]);
 	}
-	params.mod_events = true;
 	ret = parse_probe_event(buf);
 	free(buf);
 	return ret;
 }
 
-static int opt_add_probe_event(const struct option *opt __maybe_unused,
-			      const char *str, int unset __maybe_unused)
-{
-	if (str) {
-		params.mod_events = true;
-		return parse_probe_event(str);
-	} else
-		return 0;
-}
-
-static int opt_del_probe_event(const struct option *opt __maybe_unused,
-			       const char *str, int unset __maybe_unused)
-{
-	if (str) {
-		params.mod_events = true;
-		if (!params.dellist)
-			params.dellist = strlist__new(true, NULL);
-		strlist__add(params.dellist, str);
-	}
-	return 0;
-}
-
 static int opt_set_target(const struct option *opt, const char *str,
 			int unset __maybe_unused)
 {
@@ -217,8 +210,10 @@ static int opt_set_target(const struct option *opt, const char *str,
 	return ret;
 }
 
+/* Command option callbacks */
+
 #ifdef HAVE_DWARF_SUPPORT
-static int opt_show_lines(const struct option *opt __maybe_unused,
+static int opt_show_lines(const struct option *opt,
 			  const char *str, int unset __maybe_unused)
 {
 	int ret = 0;
@@ -226,19 +221,19 @@ static int opt_show_lines(const struct option *opt __maybe_unused,
 	if (!str)
 		return 0;
 
-	if (params.show_lines) {
+	if (params.command == 'L') {
 		pr_warning("Warning: more than one --line options are"
 			   " detected. Only the first one is valid.\n");
 		return 0;
 	}
 
-	params.show_lines = true;
+	params.command = opt->short_name;
 	ret = parse_line_range_desc(str, &params.line_range);
 
 	return ret;
 }
 
-static int opt_show_vars(const struct option *opt __maybe_unused,
+static int opt_show_vars(const struct option *opt,
 			 const char *str, int unset __maybe_unused)
 {
 	struct perf_probe_event *pev = &params.events[params.nevents];
@@ -252,29 +247,39 @@ static int opt_show_vars(const struct option *opt __maybe_unused,
 		pr_err("  Error: '--vars' doesn't accept arguments.\n");
 		return -EINVAL;
 	}
-	params.show_vars = true;
+	params.command = opt->short_name;
 
 	return ret;
 }
 #endif
+static int opt_add_probe_event(const struct option *opt,
+			      const char *str, int unset __maybe_unused)
+{
+	if (str) {
+		params.command = opt->short_name;
+		return parse_probe_event(str);
+	}
+
+	return 0;
+}
+
+static int opt_set_filter_with_command(const struct option *opt,
+				       const char *str, int unset)
+{
+	if (!unset)
+		params.command = opt->short_name;
+
+	if (str)
+		return params_add_filter(str);
+
+	return 0;
+}
 
 static int opt_set_filter(const struct option *opt __maybe_unused,
 			  const char *str, int unset __maybe_unused)
 {
-	const char *err;
-
-	if (str) {
-		pr_debug2("Set filter: %s\n", str);
-		if (params.filter)
-			strfilter__delete(params.filter);
-		params.filter = strfilter__new(str, &err);
-		if (!params.filter) {
-			pr_err("Filter parse error at %td.\n", err - str + 1);
-			pr_err("Source: \"%s\"\n", str);
-			pr_err("         %*c\n", (int)(err - str + 1), '^');
-			return -EINVAL;
-		}
-	}
+	if (str)
+		return params_add_filter(str);
 
 	return 0;
 }
@@ -290,8 +295,6 @@ static void cleanup_params(void)
 
 	for (i = 0; i < params.nevents; i++)
 		clear_perf_probe_event(params.events + i);
-	if (params.dellist)
-		strlist__delete(params.dellist);
 	line_range__clear(&params.line_range);
 	free(params.target);
 	if (params.filter)
@@ -316,22 +319,24 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
 		"perf probe [<options>] 'PROBEDEF' ['PROBEDEF' ...]",
 		"perf probe [<options>] --add 'PROBEDEF' [--add 'PROBEDEF' ...]",
 		"perf probe [<options>] --del '[GROUP:]EVENT' ...",
-		"perf probe --list",
+		"perf probe --list [GROUP:]EVENT ...",
 #ifdef HAVE_DWARF_SUPPORT
 		"perf probe [<options>] --line 'LINEDESC'",
 		"perf probe [<options>] --vars 'PROBEPOINT'",
 #endif
+		"perf probe [<options>] --funcs",
 		NULL
-};
+	};
 	struct option options[] = {
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show parsed arguments, etc)"),
 	OPT_BOOLEAN('q', "quiet", &params.quiet,
 		    "be quiet (do not show any mesages)"),
-	OPT_BOOLEAN('l', "list", &params.list_events,
-		    "list up current probe events"),
+	OPT_CALLBACK_DEFAULT('l', "list", NULL, "[GROUP:]EVENT",
+			     "list up probe events",
+			     opt_set_filter_with_command, DEFAULT_LIST_FILTER),
 	OPT_CALLBACK('d', "del", NULL, "[GROUP:]EVENT", "delete a probe event.",
-		opt_del_probe_event),
+		     opt_set_filter_with_command),
 	OPT_CALLBACK('a', "add", NULL,
 #ifdef HAVE_DWARF_SUPPORT
 		"[EVENT=]FUNC[@SRC][+OFF|%return|:RL|;PT]|SRC:AL|SRC;PT"
@@ -356,7 +361,7 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
 		"\t\tARG:\tProbe argument (kprobe-tracer argument format.)\n",
 #endif
 		opt_add_probe_event),
-	OPT_BOOLEAN('f', "force", &params.force_add, "forcibly add events"
+	OPT_BOOLEAN('f', "force", &probe_conf.force_add, "forcibly add events"
 		    " with existing name"),
 #ifdef HAVE_DWARF_SUPPORT
 	OPT_CALLBACK('L', "line", NULL,
@@ -365,8 +370,10 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_CALLBACK('V', "vars", NULL,
 		     "FUNC[@SRC][+OFF|%return|:RL|;PT]|SRC:AL|SRC;PT",
 		     "Show accessible variables on PROBEDEF", opt_show_vars),
-	OPT_BOOLEAN('\0', "externs", &params.show_ext_vars,
+	OPT_BOOLEAN('\0', "externs", &probe_conf.show_ext_vars,
 		    "Show external variables too (with --vars only)"),
+	OPT_BOOLEAN('\0', "range", &probe_conf.show_location_range,
+		"Show variables location range in scope (with --vars only)"),
 	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
 		   "file", "vmlinux pathname"),
 	OPT_STRING('s', "source", &symbol_conf.source_prefix,
@@ -374,12 +381,15 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_CALLBACK('m', "module", NULL, "modname|path",
 		"target module name (for online) or path (for offline)",
 		opt_set_target),
+	OPT_BOOLEAN('\0', "no-inlines", &probe_conf.no_inlines,
+		"Don't search inlined functions"),
 #endif
 	OPT__DRY_RUN(&probe_event_dry_run),
-	OPT_INTEGER('\0', "max-probes", &params.max_probe_points,
+	OPT_INTEGER('\0', "max-probes", &probe_conf.max_probes,
 		 "Set how many probe points can be found for a probe."),
-	OPT_BOOLEAN('F', "funcs", &params.show_funcs,
-		    "Show potential probe-able functions."),
+	OPT_CALLBACK_DEFAULT('F', "funcs", NULL, "[FILTER]",
+			     "Show potential probe-able functions.",
+			     opt_set_filter_with_command, DEFAULT_FUNC_FILTER),
 	OPT_CALLBACK('\0', "filter", NULL,
 		     "[!]FILTER", "Set a filter (with --vars/funcs only)\n"
 		     "\t\t\t(default: \"" DEFAULT_VAR_FILTER "\" for --vars,\n"
@@ -402,6 +412,7 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
 	set_option_flag(options, 'L', "line", PARSE_OPT_EXCLUSIVE);
 	set_option_flag(options, 'V', "vars", PARSE_OPT_EXCLUSIVE);
 #endif
+	set_option_flag(options, 'F', "funcs", PARSE_OPT_EXCLUSIVE);
 
 	argc = parse_options(argc, argv, options, probe_usage,
 			     PARSE_OPT_STOP_AT_NON_OPTION);
@@ -410,11 +421,16 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
 			pr_warning("  Error: '-' is not supported.\n");
 			usage_with_options(probe_usage, options);
 		}
+		if (params.command && params.command != 'a') {
+			pr_warning("  Error: another command except --add is set.\n");
+			usage_with_options(probe_usage, options);
+		}
 		ret = parse_probe_event_argv(argc, argv);
 		if (ret < 0) {
 			pr_err_with_code("  Error: Command Parse Error.", ret);
 			return ret;
 		}
+		params.command = 'a';
 	}
 
 	if (params.quiet) {
@@ -425,89 +441,70 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
 		verbose = -1;
 	}
 
-	if (params.max_probe_points == 0)
-		params.max_probe_points = MAX_PROBES;
-
-	if ((!params.nevents && !params.dellist && !params.list_events &&
-	     !params.show_lines && !params.show_funcs))
-		usage_with_options(probe_usage, options);
+	if (probe_conf.max_probes == 0)
+		probe_conf.max_probes = MAX_PROBES;
 
 	/*
 	 * Only consider the user's kernel image path if given.
 	 */
 	symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
 
-	if (params.list_events) {
+	switch (params.command) {
+	case 'l':
 		if (params.uprobes) {
 			pr_warning("  Error: Don't use --list with --exec.\n");
 			usage_with_options(probe_usage, options);
 		}
-		ret = show_perf_probe_events();
+		ret = show_perf_probe_events(params.filter);
 		if (ret < 0)
 			pr_err_with_code("  Error: Failed to show event list.", ret);
 		return ret;
-	}
-	if (params.show_funcs) {
-		if (!params.filter)
-			params.filter = strfilter__new(DEFAULT_FUNC_FILTER,
-						       NULL);
+	case 'F':
 		ret = show_available_funcs(params.target, params.filter,
 					params.uprobes);
-		strfilter__delete(params.filter);
-		params.filter = NULL;
 		if (ret < 0)
 			pr_err_with_code("  Error: Failed to show functions.", ret);
 		return ret;
-	}
-
 #ifdef HAVE_DWARF_SUPPORT
-	if (params.show_lines) {
+	case 'L':
 		ret = show_line_range(&params.line_range, params.target,
 				      params.uprobes);
 		if (ret < 0)
 			pr_err_with_code("  Error: Failed to show lines.", ret);
 		return ret;
-	}
-	if (params.show_vars) {
+	case 'V':
 		if (!params.filter)
 			params.filter = strfilter__new(DEFAULT_VAR_FILTER,
 						       NULL);
 
 		ret = show_available_vars(params.events, params.nevents,
-					  params.max_probe_points,
-					  params.target,
-					  params.filter,
-					  params.show_ext_vars);
-		strfilter__delete(params.filter);
-		params.filter = NULL;
+					  params.filter);
 		if (ret < 0)
 			pr_err_with_code("  Error: Failed to show vars.", ret);
 		return ret;
-	}
 #endif
-
-	if (params.dellist) {
-		ret = del_perf_probe_events(params.dellist);
+	case 'd':
+		ret = del_perf_probe_events(params.filter);
 		if (ret < 0) {
 			pr_err_with_code("  Error: Failed to delete events.", ret);
 			return ret;
 		}
-	}
-
-	if (params.nevents) {
+		break;
+	case 'a':
 		/* Ensure the last given target is used */
 		if (params.target && !params.target_used) {
 			pr_warning("  Error: -x/-m must follow the probe definitions.\n");
 			usage_with_options(probe_usage, options);
 		}
 
-		ret = add_perf_probe_events(params.events, params.nevents,
-					    params.max_probe_points,
-					    params.force_add);
+		ret = add_perf_probe_events(params.events, params.nevents);
 		if (ret < 0) {
 			pr_err_with_code("  Error: Failed to add events.", ret);
 			return ret;
 		}
+		break;
+	default:
+		usage_with_options(probe_usage, options);
 	}
 	return 0;
 }
@@ -522,5 +519,5 @@ int cmd_probe(int argc, const char **argv, const char *prefix)
 		cleanup_params();
 	}
 
-	return ret;
+	return ret < 0 ? ret : 0;
 }
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index c3efdfb630b5b6..91aa2a3dcf19eb 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -27,6 +27,8 @@
 #include "util/cpumap.h"
 #include "util/thread_map.h"
 #include "util/data.h"
+#include "util/auxtrace.h"
+#include "util/parse-branch-options.h"
 
 #include <unistd.h>
 #include <sched.h>
@@ -38,6 +40,7 @@ struct record {
 	struct record_opts	opts;
 	u64			bytes_written;
 	struct perf_data_file	file;
+	struct auxtrace_record	*itr;
 	struct perf_evlist	*evlist;
 	struct perf_session	*session;
 	const char		*progname;
@@ -110,9 +113,12 @@ out:
 	return rc;
 }
 
-static volatile int done = 0;
+static volatile int done;
 static volatile int signr = -1;
-static volatile int child_finished = 0;
+static volatile int child_finished;
+static volatile int auxtrace_snapshot_enabled;
+static volatile int auxtrace_snapshot_err;
+static volatile int auxtrace_record__snapshot_started;
 
 static void sig_handler(int sig)
 {
@@ -133,6 +139,133 @@ static void record__sig_exit(void)
 	raise(signr);
 }
 
+#ifdef HAVE_AUXTRACE_SUPPORT
+
+static int record__process_auxtrace(struct perf_tool *tool,
+				    union perf_event *event, void *data1,
+				    size_t len1, void *data2, size_t len2)
+{
+	struct record *rec = container_of(tool, struct record, tool);
+	struct perf_data_file *file = &rec->file;
+	size_t padding;
+	u8 pad[8] = {0};
+
+	if (!perf_data_file__is_pipe(file)) {
+		off_t file_offset;
+		int fd = perf_data_file__fd(file);
+		int err;
+
+		file_offset = lseek(fd, 0, SEEK_CUR);
+		if (file_offset == -1)
+			return -1;
+		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
+						     event, file_offset);
+		if (err)
+			return err;
+	}
+
+	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
+	padding = (len1 + len2) & 7;
+	if (padding)
+		padding = 8 - padding;
+
+	record__write(rec, event, event->header.size);
+	record__write(rec, data1, len1);
+	if (len2)
+		record__write(rec, data2, len2);
+	record__write(rec, &pad, padding);
+
+	return 0;
+}
+
+static int record__auxtrace_mmap_read(struct record *rec,
+				      struct auxtrace_mmap *mm)
+{
+	int ret;
+
+	ret = auxtrace_mmap__read(mm, rec->itr, &rec->tool,
+				  record__process_auxtrace);
+	if (ret < 0)
+		return ret;
+
+	if (ret)
+		rec->samples++;
+
+	return 0;
+}
+
+static int record__auxtrace_mmap_read_snapshot(struct record *rec,
+					       struct auxtrace_mmap *mm)
+{
+	int ret;
+
+	ret = auxtrace_mmap__read_snapshot(mm, rec->itr, &rec->tool,
+					   record__process_auxtrace,
+					   rec->opts.auxtrace_snapshot_size);
+	if (ret < 0)
+		return ret;
+
+	if (ret)
+		rec->samples++;
+
+	return 0;
+}
+
+static int record__auxtrace_read_snapshot_all(struct record *rec)
+{
+	int i;
+	int rc = 0;
+
+	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
+		struct auxtrace_mmap *mm =
+				&rec->evlist->mmap[i].auxtrace_mmap;
+
+		if (!mm->base)
+			continue;
+
+		if (record__auxtrace_mmap_read_snapshot(rec, mm) != 0) {
+			rc = -1;
+			goto out;
+		}
+	}
+out:
+	return rc;
+}
+
+static void record__read_auxtrace_snapshot(struct record *rec)
+{
+	pr_debug("Recording AUX area tracing snapshot\n");
+	if (record__auxtrace_read_snapshot_all(rec) < 0) {
+		auxtrace_snapshot_err = -1;
+	} else {
+		auxtrace_snapshot_err = auxtrace_record__snapshot_finish(rec->itr);
+		if (!auxtrace_snapshot_err)
+			auxtrace_snapshot_enabled = 1;
+	}
+}
+
+#else
+
+static inline
+int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
+			       struct auxtrace_mmap *mm __maybe_unused)
+{
+	return 0;
+}
+
+static inline
+void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
+{
+}
+
+static inline
+int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
+{
+	return 0;
+}
+
+#endif
+
 static int record__open(struct record *rec)
 {
 	char msg[512];
@@ -169,13 +302,16 @@ try_again:
 		goto out;
 	}
 
-	if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
+	if (perf_evlist__mmap_ex(evlist, opts->mmap_pages, false,
+				 opts->auxtrace_mmap_pages,
+				 opts->auxtrace_snapshot_mode) < 0) {
 		if (errno == EPERM) {
 			pr_err("Permission error mapping pages.\n"
 			       "Consider increasing "
 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
 			       "or try again with a smaller value of -m/--mmap_pages.\n"
-			       "(current value: %u)\n", opts->mmap_pages);
+			       "(current value: %u,%u)\n",
+			       opts->mmap_pages, opts->auxtrace_mmap_pages);
 			rc = -errno;
 		} else {
 			pr_err("failed to mmap with %d (%s)\n", errno,
@@ -270,12 +406,20 @@ static int record__mmap_read_all(struct record *rec)
 	int rc = 0;
 
 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
+		struct auxtrace_mmap *mm = &rec->evlist->mmap[i].auxtrace_mmap;
+
 		if (rec->evlist->mmap[i].base) {
 			if (record__mmap_read(rec, i) != 0) {
 				rc = -1;
 				goto out;
 			}
 		}
+
+		if (mm->base && !rec->opts.auxtrace_snapshot_mode &&
+		    record__auxtrace_mmap_read(rec, mm) != 0) {
+			rc = -1;
+			goto out;
+		}
 	}
 
 	/*
@@ -305,6 +449,9 @@ static void record__init_features(struct record *rec)
 
 	if (!rec->opts.branch_stack)
 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
+
+	if (!rec->opts.full_auxtrace)
+		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
 }
 
 static volatile int workload_exec_errno;
@@ -323,6 +470,8 @@ static void workload_exec_failed_signal(int signo __maybe_unused,
 	child_finished = 1;
 }
 
+static void snapshot_sig_handler(int sig);
+
 static int __cmd_record(struct record *rec, int argc, const char **argv)
 {
 	int err;
@@ -343,6 +492,10 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 	signal(SIGCHLD, sig_handler);
 	signal(SIGINT, sig_handler);
 	signal(SIGTERM, sig_handler);
+	if (rec->opts.auxtrace_snapshot_mode)
+		signal(SIGUSR2, snapshot_sig_handler);
+	else
+		signal(SIGUSR2, SIG_IGN);
 
 	session = perf_session__new(file, false, tool);
 	if (session == NULL) {
@@ -421,6 +574,13 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 		}
 	}
 
+	if (rec->opts.full_auxtrace) {
+		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
+					session, process_synthesized_event);
+		if (err)
+			goto out_delete_session;
+	}
+
 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
 						 machine);
 	if (err < 0)
@@ -475,14 +635,27 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 		perf_evlist__enable(rec->evlist);
 	}
 
+	auxtrace_snapshot_enabled = 1;
 	for (;;) {
 		int hits = rec->samples;
 
 		if (record__mmap_read_all(rec) < 0) {
+			auxtrace_snapshot_enabled = 0;
 			err = -1;
 			goto out_child;
 		}
 
+		if (auxtrace_record__snapshot_started) {
+			auxtrace_record__snapshot_started = 0;
+			if (!auxtrace_snapshot_err)
+				record__read_auxtrace_snapshot(rec);
+			if (auxtrace_snapshot_err) {
+				pr_err("AUX area tracing snapshot failed\n");
+				err = -1;
+				goto out_child;
+			}
+		}
+
 		if (hits == rec->samples) {
 			if (done || draining)
 				break;
@@ -505,10 +678,12 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 		 * disable events in this case.
 		 */
 		if (done && !disabled && !target__none(&opts->target)) {
+			auxtrace_snapshot_enabled = 0;
 			perf_evlist__disable(rec->evlist);
 			disabled = true;
 		}
 	}
+	auxtrace_snapshot_enabled = 0;
 
 	if (forks && workload_exec_errno) {
 		char msg[STRERR_BUFSIZE];
@@ -545,15 +720,23 @@ out_child:
 	if (!err && !file->is_pipe) {
 		rec->session->header.data_size += rec->bytes_written;
 
-		if (!rec->no_buildid)
+		if (!rec->no_buildid) {
 			process_buildids(rec);
+			/*
+			 * We take all buildids when the file contains
+			 * AUX area tracing data because we do not decode the
+			 * trace because it would take too long.
+			 */
+			if (rec->opts.full_auxtrace)
+				dsos__hit_all(rec->session);
+		}
 		perf_session__write_header(rec->session, rec->evlist, fd, true);
 	}
 
 	if (!err && !quiet) {
 		char samples[128];
 
-		if (rec->samples)
+		if (rec->samples && !rec->opts.full_auxtrace)
 			scnprintf(samples, sizeof(samples),
 				  " (%" PRIu64 " samples)", rec->samples);
 		else
@@ -569,94 +752,6 @@ out_delete_session:
 	return status;
 }
 
-#define BRANCH_OPT(n, m) \
-	{ .name = n, .mode = (m) }
-
-#define BRANCH_END { .name = NULL }
-
-struct branch_mode {
-	const char *name;
-	int mode;
-};
-
-static const struct branch_mode branch_modes[] = {
-	BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
-	BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
-	BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
-	BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
-	BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
-	BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
-	BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
-	BRANCH_OPT("abort_tx", PERF_SAMPLE_BRANCH_ABORT_TX),
-	BRANCH_OPT("in_tx", PERF_SAMPLE_BRANCH_IN_TX),
-	BRANCH_OPT("no_tx", PERF_SAMPLE_BRANCH_NO_TX),
-	BRANCH_OPT("cond", PERF_SAMPLE_BRANCH_COND),
-	BRANCH_END
-};
-
-static int
-parse_branch_stack(const struct option *opt, const char *str, int unset)
-{
-#define ONLY_PLM \
-	(PERF_SAMPLE_BRANCH_USER	|\
-	 PERF_SAMPLE_BRANCH_KERNEL	|\
-	 PERF_SAMPLE_BRANCH_HV)
-
-	uint64_t *mode = (uint64_t *)opt->value;
-	const struct branch_mode *br;
-	char *s, *os = NULL, *p;
-	int ret = -1;
-
-	if (unset)
-		return 0;
-
-	/*
-	 * cannot set it twice, -b + --branch-filter for instance
-	 */
-	if (*mode)
-		return -1;
-
-	/* str may be NULL in case no arg is passed to -b */
-	if (str) {
-		/* because str is read-only */
-		s = os = strdup(str);
-		if (!s)
-			return -1;
-
-		for (;;) {
-			p = strchr(s, ',');
-			if (p)
-				*p = '\0';
-
-			for (br = branch_modes; br->name; br++) {
-				if (!strcasecmp(s, br->name))
-					break;
-			}
-			if (!br->name) {
-				ui__warning("unknown branch filter %s,"
-					    " check man page\n", s);
-				goto error;
-			}
-
-			*mode |= br->mode;
-
-			if (!p)
-				break;
-
-			s = p + 1;
-		}
-	}
-	ret = 0;
-
-	/* default to any branch */
-	if ((*mode & ~ONLY_PLM) == 0) {
-		*mode = PERF_SAMPLE_BRANCH_ANY;
-	}
-error:
-	free(os);
-	return ret;
-}
-
 static void callchain_debug(void)
 {
 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
@@ -795,6 +890,49 @@ static int parse_clockid(const struct option *opt, const char *str, int unset)
 	return -1;
 }
 
+static int record__parse_mmap_pages(const struct option *opt,
+				    const char *str,
+				    int unset __maybe_unused)
+{
+	struct record_opts *opts = opt->value;
+	char *s, *p;
+	unsigned int mmap_pages;
+	int ret;
+
+	if (!str)
+		return -EINVAL;
+
+	s = strdup(str);
+	if (!s)
+		return -ENOMEM;
+
+	p = strchr(s, ',');
+	if (p)
+		*p = '\0';
+
+	if (*s) {
+		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
+		if (ret)
+			goto out_free;
+		opts->mmap_pages = mmap_pages;
+	}
+
+	if (!p) {
+		ret = 0;
+		goto out_free;
+	}
+
+	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
+	if (ret)
+		goto out_free;
+
+	opts->auxtrace_mmap_pages = mmap_pages;
+
+out_free:
+	free(s);
+	return ret;
+}
+
 static const char * const __record_usage[] = {
 	"perf record [<options>] [<command>]",
 	"perf record [<options>] -- <command> [<options>]",
@@ -875,9 +1013,9 @@ struct option __record_options[] = {
 			&record.opts.no_inherit_set,
 			"child tasks do not inherit counters"),
 	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
-	OPT_CALLBACK('m', "mmap-pages", &record.opts.mmap_pages, "pages",
-		     "number of mmap data pages",
-		     perf_evlist__parse_mmap_pages),
+	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
+		     "number of mmap data pages and AUX area tracing mmap pages",
+		     record__parse_mmap_pages),
 	OPT_BOOLEAN(0, "group", &record.opts.group,
 		    "put the counters into a counter group"),
 	OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
@@ -929,6 +1067,8 @@ struct option __record_options[] = {
 	OPT_CALLBACK('k', "clockid", &record.opts,
 	"clockid", "clockid to use for events, see clock_gettime()",
 	parse_clockid),
+	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
+			  "opts", "AUX area tracing Snapshot Mode", ""),
 	OPT_END()
 };
 
@@ -936,7 +1076,7 @@ struct option *record_options = __record_options;
 
 int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 {
-	int err = -ENOMEM;
+	int err;
 	struct record *rec = &record;
 	char errbuf[BUFSIZ];
 
@@ -957,6 +1097,19 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 		usage_with_options(record_usage, record_options);
 	}
 
+	if (!rec->itr) {
+		rec->itr = auxtrace_record__init(rec->evlist, &err);
+		if (err)
+			return err;
+	}
+
+	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
+					      rec->opts.auxtrace_snapshot_opts);
+	if (err)
+		return err;
+
+	err = -ENOMEM;
+
 	symbol__init(NULL);
 
 	if (symbol_conf.kptr_restrict)
@@ -1002,6 +1155,10 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
 		usage_with_options(record_usage, record_options);
 
+	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
+	if (err)
+		goto out_symbol_exit;
+
 	if (record_opts__config(&rec->opts)) {
 		err = -EINVAL;
 		goto out_symbol_exit;
@@ -1011,5 +1168,15 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 out_symbol_exit:
 	perf_evlist__delete(rec->evlist);
 	symbol__exit();
+	auxtrace_record__free(rec->itr);
 	return err;
 }
+
+static void snapshot_sig_handler(int sig __maybe_unused)
+{
+	if (!auxtrace_snapshot_enabled)
+		return;
+	auxtrace_snapshot_enabled = 0;
+	auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
+	auxtrace_record__snapshot_started = 1;
+}
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index b63aeda719be0c..56025d90622f22 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -36,6 +36,8 @@
 #include "util/data.h"
 #include "arch/common.h"
 
+#include "util/auxtrace.h"
+
 #include <dlfcn.h>
 #include <linux/bitmap.h>
 
@@ -137,10 +139,12 @@ static int process_sample_event(struct perf_tool *tool,
 	struct report *rep = container_of(tool, struct report, tool);
 	struct addr_location al;
 	struct hist_entry_iter iter = {
-		.hide_unresolved = rep->hide_unresolved,
-		.add_entry_cb = hist_iter__report_callback,
+		.evsel 			= evsel,
+		.sample 		= sample,
+		.hide_unresolved 	= rep->hide_unresolved,
+		.add_entry_cb 		= hist_iter__report_callback,
 	};
-	int ret;
+	int ret = 0;
 
 	if (perf_event__preprocess_sample(event, machine, &al, sample) < 0) {
 		pr_debug("problem processing %d event, skipping it.\n",
@@ -149,10 +153,10 @@ static int process_sample_event(struct perf_tool *tool,
 	}
 
 	if (rep->hide_unresolved && al.sym == NULL)
-		return 0;
+		goto out_put;
 
 	if (rep->cpu_list && !test_bit(sample->cpu, rep->cpu_bitmap))
-		return 0;
+		goto out_put;
 
 	if (sort__mode == SORT_MODE__BRANCH)
 		iter.ops = &hist_iter_branch;
@@ -166,11 +170,11 @@ static int process_sample_event(struct perf_tool *tool,
 	if (al.map != NULL)
 		al.map->dso->hit = 1;
 
-	ret = hist_entry_iter__add(&iter, &al, evsel, sample, rep->max_stack,
-				   rep);
+	ret = hist_entry_iter__add(&iter, &al, rep->max_stack, rep);
 	if (ret < 0)
 		pr_debug("problem adding hist entry, skipping event\n");
-
+out_put:
+	addr_location__put(&al);
 	return ret;
 }
 
@@ -330,15 +334,14 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
 	}
 
 	if (sort_order == NULL &&
-	    parent_pattern == default_parent_pattern) {
+	    parent_pattern == default_parent_pattern)
 		fprintf(stdout, "#\n# (%s)\n#\n", help);
 
-		if (rep->show_threads) {
-			bool style = !strcmp(rep->pretty_printing_style, "raw");
-			perf_read_values_display(stdout, &rep->show_threads_values,
-						 style);
-			perf_read_values_destroy(&rep->show_threads_values);
-		}
+	if (rep->show_threads) {
+		bool style = !strcmp(rep->pretty_printing_style, "raw");
+		perf_read_values_display(stdout, &rep->show_threads_values,
+					 style);
+		perf_read_values_destroy(&rep->show_threads_values);
 	}
 
 	return 0;
@@ -585,6 +588,7 @@ parse_percent_limit(const struct option *opt, const char *str,
 int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 {
 	struct perf_session *session;
+	struct itrace_synth_opts itrace_synth_opts = { .set = 0, };
 	struct stat st;
 	bool has_br_stack = false;
 	int branch_mode = -1;
@@ -607,6 +611,9 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 			.attr		 = perf_event__process_attr,
 			.tracing_data	 = perf_event__process_tracing_data,
 			.build_id	 = perf_event__process_build_id,
+			.id_index	 = perf_event__process_id_index,
+			.auxtrace_info	 = perf_event__process_auxtrace_info,
+			.auxtrace	 = perf_event__process_auxtrace,
 			.ordered_events	 = true,
 			.ordering_requires_timestamps = true,
 		},
@@ -717,6 +724,9 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 		     "Don't show entries under that percent", parse_percent_limit),
 	OPT_CALLBACK(0, "percentage", NULL, "relative|absolute",
 		     "how to display percentage of filtered entries", parse_filter_percentage),
+	OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts",
+			    "Instruction Tracing options",
+			    itrace_parse_synth_opts),
 	OPT_END()
 	};
 	struct perf_data_file file = {
@@ -761,6 +771,8 @@ repeat:
 					       report.queue_size);
 	}
 
+	session->itrace_synth_opts = &itrace_synth_opts;
+
 	report.session = session;
 
 	has_br_stack = perf_header__has_feat(&session->header,
@@ -803,8 +815,8 @@ repeat:
 		goto error;
 	}
 
-	/* Force tty output for header output. */
-	if (report.header || report.header_only)
+	/* Force tty output for header output and per-thread stat. */
+	if (report.header || report.header_only || report.show_threads)
 		use_browser = 0;
 
 	if (strcmp(input_name, "-") != 0)
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 5275bab703138c..33962612a5e903 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -95,6 +95,7 @@ struct work_atoms {
 	u64			total_lat;
 	u64			nb_atoms;
 	u64			total_runtime;
+	int			num_merged;
 };
 
 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
@@ -168,9 +169,10 @@ struct perf_sched {
 	u64		 all_runtime;
 	u64		 all_count;
 	u64		 cpu_last_switched[MAX_CPUS];
-	struct rb_root	 atom_root, sorted_atom_root;
+	struct rb_root	 atom_root, sorted_atom_root, merged_atom_root;
 	struct list_head sort_list, cmp_pid;
 	bool force;
+	bool skip_merge;
 };
 
 static u64 get_nsecs(void)
@@ -770,7 +772,7 @@ static int replay_fork_event(struct perf_sched *sched,
 	if (child == NULL || parent == NULL) {
 		pr_debug("thread does not exist on fork event: child %p, parent %p\n",
 				 child, parent);
-		return 0;
+		goto out_put;
 	}
 
 	if (verbose) {
@@ -781,6 +783,9 @@ static int replay_fork_event(struct perf_sched *sched,
 
 	register_pid(sched, parent->tid, thread__comm_str(parent));
 	register_pid(sched, child->tid, thread__comm_str(child));
+out_put:
+	thread__put(child);
+	thread__put(parent);
 	return 0;
 }
 
@@ -957,7 +962,7 @@ static int latency_switch_event(struct perf_sched *sched,
 	struct work_atoms *out_events, *in_events;
 	struct thread *sched_out, *sched_in;
 	u64 timestamp0, timestamp = sample->time;
-	int cpu = sample->cpu;
+	int cpu = sample->cpu, err = -1;
 	s64 delta;
 
 	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
@@ -976,15 +981,17 @@ static int latency_switch_event(struct perf_sched *sched,
 
 	sched_out = machine__findnew_thread(machine, -1, prev_pid);
 	sched_in = machine__findnew_thread(machine, -1, next_pid);
+	if (sched_out == NULL || sched_in == NULL)
+		goto out_put;
 
 	out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
 	if (!out_events) {
 		if (thread_atoms_insert(sched, sched_out))
-			return -1;
+			goto out_put;
 		out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
 		if (!out_events) {
 			pr_err("out-event: Internal tree error");
-			return -1;
+			goto out_put;
 		}
 	}
 	if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
@@ -993,22 +1000,25 @@ static int latency_switch_event(struct perf_sched *sched,
 	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
 	if (!in_events) {
 		if (thread_atoms_insert(sched, sched_in))
-			return -1;
+			goto out_put;
 		in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
 		if (!in_events) {
 			pr_err("in-event: Internal tree error");
-			return -1;
+			goto out_put;
 		}
 		/*
 		 * Take came in we have not heard about yet,
 		 * add in an initial atom in runnable state:
 		 */
 		if (add_sched_out_event(in_events, 'R', timestamp))
-			return -1;
+			goto out_put;
 	}
 	add_sched_in_event(in_events, timestamp);
-
-	return 0;
+	err = 0;
+out_put:
+	thread__put(sched_out);
+	thread__put(sched_in);
+	return err;
 }
 
 static int latency_runtime_event(struct perf_sched *sched,
@@ -1021,23 +1031,29 @@ static int latency_runtime_event(struct perf_sched *sched,
 	struct thread *thread = machine__findnew_thread(machine, -1, pid);
 	struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
 	u64 timestamp = sample->time;
-	int cpu = sample->cpu;
+	int cpu = sample->cpu, err = -1;
+
+	if (thread == NULL)
+		return -1;
 
 	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
 	if (!atoms) {
 		if (thread_atoms_insert(sched, thread))
-			return -1;
+			goto out_put;
 		atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
 		if (!atoms) {
 			pr_err("in-event: Internal tree error");
-			return -1;
+			goto out_put;
 		}
 		if (add_sched_out_event(atoms, 'R', timestamp))
-			return -1;
+			goto out_put;
 	}
 
 	add_runtime_event(atoms, runtime, timestamp);
-	return 0;
+	err = 0;
+out_put:
+	thread__put(thread);
+	return err;
 }
 
 static int latency_wakeup_event(struct perf_sched *sched,
@@ -1050,19 +1066,22 @@ static int latency_wakeup_event(struct perf_sched *sched,
 	struct work_atom *atom;
 	struct thread *wakee;
 	u64 timestamp = sample->time;
+	int err = -1;
 
 	wakee = machine__findnew_thread(machine, -1, pid);
+	if (wakee == NULL)
+		return -1;
 	atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
 	if (!atoms) {
 		if (thread_atoms_insert(sched, wakee))
-			return -1;
+			goto out_put;
 		atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
 		if (!atoms) {
 			pr_err("wakeup-event: Internal tree error");
-			return -1;
+			goto out_put;
 		}
 		if (add_sched_out_event(atoms, 'S', timestamp))
-			return -1;
+			goto out_put;
 	}
 
 	BUG_ON(list_empty(&atoms->work_list));
@@ -1081,17 +1100,21 @@ static int latency_wakeup_event(struct perf_sched *sched,
 	 * skip in this case.
 	 */
 	if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
-		return 0;
+		goto out_ok;
 
 	sched->nr_timestamps++;
 	if (atom->sched_out_time > timestamp) {
 		sched->nr_unordered_timestamps++;
-		return 0;
+		goto out_ok;
 	}
 
 	atom->state = THREAD_WAIT_CPU;
 	atom->wake_up_time = timestamp;
-	return 0;
+out_ok:
+	err = 0;
+out_put:
+	thread__put(wakee);
+	return err;
 }
 
 static int latency_migrate_task_event(struct perf_sched *sched,
@@ -1104,6 +1127,7 @@ static int latency_migrate_task_event(struct perf_sched *sched,
 	struct work_atoms *atoms;
 	struct work_atom *atom;
 	struct thread *migrant;
+	int err = -1;
 
 	/*
 	 * Only need to worry about migration when profiling one CPU.
@@ -1112,18 +1136,20 @@ static int latency_migrate_task_event(struct perf_sched *sched,
 		return 0;
 
 	migrant = machine__findnew_thread(machine, -1, pid);
+	if (migrant == NULL)
+		return -1;
 	atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
 	if (!atoms) {
 		if (thread_atoms_insert(sched, migrant))
-			return -1;
+			goto out_put;
 		register_pid(sched, migrant->tid, thread__comm_str(migrant));
 		atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
 		if (!atoms) {
 			pr_err("migration-event: Internal tree error");
-			return -1;
+			goto out_put;
 		}
 		if (add_sched_out_event(atoms, 'R', timestamp))
-			return -1;
+			goto out_put;
 	}
 
 	BUG_ON(list_empty(&atoms->work_list));
@@ -1135,8 +1161,10 @@ static int latency_migrate_task_event(struct perf_sched *sched,
 
 	if (atom->sched_out_time > timestamp)
 		sched->nr_unordered_timestamps++;
-
-	return 0;
+	err = 0;
+out_put:
+	thread__put(migrant);
+	return err;
 }
 
 static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
@@ -1156,7 +1184,10 @@ static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_
 	sched->all_runtime += work_list->total_runtime;
 	sched->all_count   += work_list->nb_atoms;
 
-	ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
+	if (work_list->num_merged > 1)
+		ret = printf("  %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged);
+	else
+		ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
 
 	for (i = 0; i < 24 - ret; i++)
 		printf(" ");
@@ -1276,17 +1307,22 @@ static int sort_dimension__add(const char *tok, struct list_head *list)
 static void perf_sched__sort_lat(struct perf_sched *sched)
 {
 	struct rb_node *node;
-
+	struct rb_root *root = &sched->atom_root;
+again:
 	for (;;) {
 		struct work_atoms *data;
-		node = rb_first(&sched->atom_root);
+		node = rb_first(root);
 		if (!node)
 			break;
 
-		rb_erase(node, &sched->atom_root);
+		rb_erase(node, root);
 		data = rb_entry(node, struct work_atoms, node);
 		__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
 	}
+	if (root == &sched->atom_root) {
+		root = &sched->merged_atom_root;
+		goto again;
+	}
 }
 
 static int process_sched_wakeup_event(struct perf_tool *tool,
@@ -1330,8 +1366,10 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
 	}
 
 	sched_in = machine__findnew_thread(machine, -1, next_pid);
+	if (sched_in == NULL)
+		return -1;
 
-	sched->curr_thread[this_cpu] = sched_in;
+	sched->curr_thread[this_cpu] = thread__get(sched_in);
 
 	printf("  ");
 
@@ -1381,6 +1419,8 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
 		printf("\n");
 	}
 
+	thread__put(sched_in);
+
 	return 0;
 }
 
@@ -1542,6 +1582,59 @@ static void print_bad_events(struct perf_sched *sched)
 	}
 }
 
+static void __merge_work_atoms(struct rb_root *root, struct work_atoms *data)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	struct work_atoms *this;
+	const char *comm = thread__comm_str(data->thread), *this_comm;
+
+	while (*new) {
+		int cmp;
+
+		this = container_of(*new, struct work_atoms, node);
+		parent = *new;
+
+		this_comm = thread__comm_str(this->thread);
+		cmp = strcmp(comm, this_comm);
+		if (cmp > 0) {
+			new = &((*new)->rb_left);
+		} else if (cmp < 0) {
+			new = &((*new)->rb_right);
+		} else {
+			this->num_merged++;
+			this->total_runtime += data->total_runtime;
+			this->nb_atoms += data->nb_atoms;
+			this->total_lat += data->total_lat;
+			list_splice(&data->work_list, &this->work_list);
+			if (this->max_lat < data->max_lat) {
+				this->max_lat = data->max_lat;
+				this->max_lat_at = data->max_lat_at;
+			}
+			zfree(&data);
+			return;
+		}
+	}
+
+	data->num_merged++;
+	rb_link_node(&data->node, parent, new);
+	rb_insert_color(&data->node, root);
+}
+
+static void perf_sched__merge_lat(struct perf_sched *sched)
+{
+	struct work_atoms *data;
+	struct rb_node *node;
+
+	if (sched->skip_merge)
+		return;
+
+	while ((node = rb_first(&sched->atom_root))) {
+		rb_erase(node, &sched->atom_root);
+		data = rb_entry(node, struct work_atoms, node);
+		__merge_work_atoms(&sched->merged_atom_root, data);
+	}
+}
+
 static int perf_sched__lat(struct perf_sched *sched)
 {
 	struct rb_node *next;
@@ -1551,6 +1644,7 @@ static int perf_sched__lat(struct perf_sched *sched)
 	if (perf_sched__read_events(sched))
 		return -1;
 
+	perf_sched__merge_lat(sched);
 	perf_sched__sort_lat(sched);
 
 	printf("\n -----------------------------------------------------------------------------------------------------------------\n");
@@ -1702,6 +1796,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
 		.profile_cpu	      = -1,
 		.next_shortname1      = 'A',
 		.next_shortname2      = '0',
+		.skip_merge           = 0,
 	};
 	const struct option latency_options[] = {
 	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
@@ -1712,6 +1807,8 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
 		    "CPU to profile on"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
+	OPT_BOOLEAN('p', "pids", &sched.skip_merge,
+		    "latency stats per pid instead of per comm"),
 	OPT_END()
 	};
 	const struct option replay_options[] = {
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 58f10b8e6ff20d..24809787369f5a 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -16,6 +16,7 @@
 #include "util/evsel.h"
 #include "util/sort.h"
 #include "util/data.h"
+#include "util/auxtrace.h"
 #include <linux/bitmap.h>
 
 static char const		*script_name;
@@ -26,6 +27,7 @@ static u64			nr_unordered;
 static bool			no_callchain;
 static bool			latency_format;
 static bool			system_wide;
+static bool			print_flags;
 static const char		*cpu_list;
 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 
@@ -146,9 +148,10 @@ static const char *output_field2str(enum perf_output_field field)
 
 #define PRINT_FIELD(x)  (output[attr->type].fields & PERF_OUTPUT_##x)
 
-static int perf_evsel__check_stype(struct perf_evsel *evsel,
-				   u64 sample_type, const char *sample_msg,
-				   enum perf_output_field field)
+static int perf_evsel__do_check_stype(struct perf_evsel *evsel,
+				      u64 sample_type, const char *sample_msg,
+				      enum perf_output_field field,
+				      bool allow_user_set)
 {
 	struct perf_event_attr *attr = &evsel->attr;
 	int type = attr->type;
@@ -158,6 +161,8 @@ static int perf_evsel__check_stype(struct perf_evsel *evsel,
 		return 0;
 
 	if (output[type].user_set) {
+		if (allow_user_set)
+			return 0;
 		evname = perf_evsel__name(evsel);
 		pr_err("Samples for '%s' event do not have %s attribute set. "
 		       "Cannot print '%s' field.\n",
@@ -175,10 +180,22 @@ static int perf_evsel__check_stype(struct perf_evsel *evsel,
 	return 0;
 }
 
+static int perf_evsel__check_stype(struct perf_evsel *evsel,
+				   u64 sample_type, const char *sample_msg,
+				   enum perf_output_field field)
+{
+	return perf_evsel__do_check_stype(evsel, sample_type, sample_msg, field,
+					  false);
+}
+
 static int perf_evsel__check_attr(struct perf_evsel *evsel,
 				  struct perf_session *session)
 {
 	struct perf_event_attr *attr = &evsel->attr;
+	bool allow_user_set;
+
+	allow_user_set = perf_header__has_feat(&session->header,
+					       HEADER_AUXTRACE);
 
 	if (PRINT_FIELD(TRACE) &&
 		!perf_session__has_traces(session, "record -R"))
@@ -191,8 +208,8 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
 	}
 
 	if (PRINT_FIELD(ADDR) &&
-		perf_evsel__check_stype(evsel, PERF_SAMPLE_ADDR, "ADDR",
-					PERF_OUTPUT_ADDR))
+		perf_evsel__do_check_stype(evsel, PERF_SAMPLE_ADDR, "ADDR",
+					   PERF_OUTPUT_ADDR, allow_user_set))
 		return -EINVAL;
 
 	if (PRINT_FIELD(SYM) && !PRINT_FIELD(IP) && !PRINT_FIELD(ADDR)) {
@@ -229,8 +246,8 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
 		return -EINVAL;
 
 	if (PRINT_FIELD(CPU) &&
-		perf_evsel__check_stype(evsel, PERF_SAMPLE_CPU, "CPU",
-					PERF_OUTPUT_CPU))
+		perf_evsel__do_check_stype(evsel, PERF_SAMPLE_CPU, "CPU",
+					   PERF_OUTPUT_CPU, allow_user_set))
 		return -EINVAL;
 
 	if (PRINT_FIELD(PERIOD) &&
@@ -445,6 +462,25 @@ static void print_sample_bts(union perf_event *event,
 	printf("\n");
 }
 
+static void print_sample_flags(u32 flags)
+{
+	const char *chars = PERF_IP_FLAG_CHARS;
+	const int n = strlen(PERF_IP_FLAG_CHARS);
+	char str[33];
+	int i, pos = 0;
+
+	for (i = 0; i < n; i++, flags >>= 1) {
+		if (flags & 1)
+			str[pos++] = chars[i];
+	}
+	for (; i < 32; i++, flags >>= 1) {
+		if (flags & 1)
+			str[pos++] = '?';
+	}
+	str[pos] = 0;
+	printf("  %-4s ", str);
+}
+
 static void process_event(union perf_event *event, struct perf_sample *sample,
 			  struct perf_evsel *evsel, struct addr_location *al)
 {
@@ -464,6 +500,9 @@ static void process_event(union perf_event *event, struct perf_sample *sample,
 		printf("%s: ", evname ? evname : "[unknown]");
 	}
 
+	if (print_flags)
+		print_sample_flags(sample->flags);
+
 	if (is_bts_event(attr)) {
 		print_sample_bts(event, sample, evsel, thread, al);
 		return;
@@ -568,13 +607,14 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused,
 	}
 
 	if (al.filtered)
-		return 0;
+		goto out_put;
 
 	if (cpu_list && !test_bit(sample->cpu, cpu_bitmap))
-		return 0;
+		goto out_put;
 
 	scripting_ops->process_event(event, sample, evsel, &al);
-
+out_put:
+	addr_location__put(&al);
 	return 0;
 }
 
@@ -642,8 +682,8 @@ static int process_comm_event(struct perf_tool *tool,
 	print_sample_start(sample, thread, evsel);
 	perf_event__fprintf(event, stdout);
 	ret = 0;
-
 out:
+	thread__put(thread);
 	return ret;
 }
 
@@ -674,6 +714,7 @@ static int process_fork_event(struct perf_tool *tool,
 	}
 	print_sample_start(sample, thread, evsel);
 	perf_event__fprintf(event, stdout);
+	thread__put(thread);
 
 	return 0;
 }
@@ -682,6 +723,7 @@ static int process_exit_event(struct perf_tool *tool,
 			      struct perf_sample *sample,
 			      struct machine *machine)
 {
+	int err = 0;
 	struct thread *thread;
 	struct perf_script *script = container_of(tool, struct perf_script, tool);
 	struct perf_session *session = script->session;
@@ -703,9 +745,10 @@ static int process_exit_event(struct perf_tool *tool,
 	perf_event__fprintf(event, stdout);
 
 	if (perf_event__process_exit(tool, event, sample, machine) < 0)
-		return -1;
+		err = -1;
 
-	return 0;
+	thread__put(thread);
+	return err;
 }
 
 static int process_mmap_event(struct perf_tool *tool,
@@ -735,7 +778,7 @@ static int process_mmap_event(struct perf_tool *tool,
 	}
 	print_sample_start(sample, thread, evsel);
 	perf_event__fprintf(event, stdout);
-
+	thread__put(thread);
 	return 0;
 }
 
@@ -766,7 +809,7 @@ static int process_mmap2_event(struct perf_tool *tool,
 	}
 	print_sample_start(sample, thread, evsel);
 	perf_event__fprintf(event, stdout);
-
+	thread__put(thread);
 	return 0;
 }
 
@@ -999,12 +1042,15 @@ static int parse_output_fields(const struct option *opt __maybe_unused,
 		}
 	}
 
-	tok = strtok(tok, ",");
-	while (tok) {
+	for (tok = strtok(tok, ","); tok; tok = strtok(NULL, ",")) {
 		for (i = 0; i < imax; ++i) {
 			if (strcmp(tok, all_output_options[i].str) == 0)
 				break;
 		}
+		if (i == imax && strcmp(tok, "flags") == 0) {
+			print_flags = true;
+			continue;
+		}
 		if (i == imax) {
 			fprintf(stderr, "Invalid field requested.\n");
 			rc = -EINVAL;
@@ -1032,8 +1078,6 @@ static int parse_output_fields(const struct option *opt __maybe_unused,
 			}
 			output[type].fields |= all_output_options[i].field;
 		}
-
-		tok = strtok(NULL, ",");
 	}
 
 	if (type >= 0) {
@@ -1497,6 +1541,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 	char *rec_script_path = NULL;
 	char *rep_script_path = NULL;
 	struct perf_session *session;
+	struct itrace_synth_opts itrace_synth_opts = { .set = false, };
 	char *script_path = NULL;
 	const char **__argv;
 	int i, j, err = 0;
@@ -1511,6 +1556,10 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 			.attr		 = process_attr,
 			.tracing_data	 = perf_event__process_tracing_data,
 			.build_id	 = perf_event__process_build_id,
+			.id_index	 = perf_event__process_id_index,
+			.auxtrace_info	 = perf_event__process_auxtrace_info,
+			.auxtrace	 = perf_event__process_auxtrace,
+			.auxtrace_error	 = perf_event__process_auxtrace_error,
 			.ordered_events	 = true,
 			.ordering_requires_timestamps = true,
 		},
@@ -1549,7 +1598,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 		     "comma separated output fields prepend with 'type:'. "
 		     "Valid types: hw,sw,trace,raw. "
 		     "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
-		     "addr,symoff,period", parse_output_fields),
+		     "addr,symoff,period,flags", parse_output_fields),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
 		    "system-wide collection from all CPUs"),
 	OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
@@ -1570,6 +1619,9 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_BOOLEAN('\0', "show-mmap-events", &script.show_mmap_events,
 		    "Show the mmap events"),
 	OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
+	OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts",
+			    "Instruction Tracing options",
+			    itrace_parse_synth_opts),
 	OPT_END()
 	};
 	const char * const script_subcommands[] = { "record", "report", NULL };
@@ -1765,6 +1817,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 
 	script.session = session;
 
+	session->itrace_synth_opts = &itrace_synth_opts;
+
 	if (cpu_list) {
 		err = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap);
 		if (err < 0)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index f7b8218785f6fa..fd577f725d2314 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -247,21 +247,50 @@ out_free:
 	return -1;
 }
 
+enum {
+	CTX_BIT_USER	= 1 << 0,
+	CTX_BIT_KERNEL	= 1 << 1,
+	CTX_BIT_HV	= 1 << 2,
+	CTX_BIT_HOST	= 1 << 3,
+	CTX_BIT_IDLE	= 1 << 4,
+	CTX_BIT_MAX	= 1 << 5,
+};
+
+#define NUM_CTX CTX_BIT_MAX
+
 static struct stats runtime_nsecs_stats[MAX_NR_CPUS];
-static struct stats runtime_cycles_stats[MAX_NR_CPUS];
-static struct stats runtime_stalled_cycles_front_stats[MAX_NR_CPUS];
-static struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS];
-static struct stats runtime_branches_stats[MAX_NR_CPUS];
-static struct stats runtime_cacherefs_stats[MAX_NR_CPUS];
-static struct stats runtime_l1_dcache_stats[MAX_NR_CPUS];
-static struct stats runtime_l1_icache_stats[MAX_NR_CPUS];
-static struct stats runtime_ll_cache_stats[MAX_NR_CPUS];
-static struct stats runtime_itlb_cache_stats[MAX_NR_CPUS];
-static struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS];
-static struct stats runtime_cycles_in_tx_stats[MAX_NR_CPUS];
+static struct stats runtime_cycles_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_stalled_cycles_front_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_stalled_cycles_back_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_branches_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_cacherefs_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_l1_dcache_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_l1_icache_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_ll_cache_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_itlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_dtlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS];
 static struct stats walltime_nsecs_stats;
-static struct stats runtime_transaction_stats[MAX_NR_CPUS];
-static struct stats runtime_elision_stats[MAX_NR_CPUS];
+static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS];
+
+static int evsel_context(struct perf_evsel *evsel)
+{
+	int ctx = 0;
+
+	if (evsel->attr.exclude_kernel)
+		ctx |= CTX_BIT_KERNEL;
+	if (evsel->attr.exclude_user)
+		ctx |= CTX_BIT_USER;
+	if (evsel->attr.exclude_hv)
+		ctx |= CTX_BIT_HV;
+	if (evsel->attr.exclude_host)
+		ctx |= CTX_BIT_HOST;
+	if (evsel->attr.exclude_idle)
+		ctx |= CTX_BIT_IDLE;
+
+	return ctx;
+}
 
 static void perf_stat__reset_stats(struct perf_evlist *evlist)
 {
@@ -356,37 +385,39 @@ static struct perf_evsel *nth_evsel(int n)
 static void update_shadow_stats(struct perf_evsel *counter, u64 *count,
 				int cpu)
 {
+	int ctx = evsel_context(counter);
+
 	if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK))
 		update_stats(&runtime_nsecs_stats[cpu], count[0]);
 	else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
-		update_stats(&runtime_cycles_stats[cpu], count[0]);
+		update_stats(&runtime_cycles_stats[ctx][cpu], count[0]);
 	else if (transaction_run &&
 		 perf_evsel__cmp(counter, nth_evsel(T_CYCLES_IN_TX)))
-		update_stats(&runtime_cycles_in_tx_stats[cpu], count[0]);
+		update_stats(&runtime_transaction_stats[ctx][cpu], count[0]);
 	else if (transaction_run &&
 		 perf_evsel__cmp(counter, nth_evsel(T_TRANSACTION_START)))
-		update_stats(&runtime_transaction_stats[cpu], count[0]);
+		update_stats(&runtime_transaction_stats[ctx][cpu], count[0]);
 	else if (transaction_run &&
 		 perf_evsel__cmp(counter, nth_evsel(T_ELISION_START)))
-		update_stats(&runtime_elision_stats[cpu], count[0]);
+		update_stats(&runtime_elision_stats[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
-		update_stats(&runtime_stalled_cycles_front_stats[cpu], count[0]);
+		update_stats(&runtime_stalled_cycles_front_stats[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
-		update_stats(&runtime_stalled_cycles_back_stats[cpu], count[0]);
+		update_stats(&runtime_stalled_cycles_back_stats[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
-		update_stats(&runtime_branches_stats[cpu], count[0]);
+		update_stats(&runtime_branches_stats[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES))
-		update_stats(&runtime_cacherefs_stats[cpu], count[0]);
+		update_stats(&runtime_cacherefs_stats[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D))
-		update_stats(&runtime_l1_dcache_stats[cpu], count[0]);
+		update_stats(&runtime_l1_dcache_stats[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I))
-		update_stats(&runtime_l1_icache_stats[cpu], count[0]);
+		update_stats(&runtime_ll_cache_stats[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL))
-		update_stats(&runtime_ll_cache_stats[cpu], count[0]);
+		update_stats(&runtime_ll_cache_stats[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB))
-		update_stats(&runtime_dtlb_cache_stats[cpu], count[0]);
+		update_stats(&runtime_dtlb_cache_stats[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB))
-		update_stats(&runtime_itlb_cache_stats[cpu], count[0]);
+		update_stats(&runtime_itlb_cache_stats[ctx][cpu], count[0]);
 }
 
 static void zero_per_pkg(struct perf_evsel *counter)
@@ -908,8 +939,9 @@ static void print_stalled_cycles_frontend(int cpu,
 {
 	double total, ratio = 0.0;
 	const char *color;
+	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_cycles_stats[cpu]);
+	total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -927,8 +959,9 @@ static void print_stalled_cycles_backend(int cpu,
 {
 	double total, ratio = 0.0;
 	const char *color;
+	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_cycles_stats[cpu]);
+	total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -946,8 +979,9 @@ static void print_branch_misses(int cpu,
 {
 	double total, ratio = 0.0;
 	const char *color;
+	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_branches_stats[cpu]);
+	total = avg_stats(&runtime_branches_stats[ctx][cpu]);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -965,8 +999,9 @@ static void print_l1_dcache_misses(int cpu,
 {
 	double total, ratio = 0.0;
 	const char *color;
+	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_l1_dcache_stats[cpu]);
+	total = avg_stats(&runtime_l1_dcache_stats[ctx][cpu]);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -984,8 +1019,9 @@ static void print_l1_icache_misses(int cpu,
 {
 	double total, ratio = 0.0;
 	const char *color;
+	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_l1_icache_stats[cpu]);
+	total = avg_stats(&runtime_l1_icache_stats[ctx][cpu]);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -1003,8 +1039,9 @@ static void print_dtlb_cache_misses(int cpu,
 {
 	double total, ratio = 0.0;
 	const char *color;
+	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_dtlb_cache_stats[cpu]);
+	total = avg_stats(&runtime_dtlb_cache_stats[ctx][cpu]);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -1022,8 +1059,9 @@ static void print_itlb_cache_misses(int cpu,
 {
 	double total, ratio = 0.0;
 	const char *color;
+	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_itlb_cache_stats[cpu]);
+	total = avg_stats(&runtime_itlb_cache_stats[ctx][cpu]);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -1041,8 +1079,9 @@ static void print_ll_cache_misses(int cpu,
 {
 	double total, ratio = 0.0;
 	const char *color;
+	int ctx = evsel_context(evsel);
 
-	total = avg_stats(&runtime_ll_cache_stats[cpu]);
+	total = avg_stats(&runtime_ll_cache_stats[ctx][cpu]);
 
 	if (total)
 		ratio = avg / total * 100.0;
@@ -1060,6 +1099,7 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 	double sc =  evsel->scale;
 	const char *fmt;
 	int cpu = cpu_map__id_to_cpu(id);
+	int ctx = evsel_context(evsel);
 
 	if (csv_output) {
 		fmt = sc != 1.0 ?  "%.2f%s" : "%.0f%s";
@@ -1091,15 +1131,15 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 		return;
 
 	if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
-		total = avg_stats(&runtime_cycles_stats[cpu]);
+		total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
 		if (total) {
 			ratio = avg / total;
 			fprintf(output, " #   %5.2f  insns per cycle        ", ratio);
 		} else {
 			fprintf(output, "                                   ");
 		}
-		total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]);
-		total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu]));
+		total = avg_stats(&runtime_stalled_cycles_front_stats[ctx][cpu]);
+		total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[ctx][cpu]));
 
 		if (total && avg) {
 			ratio = total / avg;
@@ -1110,46 +1150,46 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 		}
 
 	} else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) &&
-			runtime_branches_stats[cpu].n != 0) {
+			runtime_branches_stats[ctx][cpu].n != 0) {
 		print_branch_misses(cpu, evsel, avg);
 	} else if (
 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1D |
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-			runtime_l1_dcache_stats[cpu].n != 0) {
+			runtime_l1_dcache_stats[ctx][cpu].n != 0) {
 		print_l1_dcache_misses(cpu, evsel, avg);
 	} else if (
 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1I |
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-			runtime_l1_icache_stats[cpu].n != 0) {
+			runtime_l1_icache_stats[ctx][cpu].n != 0) {
 		print_l1_icache_misses(cpu, evsel, avg);
 	} else if (
 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_DTLB |
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-			runtime_dtlb_cache_stats[cpu].n != 0) {
+			runtime_dtlb_cache_stats[ctx][cpu].n != 0) {
 		print_dtlb_cache_misses(cpu, evsel, avg);
 	} else if (
 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_ITLB |
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-			runtime_itlb_cache_stats[cpu].n != 0) {
+			runtime_itlb_cache_stats[ctx][cpu].n != 0) {
 		print_itlb_cache_misses(cpu, evsel, avg);
 	} else if (
 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_LL |
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
 					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-			runtime_ll_cache_stats[cpu].n != 0) {
+			runtime_ll_cache_stats[ctx][cpu].n != 0) {
 		print_ll_cache_misses(cpu, evsel, avg);
 	} else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) &&
-			runtime_cacherefs_stats[cpu].n != 0) {
-		total = avg_stats(&runtime_cacherefs_stats[cpu]);
+			runtime_cacherefs_stats[ctx][cpu].n != 0) {
+		total = avg_stats(&runtime_cacherefs_stats[ctx][cpu]);
 
 		if (total)
 			ratio = avg * 100 / total;
@@ -1171,15 +1211,15 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 		}
 	} else if (transaction_run &&
 		   perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX))) {
-		total = avg_stats(&runtime_cycles_stats[cpu]);
+		total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
 		if (total)
 			fprintf(output,
 				" #   %5.2f%% transactional cycles   ",
 				100.0 * (avg / total));
 	} else if (transaction_run &&
 		   perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX_CP))) {
-		total = avg_stats(&runtime_cycles_stats[cpu]);
-		total2 = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
+		total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
+		total2 = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
 		if (total2 < avg)
 			total2 = avg;
 		if (total)
@@ -1189,8 +1229,8 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 	} else if (transaction_run &&
 		   perf_evsel__cmp(evsel, nth_evsel(T_TRANSACTION_START)) &&
 		   avg > 0 &&
-		   runtime_cycles_in_tx_stats[cpu].n != 0) {
-		total = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
+		   runtime_cycles_in_tx_stats[ctx][cpu].n != 0) {
+		total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
 
 		if (total)
 			ratio = total / avg;
@@ -1199,8 +1239,8 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 	} else if (transaction_run &&
 		   perf_evsel__cmp(evsel, nth_evsel(T_ELISION_START)) &&
 		   avg > 0 &&
-		   runtime_cycles_in_tx_stats[cpu].n != 0) {
-		total = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
+		   runtime_cycles_in_tx_stats[ctx][cpu].n != 0) {
+		total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
 
 		if (total)
 			ratio = total / avg;
@@ -1541,7 +1581,7 @@ static int setup_events(const char * const *attrs, unsigned len)
 	unsigned i;
 
 	for (i = 0; i < len; i++) {
-		if (parse_events(evsel_list, attrs[i]))
+		if (parse_events(evsel_list, attrs[i], NULL))
 			return -1;
 	}
 	return 0;
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index e50fe1187b0ba2..30e59620179dae 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -61,13 +61,13 @@ struct timechart {
 				tasks_only,
 				with_backtrace,
 				topology;
+	bool			force;
 	/* IO related settings */
-	u64			io_events;
 	bool			io_only,
 				skip_eagain;
+	u64			io_events;
 	u64			min_time,
 				merge_dist;
-	bool			force;
 };
 
 struct per_pidcomm;
@@ -523,7 +523,7 @@ static const char *cat_backtrace(union perf_event *event,
 				 * Discard all.
 				 */
 				zfree(&p);
-				goto exit;
+				goto exit_put;
 			}
 			continue;
 		}
@@ -538,7 +538,8 @@ static const char *cat_backtrace(union perf_event *event,
 		else
 			fprintf(f, "..... %016" PRIx64 "\n", ip);
 	}
-
+exit_put:
+	addr_location__put(&al);
 exit:
 	fclose(f);
 
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 6a4d5d41c671d0..6b987424d01526 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -775,7 +775,9 @@ static void perf_event__process_sample(struct perf_tool *tool,
 	if (al.sym == NULL || !al.sym->ignore) {
 		struct hists *hists = evsel__hists(evsel);
 		struct hist_entry_iter iter = {
-			.add_entry_cb = hist_iter__top_callback,
+			.evsel		= evsel,
+			.sample 	= sample,
+			.add_entry_cb 	= hist_iter__top_callback,
 		};
 
 		if (symbol_conf.cumulate_callchain)
@@ -785,15 +787,14 @@ static void perf_event__process_sample(struct perf_tool *tool,
 
 		pthread_mutex_lock(&hists->lock);
 
-		err = hist_entry_iter__add(&iter, &al, evsel, sample,
-					   top->max_stack, top);
+		err = hist_entry_iter__add(&iter, &al, top->max_stack, top);
 		if (err < 0)
 			pr_err("Problem incrementing symbol period, skipping event\n");
 
 		pthread_mutex_unlock(&hists->lock);
 	}
 
-	return;
+	addr_location__put(&al);
 }
 
 static void perf_top__mmap_read_idx(struct perf_top *top, int idx)
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index e122970361f21a..a05490d06374d3 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -16,7 +16,6 @@
 
 #include <libaudit.h>
 #include <stdlib.h>
-#include <sys/eventfd.h>
 #include <sys/mman.h>
 #include <linux/futex.h>
 
@@ -41,6 +40,34 @@
 # define EFD_SEMAPHORE		1
 #endif
 
+#ifndef EFD_NONBLOCK
+# define EFD_NONBLOCK		00004000
+#endif
+
+#ifndef EFD_CLOEXEC
+# define EFD_CLOEXEC		02000000
+#endif
+
+#ifndef O_CLOEXEC
+# define O_CLOEXEC		02000000
+#endif
+
+#ifndef SOCK_DCCP
+# define SOCK_DCCP		6
+#endif
+
+#ifndef SOCK_CLOEXEC
+# define SOCK_CLOEXEC		02000000
+#endif
+
+#ifndef SOCK_NONBLOCK
+# define SOCK_NONBLOCK		00004000
+#endif
+
+#ifndef MSG_CMSG_CLOEXEC
+# define MSG_CMSG_CLOEXEC	0x40000000
+#endif
+
 struct tp_field {
 	int offset;
 	union {
@@ -1712,7 +1739,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 	void *args;
 	size_t printed = 0;
 	struct thread *thread;
-	int id = perf_evsel__sc_tp_uint(evsel, id, sample);
+	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
 	struct thread_trace *ttrace;
 
@@ -1725,14 +1752,14 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
 	ttrace = thread__trace(thread, trace->output);
 	if (ttrace == NULL)
-		return -1;
+		goto out_put;
 
 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
 
 	if (ttrace->entry_str == NULL) {
 		ttrace->entry_str = malloc(1024);
 		if (!ttrace->entry_str)
-			return -1;
+			goto out_put;
 	}
 
 	if (!trace->summary_only)
@@ -1757,8 +1784,10 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 		thread__put(trace->current);
 		trace->current = thread__get(thread);
 	}
-
-	return 0;
+	err = 0;
+out_put:
+	thread__put(thread);
+	return err;
 }
 
 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
@@ -1768,7 +1797,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
 	long ret;
 	u64 duration = 0;
 	struct thread *thread;
-	int id = perf_evsel__sc_tp_uint(evsel, id, sample);
+	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
 	struct thread_trace *ttrace;
 
@@ -1781,7 +1810,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
 	ttrace = thread__trace(thread, trace->output);
 	if (ttrace == NULL)
-		return -1;
+		goto out_put;
 
 	if (trace->summary)
 		thread__update_stats(ttrace, id, sample);
@@ -1835,8 +1864,10 @@ signed_print:
 	fputc('\n', trace->output);
 out:
 	ttrace->entry_pending = false;
-
-	return 0;
+	err = 0;
+out_put:
+	thread__put(thread);
+	return err;
 }
 
 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
@@ -1863,6 +1894,7 @@ static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evs
 
 	ttrace->runtime_ms += runtime_ms;
 	trace->runtime_ms += runtime_ms;
+	thread__put(thread);
 	return 0;
 
 out_dump:
@@ -1872,6 +1904,7 @@ out_dump:
 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
 	       runtime,
 	       perf_evsel__intval(evsel, sample, "vruntime"));
+	thread__put(thread);
 	return 0;
 }
 
@@ -1924,11 +1957,12 @@ static int trace__pgfault(struct trace *trace,
 	struct addr_location al;
 	char map_type = 'd';
 	struct thread_trace *ttrace;
+	int err = -1;
 
 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
 	ttrace = thread__trace(thread, trace->output);
 	if (ttrace == NULL)
-		return -1;
+		goto out_put;
 
 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
 		ttrace->pfmaj++;
@@ -1936,7 +1970,7 @@ static int trace__pgfault(struct trace *trace,
 		ttrace->pfmin++;
 
 	if (trace->summary_only)
-		return 0;
+		goto out;
 
 	thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
 			      sample->ip, &al);
@@ -1967,8 +2001,11 @@ static int trace__pgfault(struct trace *trace,
 	print_location(trace->output, sample, &al, true, false);
 
 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
-
-	return 0;
+out:
+	err = 0;
+out_put:
+	thread__put(thread);
+	return err;
 }
 
 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
@@ -2666,16 +2703,15 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
 		    "show the thread COMM next to its id"),
 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
-	OPT_STRING('e', "expr", &ev_qualifier_str, "expr",
-		    "list of events to trace"),
+	OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
 		    "trace events on existing process id"),
 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
 		    "trace events on existing thread id"),
-	OPT_CALLBACK(0, "filter-pids", &trace, "float",
-		     "show only events with duration > N.M ms", trace__set_filter_pids),
+	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
+		     "pids to filter (by the kernel)", trace__set_filter_pids),
 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
 		    "system-wide collection from all CPUs"),
 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
@@ -2712,11 +2748,10 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 	signal(SIGFPE, sighandler_dump_stack);
 
 	trace.evlist = perf_evlist__new();
-	if (trace.evlist == NULL)
-		return -ENOMEM;
 
 	if (trace.evlist == NULL) {
 		pr_err("Not enough memory to run!\n");
+		err = -ENOMEM;
 		goto out;
 	}
 
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index 59a98c64324038..317001c946608b 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -32,7 +32,7 @@ ifeq ($(ARCH),x86)
     LIBUNWIND_LIBS = -lunwind -lunwind-x86_64
     $(call detected,CONFIG_X86_64)
   else
-    LIBUNWIND_LIBS = -lunwind -lunwind-x86
+    LIBUNWIND_LIBS = -lunwind-x86 -llzma -lunwind
   endif
   NO_PERF_REGS := 0
 endif
@@ -130,6 +130,8 @@ endif
 
 ifeq ($(DEBUG),0)
   CFLAGS += -O6
+else
+  CFLAGS += $(call cc-option,-Og,-O0)
 endif
 
 ifdef PARSER_DEBUG
@@ -268,6 +270,10 @@ else
   endif # libelf support
 endif # NO_LIBELF
 
+ifdef NO_DWARF
+  NO_LIBDW_DWARF_UNWIND := 1
+endif
+
 ifndef NO_LIBELF
   CFLAGS += -DHAVE_LIBELF_SUPPORT
   EXTLIBS += -lelf
@@ -610,6 +616,11 @@ ifdef LIBBABELTRACE
   endif
 endif
 
+ifndef NO_AUXTRACE
+  $(call detected,CONFIG_AUXTRACE)
+  CFLAGS += -DHAVE_AUXTRACE_SUPPORT
+endif
+
 # Among the variables below, these:
 #   perfexecdir
 #   template_dir
diff --git a/tools/perf/config/utilities.mak b/tools/perf/config/utilities.mak
index c16ce833079c0a..0ebef09c0842f8 100644
--- a/tools/perf/config/utilities.mak
+++ b/tools/perf/config/utilities.mak
@@ -177,3 +177,22 @@ $(if $($(1)),$(call _ge_attempt,$($(1)),$(1)),$(call _ge_attempt,$(2)))
 endef
 _ge_attempt = $(if $(get-executable),$(get-executable),$(call _gea_err,$(2)))
 _gea_err  = $(if $(1),$(error Please set '$(1)' appropriately))
+
+# try-run
+# Usage: option = $(call try-run, $(CC)...-o "$$TMP",option-ok,otherwise)
+# Exit code chooses option. "$$TMP" is can be used as temporary file and
+# is automatically cleaned up.
+try-run = $(shell set -e;		\
+	TMP="$(TMPOUT).$$$$.tmp";	\
+	TMPO="$(TMPOUT).$$$$.o";	\
+	if ($(1)) >/dev/null 2>&1;	\
+	then echo "$(2)";		\
+	else echo "$(3)";		\
+	fi;				\
+	rm -f "$$TMP" "$$TMPO")
+
+# cc-option
+# Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586)
+
+cc-option = $(call try-run,\
+	$(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2))
diff --git a/tools/perf/perf-sys.h b/tools/perf/perf-sys.h
index 6ef68165c9db62..83a25cef82fdd2 100644
--- a/tools/perf/perf-sys.h
+++ b/tools/perf/perf-sys.h
@@ -6,11 +6,9 @@
 #include <sys/syscall.h>
 #include <linux/types.h>
 #include <linux/perf_event.h>
+#include <asm/barrier.h>
 
 #if defined(__i386__)
-#define mb()		asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
-#define wmb()		asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
-#define rmb()		asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #define CPUINFO_PROC	{"model name"}
 #ifndef __NR_perf_event_open
@@ -25,9 +23,6 @@
 #endif
 
 #if defined(__x86_64__)
-#define mb()		asm volatile("mfence" ::: "memory")
-#define wmb()		asm volatile("sfence" ::: "memory")
-#define rmb()		asm volatile("lfence" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #define CPUINFO_PROC	{"model name"}
 #ifndef __NR_perf_event_open
@@ -43,129 +38,63 @@
 
 #ifdef __powerpc__
 #include "../../arch/powerpc/include/uapi/asm/unistd.h"
-#define mb()		asm volatile ("sync" ::: "memory")
-#define wmb()		asm volatile ("sync" ::: "memory")
-#define rmb()		asm volatile ("sync" ::: "memory")
 #define CPUINFO_PROC	{"cpu"}
 #endif
 
 #ifdef __s390__
-#define mb()		asm volatile("bcr 15,0" ::: "memory")
-#define wmb()		asm volatile("bcr 15,0" ::: "memory")
-#define rmb()		asm volatile("bcr 15,0" ::: "memory")
 #define CPUINFO_PROC	{"vendor_id"}
 #endif
 
 #ifdef __sh__
-#if defined(__SH4A__) || defined(__SH5__)
-# define mb()		asm volatile("synco" ::: "memory")
-# define wmb()		asm volatile("synco" ::: "memory")
-# define rmb()		asm volatile("synco" ::: "memory")
-#else
-# define mb()		asm volatile("" ::: "memory")
-# define wmb()		asm volatile("" ::: "memory")
-# define rmb()		asm volatile("" ::: "memory")
-#endif
 #define CPUINFO_PROC	{"cpu type"}
 #endif
 
 #ifdef __hppa__
-#define mb()		asm volatile("" ::: "memory")
-#define wmb()		asm volatile("" ::: "memory")
-#define rmb()		asm volatile("" ::: "memory")
 #define CPUINFO_PROC	{"cpu"}
 #endif
 
 #ifdef __sparc__
-#ifdef __LP64__
-#define mb()		asm volatile("ba,pt %%xcc, 1f\n"	\
-				     "membar #StoreLoad\n"	\
-				     "1:\n":::"memory")
-#else
-#define mb()		asm volatile("":::"memory")
-#endif
-#define wmb()		asm volatile("":::"memory")
-#define rmb()		asm volatile("":::"memory")
 #define CPUINFO_PROC	{"cpu"}
 #endif
 
 #ifdef __alpha__
-#define mb()		asm volatile("mb" ::: "memory")
-#define wmb()		asm volatile("wmb" ::: "memory")
-#define rmb()		asm volatile("mb" ::: "memory")
 #define CPUINFO_PROC	{"cpu model"}
 #endif
 
 #ifdef __ia64__
-#define mb()		asm volatile ("mf" ::: "memory")
-#define wmb()		asm volatile ("mf" ::: "memory")
-#define rmb()		asm volatile ("mf" ::: "memory")
 #define cpu_relax()	asm volatile ("hint @pause" ::: "memory")
 #define CPUINFO_PROC	{"model name"}
 #endif
 
 #ifdef __arm__
-/*
- * Use the __kuser_memory_barrier helper in the CPU helper page. See
- * arch/arm/kernel/entry-armv.S in the kernel source for details.
- */
-#define mb()		((void(*)(void))0xffff0fa0)()
-#define wmb()		((void(*)(void))0xffff0fa0)()
-#define rmb()		((void(*)(void))0xffff0fa0)()
 #define CPUINFO_PROC	{"model name", "Processor"}
 #endif
 
 #ifdef __aarch64__
-#define mb()		asm volatile("dmb ish" ::: "memory")
-#define wmb()		asm volatile("dmb ishst" ::: "memory")
-#define rmb()		asm volatile("dmb ishld" ::: "memory")
 #define cpu_relax()	asm volatile("yield" ::: "memory")
 #endif
 
 #ifdef __mips__
-#define mb()		asm volatile(					\
-				".set	mips2\n\t"			\
-				"sync\n\t"				\
-				".set	mips0"				\
-				: /* no output */			\
-				: /* no input */			\
-				: "memory")
-#define wmb()	mb()
-#define rmb()	mb()
 #define CPUINFO_PROC	{"cpu model"}
 #endif
 
 #ifdef __arc__
-#define mb()		asm volatile("" ::: "memory")
-#define wmb()		asm volatile("" ::: "memory")
-#define rmb()		asm volatile("" ::: "memory")
 #define CPUINFO_PROC	{"Processor"}
 #endif
 
 #ifdef __metag__
-#define mb()		asm volatile("" ::: "memory")
-#define wmb()		asm volatile("" ::: "memory")
-#define rmb()		asm volatile("" ::: "memory")
 #define CPUINFO_PROC	{"CPU"}
 #endif
 
 #ifdef __xtensa__
-#define mb()		asm volatile("memw" ::: "memory")
-#define wmb()		asm volatile("memw" ::: "memory")
-#define rmb()		asm volatile("" ::: "memory")
 #define CPUINFO_PROC	{"core ID"}
 #endif
 
 #ifdef __tile__
-#define mb()		asm volatile ("mf" ::: "memory")
-#define wmb()		asm volatile ("mf" ::: "memory")
-#define rmb()		asm volatile ("mf" ::: "memory")
 #define cpu_relax()	asm volatile ("mfspr zero, PASS" ::: "memory")
 #define CPUINFO_PROC    {"model name"}
 #endif
 
-#define barrier() asm volatile ("" ::: "memory")
-
 #ifndef cpu_relax
 #define cpu_relax() barrier()
 #endif
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index e14bb637255cc3..aa79fb8a16d403 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -54,12 +54,17 @@ struct record_opts {
 	bool	     period;
 	bool	     sample_intr_regs;
 	bool	     running_time;
+	bool	     full_auxtrace;
+	bool	     auxtrace_snapshot_mode;
 	unsigned int freq;
 	unsigned int mmap_pages;
+	unsigned int auxtrace_mmap_pages;
 	unsigned int user_freq;
 	u64          branch_stack;
 	u64	     default_interval;
 	u64	     user_interval;
+	size_t	     auxtrace_snapshot_size;
+	const char   *auxtrace_snapshot_opts;
 	bool	     sample_transaction;
 	unsigned     initial_delay;
 	bool         use_clockid;
diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index 6a8801b3201701..ee41e705b2eba7 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -3,9 +3,9 @@ perf-y += parse-events.o
 perf-y += dso-data.o
 perf-y += attr.o
 perf-y += vmlinux-kallsyms.o
-perf-y += open-syscall.o
-perf-y += open-syscall-all-cpus.o
-perf-y += open-syscall-tp-fields.o
+perf-y += openat-syscall.o
+perf-y += openat-syscall-all-cpus.o
+perf-y += openat-syscall-tp-fields.o
 perf-y += mmap-basic.o
 perf-y += perf-record.o
 perf-y += rdpmc.o
@@ -34,7 +34,7 @@ perf-y += kmod-path.o
 
 perf-$(CONFIG_X86) += perf-time-to-tsc.o
 
-ifeq ($(ARCH),$(filter $(ARCH),x86 arm))
+ifeq ($(ARCH),$(filter $(ARCH),x86 arm arm64))
 perf-$(CONFIG_DWARF_UNWIND) += dwarf-unwind.o
 endif
 
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 4f409816711249..87b9961646e4a5 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -23,12 +23,12 @@ static struct test {
 		.func = test__vmlinux_matches_kallsyms,
 	},
 	{
-		.desc = "detect open syscall event",
-		.func = test__open_syscall_event,
+		.desc = "detect openat syscall event",
+		.func = test__openat_syscall_event,
 	},
 	{
-		.desc = "detect open syscall event on all cpus",
-		.func = test__open_syscall_event_on_all_cpus,
+		.desc = "detect openat syscall event on all cpus",
+		.func = test__openat_syscall_event_on_all_cpus,
 	},
 	{
 		.desc = "read samples using the mmap interface",
@@ -73,8 +73,8 @@ static struct test {
 		.func = test__perf_evsel__tp_sched_test,
 	},
 	{
-		.desc = "Generate and check syscalls:sys_enter_open event fields",
-		.func = test__syscall_open_tp_fields,
+		.desc = "Generate and check syscalls:sys_enter_openat event fields",
+		.func = test__syscall_openat_tp_fields,
 	},
 	{
 		.desc = "struct perf_event_attr setup",
@@ -126,7 +126,7 @@ static struct test {
 		.desc = "Test parsing with no sample_id_all bit set",
 		.func = test__parse_no_sample_id_all,
 	},
-#if defined(__x86_64__) || defined(__i386__) || defined(__arm__)
+#if defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__)
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
 	{
 		.desc = "Test dwarf unwind",
@@ -219,7 +219,7 @@ static int run_test(struct test *test)
 	wait(&status);
 
 	if (WIFEXITED(status)) {
-		err = WEXITSTATUS(status);
+		err = (signed char)WEXITSTATUS(status);
 		pr_debug("test child finished with %d\n", err);
 	} else if (WIFSIGNALED(status)) {
 		err = -1;
diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
index f671ec37a7c40c..e2a432b67d5208 100644
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -248,6 +248,7 @@ static int process_sample_event(struct machine *machine,
 	struct perf_sample sample;
 	struct thread *thread;
 	u8 cpumode;
+	int ret;
 
 	if (perf_evlist__parse_sample(evlist, event, &sample)) {
 		pr_debug("perf_evlist__parse_sample failed\n");
@@ -262,7 +263,9 @@ static int process_sample_event(struct machine *machine,
 
 	cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
 
-	return read_object_code(sample.ip, READLEN, cpumode, thread, state);
+	ret = read_object_code(sample.ip, READLEN, cpumode, thread, state);
+	thread__put(thread);
+	return ret;
 }
 
 static int process_event(struct machine *machine, struct perf_evlist *evlist,
@@ -457,13 +460,13 @@ static int do_test_code_reading(bool try_kcore)
 	thread = machine__findnew_thread(machine, pid, pid);
 	if (!thread) {
 		pr_debug("machine__findnew_thread failed\n");
-		goto out_err;
+		goto out_put;
 	}
 
 	cpus = cpu_map__new(NULL);
 	if (!cpus) {
 		pr_debug("cpu_map__new failed\n");
-		goto out_err;
+		goto out_put;
 	}
 
 	while (1) {
@@ -472,7 +475,7 @@ static int do_test_code_reading(bool try_kcore)
 		evlist = perf_evlist__new();
 		if (!evlist) {
 			pr_debug("perf_evlist__new failed\n");
-			goto out_err;
+			goto out_put;
 		}
 
 		perf_evlist__set_maps(evlist, cpus, threads);
@@ -482,10 +485,10 @@ static int do_test_code_reading(bool try_kcore)
 		else
 			str = "cycles";
 		pr_debug("Parsing event '%s'\n", str);
-		ret = parse_events(evlist, str);
+		ret = parse_events(evlist, str, NULL);
 		if (ret < 0) {
 			pr_debug("parse_events failed\n");
-			goto out_err;
+			goto out_put;
 		}
 
 		perf_evlist__config(evlist, &opts);
@@ -506,7 +509,7 @@ static int do_test_code_reading(bool try_kcore)
 				continue;
 			}
 			pr_debug("perf_evlist__open failed\n");
-			goto out_err;
+			goto out_put;
 		}
 		break;
 	}
@@ -514,7 +517,7 @@ static int do_test_code_reading(bool try_kcore)
 	ret = perf_evlist__mmap(evlist, UINT_MAX, false);
 	if (ret < 0) {
 		pr_debug("perf_evlist__mmap failed\n");
-		goto out_err;
+		goto out_put;
 	}
 
 	perf_evlist__enable(evlist);
@@ -525,7 +528,7 @@ static int do_test_code_reading(bool try_kcore)
 
 	ret = process_events(machine, evlist, &state);
 	if (ret < 0)
-		goto out_err;
+		goto out_put;
 
 	if (!have_vmlinux && !have_kcore && !try_kcore)
 		err = TEST_CODE_READING_NO_KERNEL_OBJ;
@@ -535,7 +538,10 @@ static int do_test_code_reading(bool try_kcore)
 		err = TEST_CODE_READING_NO_ACCESS;
 	else
 		err = TEST_CODE_READING_OK;
+out_put:
+	thread__put(thread);
 out_err:
+
 	if (evlist) {
 		perf_evlist__delete(evlist);
 	} else {
diff --git a/tools/perf/tests/dso-data.c b/tools/perf/tests/dso-data.c
index 513e5febbe5a50..3e41c61bd86104 100644
--- a/tools/perf/tests/dso-data.c
+++ b/tools/perf/tests/dso-data.c
@@ -99,6 +99,17 @@ struct test_data_offset offsets[] = {
 	},
 };
 
+/* move it from util/dso.c for compatibility */
+static int dso__data_fd(struct dso *dso, struct machine *machine)
+{
+	int fd = dso__data_get_fd(dso, machine);
+
+	if (fd >= 0)
+		dso__data_put_fd(dso);
+
+	return fd;
+}
+
 int test__dso_data(void)
 {
 	struct machine machine;
diff --git a/tools/perf/tests/dwarf-unwind.c b/tools/perf/tests/dwarf-unwind.c
index 0bf06bec68c7e9..9b748e1ad46ed2 100644
--- a/tools/perf/tests/dwarf-unwind.c
+++ b/tools/perf/tests/dwarf-unwind.c
@@ -170,6 +170,7 @@ int test__dwarf_unwind(void)
 	}
 
 	err = krava_1(thread);
+	thread__put(thread);
 
  out:
 	machine__delete_threads(machine);
diff --git a/tools/perf/tests/evsel-roundtrip-name.c b/tools/perf/tests/evsel-roundtrip-name.c
index b8d8341b383e7b..3fa715987a5ec2 100644
--- a/tools/perf/tests/evsel-roundtrip-name.c
+++ b/tools/perf/tests/evsel-roundtrip-name.c
@@ -23,7 +23,7 @@ static int perf_evsel__roundtrip_cache_name_test(void)
 			for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) {
 				__perf_evsel__hw_cache_type_op_res_name(type, op, i,
 									name, sizeof(name));
-				err = parse_events(evlist, name);
+				err = parse_events(evlist, name, NULL);
 				if (err)
 					ret = err;
 			}
@@ -71,7 +71,7 @@ static int __perf_evsel__name_array_test(const char *names[], int nr_names)
                 return -ENOMEM;
 
 	for (i = 0; i < nr_names; ++i) {
-		err = parse_events(evlist, names[i]);
+		err = parse_events(evlist, names[i], NULL);
 		if (err) {
 			pr_debug("failed to parse event '%s', err %d\n",
 				 names[i], err);
diff --git a/tools/perf/tests/hists_common.c b/tools/perf/tests/hists_common.c
index a62c091345163f..915f60af6a0ebe 100644
--- a/tools/perf/tests/hists_common.c
+++ b/tools/perf/tests/hists_common.c
@@ -96,6 +96,7 @@ struct machine *setup_fake_machine(struct machines *machines)
 			goto out;
 
 		thread__set_comm(thread, fake_threads[i].comm, 0);
+		thread__put(thread);
 	}
 
 	for (i = 0; i < ARRAY_SIZE(fake_mmap_info); i++) {
@@ -120,8 +121,7 @@ struct machine *setup_fake_machine(struct machines *machines)
 		size_t k;
 		struct dso *dso;
 
-		dso = __dsos__findnew(&machine->user_dsos,
-				      fake_symbols[i].dso_name);
+		dso = machine__findnew_dso(machine, fake_symbols[i].dso_name);
 		if (dso == NULL)
 			goto out;
 
diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c
index 18619966454c57..7d82c8be5e360d 100644
--- a/tools/perf/tests/hists_cumulate.c
+++ b/tools/perf/tests/hists_cumulate.c
@@ -87,6 +87,8 @@ static int add_hist_entries(struct hists *hists, struct machine *machine)
 			},
 		};
 		struct hist_entry_iter iter = {
+			.evsel = evsel,
+			.sample	= &sample,
 			.hide_unresolved = false,
 		};
 
@@ -104,9 +106,11 @@ static int add_hist_entries(struct hists *hists, struct machine *machine)
 						  &sample) < 0)
 			goto out;
 
-		if (hist_entry_iter__add(&iter, &al, evsel, &sample,
-					 PERF_MAX_STACK_DEPTH, NULL) < 0)
+		if (hist_entry_iter__add(&iter, &al, PERF_MAX_STACK_DEPTH,
+					 NULL) < 0) {
+			addr_location__put(&al);
 			goto out;
+		}
 
 		fake_samples[i].thread = al.thread;
 		fake_samples[i].map = al.map;
@@ -695,7 +699,7 @@ int test__hists_cumulate(void)
 
 	TEST_ASSERT_VAL("No memory", evlist);
 
-	err = parse_events(evlist, "cpu-clock");
+	err = parse_events(evlist, "cpu-clock", NULL);
 	if (err)
 		goto out;
 
diff --git a/tools/perf/tests/hists_filter.c b/tools/perf/tests/hists_filter.c
index 59e53db7914c0a..ce48775e6ada13 100644
--- a/tools/perf/tests/hists_filter.c
+++ b/tools/perf/tests/hists_filter.c
@@ -63,6 +63,8 @@ static int add_hist_entries(struct perf_evlist *evlist,
 				},
 			};
 			struct hist_entry_iter iter = {
+				.evsel = evsel,
+				.sample = &sample,
 				.ops = &hist_iter_normal,
 				.hide_unresolved = false,
 			};
@@ -81,9 +83,11 @@ static int add_hist_entries(struct perf_evlist *evlist,
 							  &sample) < 0)
 				goto out;
 
-			if (hist_entry_iter__add(&iter, &al, evsel, &sample,
-						 PERF_MAX_STACK_DEPTH, NULL) < 0)
+			if (hist_entry_iter__add(&iter, &al,
+						 PERF_MAX_STACK_DEPTH, NULL) < 0) {
+				addr_location__put(&al);
 				goto out;
+			}
 
 			fake_samples[i].thread = al.thread;
 			fake_samples[i].map = al.map;
@@ -108,10 +112,10 @@ int test__hists_filter(void)
 
 	TEST_ASSERT_VAL("No memory", evlist);
 
-	err = parse_events(evlist, "cpu-clock");
+	err = parse_events(evlist, "cpu-clock", NULL);
 	if (err)
 		goto out;
-	err = parse_events(evlist, "task-clock");
+	err = parse_events(evlist, "task-clock", NULL);
 	if (err)
 		goto out;
 
diff --git a/tools/perf/tests/hists_link.c b/tools/perf/tests/hists_link.c
index 278ba8344c236d..8c102b01142497 100644
--- a/tools/perf/tests/hists_link.c
+++ b/tools/perf/tests/hists_link.c
@@ -91,8 +91,10 @@ static int add_hist_entries(struct perf_evlist *evlist, struct machine *machine)
 
 			he = __hists__add_entry(hists, &al, NULL,
 						NULL, NULL, 1, 1, 0, true);
-			if (he == NULL)
+			if (he == NULL) {
+				addr_location__put(&al);
 				goto out;
+			}
 
 			fake_common_samples[k].thread = al.thread;
 			fake_common_samples[k].map = al.map;
@@ -115,8 +117,10 @@ static int add_hist_entries(struct perf_evlist *evlist, struct machine *machine)
 
 			he = __hists__add_entry(hists, &al, NULL,
 						NULL, NULL, 1, 1, 0, true);
-			if (he == NULL)
+			if (he == NULL) {
+				addr_location__put(&al);
 				goto out;
+			}
 
 			fake_samples[i][k].thread = al.thread;
 			fake_samples[i][k].map = al.map;
@@ -282,10 +286,10 @@ int test__hists_link(void)
 	if (evlist == NULL)
                 return -ENOMEM;
 
-	err = parse_events(evlist, "cpu-clock");
+	err = parse_events(evlist, "cpu-clock", NULL);
 	if (err)
 		goto out;
-	err = parse_events(evlist, "task-clock");
+	err = parse_events(evlist, "task-clock", NULL);
 	if (err)
 		goto out;
 
diff --git a/tools/perf/tests/hists_output.c b/tools/perf/tests/hists_output.c
index b52c9faea22450..adbebc852cc8b5 100644
--- a/tools/perf/tests/hists_output.c
+++ b/tools/perf/tests/hists_output.c
@@ -57,6 +57,8 @@ static int add_hist_entries(struct hists *hists, struct machine *machine)
 			},
 		};
 		struct hist_entry_iter iter = {
+			.evsel = evsel,
+			.sample = &sample,
 			.ops = &hist_iter_normal,
 			.hide_unresolved = false,
 		};
@@ -70,9 +72,11 @@ static int add_hist_entries(struct hists *hists, struct machine *machine)
 						  &sample) < 0)
 			goto out;
 
-		if (hist_entry_iter__add(&iter, &al, evsel, &sample,
-					 PERF_MAX_STACK_DEPTH, NULL) < 0)
+		if (hist_entry_iter__add(&iter, &al, PERF_MAX_STACK_DEPTH,
+					 NULL) < 0) {
+			addr_location__put(&al);
 			goto out;
+		}
 
 		fake_samples[i].thread = al.thread;
 		fake_samples[i].map = al.map;
@@ -590,7 +594,7 @@ int test__hists_output(void)
 
 	TEST_ASSERT_VAL("No memory", evlist);
 
-	err = parse_events(evlist, "cpu-clock");
+	err = parse_events(evlist, "cpu-clock", NULL);
 	if (err)
 		goto out;
 
diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c
index 7a5ab7b0b8f698..5b171d1e338bdd 100644
--- a/tools/perf/tests/keep-tracking.c
+++ b/tools/perf/tests/keep-tracking.c
@@ -78,8 +78,8 @@ int test__keep_tracking(void)
 
 	perf_evlist__set_maps(evlist, cpus, threads);
 
-	CHECK__(parse_events(evlist, "dummy:u"));
-	CHECK__(parse_events(evlist, "cycles:u"));
+	CHECK__(parse_events(evlist, "dummy:u", NULL));
+	CHECK__(parse_events(evlist, "cycles:u", NULL));
 
 	perf_evlist__config(evlist, &opts);
 
diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index bff85324f799bd..65280d28662e4c 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -32,6 +32,7 @@ make_no_backtrace   := NO_BACKTRACE=1
 make_no_libnuma     := NO_LIBNUMA=1
 make_no_libaudit    := NO_LIBAUDIT=1
 make_no_libbionic   := NO_LIBBIONIC=1
+make_no_auxtrace    := NO_AUXTRACE=1
 make_tags           := tags
 make_cscope         := cscope
 make_help           := help
@@ -52,7 +53,7 @@ make_static         := LDFLAGS=-static
 make_minimal        := NO_LIBPERL=1 NO_LIBPYTHON=1 NO_NEWT=1 NO_GTK2=1
 make_minimal        += NO_DEMANGLE=1 NO_LIBELF=1 NO_LIBUNWIND=1 NO_BACKTRACE=1
 make_minimal        += NO_LIBNUMA=1 NO_LIBAUDIT=1 NO_LIBBIONIC=1
-make_minimal        += NO_LIBDW_DWARF_UNWIND=1
+make_minimal        += NO_LIBDW_DWARF_UNWIND=1 NO_AUXTRACE=1
 
 # $(run) contains all available tests
 run := make_pure
@@ -74,6 +75,7 @@ run += make_no_backtrace
 run += make_no_libnuma
 run += make_no_libaudit
 run += make_no_libbionic
+run += make_no_auxtrace
 run += make_help
 run += make_doc
 run += make_perf_o
@@ -223,7 +225,19 @@ tarpkg:
 	echo "- $@: $$cmd" && echo $$cmd > $@ && \
 	( eval $$cmd ) >> $@ 2>&1
 
-all: $(run) $(run_O) tarpkg
+make_kernelsrc:
+	@echo " - make -C <kernelsrc> tools/perf"
+	$(call clean); \
+	(make -C ../.. tools/perf) > $@ 2>&1 && \
+	test -x perf && rm -f $@ || (cat $@ ; false)
+
+make_kernelsrc_tools:
+	@echo " - make -C <kernelsrc>/tools perf"
+	$(call clean); \
+	(make -C ../../tools perf) > $@ 2>&1 && \
+	test -x perf && rm -f $@ || (cat $@ ; false)
+
+all: $(run) $(run_O) tarpkg make_kernelsrc make_kernelsrc_tools
 	@echo OK
 
 out: $(run_O)
diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c
index 9b9622a33932da..5855cf47121003 100644
--- a/tools/perf/tests/mmap-basic.c
+++ b/tools/perf/tests/mmap-basic.c
@@ -23,10 +23,8 @@ int test__basic_mmap(void)
 	struct cpu_map *cpus;
 	struct perf_evlist *evlist;
 	cpu_set_t cpu_set;
-	const char *syscall_names[] = { "getsid", "getppid", "getpgrp",
-					"getpgid", };
-	pid_t (*syscalls[])(void) = { (void *)getsid, getppid, getpgrp,
-				      (void*)getpgid };
+	const char *syscall_names[] = { "getsid", "getppid", "getpgid", };
+	pid_t (*syscalls[])(void) = { (void *)getsid, getppid, (void*)getpgid };
 #define nsyscalls ARRAY_SIZE(syscall_names)
 	unsigned int nr_events[nsyscalls],
 		     expected_nr_events[nsyscalls], i, j;
diff --git a/tools/perf/tests/mmap-thread-lookup.c b/tools/perf/tests/mmap-thread-lookup.c
index 2113f1c8611fb5..264e215c0d36a9 100644
--- a/tools/perf/tests/mmap-thread-lookup.c
+++ b/tools/perf/tests/mmap-thread-lookup.c
@@ -191,6 +191,8 @@ static int mmap_events(synth_cb synth)
 				      PERF_RECORD_MISC_USER, MAP__FUNCTION,
 				      (unsigned long) (td->map + 1), &al);
 
+		thread__put(thread);
+
 		if (!al.map) {
 			pr_debug("failed, couldn't find map\n");
 			err = -1;
diff --git a/tools/perf/tests/open-syscall-all-cpus.c b/tools/perf/tests/openat-syscall-all-cpus.c
index 3ec885c48f8fbc..e34dfdf96b5acc 100644
--- a/tools/perf/tests/open-syscall-all-cpus.c
+++ b/tools/perf/tests/openat-syscall-all-cpus.c
@@ -4,12 +4,12 @@
 #include "cpumap.h"
 #include "debug.h"
 
-int test__open_syscall_event_on_all_cpus(void)
+int test__openat_syscall_event_on_all_cpus(void)
 {
 	int err = -1, fd, cpu;
 	struct cpu_map *cpus;
 	struct perf_evsel *evsel;
-	unsigned int nr_open_calls = 111, i;
+	unsigned int nr_openat_calls = 111, i;
 	cpu_set_t cpu_set;
 	struct thread_map *threads = thread_map__new(-1, getpid(), UINT_MAX);
 	char sbuf[STRERR_BUFSIZE];
@@ -27,7 +27,7 @@ int test__open_syscall_event_on_all_cpus(void)
 
 	CPU_ZERO(&cpu_set);
 
-	evsel = perf_evsel__newtp("syscalls", "sys_enter_open");
+	evsel = perf_evsel__newtp("syscalls", "sys_enter_openat");
 	if (evsel == NULL) {
 		if (tracefs_configured())
 			pr_debug("is tracefs mounted on /sys/kernel/tracing?\n");
@@ -46,7 +46,7 @@ int test__open_syscall_event_on_all_cpus(void)
 	}
 
 	for (cpu = 0; cpu < cpus->nr; ++cpu) {
-		unsigned int ncalls = nr_open_calls + cpu;
+		unsigned int ncalls = nr_openat_calls + cpu;
 		/*
 		 * XXX eventually lift this restriction in a way that
 		 * keeps perf building on older glibc installations
@@ -66,7 +66,7 @@ int test__open_syscall_event_on_all_cpus(void)
 			goto out_close_fd;
 		}
 		for (i = 0; i < ncalls; ++i) {
-			fd = open("/etc/passwd", O_RDONLY);
+			fd = openat(0, "/etc/passwd", O_RDONLY);
 			close(fd);
 		}
 		CPU_CLR(cpus->map[cpu], &cpu_set);
@@ -96,7 +96,7 @@ int test__open_syscall_event_on_all_cpus(void)
 			break;
 		}
 
-		expected = nr_open_calls + cpu;
+		expected = nr_openat_calls + cpu;
 		if (evsel->counts->cpu[cpu].val != expected) {
 			pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls on cpu %d, got %" PRIu64 "\n",
 				 expected, cpus->map[cpu], evsel->counts->cpu[cpu].val);
diff --git a/tools/perf/tests/open-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c
index 127dcae0b76033..6245221479d766 100644
--- a/tools/perf/tests/open-syscall-tp-fields.c
+++ b/tools/perf/tests/openat-syscall-tp-fields.c
@@ -5,7 +5,7 @@
 #include "tests.h"
 #include "debug.h"
 
-int test__syscall_open_tp_fields(void)
+int test__syscall_openat_tp_fields(void)
 {
 	struct record_opts opts = {
 		.target = {
@@ -29,7 +29,7 @@ int test__syscall_open_tp_fields(void)
 		goto out;
 	}
 
-	evsel = perf_evsel__newtp("syscalls", "sys_enter_open");
+	evsel = perf_evsel__newtp("syscalls", "sys_enter_openat");
 	if (evsel == NULL) {
 		pr_debug("%s: perf_evsel__newtp\n", __func__);
 		goto out_delete_evlist;
@@ -66,7 +66,7 @@ int test__syscall_open_tp_fields(void)
 	/*
 	 * Generate the event:
 	 */
-	open(filename, flags);
+	openat(AT_FDCWD, filename, flags);
 
 	while (1) {
 		int before = nr_events;
diff --git a/tools/perf/tests/open-syscall.c b/tools/perf/tests/openat-syscall.c
index 07aa319bf334f3..9f9491bb8e4897 100644
--- a/tools/perf/tests/open-syscall.c
+++ b/tools/perf/tests/openat-syscall.c
@@ -3,11 +3,11 @@
 #include "debug.h"
 #include "tests.h"
 
-int test__open_syscall_event(void)
+int test__openat_syscall_event(void)
 {
 	int err = -1, fd;
 	struct perf_evsel *evsel;
-	unsigned int nr_open_calls = 111, i;
+	unsigned int nr_openat_calls = 111, i;
 	struct thread_map *threads = thread_map__new(-1, getpid(), UINT_MAX);
 	char sbuf[STRERR_BUFSIZE];
 
@@ -16,7 +16,7 @@ int test__open_syscall_event(void)
 		return -1;
 	}
 
-	evsel = perf_evsel__newtp("syscalls", "sys_enter_open");
+	evsel = perf_evsel__newtp("syscalls", "sys_enter_openat");
 	if (evsel == NULL) {
 		if (tracefs_configured())
 			pr_debug("is tracefs mounted on /sys/kernel/tracing?\n");
@@ -34,8 +34,8 @@ int test__open_syscall_event(void)
 		goto out_evsel_delete;
 	}
 
-	for (i = 0; i < nr_open_calls; ++i) {
-		fd = open("/etc/passwd", O_RDONLY);
+	for (i = 0; i < nr_openat_calls; ++i) {
+		fd = openat(0, "/etc/passwd", O_RDONLY);
 		close(fd);
 	}
 
@@ -44,9 +44,9 @@ int test__open_syscall_event(void)
 		goto out_close_fd;
 	}
 
-	if (evsel->counts->cpu[0].val != nr_open_calls) {
+	if (evsel->counts->cpu[0].val != nr_openat_calls) {
 		pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls, got %" PRIu64 "\n",
-			 nr_open_calls, evsel->counts->cpu[0].val);
+			 nr_openat_calls, evsel->counts->cpu[0].val);
 		goto out_close_fd;
 	}
 
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index 3de744961739c2..d76963f7ad3d4a 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -427,7 +427,7 @@ static int test__checkevent_list(struct perf_evlist *evlist)
 	TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
 	TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
 
-	/* syscalls:sys_enter_open:k */
+	/* syscalls:sys_enter_openat:k */
 	evsel = perf_evsel__next(evsel);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_TRACEPOINT == evsel->attr.type);
 	TEST_ASSERT_VAL("wrong sample_type",
@@ -665,7 +665,7 @@ static int test__group3(struct perf_evlist *evlist __maybe_unused)
 	TEST_ASSERT_VAL("wrong number of entries", 5 == evlist->nr_entries);
 	TEST_ASSERT_VAL("wrong number of groups", 2 == evlist->nr_groups);
 
-	/* group1 syscalls:sys_enter_open:H */
+	/* group1 syscalls:sys_enter_openat:H */
 	evsel = leader = perf_evlist__first(evlist);
 	TEST_ASSERT_VAL("wrong type", PERF_TYPE_TRACEPOINT == evsel->attr.type);
 	TEST_ASSERT_VAL("wrong sample_type",
@@ -1293,7 +1293,7 @@ struct evlist_test {
 
 static struct evlist_test test__events[] = {
 	{
-		.name  = "syscalls:sys_enter_open",
+		.name  = "syscalls:sys_enter_openat",
 		.check = test__checkevent_tracepoint,
 		.id    = 0,
 	},
@@ -1353,7 +1353,7 @@ static struct evlist_test test__events[] = {
 		.id    = 11,
 	},
 	{
-		.name  = "syscalls:sys_enter_open:k",
+		.name  = "syscalls:sys_enter_openat:k",
 		.check = test__checkevent_tracepoint_modifier,
 		.id    = 12,
 	},
@@ -1408,7 +1408,7 @@ static struct evlist_test test__events[] = {
 		.id    = 22,
 	},
 	{
-		.name  = "r1,syscalls:sys_enter_open:k,1:1:hp",
+		.name  = "r1,syscalls:sys_enter_openat:k,1:1:hp",
 		.check = test__checkevent_list,
 		.id    = 23,
 	},
@@ -1443,7 +1443,7 @@ static struct evlist_test test__events[] = {
 		.id    = 29,
 	},
 	{
-		.name  = "group1{syscalls:sys_enter_open:H,cycles:kppp},group2{cycles,1:3}:G,instructions:u",
+		.name  = "group1{syscalls:sys_enter_openat:H,cycles:kppp},group2{cycles,1:3}:G,instructions:u",
 		.check = test__group3,
 		.id    = 30,
 	},
@@ -1571,7 +1571,7 @@ static int test_event(struct evlist_test *e)
 	if (evlist == NULL)
 		return -ENOMEM;
 
-	ret = parse_events(evlist, e->name);
+	ret = parse_events(evlist, e->name, NULL);
 	if (ret) {
 		pr_debug("failed to parse event '%s', err %d\n",
 			 e->name, ret);
diff --git a/tools/perf/tests/perf-time-to-tsc.c b/tools/perf/tests/perf-time-to-tsc.c
index f238442b238a29..5f49484f1abc03 100644
--- a/tools/perf/tests/perf-time-to-tsc.c
+++ b/tools/perf/tests/perf-time-to-tsc.c
@@ -68,7 +68,7 @@ int test__perf_time_to_tsc(void)
 
 	perf_evlist__set_maps(evlist, cpus, threads);
 
-	CHECK__(parse_events(evlist, "cycles:u"));
+	CHECK__(parse_events(evlist, "cycles:u", NULL));
 
 	perf_evlist__config(evlist, &opts);
 
diff --git a/tools/perf/tests/pmu.c b/tools/perf/tests/pmu.c
index eeb68bb1972d44..faa04e9d5d5fc7 100644
--- a/tools/perf/tests/pmu.c
+++ b/tools/perf/tests/pmu.c
@@ -152,7 +152,8 @@ int test__pmu(void)
 		if (ret)
 			break;
 
-		ret = perf_pmu__config_terms(&formats, &attr, terms, false);
+		ret = perf_pmu__config_terms(&formats, &attr, terms,
+					     false, NULL);
 		if (ret)
 			break;
 
diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c
index cc68648c7c5552..0d31403ea593c7 100644
--- a/tools/perf/tests/switch-tracking.c
+++ b/tools/perf/tests/switch-tracking.c
@@ -347,7 +347,7 @@ int test__switch_tracking(void)
 	perf_evlist__set_maps(evlist, cpus, threads);
 
 	/* First event */
-	err = parse_events(evlist, "cpu-clock:u");
+	err = parse_events(evlist, "cpu-clock:u", NULL);
 	if (err) {
 		pr_debug("Failed to parse event dummy:u\n");
 		goto out_err;
@@ -356,7 +356,7 @@ int test__switch_tracking(void)
 	cpu_clocks_evsel = perf_evlist__last(evlist);
 
 	/* Second event */
-	err = parse_events(evlist, "cycles:u");
+	err = parse_events(evlist, "cycles:u", NULL);
 	if (err) {
 		pr_debug("Failed to parse event cycles:u\n");
 		goto out_err;
@@ -371,7 +371,7 @@ int test__switch_tracking(void)
 		goto out;
 	}
 
-	err = parse_events(evlist, sched_switch);
+	err = parse_events(evlist, sched_switch, NULL);
 	if (err) {
 		pr_debug("Failed to parse event %s\n", sched_switch);
 		goto out_err;
@@ -401,7 +401,7 @@ int test__switch_tracking(void)
 	perf_evsel__set_sample_bit(cycles_evsel, TIME);
 
 	/* Fourth event */
-	err = parse_events(evlist, "dummy:u");
+	err = parse_events(evlist, "dummy:u", NULL);
 	if (err) {
 		pr_debug("Failed to parse event dummy:u\n");
 		goto out_err;
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index 52758a33f64c56..8e5038b48ba8df 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -9,6 +9,15 @@ do {									 \
 	}								 \
 } while (0)
 
+#define TEST_ASSERT_EQUAL(text, val, expected)				 \
+do {									 \
+	if (val != expected) {						 \
+		pr_debug("FAILED %s:%d %s (%d != %d)\n",		 \
+			 __FILE__, __LINE__, text, val, expected);	 \
+		return -1;						 \
+	}								 \
+} while (0)
+
 enum {
 	TEST_OK   =  0,
 	TEST_FAIL = -1,
@@ -17,14 +26,14 @@ enum {
 
 /* Tests */
 int test__vmlinux_matches_kallsyms(void);
-int test__open_syscall_event(void);
-int test__open_syscall_event_on_all_cpus(void);
+int test__openat_syscall_event(void);
+int test__openat_syscall_event_on_all_cpus(void);
 int test__basic_mmap(void);
 int test__PERF_RECORD(void);
 int test__rdpmc(void);
 int test__perf_evsel__roundtrip_name_test(void);
 int test__perf_evsel__tp_sched_test(void);
-int test__syscall_open_tp_fields(void);
+int test__syscall_openat_tp_fields(void);
 int test__pmu(void);
 int test__attr(void);
 int test__dso_data(void);
@@ -53,7 +62,7 @@ int test__fdarray__filter(void);
 int test__fdarray__add(void);
 int test__kmod_path__parse(void);
 
-#if defined(__x86_64__) || defined(__i386__) || defined(__arm__)
+#if defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__)
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
 struct thread;
 struct perf_sample;
diff --git a/tools/perf/tests/thread-mg-share.c b/tools/perf/tests/thread-mg-share.c
index b028499dd3cf0f..01fabb19d74607 100644
--- a/tools/perf/tests/thread-mg-share.c
+++ b/tools/perf/tests/thread-mg-share.c
@@ -43,7 +43,7 @@ int test__thread_mg_share(void)
 			leader && t1 && t2 && t3 && other);
 
 	mg = leader->mg;
-	TEST_ASSERT_VAL("wrong refcnt", mg->refcnt == 4);
+	TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&mg->refcnt), 4);
 
 	/* test the map groups pointer is shared */
 	TEST_ASSERT_VAL("map groups don't match", mg == t1->mg);
@@ -58,33 +58,40 @@ int test__thread_mg_share(void)
 	other_leader = machine__find_thread(machine, 4, 4);
 	TEST_ASSERT_VAL("failed to find other leader", other_leader);
 
+	/*
+	 * Ok, now that all the rbtree related operations were done,
+	 * lets remove all of them from there so that we can do the
+	 * refcounting tests.
+	 */
+	machine__remove_thread(machine, leader);
+	machine__remove_thread(machine, t1);
+	machine__remove_thread(machine, t2);
+	machine__remove_thread(machine, t3);
+	machine__remove_thread(machine, other);
+	machine__remove_thread(machine, other_leader);
+
 	other_mg = other->mg;
-	TEST_ASSERT_VAL("wrong refcnt", other_mg->refcnt == 2);
+	TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&other_mg->refcnt), 2);
 
 	TEST_ASSERT_VAL("map groups don't match", other_mg == other_leader->mg);
 
 	/* release thread group */
-	thread__delete(leader);
-	TEST_ASSERT_VAL("wrong refcnt", mg->refcnt == 3);
+	thread__put(leader);
+	TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&mg->refcnt), 3);
 
-	thread__delete(t1);
-	TEST_ASSERT_VAL("wrong refcnt", mg->refcnt == 2);
+	thread__put(t1);
+	TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&mg->refcnt), 2);
 
-	thread__delete(t2);
-	TEST_ASSERT_VAL("wrong refcnt", mg->refcnt == 1);
+	thread__put(t2);
+	TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&mg->refcnt), 1);
 
-	thread__delete(t3);
+	thread__put(t3);
 
 	/* release other group  */
-	thread__delete(other_leader);
-	TEST_ASSERT_VAL("wrong refcnt", other_mg->refcnt == 1);
+	thread__put(other_leader);
+	TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&other_mg->refcnt), 1);
 
-	thread__delete(other);
-
-	/*
-	 * Cannot call machine__delete_threads(machine) now,
-	 * because we've already released all the threads.
-	 */
+	thread__put(other);
 
 	machines__exit(&machines);
 	return 0;
diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c
index 3d9088003a5b6d..b34c5fc829ae2b 100644
--- a/tools/perf/tests/vmlinux-kallsyms.c
+++ b/tools/perf/tests/vmlinux-kallsyms.c
@@ -23,9 +23,10 @@ int test__vmlinux_matches_kallsyms(void)
 	int err = -1;
 	struct rb_node *nd;
 	struct symbol *sym;
-	struct map *kallsyms_map, *vmlinux_map;
+	struct map *kallsyms_map, *vmlinux_map, *map;
 	struct machine kallsyms, vmlinux;
 	enum map_type type = MAP__FUNCTION;
+	struct maps *maps = &vmlinux.kmaps.maps[type];
 	u64 mem_start, mem_end;
 
 	/*
@@ -184,8 +185,8 @@ detour:
 
 	pr_info("Maps only in vmlinux:\n");
 
-	for (nd = rb_first(&vmlinux.kmaps.maps[type]); nd; nd = rb_next(nd)) {
-		struct map *pos = rb_entry(nd, struct map, rb_node), *pair;
+	for (map = maps__first(maps); map; map = map__next(map)) {
+		struct map *
 		/*
 		 * If it is the kernel, kallsyms is always "[kernel.kallsyms]", while
 		 * the kernel will have the path for the vmlinux file being used,
@@ -193,22 +194,22 @@ detour:
 		 * both cases.
 		 */
 		pair = map_groups__find_by_name(&kallsyms.kmaps, type,
-						(pos->dso->kernel ?
-							pos->dso->short_name :
-							pos->dso->name));
+						(map->dso->kernel ?
+							map->dso->short_name :
+							map->dso->name));
 		if (pair)
 			pair->priv = 1;
 		else
-			map__fprintf(pos, stderr);
+			map__fprintf(map, stderr);
 	}
 
 	pr_info("Maps in vmlinux with a different name in kallsyms:\n");
 
-	for (nd = rb_first(&vmlinux.kmaps.maps[type]); nd; nd = rb_next(nd)) {
-		struct map *pos = rb_entry(nd, struct map, rb_node), *pair;
+	for (map = maps__first(maps); map; map = map__next(map)) {
+		struct map *pair;
 
-		mem_start = vmlinux_map->unmap_ip(vmlinux_map, pos->start);
-		mem_end = vmlinux_map->unmap_ip(vmlinux_map, pos->end);
+		mem_start = vmlinux_map->unmap_ip(vmlinux_map, map->start);
+		mem_end = vmlinux_map->unmap_ip(vmlinux_map, map->end);
 
 		pair = map_groups__find(&kallsyms.kmaps, type, mem_start);
 		if (pair == NULL || pair->priv)
@@ -217,7 +218,7 @@ detour:
 		if (pair->start == mem_start) {
 			pair->priv = 1;
 			pr_info(" %" PRIx64 "-%" PRIx64 " %" PRIx64 " %s in kallsyms as",
-				pos->start, pos->end, pos->pgoff, pos->dso->name);
+				map->start, map->end, map->pgoff, map->dso->name);
 			if (mem_end != pair->end)
 				pr_info(":\n*%" PRIx64 "-%" PRIx64 " %" PRIx64,
 					pair->start, pair->end, pair->pgoff);
@@ -228,12 +229,11 @@ detour:
 
 	pr_info("Maps only in kallsyms:\n");
 
-	for (nd = rb_first(&kallsyms.kmaps.maps[type]);
-	     nd; nd = rb_next(nd)) {
-		struct map *pos = rb_entry(nd, struct map, rb_node);
+	maps = &kallsyms.kmaps.maps[type];
 
-		if (!pos->priv)
-			map__fprintf(pos, stderr);
+	for (map = maps__first(maps); map; map = map__next(map)) {
+		if (!map->priv)
+			map__fprintf(map, stderr);
 	}
 out:
 	machine__exit(&kallsyms);
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index e5250eb2dd5786..acb0e23b138e98 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -838,6 +838,10 @@ int map_symbol__tui_annotate(struct map_symbol *ms, struct perf_evsel *evsel,
 int hist_entry__tui_annotate(struct hist_entry *he, struct perf_evsel *evsel,
 			     struct hist_browser_timer *hbt)
 {
+	/* reset abort key so that it can get Ctrl-C as a key */
+	SLang_reset_tty();
+	SLang_init_tty(0, 0, 0);
+
 	return map_symbol__tui_annotate(&he->ms, evsel, hbt);
 }
 
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 995b7a8596b142..e64893f2fd7fa5 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -25,6 +25,9 @@ struct hist_browser {
 	struct hists	    *hists;
 	struct hist_entry   *he_selection;
 	struct map_symbol   *selection;
+	struct hist_browser_timer *hbt;
+	struct pstack	    *pstack;
+	struct perf_session_env *env;
 	int		     print_seq;
 	bool		     show_dso;
 	bool		     show_headers;
@@ -60,7 +63,7 @@ static int hist_browser__get_folding(struct hist_browser *browser)
 		struct hist_entry *he =
 			rb_entry(nd, struct hist_entry, rb_node);
 
-		if (he->ms.unfolded)
+		if (he->unfolded)
 			unfolded_rows += he->nr_rows;
 	}
 	return unfolded_rows;
@@ -136,24 +139,19 @@ static char tree__folded_sign(bool unfolded)
 	return unfolded ? '-' : '+';
 }
 
-static char map_symbol__folded(const struct map_symbol *ms)
-{
-	return ms->has_children ? tree__folded_sign(ms->unfolded) : ' ';
-}
-
 static char hist_entry__folded(const struct hist_entry *he)
 {
-	return map_symbol__folded(&he->ms);
+	return he->has_children ? tree__folded_sign(he->unfolded) : ' ';
 }
 
 static char callchain_list__folded(const struct callchain_list *cl)
 {
-	return map_symbol__folded(&cl->ms);
+	return cl->has_children ? tree__folded_sign(cl->unfolded) : ' ';
 }
 
-static void map_symbol__set_folding(struct map_symbol *ms, bool unfold)
+static void callchain_list__set_folding(struct callchain_list *cl, bool unfold)
 {
-	ms->unfolded = unfold ? ms->has_children : false;
+	cl->unfolded = unfold ? cl->has_children : false;
 }
 
 static int callchain_node__count_rows_rb_tree(struct callchain_node *node)
@@ -189,7 +187,7 @@ static int callchain_node__count_rows(struct callchain_node *node)
 
 	list_for_each_entry(chain, &node->val, list) {
 		++n;
-		unfolded = chain->ms.unfolded;
+		unfolded = chain->unfolded;
 	}
 
 	if (unfolded)
@@ -211,15 +209,27 @@ static int callchain__count_rows(struct rb_root *chain)
 	return n;
 }
 
-static bool map_symbol__toggle_fold(struct map_symbol *ms)
+static bool hist_entry__toggle_fold(struct hist_entry *he)
 {
-	if (!ms)
+	if (!he)
 		return false;
 
-	if (!ms->has_children)
+	if (!he->has_children)
 		return false;
 
-	ms->unfolded = !ms->unfolded;
+	he->unfolded = !he->unfolded;
+	return true;
+}
+
+static bool callchain_list__toggle_fold(struct callchain_list *cl)
+{
+	if (!cl)
+		return false;
+
+	if (!cl->has_children)
+		return false;
+
+	cl->unfolded = !cl->unfolded;
 	return true;
 }
 
@@ -235,10 +245,10 @@ static void callchain_node__init_have_children_rb_tree(struct callchain_node *no
 		list_for_each_entry(chain, &child->val, list) {
 			if (first) {
 				first = false;
-				chain->ms.has_children = chain->list.next != &child->val ||
+				chain->has_children = chain->list.next != &child->val ||
 							 !RB_EMPTY_ROOT(&child->rb_root);
 			} else
-				chain->ms.has_children = chain->list.next == &child->val &&
+				chain->has_children = chain->list.next == &child->val &&
 							 !RB_EMPTY_ROOT(&child->rb_root);
 		}
 
@@ -252,11 +262,11 @@ static void callchain_node__init_have_children(struct callchain_node *node,
 	struct callchain_list *chain;
 
 	chain = list_entry(node->val.next, struct callchain_list, list);
-	chain->ms.has_children = has_sibling;
+	chain->has_children = has_sibling;
 
 	if (!list_empty(&node->val)) {
 		chain = list_entry(node->val.prev, struct callchain_list, list);
-		chain->ms.has_children = !RB_EMPTY_ROOT(&node->rb_root);
+		chain->has_children = !RB_EMPTY_ROOT(&node->rb_root);
 	}
 
 	callchain_node__init_have_children_rb_tree(node);
@@ -276,7 +286,7 @@ static void callchain__init_have_children(struct rb_root *root)
 static void hist_entry__init_have_children(struct hist_entry *he)
 {
 	if (!he->init_have_children) {
-		he->ms.has_children = !RB_EMPTY_ROOT(&he->sorted_chain);
+		he->has_children = !RB_EMPTY_ROOT(&he->sorted_chain);
 		callchain__init_have_children(&he->sorted_chain);
 		he->init_have_children = true;
 	}
@@ -284,14 +294,22 @@ static void hist_entry__init_have_children(struct hist_entry *he)
 
 static bool hist_browser__toggle_fold(struct hist_browser *browser)
 {
-	if (map_symbol__toggle_fold(browser->selection)) {
-		struct hist_entry *he = browser->he_selection;
+	struct hist_entry *he = browser->he_selection;
+	struct map_symbol *ms = browser->selection;
+	struct callchain_list *cl = container_of(ms, struct callchain_list, ms);
+	bool has_children;
 
+	if (ms == &he->ms)
+		has_children = hist_entry__toggle_fold(he);
+	else
+		has_children = callchain_list__toggle_fold(cl);
+
+	if (has_children) {
 		hist_entry__init_have_children(he);
 		browser->b.nr_entries -= he->nr_rows;
 		browser->nr_callchain_rows -= he->nr_rows;
 
-		if (he->ms.unfolded)
+		if (he->unfolded)
 			he->nr_rows = callchain__count_rows(&he->sorted_chain);
 		else
 			he->nr_rows = 0;
@@ -318,8 +336,8 @@ static int callchain_node__set_folding_rb_tree(struct callchain_node *node, bool
 
 		list_for_each_entry(chain, &child->val, list) {
 			++n;
-			map_symbol__set_folding(&chain->ms, unfold);
-			has_children = chain->ms.has_children;
+			callchain_list__set_folding(chain, unfold);
+			has_children = chain->has_children;
 		}
 
 		if (has_children)
@@ -337,8 +355,8 @@ static int callchain_node__set_folding(struct callchain_node *node, bool unfold)
 
 	list_for_each_entry(chain, &node->val, list) {
 		++n;
-		map_symbol__set_folding(&chain->ms, unfold);
-		has_children = chain->ms.has_children;
+		callchain_list__set_folding(chain, unfold);
+		has_children = chain->has_children;
 	}
 
 	if (has_children)
@@ -363,9 +381,9 @@ static int callchain__set_folding(struct rb_root *chain, bool unfold)
 static void hist_entry__set_folding(struct hist_entry *he, bool unfold)
 {
 	hist_entry__init_have_children(he);
-	map_symbol__set_folding(&he->ms, unfold);
+	he->unfolded = unfold ? he->has_children : false;
 
-	if (he->ms.has_children) {
+	if (he->has_children) {
 		int n = callchain__set_folding(&he->sorted_chain, unfold);
 		he->nr_rows = unfold ? n : 0;
 	} else
@@ -406,11 +424,11 @@ static void ui_browser__warn_lost_events(struct ui_browser *browser)
 		"Or reduce the sampling frequency.");
 }
 
-static int hist_browser__run(struct hist_browser *browser,
-			     struct hist_browser_timer *hbt)
+static int hist_browser__run(struct hist_browser *browser)
 {
 	int key;
 	char title[160];
+	struct hist_browser_timer *hbt = browser->hbt;
 	int delay_secs = hbt ? hbt->refresh : 0;
 
 	browser->b.entries = &browser->hists->entries;
@@ -1016,7 +1034,7 @@ do_offset:
 	if (offset > 0) {
 		do {
 			h = rb_entry(nd, struct hist_entry, rb_node);
-			if (h->ms.unfolded) {
+			if (h->unfolded) {
 				u16 remaining = h->nr_rows - h->row_offset;
 				if (offset > remaining) {
 					offset -= remaining;
@@ -1037,7 +1055,7 @@ do_offset:
 	} else if (offset < 0) {
 		while (1) {
 			h = rb_entry(nd, struct hist_entry, rb_node);
-			if (h->ms.unfolded) {
+			if (h->unfolded) {
 				if (first) {
 					if (-offset > h->row_offset) {
 						offset += h->row_offset;
@@ -1074,7 +1092,7 @@ do_offset:
 				 * row_offset at its last entry.
 				 */
 				h = rb_entry(nd, struct hist_entry, rb_node);
-				if (h->ms.unfolded)
+				if (h->unfolded)
 					h->row_offset = h->nr_rows;
 				break;
 			}
@@ -1195,7 +1213,9 @@ static int hist_browser__dump(struct hist_browser *browser)
 	return 0;
 }
 
-static struct hist_browser *hist_browser__new(struct hists *hists)
+static struct hist_browser *hist_browser__new(struct hists *hists,
+					      struct hist_browser_timer *hbt,
+					      struct perf_session_env *env)
 {
 	struct hist_browser *browser = zalloc(sizeof(*browser));
 
@@ -1206,6 +1226,8 @@ static struct hist_browser *hist_browser__new(struct hists *hists)
 		browser->b.seek = ui_browser__hists_seek;
 		browser->b.use_navkeypressed = true;
 		browser->show_headers = symbol_conf.show_hist_headers;
+		browser->hbt = hbt;
+		browser->env = env;
 	}
 
 	return browser;
@@ -1395,6 +1417,257 @@ close_file_and_continue:
 	return ret;
 }
 
+struct popup_action {
+	struct thread 		*thread;
+	struct dso		*dso;
+	struct map_symbol 	ms;
+
+	int (*fn)(struct hist_browser *browser, struct popup_action *act);
+};
+
+static int
+do_annotate(struct hist_browser *browser, struct popup_action *act)
+{
+	struct perf_evsel *evsel;
+	struct annotation *notes;
+	struct hist_entry *he;
+	int err;
+
+	if (!objdump_path && perf_session_env__lookup_objdump(browser->env))
+		return 0;
+
+	notes = symbol__annotation(act->ms.sym);
+	if (!notes->src)
+		return 0;
+
+	evsel = hists_to_evsel(browser->hists);
+	err = map_symbol__tui_annotate(&act->ms, evsel, browser->hbt);
+	he = hist_browser__selected_entry(browser);
+	/*
+	 * offer option to annotate the other branch source or target
+	 * (if they exists) when returning from annotate
+	 */
+	if ((err == 'q' || err == CTRL('c')) && he->branch_info)
+		return 1;
+
+	ui_browser__update_nr_entries(&browser->b, browser->hists->nr_entries);
+	if (err)
+		ui_browser__handle_resize(&browser->b);
+	return 0;
+}
+
+static int
+add_annotate_opt(struct hist_browser *browser __maybe_unused,
+		 struct popup_action *act, char **optstr,
+		 struct map *map, struct symbol *sym)
+{
+	if (sym == NULL || map->dso->annotate_warned)
+		return 0;
+
+	if (asprintf(optstr, "Annotate %s", sym->name) < 0)
+		return 0;
+
+	act->ms.map = map;
+	act->ms.sym = sym;
+	act->fn = do_annotate;
+	return 1;
+}
+
+static int
+do_zoom_thread(struct hist_browser *browser, struct popup_action *act)
+{
+	struct thread *thread = act->thread;
+
+	if (browser->hists->thread_filter) {
+		pstack__remove(browser->pstack, &browser->hists->thread_filter);
+		perf_hpp__set_elide(HISTC_THREAD, false);
+		thread__zput(browser->hists->thread_filter);
+		ui_helpline__pop();
+	} else {
+		ui_helpline__fpush("To zoom out press <- or -> + \"Zoom out of %s(%d) thread\"",
+				   thread->comm_set ? thread__comm_str(thread) : "",
+				   thread->tid);
+		browser->hists->thread_filter = thread__get(thread);
+		perf_hpp__set_elide(HISTC_THREAD, false);
+		pstack__push(browser->pstack, &browser->hists->thread_filter);
+	}
+
+	hists__filter_by_thread(browser->hists);
+	hist_browser__reset(browser);
+	return 0;
+}
+
+static int
+add_thread_opt(struct hist_browser *browser, struct popup_action *act,
+	       char **optstr, struct thread *thread)
+{
+	if (thread == NULL)
+		return 0;
+
+	if (asprintf(optstr, "Zoom %s %s(%d) thread",
+		     browser->hists->thread_filter ? "out of" : "into",
+		     thread->comm_set ? thread__comm_str(thread) : "",
+		     thread->tid) < 0)
+		return 0;
+
+	act->thread = thread;
+	act->fn = do_zoom_thread;
+	return 1;
+}
+
+static int
+do_zoom_dso(struct hist_browser *browser, struct popup_action *act)
+{
+	struct dso *dso = act->dso;
+
+	if (browser->hists->dso_filter) {
+		pstack__remove(browser->pstack, &browser->hists->dso_filter);
+		perf_hpp__set_elide(HISTC_DSO, false);
+		browser->hists->dso_filter = NULL;
+		ui_helpline__pop();
+	} else {
+		if (dso == NULL)
+			return 0;
+		ui_helpline__fpush("To zoom out press <- or -> + \"Zoom out of %s DSO\"",
+				   dso->kernel ? "the Kernel" : dso->short_name);
+		browser->hists->dso_filter = dso;
+		perf_hpp__set_elide(HISTC_DSO, true);
+		pstack__push(browser->pstack, &browser->hists->dso_filter);
+	}
+
+	hists__filter_by_dso(browser->hists);
+	hist_browser__reset(browser);
+	return 0;
+}
+
+static int
+add_dso_opt(struct hist_browser *browser, struct popup_action *act,
+	    char **optstr, struct dso *dso)
+{
+	if (dso == NULL)
+		return 0;
+
+	if (asprintf(optstr, "Zoom %s %s DSO",
+		     browser->hists->dso_filter ? "out of" : "into",
+		     dso->kernel ? "the Kernel" : dso->short_name) < 0)
+		return 0;
+
+	act->dso = dso;
+	act->fn = do_zoom_dso;
+	return 1;
+}
+
+static int
+do_browse_map(struct hist_browser *browser __maybe_unused,
+	      struct popup_action *act)
+{
+	map__browse(act->ms.map);
+	return 0;
+}
+
+static int
+add_map_opt(struct hist_browser *browser __maybe_unused,
+	    struct popup_action *act, char **optstr, struct map *map)
+{
+	if (map == NULL)
+		return 0;
+
+	if (asprintf(optstr, "Browse map details") < 0)
+		return 0;
+
+	act->ms.map = map;
+	act->fn = do_browse_map;
+	return 1;
+}
+
+static int
+do_run_script(struct hist_browser *browser __maybe_unused,
+	      struct popup_action *act)
+{
+	char script_opt[64];
+	memset(script_opt, 0, sizeof(script_opt));
+
+	if (act->thread) {
+		scnprintf(script_opt, sizeof(script_opt), " -c %s ",
+			  thread__comm_str(act->thread));
+	} else if (act->ms.sym) {
+		scnprintf(script_opt, sizeof(script_opt), " -S %s ",
+			  act->ms.sym->name);
+	}
+
+	script_browse(script_opt);
+	return 0;
+}
+
+static int
+add_script_opt(struct hist_browser *browser __maybe_unused,
+	       struct popup_action *act, char **optstr,
+	       struct thread *thread, struct symbol *sym)
+{
+	if (thread) {
+		if (asprintf(optstr, "Run scripts for samples of thread [%s]",
+			     thread__comm_str(thread)) < 0)
+			return 0;
+	} else if (sym) {
+		if (asprintf(optstr, "Run scripts for samples of symbol [%s]",
+			     sym->name) < 0)
+			return 0;
+	} else {
+		if (asprintf(optstr, "Run scripts for all samples") < 0)
+			return 0;
+	}
+
+	act->thread = thread;
+	act->ms.sym = sym;
+	act->fn = do_run_script;
+	return 1;
+}
+
+static int
+do_switch_data(struct hist_browser *browser __maybe_unused,
+	       struct popup_action *act __maybe_unused)
+{
+	if (switch_data_file()) {
+		ui__warning("Won't switch the data files due to\n"
+			    "no valid data file get selected!\n");
+		return 0;
+	}
+
+	return K_SWITCH_INPUT_DATA;
+}
+
+static int
+add_switch_opt(struct hist_browser *browser,
+	       struct popup_action *act, char **optstr)
+{
+	if (!is_report_browser(browser->hbt))
+		return 0;
+
+	if (asprintf(optstr, "Switch to another data file in PWD") < 0)
+		return 0;
+
+	act->fn = do_switch_data;
+	return 1;
+}
+
+static int
+do_exit_browser(struct hist_browser *browser __maybe_unused,
+		struct popup_action *act __maybe_unused)
+{
+	return 0;
+}
+
+static int
+add_exit_opt(struct hist_browser *browser __maybe_unused,
+	     struct popup_action *act, char **optstr)
+{
+	if (asprintf(optstr, "Exit") < 0)
+		return 0;
+
+	act->fn = do_exit_browser;
+	return 1;
+}
+
 static void hist_browser__update_nr_entries(struct hist_browser *hb)
 {
 	u64 nr_entries = 0;
@@ -1421,14 +1694,14 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 				    struct perf_session_env *env)
 {
 	struct hists *hists = evsel__hists(evsel);
-	struct hist_browser *browser = hist_browser__new(hists);
+	struct hist_browser *browser = hist_browser__new(hists, hbt, env);
 	struct branch_info *bi;
-	struct pstack *fstack;
-	char *options[16];
+#define MAX_OPTIONS  16
+	char *options[MAX_OPTIONS];
+	struct popup_action actions[MAX_OPTIONS];
 	int nr_options = 0;
 	int key = -1;
 	char buf[64];
-	char script_opt[64];
 	int delay_secs = hbt ? hbt->refresh : 0;
 	struct perf_hpp_fmt *fmt;
 
@@ -1468,18 +1741,23 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 	if (browser == NULL)
 		return -1;
 
+	/* reset abort key so that it can get Ctrl-C as a key */
+	SLang_reset_tty();
+	SLang_init_tty(0, 0, 0);
+
 	if (min_pcnt) {
 		browser->min_pcnt = min_pcnt;
 		hist_browser__update_nr_entries(browser);
 	}
 
-	fstack = pstack__new(2);
-	if (fstack == NULL)
+	browser->pstack = pstack__new(2);
+	if (browser->pstack == NULL)
 		goto out;
 
 	ui_helpline__push(helpline);
 
 	memset(options, 0, sizeof(options));
+	memset(actions, 0, sizeof(actions));
 
 	perf_hpp__for_each_format(fmt)
 		perf_hpp__reset_width(fmt, hists);
@@ -1489,16 +1767,12 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 
 	while (1) {
 		struct thread *thread = NULL;
-		const struct dso *dso = NULL;
-		int choice = 0,
-		    annotate = -2, zoom_dso = -2, zoom_thread = -2,
-		    annotate_f = -2, annotate_t = -2, browse_map = -2;
-		int scripts_comm = -2, scripts_symbol = -2,
-		    scripts_all = -2, switch_data = -2;
+		struct dso *dso = NULL;
+		int choice = 0;
 
 		nr_options = 0;
 
-		key = hist_browser__run(browser, hbt);
+		key = hist_browser__run(browser);
 
 		if (browser->he_selection != NULL) {
 			thread = hist_browser__selected_thread(browser);
@@ -1526,17 +1800,25 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 			    browser->selection->sym == NULL ||
 			    browser->selection->map->dso->annotate_warned)
 				continue;
-			goto do_annotate;
+
+			actions->ms.map = browser->selection->map;
+			actions->ms.sym = browser->selection->sym;
+			do_annotate(browser, actions);
+			continue;
 		case 'P':
 			hist_browser__dump(browser);
 			continue;
 		case 'd':
-			goto zoom_dso;
+			actions->dso = dso;
+			do_zoom_dso(browser, actions);
+			continue;
 		case 'V':
 			browser->show_dso = !browser->show_dso;
 			continue;
 		case 't':
-			goto zoom_thread;
+			actions->thread = thread;
+			do_zoom_thread(browser, actions);
+			continue;
 		case '/':
 			if (ui_browser__input_window("Symbol to show",
 					"Please enter the name of symbol you want to see",
@@ -1548,12 +1830,18 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 			}
 			continue;
 		case 'r':
-			if (is_report_browser(hbt))
-				goto do_scripts;
+			if (is_report_browser(hbt)) {
+				actions->thread = NULL;
+				actions->ms.sym = NULL;
+				do_run_script(browser, actions);
+			}
 			continue;
 		case 's':
-			if (is_report_browser(hbt))
-				goto do_data_switch;
+			if (is_report_browser(hbt)) {
+				key = do_switch_data(browser, actions);
+				if (key == K_SWITCH_INPUT_DATA)
+					goto out_free_stack;
+			}
 			continue;
 		case 'i':
 			/* env->arch is NULL for live-mode (i.e. perf top) */
@@ -1583,7 +1871,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 		case K_LEFT: {
 			const void *top;
 
-			if (pstack__empty(fstack)) {
+			if (pstack__empty(browser->pstack)) {
 				/*
 				 * Go back to the perf_evsel_menu__run or other user
 				 */
@@ -1591,11 +1879,17 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 					goto out_free_stack;
 				continue;
 			}
-			top = pstack__pop(fstack);
-			if (top == &browser->hists->dso_filter)
-				goto zoom_out_dso;
+			top = pstack__peek(browser->pstack);
+			if (top == &browser->hists->dso_filter) {
+				/*
+				 * No need to set actions->dso here since
+				 * it's just to remove the current filter.
+				 * Ditto for thread below.
+				 */
+				do_zoom_dso(browser, actions);
+			}
 			if (top == &browser->hists->thread_filter)
-				goto zoom_out_thread;
+				do_zoom_thread(browser, actions);
 			continue;
 		}
 		case K_ESC:
@@ -1623,196 +1917,71 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 			if (bi == NULL)
 				goto skip_annotation;
 
-			if (bi->from.sym != NULL &&
-			    !bi->from.map->dso->annotate_warned &&
-			    asprintf(&options[nr_options], "Annotate %s", bi->from.sym->name) > 0) {
-				annotate_f = nr_options++;
-			}
-
-			if (bi->to.sym != NULL &&
-			    !bi->to.map->dso->annotate_warned &&
-			    (bi->to.sym != bi->from.sym ||
-			     bi->to.map->dso != bi->from.map->dso) &&
-			    asprintf(&options[nr_options], "Annotate %s", bi->to.sym->name) > 0) {
-				annotate_t = nr_options++;
-			}
+			nr_options += add_annotate_opt(browser,
+						       &actions[nr_options],
+						       &options[nr_options],
+						       bi->from.map,
+						       bi->from.sym);
+			if (bi->to.sym != bi->from.sym)
+				nr_options += add_annotate_opt(browser,
+							&actions[nr_options],
+							&options[nr_options],
+							bi->to.map,
+							bi->to.sym);
 		} else {
-			if (browser->selection->sym != NULL &&
-			    !browser->selection->map->dso->annotate_warned) {
-				struct annotation *notes;
-
-				notes = symbol__annotation(browser->selection->sym);
-
-				if (notes->src &&
-				    asprintf(&options[nr_options], "Annotate %s",
-						 browser->selection->sym->name) > 0) {
-					annotate = nr_options++;
-				}
-			}
+			nr_options += add_annotate_opt(browser,
+						       &actions[nr_options],
+						       &options[nr_options],
+						       browser->selection->map,
+						       browser->selection->sym);
 		}
 skip_annotation:
-		if (thread != NULL &&
-		    asprintf(&options[nr_options], "Zoom %s %s(%d) thread",
-			     (browser->hists->thread_filter ? "out of" : "into"),
-			     (thread->comm_set ? thread__comm_str(thread) : ""),
-			     thread->tid) > 0)
-			zoom_thread = nr_options++;
-
-		if (dso != NULL &&
-		    asprintf(&options[nr_options], "Zoom %s %s DSO",
-			     (browser->hists->dso_filter ? "out of" : "into"),
-			     (dso->kernel ? "the Kernel" : dso->short_name)) > 0)
-			zoom_dso = nr_options++;
-
-		if (browser->selection != NULL &&
-		    browser->selection->map != NULL &&
-		    asprintf(&options[nr_options], "Browse map details") > 0)
-			browse_map = nr_options++;
+		nr_options += add_thread_opt(browser, &actions[nr_options],
+					     &options[nr_options], thread);
+		nr_options += add_dso_opt(browser, &actions[nr_options],
+					  &options[nr_options], dso);
+		nr_options += add_map_opt(browser, &actions[nr_options],
+					  &options[nr_options],
+					  browser->selection->map);
 
 		/* perf script support */
 		if (browser->he_selection) {
-			struct symbol *sym;
-
-			if (asprintf(&options[nr_options], "Run scripts for samples of thread [%s]",
-				     thread__comm_str(browser->he_selection->thread)) > 0)
-				scripts_comm = nr_options++;
-
-			sym = browser->he_selection->ms.sym;
-			if (sym && sym->namelen &&
-				asprintf(&options[nr_options], "Run scripts for samples of symbol [%s]",
-						sym->name) > 0)
-				scripts_symbol = nr_options++;
+			nr_options += add_script_opt(browser,
+						     &actions[nr_options],
+						     &options[nr_options],
+						     thread, NULL);
+			nr_options += add_script_opt(browser,
+						     &actions[nr_options],
+						     &options[nr_options],
+						     NULL, browser->selection->sym);
 		}
-
-		if (asprintf(&options[nr_options], "Run scripts for all samples") > 0)
-			scripts_all = nr_options++;
-
-		if (is_report_browser(hbt) && asprintf(&options[nr_options],
-				"Switch to another data file in PWD") > 0)
-			switch_data = nr_options++;
+		nr_options += add_script_opt(browser, &actions[nr_options],
+					     &options[nr_options], NULL, NULL);
+		nr_options += add_switch_opt(browser, &actions[nr_options],
+					     &options[nr_options]);
 add_exit_option:
-		options[nr_options++] = (char *)"Exit";
-retry_popup_menu:
-		choice = ui__popup_menu(nr_options, options);
-
-		if (choice == nr_options - 1)
-			break;
+		nr_options += add_exit_opt(browser, &actions[nr_options],
+					   &options[nr_options]);
 
-		if (choice == -1) {
-			free_popup_options(options, nr_options - 1);
-			continue;
-		}
-
-		if (choice == annotate || choice == annotate_t || choice == annotate_f) {
-			struct hist_entry *he;
-			struct annotation *notes;
-			struct map_symbol ms;
-			int err;
-do_annotate:
-			if (!objdump_path && perf_session_env__lookup_objdump(env))
-				continue;
-
-			he = hist_browser__selected_entry(browser);
-			if (he == NULL)
-				continue;
-
-			if (choice == annotate_f) {
-				ms.map = he->branch_info->from.map;
-				ms.sym = he->branch_info->from.sym;
-			} else if (choice == annotate_t) {
-				ms.map = he->branch_info->to.map;
-				ms.sym = he->branch_info->to.sym;
-			} else {
-				ms = *browser->selection;
-			}
-
-			notes = symbol__annotation(ms.sym);
-			if (!notes->src)
-				continue;
-
-			err = map_symbol__tui_annotate(&ms, evsel, hbt);
-			/*
-			 * offer option to annotate the other branch source or target
-			 * (if they exists) when returning from annotate
-			 */
-			if ((err == 'q' || err == CTRL('c'))
-			    && annotate_t != -2 && annotate_f != -2)
-				goto retry_popup_menu;
-
-			ui_browser__update_nr_entries(&browser->b, browser->hists->nr_entries);
-			if (err)
-				ui_browser__handle_resize(&browser->b);
-
-		} else if (choice == browse_map)
-			map__browse(browser->selection->map);
-		else if (choice == zoom_dso) {
-zoom_dso:
-			if (browser->hists->dso_filter) {
-				pstack__remove(fstack, &browser->hists->dso_filter);
-zoom_out_dso:
-				ui_helpline__pop();
-				browser->hists->dso_filter = NULL;
-				perf_hpp__set_elide(HISTC_DSO, false);
-			} else {
-				if (dso == NULL)
-					continue;
-				ui_helpline__fpush("To zoom out press <- or -> + \"Zoom out of %s DSO\"",
-						   dso->kernel ? "the Kernel" : dso->short_name);
-				browser->hists->dso_filter = dso;
-				perf_hpp__set_elide(HISTC_DSO, true);
-				pstack__push(fstack, &browser->hists->dso_filter);
-			}
-			hists__filter_by_dso(hists);
-			hist_browser__reset(browser);
-		} else if (choice == zoom_thread) {
-zoom_thread:
-			if (browser->hists->thread_filter) {
-				pstack__remove(fstack, &browser->hists->thread_filter);
-zoom_out_thread:
-				ui_helpline__pop();
-				thread__zput(browser->hists->thread_filter);
-				perf_hpp__set_elide(HISTC_THREAD, false);
-			} else {
-				ui_helpline__fpush("To zoom out press <- or -> + \"Zoom out of %s(%d) thread\"",
-						   thread->comm_set ? thread__comm_str(thread) : "",
-						   thread->tid);
-				browser->hists->thread_filter = thread__get(thread);
-				perf_hpp__set_elide(HISTC_THREAD, false);
-				pstack__push(fstack, &browser->hists->thread_filter);
-			}
-			hists__filter_by_thread(hists);
-			hist_browser__reset(browser);
-		}
-		/* perf scripts support */
-		else if (choice == scripts_all || choice == scripts_comm ||
-				choice == scripts_symbol) {
-do_scripts:
-			memset(script_opt, 0, 64);
+		do {
+			struct popup_action *act;
 
-			if (choice == scripts_comm)
-				sprintf(script_opt, " -c %s ", thread__comm_str(browser->he_selection->thread));
+			choice = ui__popup_menu(nr_options, options);
+			if (choice == -1 || choice >= nr_options)
+				break;
 
-			if (choice == scripts_symbol)
-				sprintf(script_opt, " -S %s ", browser->he_selection->ms.sym->name);
+			act = &actions[choice];
+			key = act->fn(browser, act);
+		} while (key == 1);
 
-			script_browse(script_opt);
-		}
-		/* Switch to another data file */
-		else if (choice == switch_data) {
-do_data_switch:
-			if (!switch_data_file()) {
-				key = K_SWITCH_INPUT_DATA;
-				break;
-			} else
-				ui__warning("Won't switch the data files due to\n"
-					"no valid data file get selected!\n");
-		}
+		if (key == K_SWITCH_INPUT_DATA)
+			break;
 	}
 out_free_stack:
-	pstack__delete(fstack);
+	pstack__delete(browser->pstack);
 out:
 	hist_browser__delete(browser);
-	free_popup_options(options, nr_options - 1);
+	free_popup_options(options, MAX_OPTIONS);
 	return key;
 }
 
diff --git a/tools/perf/ui/tui/setup.c b/tools/perf/ui/tui/setup.c
index b77e1d7713637c..60d1f29b4b50a9 100644
--- a/tools/perf/ui/tui/setup.c
+++ b/tools/perf/ui/tui/setup.c
@@ -129,7 +129,7 @@ int ui__init(void)
 	err = SLsmg_init_smg();
 	if (err < 0)
 		goto out;
-	err = SLang_init_tty(0, 0, 0);
+	err = SLang_init_tty(-1, 0, 0);
 	if (err < 0)
 		goto out;
 
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 797490a4007560..e4b676de2f6435 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -74,6 +74,8 @@ libperf-y += data.o
 libperf-$(CONFIG_X86) += tsc.o
 libperf-y += cloexec.o
 libperf-y += thread-stack.o
+libperf-$(CONFIG_AUXTRACE) += auxtrace.o
+libperf-y += parse-branch-options.o
 
 libperf-$(CONFIG_LIBELF) += symbol-elf.o
 libperf-$(CONFIG_LIBELF) += probe-event.o
@@ -101,23 +103,23 @@ CFLAGS_exec_cmd.o += -DPERF_EXEC_PATH="BUILD_STR($(perfexecdir_SQ))" -DPREFIX="B
 
 $(OUTPUT)util/parse-events-flex.c: util/parse-events.l $(OUTPUT)util/parse-events-bison.c
 	$(call rule_mkdir)
-	@$(call echo-cmd,flex)$(FLEX) -o $@ --header-file=$(OUTPUT)util/parse-events-flex.h $(PARSER_DEBUG_FLEX) util/parse-events.l
+	$(Q)$(call echo-cmd,flex)$(FLEX) -o $@ --header-file=$(OUTPUT)util/parse-events-flex.h $(PARSER_DEBUG_FLEX) util/parse-events.l
 
 $(OUTPUT)util/parse-events-bison.c: util/parse-events.y
 	$(call rule_mkdir)
-	@$(call echo-cmd,bison)$(BISON) -v util/parse-events.y -d $(PARSER_DEBUG_BISON) -o $@ -p parse_events_
+	$(Q)$(call echo-cmd,bison)$(BISON) -v util/parse-events.y -d $(PARSER_DEBUG_BISON) -o $@ -p parse_events_
 
 $(OUTPUT)util/pmu-flex.c: util/pmu.l $(OUTPUT)util/pmu-bison.c
 	$(call rule_mkdir)
-	@$(call echo-cmd,flex)$(FLEX) -o $@ --header-file=$(OUTPUT)util/pmu-flex.h util/pmu.l
+	$(Q)$(call echo-cmd,flex)$(FLEX) -o $@ --header-file=$(OUTPUT)util/pmu-flex.h util/pmu.l
 
 $(OUTPUT)util/pmu-bison.c: util/pmu.y
 	$(call rule_mkdir)
-	@$(call echo-cmd,bison)$(BISON) -v util/pmu.y -d -o $@ -p perf_pmu_
+	$(Q)$(call echo-cmd,bison)$(BISON) -v util/pmu.y -d -o $@ -p perf_pmu_
 
 CFLAGS_parse-events-flex.o  += -w
 CFLAGS_pmu-flex.o           += -w
-CFLAGS_parse-events-bison.o += -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w
+CFLAGS_parse-events-bison.o += -DYYENABLE_NLS=0 -w
 CFLAGS_pmu-bison.o          += -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w
 
 $(OUTPUT)util/parse-events.o: $(OUTPUT)util/parse-events-flex.c $(OUTPUT)util/parse-events-bison.c
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 7f5bdfc9bc87d1..bf804300990987 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -506,6 +506,17 @@ static int __symbol__inc_addr_samples(struct symbol *sym, struct map *map,
 	return 0;
 }
 
+static struct annotation *symbol__get_annotation(struct symbol *sym)
+{
+	struct annotation *notes = symbol__annotation(sym);
+
+	if (notes->src == NULL) {
+		if (symbol__alloc_hist(sym) < 0)
+			return NULL;
+	}
+	return notes;
+}
+
 static int symbol__inc_addr_samples(struct symbol *sym, struct map *map,
 				    int evidx, u64 addr)
 {
@@ -513,13 +524,9 @@ static int symbol__inc_addr_samples(struct symbol *sym, struct map *map,
 
 	if (sym == NULL)
 		return 0;
-
-	notes = symbol__annotation(sym);
-	if (notes->src == NULL) {
-		if (symbol__alloc_hist(sym) < 0)
-			return -ENOMEM;
-	}
-
+	notes = symbol__get_annotation(sym);
+	if (notes == NULL)
+		return -ENOMEM;
 	return __symbol__inc_addr_samples(sym, map, notes, evidx, addr);
 }
 
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
new file mode 100644
index 00000000000000..df66966cfde7ab
--- /dev/null
+++ b/tools/perf/util/auxtrace.c
@@ -0,0 +1,1352 @@
+/*
+ * auxtrace.c: AUX area trace support
+ * Copyright (c) 2013-2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <stdbool.h>
+
+#include <linux/kernel.h>
+#include <linux/perf_event.h>
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/log2.h>
+#include <linux/string.h>
+
+#include <sys/param.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+#include <linux/list.h>
+
+#include "../perf.h"
+#include "util.h"
+#include "evlist.h"
+#include "cpumap.h"
+#include "thread_map.h"
+#include "asm/bug.h"
+#include "auxtrace.h"
+
+#include <linux/hash.h>
+
+#include "event.h"
+#include "session.h"
+#include "debug.h"
+#include "parse-options.h"
+
+int auxtrace_mmap__mmap(struct auxtrace_mmap *mm,
+			struct auxtrace_mmap_params *mp,
+			void *userpg, int fd)
+{
+	struct perf_event_mmap_page *pc = userpg;
+
+#if BITS_PER_LONG != 64 && !defined(HAVE_SYNC_COMPARE_AND_SWAP_SUPPORT)
+	pr_err("Cannot use AUX area tracing mmaps\n");
+	return -1;
+#endif
+
+	WARN_ONCE(mm->base, "Uninitialized auxtrace_mmap\n");
+
+	mm->userpg = userpg;
+	mm->mask = mp->mask;
+	mm->len = mp->len;
+	mm->prev = 0;
+	mm->idx = mp->idx;
+	mm->tid = mp->tid;
+	mm->cpu = mp->cpu;
+
+	if (!mp->len) {
+		mm->base = NULL;
+		return 0;
+	}
+
+	pc->aux_offset = mp->offset;
+	pc->aux_size = mp->len;
+
+	mm->base = mmap(NULL, mp->len, mp->prot, MAP_SHARED, fd, mp->offset);
+	if (mm->base == MAP_FAILED) {
+		pr_debug2("failed to mmap AUX area\n");
+		mm->base = NULL;
+		return -1;
+	}
+
+	return 0;
+}
+
+void auxtrace_mmap__munmap(struct auxtrace_mmap *mm)
+{
+	if (mm->base) {
+		munmap(mm->base, mm->len);
+		mm->base = NULL;
+	}
+}
+
+void auxtrace_mmap_params__init(struct auxtrace_mmap_params *mp,
+				off_t auxtrace_offset,
+				unsigned int auxtrace_pages,
+				bool auxtrace_overwrite)
+{
+	if (auxtrace_pages) {
+		mp->offset = auxtrace_offset;
+		mp->len = auxtrace_pages * (size_t)page_size;
+		mp->mask = is_power_of_2(mp->len) ? mp->len - 1 : 0;
+		mp->prot = PROT_READ | (auxtrace_overwrite ? 0 : PROT_WRITE);
+		pr_debug2("AUX area mmap length %zu\n", mp->len);
+	} else {
+		mp->len = 0;
+	}
+}
+
+void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp,
+				   struct perf_evlist *evlist, int idx,
+				   bool per_cpu)
+{
+	mp->idx = idx;
+
+	if (per_cpu) {
+		mp->cpu = evlist->cpus->map[idx];
+		if (evlist->threads)
+			mp->tid = evlist->threads->map[0];
+		else
+			mp->tid = -1;
+	} else {
+		mp->cpu = -1;
+		mp->tid = evlist->threads->map[idx];
+	}
+}
+
+#define AUXTRACE_INIT_NR_QUEUES	32
+
+static struct auxtrace_queue *auxtrace_alloc_queue_array(unsigned int nr_queues)
+{
+	struct auxtrace_queue *queue_array;
+	unsigned int max_nr_queues, i;
+
+	max_nr_queues = UINT_MAX / sizeof(struct auxtrace_queue);
+	if (nr_queues > max_nr_queues)
+		return NULL;
+
+	queue_array = calloc(nr_queues, sizeof(struct auxtrace_queue));
+	if (!queue_array)
+		return NULL;
+
+	for (i = 0; i < nr_queues; i++) {
+		INIT_LIST_HEAD(&queue_array[i].head);
+		queue_array[i].priv = NULL;
+	}
+
+	return queue_array;
+}
+
+int auxtrace_queues__init(struct auxtrace_queues *queues)
+{
+	queues->nr_queues = AUXTRACE_INIT_NR_QUEUES;
+	queues->queue_array = auxtrace_alloc_queue_array(queues->nr_queues);
+	if (!queues->queue_array)
+		return -ENOMEM;
+	return 0;
+}
+
+static int auxtrace_queues__grow(struct auxtrace_queues *queues,
+				 unsigned int new_nr_queues)
+{
+	unsigned int nr_queues = queues->nr_queues;
+	struct auxtrace_queue *queue_array;
+	unsigned int i;
+
+	if (!nr_queues)
+		nr_queues = AUXTRACE_INIT_NR_QUEUES;
+
+	while (nr_queues && nr_queues < new_nr_queues)
+		nr_queues <<= 1;
+
+	if (nr_queues < queues->nr_queues || nr_queues < new_nr_queues)
+		return -EINVAL;
+
+	queue_array = auxtrace_alloc_queue_array(nr_queues);
+	if (!queue_array)
+		return -ENOMEM;
+
+	for (i = 0; i < queues->nr_queues; i++) {
+		list_splice_tail(&queues->queue_array[i].head,
+				 &queue_array[i].head);
+		queue_array[i].priv = queues->queue_array[i].priv;
+	}
+
+	queues->nr_queues = nr_queues;
+	queues->queue_array = queue_array;
+
+	return 0;
+}
+
+static void *auxtrace_copy_data(u64 size, struct perf_session *session)
+{
+	int fd = perf_data_file__fd(session->file);
+	void *p;
+	ssize_t ret;
+
+	if (size > SSIZE_MAX)
+		return NULL;
+
+	p = malloc(size);
+	if (!p)
+		return NULL;
+
+	ret = readn(fd, p, size);
+	if (ret != (ssize_t)size) {
+		free(p);
+		return NULL;
+	}
+
+	return p;
+}
+
+static int auxtrace_queues__add_buffer(struct auxtrace_queues *queues,
+				       unsigned int idx,
+				       struct auxtrace_buffer *buffer)
+{
+	struct auxtrace_queue *queue;
+	int err;
+
+	if (idx >= queues->nr_queues) {
+		err = auxtrace_queues__grow(queues, idx + 1);
+		if (err)
+			return err;
+	}
+
+	queue = &queues->queue_array[idx];
+
+	if (!queue->set) {
+		queue->set = true;
+		queue->tid = buffer->tid;
+		queue->cpu = buffer->cpu;
+	} else if (buffer->cpu != queue->cpu || buffer->tid != queue->tid) {
+		pr_err("auxtrace queue conflict: cpu %d, tid %d vs cpu %d, tid %d\n",
+		       queue->cpu, queue->tid, buffer->cpu, buffer->tid);
+		return -EINVAL;
+	}
+
+	buffer->buffer_nr = queues->next_buffer_nr++;
+
+	list_add_tail(&buffer->list, &queue->head);
+
+	queues->new_data = true;
+	queues->populated = true;
+
+	return 0;
+}
+
+/* Limit buffers to 32MiB on 32-bit */
+#define BUFFER_LIMIT_FOR_32_BIT (32 * 1024 * 1024)
+
+static int auxtrace_queues__split_buffer(struct auxtrace_queues *queues,
+					 unsigned int idx,
+					 struct auxtrace_buffer *buffer)
+{
+	u64 sz = buffer->size;
+	bool consecutive = false;
+	struct auxtrace_buffer *b;
+	int err;
+
+	while (sz > BUFFER_LIMIT_FOR_32_BIT) {
+		b = memdup(buffer, sizeof(struct auxtrace_buffer));
+		if (!b)
+			return -ENOMEM;
+		b->size = BUFFER_LIMIT_FOR_32_BIT;
+		b->consecutive = consecutive;
+		err = auxtrace_queues__add_buffer(queues, idx, b);
+		if (err) {
+			auxtrace_buffer__free(b);
+			return err;
+		}
+		buffer->data_offset += BUFFER_LIMIT_FOR_32_BIT;
+		sz -= BUFFER_LIMIT_FOR_32_BIT;
+		consecutive = true;
+	}
+
+	buffer->size = sz;
+	buffer->consecutive = consecutive;
+
+	return 0;
+}
+
+static int auxtrace_queues__add_event_buffer(struct auxtrace_queues *queues,
+					     struct perf_session *session,
+					     unsigned int idx,
+					     struct auxtrace_buffer *buffer)
+{
+	if (session->one_mmap) {
+		buffer->data = buffer->data_offset - session->one_mmap_offset +
+			       session->one_mmap_addr;
+	} else if (perf_data_file__is_pipe(session->file)) {
+		buffer->data = auxtrace_copy_data(buffer->size, session);
+		if (!buffer->data)
+			return -ENOMEM;
+		buffer->data_needs_freeing = true;
+	} else if (BITS_PER_LONG == 32 &&
+		   buffer->size > BUFFER_LIMIT_FOR_32_BIT) {
+		int err;
+
+		err = auxtrace_queues__split_buffer(queues, idx, buffer);
+		if (err)
+			return err;
+	}
+
+	return auxtrace_queues__add_buffer(queues, idx, buffer);
+}
+
+int auxtrace_queues__add_event(struct auxtrace_queues *queues,
+			       struct perf_session *session,
+			       union perf_event *event, off_t data_offset,
+			       struct auxtrace_buffer **buffer_ptr)
+{
+	struct auxtrace_buffer *buffer;
+	unsigned int idx;
+	int err;
+
+	buffer = zalloc(sizeof(struct auxtrace_buffer));
+	if (!buffer)
+		return -ENOMEM;
+
+	buffer->pid = -1;
+	buffer->tid = event->auxtrace.tid;
+	buffer->cpu = event->auxtrace.cpu;
+	buffer->data_offset = data_offset;
+	buffer->offset = event->auxtrace.offset;
+	buffer->reference = event->auxtrace.reference;
+	buffer->size = event->auxtrace.size;
+	idx = event->auxtrace.idx;
+
+	err = auxtrace_queues__add_event_buffer(queues, session, idx, buffer);
+	if (err)
+		goto out_err;
+
+	if (buffer_ptr)
+		*buffer_ptr = buffer;
+
+	return 0;
+
+out_err:
+	auxtrace_buffer__free(buffer);
+	return err;
+}
+
+static int auxtrace_queues__add_indexed_event(struct auxtrace_queues *queues,
+					      struct perf_session *session,
+					      off_t file_offset, size_t sz)
+{
+	union perf_event *event;
+	int err;
+	char buf[PERF_SAMPLE_MAX_SIZE];
+
+	err = perf_session__peek_event(session, file_offset, buf,
+				       PERF_SAMPLE_MAX_SIZE, &event, NULL);
+	if (err)
+		return err;
+
+	if (event->header.type == PERF_RECORD_AUXTRACE) {
+		if (event->header.size < sizeof(struct auxtrace_event) ||
+		    event->header.size != sz) {
+			err = -EINVAL;
+			goto out;
+		}
+		file_offset += event->header.size;
+		err = auxtrace_queues__add_event(queues, session, event,
+						 file_offset, NULL);
+	}
+out:
+	return err;
+}
+
+void auxtrace_queues__free(struct auxtrace_queues *queues)
+{
+	unsigned int i;
+
+	for (i = 0; i < queues->nr_queues; i++) {
+		while (!list_empty(&queues->queue_array[i].head)) {
+			struct auxtrace_buffer *buffer;
+
+			buffer = list_entry(queues->queue_array[i].head.next,
+					    struct auxtrace_buffer, list);
+			list_del(&buffer->list);
+			auxtrace_buffer__free(buffer);
+		}
+	}
+
+	zfree(&queues->queue_array);
+	queues->nr_queues = 0;
+}
+
+static void auxtrace_heapify(struct auxtrace_heap_item *heap_array,
+			     unsigned int pos, unsigned int queue_nr,
+			     u64 ordinal)
+{
+	unsigned int parent;
+
+	while (pos) {
+		parent = (pos - 1) >> 1;
+		if (heap_array[parent].ordinal <= ordinal)
+			break;
+		heap_array[pos] = heap_array[parent];
+		pos = parent;
+	}
+	heap_array[pos].queue_nr = queue_nr;
+	heap_array[pos].ordinal = ordinal;
+}
+
+int auxtrace_heap__add(struct auxtrace_heap *heap, unsigned int queue_nr,
+		       u64 ordinal)
+{
+	struct auxtrace_heap_item *heap_array;
+
+	if (queue_nr >= heap->heap_sz) {
+		unsigned int heap_sz = AUXTRACE_INIT_NR_QUEUES;
+
+		while (heap_sz <= queue_nr)
+			heap_sz <<= 1;
+		heap_array = realloc(heap->heap_array,
+				     heap_sz * sizeof(struct auxtrace_heap_item));
+		if (!heap_array)
+			return -ENOMEM;
+		heap->heap_array = heap_array;
+		heap->heap_sz = heap_sz;
+	}
+
+	auxtrace_heapify(heap->heap_array, heap->heap_cnt++, queue_nr, ordinal);
+
+	return 0;
+}
+
+void auxtrace_heap__free(struct auxtrace_heap *heap)
+{
+	zfree(&heap->heap_array);
+	heap->heap_cnt = 0;
+	heap->heap_sz = 0;
+}
+
+void auxtrace_heap__pop(struct auxtrace_heap *heap)
+{
+	unsigned int pos, last, heap_cnt = heap->heap_cnt;
+	struct auxtrace_heap_item *heap_array;
+
+	if (!heap_cnt)
+		return;
+
+	heap->heap_cnt -= 1;
+
+	heap_array = heap->heap_array;
+
+	pos = 0;
+	while (1) {
+		unsigned int left, right;
+
+		left = (pos << 1) + 1;
+		if (left >= heap_cnt)
+			break;
+		right = left + 1;
+		if (right >= heap_cnt) {
+			heap_array[pos] = heap_array[left];
+			return;
+		}
+		if (heap_array[left].ordinal < heap_array[right].ordinal) {
+			heap_array[pos] = heap_array[left];
+			pos = left;
+		} else {
+			heap_array[pos] = heap_array[right];
+			pos = right;
+		}
+	}
+
+	last = heap_cnt - 1;
+	auxtrace_heapify(heap_array, pos, heap_array[last].queue_nr,
+			 heap_array[last].ordinal);
+}
+
+size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr)
+{
+	if (itr)
+		return itr->info_priv_size(itr);
+	return 0;
+}
+
+static int auxtrace_not_supported(void)
+{
+	pr_err("AUX area tracing is not supported on this architecture\n");
+	return -EINVAL;
+}
+
+int auxtrace_record__info_fill(struct auxtrace_record *itr,
+			       struct perf_session *session,
+			       struct auxtrace_info_event *auxtrace_info,
+			       size_t priv_size)
+{
+	if (itr)
+		return itr->info_fill(itr, session, auxtrace_info, priv_size);
+	return auxtrace_not_supported();
+}
+
+void auxtrace_record__free(struct auxtrace_record *itr)
+{
+	if (itr)
+		itr->free(itr);
+}
+
+int auxtrace_record__snapshot_start(struct auxtrace_record *itr)
+{
+	if (itr && itr->snapshot_start)
+		return itr->snapshot_start(itr);
+	return 0;
+}
+
+int auxtrace_record__snapshot_finish(struct auxtrace_record *itr)
+{
+	if (itr && itr->snapshot_finish)
+		return itr->snapshot_finish(itr);
+	return 0;
+}
+
+int auxtrace_record__find_snapshot(struct auxtrace_record *itr, int idx,
+				   struct auxtrace_mmap *mm,
+				   unsigned char *data, u64 *head, u64 *old)
+{
+	if (itr && itr->find_snapshot)
+		return itr->find_snapshot(itr, idx, mm, data, head, old);
+	return 0;
+}
+
+int auxtrace_record__options(struct auxtrace_record *itr,
+			     struct perf_evlist *evlist,
+			     struct record_opts *opts)
+{
+	if (itr)
+		return itr->recording_options(itr, evlist, opts);
+	return 0;
+}
+
+u64 auxtrace_record__reference(struct auxtrace_record *itr)
+{
+	if (itr)
+		return itr->reference(itr);
+	return 0;
+}
+
+int auxtrace_parse_snapshot_options(struct auxtrace_record *itr,
+				    struct record_opts *opts, const char *str)
+{
+	if (!str)
+		return 0;
+
+	if (itr)
+		return itr->parse_snapshot_options(itr, opts, str);
+
+	pr_err("No AUX area tracing to snapshot\n");
+	return -EINVAL;
+}
+
+struct auxtrace_record *__weak
+auxtrace_record__init(struct perf_evlist *evlist __maybe_unused, int *err)
+{
+	*err = 0;
+	return NULL;
+}
+
+static int auxtrace_index__alloc(struct list_head *head)
+{
+	struct auxtrace_index *auxtrace_index;
+
+	auxtrace_index = malloc(sizeof(struct auxtrace_index));
+	if (!auxtrace_index)
+		return -ENOMEM;
+
+	auxtrace_index->nr = 0;
+	INIT_LIST_HEAD(&auxtrace_index->list);
+
+	list_add_tail(&auxtrace_index->list, head);
+
+	return 0;
+}
+
+void auxtrace_index__free(struct list_head *head)
+{
+	struct auxtrace_index *auxtrace_index, *n;
+
+	list_for_each_entry_safe(auxtrace_index, n, head, list) {
+		list_del(&auxtrace_index->list);
+		free(auxtrace_index);
+	}
+}
+
+static struct auxtrace_index *auxtrace_index__last(struct list_head *head)
+{
+	struct auxtrace_index *auxtrace_index;
+	int err;
+
+	if (list_empty(head)) {
+		err = auxtrace_index__alloc(head);
+		if (err)
+			return NULL;
+	}
+
+	auxtrace_index = list_entry(head->prev, struct auxtrace_index, list);
+
+	if (auxtrace_index->nr >= PERF_AUXTRACE_INDEX_ENTRY_COUNT) {
+		err = auxtrace_index__alloc(head);
+		if (err)
+			return NULL;
+		auxtrace_index = list_entry(head->prev, struct auxtrace_index,
+					    list);
+	}
+
+	return auxtrace_index;
+}
+
+int auxtrace_index__auxtrace_event(struct list_head *head,
+				   union perf_event *event, off_t file_offset)
+{
+	struct auxtrace_index *auxtrace_index;
+	size_t nr;
+
+	auxtrace_index = auxtrace_index__last(head);
+	if (!auxtrace_index)
+		return -ENOMEM;
+
+	nr = auxtrace_index->nr;
+	auxtrace_index->entries[nr].file_offset = file_offset;
+	auxtrace_index->entries[nr].sz = event->header.size;
+	auxtrace_index->nr += 1;
+
+	return 0;
+}
+
+static int auxtrace_index__do_write(int fd,
+				    struct auxtrace_index *auxtrace_index)
+{
+	struct auxtrace_index_entry ent;
+	size_t i;
+
+	for (i = 0; i < auxtrace_index->nr; i++) {
+		ent.file_offset = auxtrace_index->entries[i].file_offset;
+		ent.sz = auxtrace_index->entries[i].sz;
+		if (writen(fd, &ent, sizeof(ent)) != sizeof(ent))
+			return -errno;
+	}
+	return 0;
+}
+
+int auxtrace_index__write(int fd, struct list_head *head)
+{
+	struct auxtrace_index *auxtrace_index;
+	u64 total = 0;
+	int err;
+
+	list_for_each_entry(auxtrace_index, head, list)
+		total += auxtrace_index->nr;
+
+	if (writen(fd, &total, sizeof(total)) != sizeof(total))
+		return -errno;
+
+	list_for_each_entry(auxtrace_index, head, list) {
+		err = auxtrace_index__do_write(fd, auxtrace_index);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int auxtrace_index__process_entry(int fd, struct list_head *head,
+					 bool needs_swap)
+{
+	struct auxtrace_index *auxtrace_index;
+	struct auxtrace_index_entry ent;
+	size_t nr;
+
+	if (readn(fd, &ent, sizeof(ent)) != sizeof(ent))
+		return -1;
+
+	auxtrace_index = auxtrace_index__last(head);
+	if (!auxtrace_index)
+		return -1;
+
+	nr = auxtrace_index->nr;
+	if (needs_swap) {
+		auxtrace_index->entries[nr].file_offset =
+						bswap_64(ent.file_offset);
+		auxtrace_index->entries[nr].sz = bswap_64(ent.sz);
+	} else {
+		auxtrace_index->entries[nr].file_offset = ent.file_offset;
+		auxtrace_index->entries[nr].sz = ent.sz;
+	}
+
+	auxtrace_index->nr = nr + 1;
+
+	return 0;
+}
+
+int auxtrace_index__process(int fd, u64 size, struct perf_session *session,
+			    bool needs_swap)
+{
+	struct list_head *head = &session->auxtrace_index;
+	u64 nr;
+
+	if (readn(fd, &nr, sizeof(u64)) != sizeof(u64))
+		return -1;
+
+	if (needs_swap)
+		nr = bswap_64(nr);
+
+	if (sizeof(u64) + nr * sizeof(struct auxtrace_index_entry) > size)
+		return -1;
+
+	while (nr--) {
+		int err;
+
+		err = auxtrace_index__process_entry(fd, head, needs_swap);
+		if (err)
+			return -1;
+	}
+
+	return 0;
+}
+
+static int auxtrace_queues__process_index_entry(struct auxtrace_queues *queues,
+						struct perf_session *session,
+						struct auxtrace_index_entry *ent)
+{
+	return auxtrace_queues__add_indexed_event(queues, session,
+						  ent->file_offset, ent->sz);
+}
+
+int auxtrace_queues__process_index(struct auxtrace_queues *queues,
+				   struct perf_session *session)
+{
+	struct auxtrace_index *auxtrace_index;
+	struct auxtrace_index_entry *ent;
+	size_t i;
+	int err;
+
+	list_for_each_entry(auxtrace_index, &session->auxtrace_index, list) {
+		for (i = 0; i < auxtrace_index->nr; i++) {
+			ent = &auxtrace_index->entries[i];
+			err = auxtrace_queues__process_index_entry(queues,
+								   session,
+								   ent);
+			if (err)
+				return err;
+		}
+	}
+	return 0;
+}
+
+struct auxtrace_buffer *auxtrace_buffer__next(struct auxtrace_queue *queue,
+					      struct auxtrace_buffer *buffer)
+{
+	if (buffer) {
+		if (list_is_last(&buffer->list, &queue->head))
+			return NULL;
+		return list_entry(buffer->list.next, struct auxtrace_buffer,
+				  list);
+	} else {
+		if (list_empty(&queue->head))
+			return NULL;
+		return list_entry(queue->head.next, struct auxtrace_buffer,
+				  list);
+	}
+}
+
+void *auxtrace_buffer__get_data(struct auxtrace_buffer *buffer, int fd)
+{
+	size_t adj = buffer->data_offset & (page_size - 1);
+	size_t size = buffer->size + adj;
+	off_t file_offset = buffer->data_offset - adj;
+	void *addr;
+
+	if (buffer->data)
+		return buffer->data;
+
+	addr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, file_offset);
+	if (addr == MAP_FAILED)
+		return NULL;
+
+	buffer->mmap_addr = addr;
+	buffer->mmap_size = size;
+
+	buffer->data = addr + adj;
+
+	return buffer->data;
+}
+
+void auxtrace_buffer__put_data(struct auxtrace_buffer *buffer)
+{
+	if (!buffer->data || !buffer->mmap_addr)
+		return;
+	munmap(buffer->mmap_addr, buffer->mmap_size);
+	buffer->mmap_addr = NULL;
+	buffer->mmap_size = 0;
+	buffer->data = NULL;
+	buffer->use_data = NULL;
+}
+
+void auxtrace_buffer__drop_data(struct auxtrace_buffer *buffer)
+{
+	auxtrace_buffer__put_data(buffer);
+	if (buffer->data_needs_freeing) {
+		buffer->data_needs_freeing = false;
+		zfree(&buffer->data);
+		buffer->use_data = NULL;
+		buffer->size = 0;
+	}
+}
+
+void auxtrace_buffer__free(struct auxtrace_buffer *buffer)
+{
+	auxtrace_buffer__drop_data(buffer);
+	free(buffer);
+}
+
+void auxtrace_synth_error(struct auxtrace_error_event *auxtrace_error, int type,
+			  int code, int cpu, pid_t pid, pid_t tid, u64 ip,
+			  const char *msg)
+{
+	size_t size;
+
+	memset(auxtrace_error, 0, sizeof(struct auxtrace_error_event));
+
+	auxtrace_error->header.type = PERF_RECORD_AUXTRACE_ERROR;
+	auxtrace_error->type = type;
+	auxtrace_error->code = code;
+	auxtrace_error->cpu = cpu;
+	auxtrace_error->pid = pid;
+	auxtrace_error->tid = tid;
+	auxtrace_error->ip = ip;
+	strlcpy(auxtrace_error->msg, msg, MAX_AUXTRACE_ERROR_MSG);
+
+	size = (void *)auxtrace_error->msg - (void *)auxtrace_error +
+	       strlen(auxtrace_error->msg) + 1;
+	auxtrace_error->header.size = PERF_ALIGN(size, sizeof(u64));
+}
+
+int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr,
+					 struct perf_tool *tool,
+					 struct perf_session *session,
+					 perf_event__handler_t process)
+{
+	union perf_event *ev;
+	size_t priv_size;
+	int err;
+
+	pr_debug2("Synthesizing auxtrace information\n");
+	priv_size = auxtrace_record__info_priv_size(itr);
+	ev = zalloc(sizeof(struct auxtrace_info_event) + priv_size);
+	if (!ev)
+		return -ENOMEM;
+
+	ev->auxtrace_info.header.type = PERF_RECORD_AUXTRACE_INFO;
+	ev->auxtrace_info.header.size = sizeof(struct auxtrace_info_event) +
+					priv_size;
+	err = auxtrace_record__info_fill(itr, session, &ev->auxtrace_info,
+					 priv_size);
+	if (err)
+		goto out_free;
+
+	err = process(tool, ev, NULL, NULL);
+out_free:
+	free(ev);
+	return err;
+}
+
+static bool auxtrace__dont_decode(struct perf_session *session)
+{
+	return !session->itrace_synth_opts ||
+	       session->itrace_synth_opts->dont_decode;
+}
+
+int perf_event__process_auxtrace_info(struct perf_tool *tool __maybe_unused,
+				      union perf_event *event,
+				      struct perf_session *session __maybe_unused)
+{
+	enum auxtrace_type type = event->auxtrace_info.type;
+
+	if (dump_trace)
+		fprintf(stdout, " type: %u\n", type);
+
+	switch (type) {
+	case PERF_AUXTRACE_UNKNOWN:
+	default:
+		return -EINVAL;
+	}
+}
+
+s64 perf_event__process_auxtrace(struct perf_tool *tool,
+				 union perf_event *event,
+				 struct perf_session *session)
+{
+	s64 err;
+
+	if (dump_trace)
+		fprintf(stdout, " size: %#"PRIx64"  offset: %#"PRIx64"  ref: %#"PRIx64"  idx: %u  tid: %d  cpu: %d\n",
+			event->auxtrace.size, event->auxtrace.offset,
+			event->auxtrace.reference, event->auxtrace.idx,
+			event->auxtrace.tid, event->auxtrace.cpu);
+
+	if (auxtrace__dont_decode(session))
+		return event->auxtrace.size;
+
+	if (!session->auxtrace || event->header.type != PERF_RECORD_AUXTRACE)
+		return -EINVAL;
+
+	err = session->auxtrace->process_auxtrace_event(session, event, tool);
+	if (err < 0)
+		return err;
+
+	return event->auxtrace.size;
+}
+
+#define PERF_ITRACE_DEFAULT_PERIOD_TYPE		PERF_ITRACE_PERIOD_NANOSECS
+#define PERF_ITRACE_DEFAULT_PERIOD		100000
+#define PERF_ITRACE_DEFAULT_CALLCHAIN_SZ	16
+#define PERF_ITRACE_MAX_CALLCHAIN_SZ		1024
+
+void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts)
+{
+	synth_opts->instructions = true;
+	synth_opts->branches = true;
+	synth_opts->transactions = true;
+	synth_opts->errors = true;
+	synth_opts->period_type = PERF_ITRACE_DEFAULT_PERIOD_TYPE;
+	synth_opts->period = PERF_ITRACE_DEFAULT_PERIOD;
+	synth_opts->callchain_sz = PERF_ITRACE_DEFAULT_CALLCHAIN_SZ;
+}
+
+/*
+ * Please check tools/perf/Documentation/perf-script.txt for information
+ * about the options parsed here, which is introduced after this cset,
+ * when support in 'perf script' for these options is introduced.
+ */
+int itrace_parse_synth_opts(const struct option *opt, const char *str,
+			    int unset)
+{
+	struct itrace_synth_opts *synth_opts = opt->value;
+	const char *p;
+	char *endptr;
+
+	synth_opts->set = true;
+
+	if (unset) {
+		synth_opts->dont_decode = true;
+		return 0;
+	}
+
+	if (!str) {
+		itrace_synth_opts__set_default(synth_opts);
+		return 0;
+	}
+
+	for (p = str; *p;) {
+		switch (*p++) {
+		case 'i':
+			synth_opts->instructions = true;
+			while (*p == ' ' || *p == ',')
+				p += 1;
+			if (isdigit(*p)) {
+				synth_opts->period = strtoull(p, &endptr, 10);
+				p = endptr;
+				while (*p == ' ' || *p == ',')
+					p += 1;
+				switch (*p++) {
+				case 'i':
+					synth_opts->period_type =
+						PERF_ITRACE_PERIOD_INSTRUCTIONS;
+					break;
+				case 't':
+					synth_opts->period_type =
+						PERF_ITRACE_PERIOD_TICKS;
+					break;
+				case 'm':
+					synth_opts->period *= 1000;
+					/* Fall through */
+				case 'u':
+					synth_opts->period *= 1000;
+					/* Fall through */
+				case 'n':
+					if (*p++ != 's')
+						goto out_err;
+					synth_opts->period_type =
+						PERF_ITRACE_PERIOD_NANOSECS;
+					break;
+				case '\0':
+					goto out;
+				default:
+					goto out_err;
+				}
+			}
+			break;
+		case 'b':
+			synth_opts->branches = true;
+			break;
+		case 'x':
+			synth_opts->transactions = true;
+			break;
+		case 'e':
+			synth_opts->errors = true;
+			break;
+		case 'd':
+			synth_opts->log = true;
+			break;
+		case 'c':
+			synth_opts->branches = true;
+			synth_opts->calls = true;
+			break;
+		case 'r':
+			synth_opts->branches = true;
+			synth_opts->returns = true;
+			break;
+		case 'g':
+			synth_opts->callchain = true;
+			synth_opts->callchain_sz =
+					PERF_ITRACE_DEFAULT_CALLCHAIN_SZ;
+			while (*p == ' ' || *p == ',')
+				p += 1;
+			if (isdigit(*p)) {
+				unsigned int val;
+
+				val = strtoul(p, &endptr, 10);
+				p = endptr;
+				if (!val || val > PERF_ITRACE_MAX_CALLCHAIN_SZ)
+					goto out_err;
+				synth_opts->callchain_sz = val;
+			}
+			break;
+		case ' ':
+		case ',':
+			break;
+		default:
+			goto out_err;
+		}
+	}
+out:
+	if (synth_opts->instructions) {
+		if (!synth_opts->period_type)
+			synth_opts->period_type =
+					PERF_ITRACE_DEFAULT_PERIOD_TYPE;
+		if (!synth_opts->period)
+			synth_opts->period = PERF_ITRACE_DEFAULT_PERIOD;
+	}
+
+	return 0;
+
+out_err:
+	pr_err("Bad Instruction Tracing options '%s'\n", str);
+	return -EINVAL;
+}
+
+static const char * const auxtrace_error_type_name[] = {
+	[PERF_AUXTRACE_ERROR_ITRACE] = "instruction trace",
+};
+
+static const char *auxtrace_error_name(int type)
+{
+	const char *error_type_name = NULL;
+
+	if (type < PERF_AUXTRACE_ERROR_MAX)
+		error_type_name = auxtrace_error_type_name[type];
+	if (!error_type_name)
+		error_type_name = "unknown AUX";
+	return error_type_name;
+}
+
+size_t perf_event__fprintf_auxtrace_error(union perf_event *event, FILE *fp)
+{
+	struct auxtrace_error_event *e = &event->auxtrace_error;
+	int ret;
+
+	ret = fprintf(fp, " %s error type %u",
+		      auxtrace_error_name(e->type), e->type);
+	ret += fprintf(fp, " cpu %d pid %d tid %d ip %#"PRIx64" code %u: %s\n",
+		       e->cpu, e->pid, e->tid, e->ip, e->code, e->msg);
+	return ret;
+}
+
+void perf_session__auxtrace_error_inc(struct perf_session *session,
+				      union perf_event *event)
+{
+	struct auxtrace_error_event *e = &event->auxtrace_error;
+
+	if (e->type < PERF_AUXTRACE_ERROR_MAX)
+		session->evlist->stats.nr_auxtrace_errors[e->type] += 1;
+}
+
+void events_stats__auxtrace_error_warn(const struct events_stats *stats)
+{
+	int i;
+
+	for (i = 0; i < PERF_AUXTRACE_ERROR_MAX; i++) {
+		if (!stats->nr_auxtrace_errors[i])
+			continue;
+		ui__warning("%u %s errors\n",
+			    stats->nr_auxtrace_errors[i],
+			    auxtrace_error_name(i));
+	}
+}
+
+int perf_event__process_auxtrace_error(struct perf_tool *tool __maybe_unused,
+				       union perf_event *event,
+				       struct perf_session *session)
+{
+	if (auxtrace__dont_decode(session))
+		return 0;
+
+	perf_event__fprintf_auxtrace_error(event, stdout);
+	return 0;
+}
+
+static int __auxtrace_mmap__read(struct auxtrace_mmap *mm,
+				 struct auxtrace_record *itr,
+				 struct perf_tool *tool, process_auxtrace_t fn,
+				 bool snapshot, size_t snapshot_size)
+{
+	u64 head, old = mm->prev, offset, ref;
+	unsigned char *data = mm->base;
+	size_t size, head_off, old_off, len1, len2, padding;
+	union perf_event ev;
+	void *data1, *data2;
+
+	if (snapshot) {
+		head = auxtrace_mmap__read_snapshot_head(mm);
+		if (auxtrace_record__find_snapshot(itr, mm->idx, mm, data,
+						   &head, &old))
+			return -1;
+	} else {
+		head = auxtrace_mmap__read_head(mm);
+	}
+
+	if (old == head)
+		return 0;
+
+	pr_debug3("auxtrace idx %d old %#"PRIx64" head %#"PRIx64" diff %#"PRIx64"\n",
+		  mm->idx, old, head, head - old);
+
+	if (mm->mask) {
+		head_off = head & mm->mask;
+		old_off = old & mm->mask;
+	} else {
+		head_off = head % mm->len;
+		old_off = old % mm->len;
+	}
+
+	if (head_off > old_off)
+		size = head_off - old_off;
+	else
+		size = mm->len - (old_off - head_off);
+
+	if (snapshot && size > snapshot_size)
+		size = snapshot_size;
+
+	ref = auxtrace_record__reference(itr);
+
+	if (head > old || size <= head || mm->mask) {
+		offset = head - size;
+	} else {
+		/*
+		 * When the buffer size is not a power of 2, 'head' wraps at the
+		 * highest multiple of the buffer size, so we have to subtract
+		 * the remainder here.
+		 */
+		u64 rem = (0ULL - mm->len) % mm->len;
+
+		offset = head - size - rem;
+	}
+
+	if (size > head_off) {
+		len1 = size - head_off;
+		data1 = &data[mm->len - len1];
+		len2 = head_off;
+		data2 = &data[0];
+	} else {
+		len1 = size;
+		data1 = &data[head_off - len1];
+		len2 = 0;
+		data2 = NULL;
+	}
+
+	/* padding must be written by fn() e.g. record__process_auxtrace() */
+	padding = size & 7;
+	if (padding)
+		padding = 8 - padding;
+
+	memset(&ev, 0, sizeof(ev));
+	ev.auxtrace.header.type = PERF_RECORD_AUXTRACE;
+	ev.auxtrace.header.size = sizeof(ev.auxtrace);
+	ev.auxtrace.size = size + padding;
+	ev.auxtrace.offset = offset;
+	ev.auxtrace.reference = ref;
+	ev.auxtrace.idx = mm->idx;
+	ev.auxtrace.tid = mm->tid;
+	ev.auxtrace.cpu = mm->cpu;
+
+	if (fn(tool, &ev, data1, len1, data2, len2))
+		return -1;
+
+	mm->prev = head;
+
+	if (!snapshot) {
+		auxtrace_mmap__write_tail(mm, head);
+		if (itr->read_finish) {
+			int err;
+
+			err = itr->read_finish(itr, mm->idx);
+			if (err < 0)
+				return err;
+		}
+	}
+
+	return 1;
+}
+
+int auxtrace_mmap__read(struct auxtrace_mmap *mm, struct auxtrace_record *itr,
+			struct perf_tool *tool, process_auxtrace_t fn)
+{
+	return __auxtrace_mmap__read(mm, itr, tool, fn, false, 0);
+}
+
+int auxtrace_mmap__read_snapshot(struct auxtrace_mmap *mm,
+				 struct auxtrace_record *itr,
+				 struct perf_tool *tool, process_auxtrace_t fn,
+				 size_t snapshot_size)
+{
+	return __auxtrace_mmap__read(mm, itr, tool, fn, true, snapshot_size);
+}
+
+/**
+ * struct auxtrace_cache - hash table to implement a cache
+ * @hashtable: the hashtable
+ * @sz: hashtable size (number of hlists)
+ * @entry_size: size of an entry
+ * @limit: limit the number of entries to this maximum, when reached the cache
+ *         is dropped and caching begins again with an empty cache
+ * @cnt: current number of entries
+ * @bits: hashtable size (@sz = 2^@bits)
+ */
+struct auxtrace_cache {
+	struct hlist_head *hashtable;
+	size_t sz;
+	size_t entry_size;
+	size_t limit;
+	size_t cnt;
+	unsigned int bits;
+};
+
+struct auxtrace_cache *auxtrace_cache__new(unsigned int bits, size_t entry_size,
+					   unsigned int limit_percent)
+{
+	struct auxtrace_cache *c;
+	struct hlist_head *ht;
+	size_t sz, i;
+
+	c = zalloc(sizeof(struct auxtrace_cache));
+	if (!c)
+		return NULL;
+
+	sz = 1UL << bits;
+
+	ht = calloc(sz, sizeof(struct hlist_head));
+	if (!ht)
+		goto out_free;
+
+	for (i = 0; i < sz; i++)
+		INIT_HLIST_HEAD(&ht[i]);
+
+	c->hashtable = ht;
+	c->sz = sz;
+	c->entry_size = entry_size;
+	c->limit = (c->sz * limit_percent) / 100;
+	c->bits = bits;
+
+	return c;
+
+out_free:
+	free(c);
+	return NULL;
+}
+
+static void auxtrace_cache__drop(struct auxtrace_cache *c)
+{
+	struct auxtrace_cache_entry *entry;
+	struct hlist_node *tmp;
+	size_t i;
+
+	if (!c)
+		return;
+
+	for (i = 0; i < c->sz; i++) {
+		hlist_for_each_entry_safe(entry, tmp, &c->hashtable[i], hash) {
+			hlist_del(&entry->hash);
+			auxtrace_cache__free_entry(c, entry);
+		}
+	}
+
+	c->cnt = 0;
+}
+
+void auxtrace_cache__free(struct auxtrace_cache *c)
+{
+	if (!c)
+		return;
+
+	auxtrace_cache__drop(c);
+	free(c->hashtable);
+	free(c);
+}
+
+void *auxtrace_cache__alloc_entry(struct auxtrace_cache *c)
+{
+	return malloc(c->entry_size);
+}
+
+void auxtrace_cache__free_entry(struct auxtrace_cache *c __maybe_unused,
+				void *entry)
+{
+	free(entry);
+}
+
+int auxtrace_cache__add(struct auxtrace_cache *c, u32 key,
+			struct auxtrace_cache_entry *entry)
+{
+	if (c->limit && ++c->cnt > c->limit)
+		auxtrace_cache__drop(c);
+
+	entry->key = key;
+	hlist_add_head(&entry->hash, &c->hashtable[hash_32(key, c->bits)]);
+
+	return 0;
+}
+
+void *auxtrace_cache__lookup(struct auxtrace_cache *c, u32 key)
+{
+	struct auxtrace_cache_entry *entry;
+	struct hlist_head *hlist;
+
+	if (!c)
+		return NULL;
+
+	hlist = &c->hashtable[hash_32(key, c->bits)];
+	hlist_for_each_entry(entry, hlist, hash) {
+		if (entry->key == key)
+			return entry;
+	}
+
+	return NULL;
+}
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
new file mode 100644
index 00000000000000..a171abbe730146
--- /dev/null
+++ b/tools/perf/util/auxtrace.h
@@ -0,0 +1,643 @@
+/*
+ * auxtrace.h: AUX area trace support
+ * Copyright (c) 2013-2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#ifndef __PERF_AUXTRACE_H
+#define __PERF_AUXTRACE_H
+
+#include <sys/types.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <linux/list.h>
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include "../perf.h"
+#include "event.h"
+#include "session.h"
+#include "debug.h"
+
+union perf_event;
+struct perf_session;
+struct perf_evlist;
+struct perf_tool;
+struct option;
+struct record_opts;
+struct auxtrace_info_event;
+struct events_stats;
+
+enum auxtrace_type {
+	PERF_AUXTRACE_UNKNOWN,
+};
+
+enum itrace_period_type {
+	PERF_ITRACE_PERIOD_INSTRUCTIONS,
+	PERF_ITRACE_PERIOD_TICKS,
+	PERF_ITRACE_PERIOD_NANOSECS,
+};
+
+/**
+ * struct itrace_synth_opts - AUX area tracing synthesis options.
+ * @set: indicates whether or not options have been set
+ * @inject: indicates the event (not just the sample) must be fully synthesized
+ *          because 'perf inject' will write it out
+ * @instructions: whether to synthesize 'instructions' events
+ * @branches: whether to synthesize 'branches' events
+ * @transactions: whether to synthesize events for transactions
+ * @errors: whether to synthesize decoder error events
+ * @dont_decode: whether to skip decoding entirely
+ * @log: write a decoding log
+ * @calls: limit branch samples to calls (can be combined with @returns)
+ * @returns: limit branch samples to returns (can be combined with @calls)
+ * @callchain: add callchain to 'instructions' events
+ * @callchain_sz: maximum callchain size
+ * @period: 'instructions' events period
+ * @period_type: 'instructions' events period type
+ */
+struct itrace_synth_opts {
+	bool			set;
+	bool			inject;
+	bool			instructions;
+	bool			branches;
+	bool			transactions;
+	bool			errors;
+	bool			dont_decode;
+	bool			log;
+	bool			calls;
+	bool			returns;
+	bool			callchain;
+	unsigned int		callchain_sz;
+	unsigned long long	period;
+	enum itrace_period_type	period_type;
+};
+
+/**
+ * struct auxtrace_index_entry - indexes a AUX area tracing event within a
+ *                               perf.data file.
+ * @file_offset: offset within the perf.data file
+ * @sz: size of the event
+ */
+struct auxtrace_index_entry {
+	u64			file_offset;
+	u64			sz;
+};
+
+#define PERF_AUXTRACE_INDEX_ENTRY_COUNT 256
+
+/**
+ * struct auxtrace_index - index of AUX area tracing events within a perf.data
+ *                         file.
+ * @list: linking a number of arrays of entries
+ * @nr: number of entries
+ * @entries: array of entries
+ */
+struct auxtrace_index {
+	struct list_head	list;
+	size_t			nr;
+	struct auxtrace_index_entry entries[PERF_AUXTRACE_INDEX_ENTRY_COUNT];
+};
+
+/**
+ * struct auxtrace - session callbacks to allow AUX area data decoding.
+ * @process_event: lets the decoder see all session events
+ * @flush_events: process any remaining data
+ * @free_events: free resources associated with event processing
+ * @free: free resources associated with the session
+ */
+struct auxtrace {
+	int (*process_event)(struct perf_session *session,
+			     union perf_event *event,
+			     struct perf_sample *sample,
+			     struct perf_tool *tool);
+	int (*process_auxtrace_event)(struct perf_session *session,
+				      union perf_event *event,
+				      struct perf_tool *tool);
+	int (*flush_events)(struct perf_session *session,
+			    struct perf_tool *tool);
+	void (*free_events)(struct perf_session *session);
+	void (*free)(struct perf_session *session);
+};
+
+/**
+ * struct auxtrace_buffer - a buffer containing AUX area tracing data.
+ * @list: buffers are queued in a list held by struct auxtrace_queue
+ * @size: size of the buffer in bytes
+ * @pid: in per-thread mode, the pid this buffer is associated with
+ * @tid: in per-thread mode, the tid this buffer is associated with
+ * @cpu: in per-cpu mode, the cpu this buffer is associated with
+ * @data: actual buffer data (can be null if the data has not been loaded)
+ * @data_offset: file offset at which the buffer can be read
+ * @mmap_addr: mmap address at which the buffer can be read
+ * @mmap_size: size of the mmap at @mmap_addr
+ * @data_needs_freeing: @data was malloc'd so free it when it is no longer
+ *                      needed
+ * @consecutive: the original data was split up and this buffer is consecutive
+ *               to the previous buffer
+ * @offset: offset as determined by aux_head / aux_tail members of struct
+ *          perf_event_mmap_page
+ * @reference: an implementation-specific reference determined when the data is
+ *             recorded
+ * @buffer_nr: used to number each buffer
+ * @use_size: implementation actually only uses this number of bytes
+ * @use_data: implementation actually only uses data starting at this address
+ */
+struct auxtrace_buffer {
+	struct list_head	list;
+	size_t			size;
+	pid_t			pid;
+	pid_t			tid;
+	int			cpu;
+	void			*data;
+	off_t			data_offset;
+	void			*mmap_addr;
+	size_t			mmap_size;
+	bool			data_needs_freeing;
+	bool			consecutive;
+	u64			offset;
+	u64			reference;
+	u64			buffer_nr;
+	size_t			use_size;
+	void			*use_data;
+};
+
+/**
+ * struct auxtrace_queue - a queue of AUX area tracing data buffers.
+ * @head: head of buffer list
+ * @tid: in per-thread mode, the tid this queue is associated with
+ * @cpu: in per-cpu mode, the cpu this queue is associated with
+ * @set: %true once this queue has been dedicated to a specific thread or cpu
+ * @priv: implementation-specific data
+ */
+struct auxtrace_queue {
+	struct list_head	head;
+	pid_t			tid;
+	int			cpu;
+	bool			set;
+	void			*priv;
+};
+
+/**
+ * struct auxtrace_queues - an array of AUX area tracing queues.
+ * @queue_array: array of queues
+ * @nr_queues: number of queues
+ * @new_data: set whenever new data is queued
+ * @populated: queues have been fully populated using the auxtrace_index
+ * @next_buffer_nr: used to number each buffer
+ */
+struct auxtrace_queues {
+	struct auxtrace_queue	*queue_array;
+	unsigned int		nr_queues;
+	bool			new_data;
+	bool			populated;
+	u64			next_buffer_nr;
+};
+
+/**
+ * struct auxtrace_heap_item - element of struct auxtrace_heap.
+ * @queue_nr: queue number
+ * @ordinal: value used for sorting (lowest ordinal is top of the heap) expected
+ *           to be a timestamp
+ */
+struct auxtrace_heap_item {
+	unsigned int		queue_nr;
+	u64			ordinal;
+};
+
+/**
+ * struct auxtrace_heap - a heap suitable for sorting AUX area tracing queues.
+ * @heap_array: the heap
+ * @heap_cnt: the number of elements in the heap
+ * @heap_sz: maximum number of elements (grows as needed)
+ */
+struct auxtrace_heap {
+	struct auxtrace_heap_item	*heap_array;
+	unsigned int		heap_cnt;
+	unsigned int		heap_sz;
+};
+
+/**
+ * struct auxtrace_mmap - records an mmap of the auxtrace buffer.
+ * @base: address of mapped area
+ * @userpg: pointer to buffer's perf_event_mmap_page
+ * @mask: %0 if @len is not a power of two, otherwise (@len - %1)
+ * @len: size of mapped area
+ * @prev: previous aux_head
+ * @idx: index of this mmap
+ * @tid: tid for a per-thread mmap (also set if there is only 1 tid on a per-cpu
+ *       mmap) otherwise %0
+ * @cpu: cpu number for a per-cpu mmap otherwise %-1
+ */
+struct auxtrace_mmap {
+	void		*base;
+	void		*userpg;
+	size_t		mask;
+	size_t		len;
+	u64		prev;
+	int		idx;
+	pid_t		tid;
+	int		cpu;
+};
+
+/**
+ * struct auxtrace_mmap_params - parameters to set up struct auxtrace_mmap.
+ * @mask: %0 if @len is not a power of two, otherwise (@len - %1)
+ * @offset: file offset of mapped area
+ * @len: size of mapped area
+ * @prot: mmap memory protection
+ * @idx: index of this mmap
+ * @tid: tid for a per-thread mmap (also set if there is only 1 tid on a per-cpu
+ *       mmap) otherwise %0
+ * @cpu: cpu number for a per-cpu mmap otherwise %-1
+ */
+struct auxtrace_mmap_params {
+	size_t		mask;
+	off_t		offset;
+	size_t		len;
+	int		prot;
+	int		idx;
+	pid_t		tid;
+	int		cpu;
+};
+
+/**
+ * struct auxtrace_record - callbacks for recording AUX area data.
+ * @recording_options: validate and process recording options
+ * @info_priv_size: return the size of the private data in auxtrace_info_event
+ * @info_fill: fill-in the private data in auxtrace_info_event
+ * @free: free this auxtrace record structure
+ * @snapshot_start: starting a snapshot
+ * @snapshot_finish: finishing a snapshot
+ * @find_snapshot: find data to snapshot within auxtrace mmap
+ * @parse_snapshot_options: parse snapshot options
+ * @reference: provide a 64-bit reference number for auxtrace_event
+ * @read_finish: called after reading from an auxtrace mmap
+ */
+struct auxtrace_record {
+	int (*recording_options)(struct auxtrace_record *itr,
+				 struct perf_evlist *evlist,
+				 struct record_opts *opts);
+	size_t (*info_priv_size)(struct auxtrace_record *itr);
+	int (*info_fill)(struct auxtrace_record *itr,
+			 struct perf_session *session,
+			 struct auxtrace_info_event *auxtrace_info,
+			 size_t priv_size);
+	void (*free)(struct auxtrace_record *itr);
+	int (*snapshot_start)(struct auxtrace_record *itr);
+	int (*snapshot_finish)(struct auxtrace_record *itr);
+	int (*find_snapshot)(struct auxtrace_record *itr, int idx,
+			     struct auxtrace_mmap *mm, unsigned char *data,
+			     u64 *head, u64 *old);
+	int (*parse_snapshot_options)(struct auxtrace_record *itr,
+				      struct record_opts *opts,
+				      const char *str);
+	u64 (*reference)(struct auxtrace_record *itr);
+	int (*read_finish)(struct auxtrace_record *itr, int idx);
+};
+
+#ifdef HAVE_AUXTRACE_SUPPORT
+
+/*
+ * In snapshot mode the mmapped page is read-only which makes using
+ * __sync_val_compare_and_swap() problematic.  However, snapshot mode expects
+ * the buffer is not updated while the snapshot is made (e.g. Intel PT disables
+ * the event) so there is not a race anyway.
+ */
+static inline u64 auxtrace_mmap__read_snapshot_head(struct auxtrace_mmap *mm)
+{
+	struct perf_event_mmap_page *pc = mm->userpg;
+	u64 head = ACCESS_ONCE(pc->aux_head);
+
+	/* Ensure all reads are done after we read the head */
+	rmb();
+	return head;
+}
+
+static inline u64 auxtrace_mmap__read_head(struct auxtrace_mmap *mm)
+{
+	struct perf_event_mmap_page *pc = mm->userpg;
+#if BITS_PER_LONG == 64 || !defined(HAVE_SYNC_COMPARE_AND_SWAP_SUPPORT)
+	u64 head = ACCESS_ONCE(pc->aux_head);
+#else
+	u64 head = __sync_val_compare_and_swap(&pc->aux_head, 0, 0);
+#endif
+
+	/* Ensure all reads are done after we read the head */
+	rmb();
+	return head;
+}
+
+static inline void auxtrace_mmap__write_tail(struct auxtrace_mmap *mm, u64 tail)
+{
+	struct perf_event_mmap_page *pc = mm->userpg;
+#if BITS_PER_LONG != 64 && defined(HAVE_SYNC_COMPARE_AND_SWAP_SUPPORT)
+	u64 old_tail;
+#endif
+
+	/* Ensure all reads are done before we write the tail out */
+	mb();
+#if BITS_PER_LONG == 64 || !defined(HAVE_SYNC_COMPARE_AND_SWAP_SUPPORT)
+	pc->aux_tail = tail;
+#else
+	do {
+		old_tail = __sync_val_compare_and_swap(&pc->aux_tail, 0, 0);
+	} while (!__sync_bool_compare_and_swap(&pc->aux_tail, old_tail, tail));
+#endif
+}
+
+int auxtrace_mmap__mmap(struct auxtrace_mmap *mm,
+			struct auxtrace_mmap_params *mp,
+			void *userpg, int fd);
+void auxtrace_mmap__munmap(struct auxtrace_mmap *mm);
+void auxtrace_mmap_params__init(struct auxtrace_mmap_params *mp,
+				off_t auxtrace_offset,
+				unsigned int auxtrace_pages,
+				bool auxtrace_overwrite);
+void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp,
+				   struct perf_evlist *evlist, int idx,
+				   bool per_cpu);
+
+typedef int (*process_auxtrace_t)(struct perf_tool *tool,
+				  union perf_event *event, void *data1,
+				  size_t len1, void *data2, size_t len2);
+
+int auxtrace_mmap__read(struct auxtrace_mmap *mm, struct auxtrace_record *itr,
+			struct perf_tool *tool, process_auxtrace_t fn);
+
+int auxtrace_mmap__read_snapshot(struct auxtrace_mmap *mm,
+				 struct auxtrace_record *itr,
+				 struct perf_tool *tool, process_auxtrace_t fn,
+				 size_t snapshot_size);
+
+int auxtrace_queues__init(struct auxtrace_queues *queues);
+int auxtrace_queues__add_event(struct auxtrace_queues *queues,
+			       struct perf_session *session,
+			       union perf_event *event, off_t data_offset,
+			       struct auxtrace_buffer **buffer_ptr);
+void auxtrace_queues__free(struct auxtrace_queues *queues);
+int auxtrace_queues__process_index(struct auxtrace_queues *queues,
+				   struct perf_session *session);
+struct auxtrace_buffer *auxtrace_buffer__next(struct auxtrace_queue *queue,
+					      struct auxtrace_buffer *buffer);
+void *auxtrace_buffer__get_data(struct auxtrace_buffer *buffer, int fd);
+void auxtrace_buffer__put_data(struct auxtrace_buffer *buffer);
+void auxtrace_buffer__drop_data(struct auxtrace_buffer *buffer);
+void auxtrace_buffer__free(struct auxtrace_buffer *buffer);
+
+int auxtrace_heap__add(struct auxtrace_heap *heap, unsigned int queue_nr,
+		       u64 ordinal);
+void auxtrace_heap__pop(struct auxtrace_heap *heap);
+void auxtrace_heap__free(struct auxtrace_heap *heap);
+
+struct auxtrace_cache_entry {
+	struct hlist_node hash;
+	u32 key;
+};
+
+struct auxtrace_cache *auxtrace_cache__new(unsigned int bits, size_t entry_size,
+					   unsigned int limit_percent);
+void auxtrace_cache__free(struct auxtrace_cache *auxtrace_cache);
+void *auxtrace_cache__alloc_entry(struct auxtrace_cache *c);
+void auxtrace_cache__free_entry(struct auxtrace_cache *c, void *entry);
+int auxtrace_cache__add(struct auxtrace_cache *c, u32 key,
+			struct auxtrace_cache_entry *entry);
+void *auxtrace_cache__lookup(struct auxtrace_cache *c, u32 key);
+
+struct auxtrace_record *auxtrace_record__init(struct perf_evlist *evlist,
+					      int *err);
+
+int auxtrace_parse_snapshot_options(struct auxtrace_record *itr,
+				    struct record_opts *opts,
+				    const char *str);
+int auxtrace_record__options(struct auxtrace_record *itr,
+			     struct perf_evlist *evlist,
+			     struct record_opts *opts);
+size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr);
+int auxtrace_record__info_fill(struct auxtrace_record *itr,
+			       struct perf_session *session,
+			       struct auxtrace_info_event *auxtrace_info,
+			       size_t priv_size);
+void auxtrace_record__free(struct auxtrace_record *itr);
+int auxtrace_record__snapshot_start(struct auxtrace_record *itr);
+int auxtrace_record__snapshot_finish(struct auxtrace_record *itr);
+int auxtrace_record__find_snapshot(struct auxtrace_record *itr, int idx,
+				   struct auxtrace_mmap *mm,
+				   unsigned char *data, u64 *head, u64 *old);
+u64 auxtrace_record__reference(struct auxtrace_record *itr);
+
+int auxtrace_index__auxtrace_event(struct list_head *head, union perf_event *event,
+				   off_t file_offset);
+int auxtrace_index__write(int fd, struct list_head *head);
+int auxtrace_index__process(int fd, u64 size, struct perf_session *session,
+			    bool needs_swap);
+void auxtrace_index__free(struct list_head *head);
+
+void auxtrace_synth_error(struct auxtrace_error_event *auxtrace_error, int type,
+			  int code, int cpu, pid_t pid, pid_t tid, u64 ip,
+			  const char *msg);
+
+int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr,
+					 struct perf_tool *tool,
+					 struct perf_session *session,
+					 perf_event__handler_t process);
+int perf_event__process_auxtrace_info(struct perf_tool *tool,
+				      union perf_event *event,
+				      struct perf_session *session);
+s64 perf_event__process_auxtrace(struct perf_tool *tool,
+				 union perf_event *event,
+				 struct perf_session *session);
+int perf_event__process_auxtrace_error(struct perf_tool *tool,
+				       union perf_event *event,
+				       struct perf_session *session);
+int itrace_parse_synth_opts(const struct option *opt, const char *str,
+			    int unset);
+void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts);
+
+size_t perf_event__fprintf_auxtrace_error(union perf_event *event, FILE *fp);
+void perf_session__auxtrace_error_inc(struct perf_session *session,
+				      union perf_event *event);
+void events_stats__auxtrace_error_warn(const struct events_stats *stats);
+
+static inline int auxtrace__process_event(struct perf_session *session,
+					  union perf_event *event,
+					  struct perf_sample *sample,
+					  struct perf_tool *tool)
+{
+	if (!session->auxtrace)
+		return 0;
+
+	return session->auxtrace->process_event(session, event, sample, tool);
+}
+
+static inline int auxtrace__flush_events(struct perf_session *session,
+					 struct perf_tool *tool)
+{
+	if (!session->auxtrace)
+		return 0;
+
+	return session->auxtrace->flush_events(session, tool);
+}
+
+static inline void auxtrace__free_events(struct perf_session *session)
+{
+	if (!session->auxtrace)
+		return;
+
+	return session->auxtrace->free_events(session);
+}
+
+static inline void auxtrace__free(struct perf_session *session)
+{
+	if (!session->auxtrace)
+		return;
+
+	return session->auxtrace->free(session);
+}
+
+#else
+
+static inline struct auxtrace_record *
+auxtrace_record__init(struct perf_evlist *evlist __maybe_unused,
+		      int *err __maybe_unused)
+{
+	*err = 0;
+	return NULL;
+}
+
+static inline
+void auxtrace_record__free(struct auxtrace_record *itr __maybe_unused)
+{
+}
+
+static inline int
+perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr __maybe_unused,
+				     struct perf_tool *tool __maybe_unused,
+				     struct perf_session *session __maybe_unused,
+				     perf_event__handler_t process __maybe_unused)
+{
+	return -EINVAL;
+}
+
+static inline
+int auxtrace_record__options(struct auxtrace_record *itr __maybe_unused,
+			     struct perf_evlist *evlist __maybe_unused,
+			     struct record_opts *opts __maybe_unused)
+{
+	return 0;
+}
+
+#define perf_event__process_auxtrace_info		0
+#define perf_event__process_auxtrace			0
+#define perf_event__process_auxtrace_error		0
+
+static inline
+void perf_session__auxtrace_error_inc(struct perf_session *session
+				      __maybe_unused,
+				      union perf_event *event
+				      __maybe_unused)
+{
+}
+
+static inline
+void events_stats__auxtrace_error_warn(const struct events_stats *stats
+				       __maybe_unused)
+{
+}
+
+static inline
+int itrace_parse_synth_opts(const struct option *opt __maybe_unused,
+			    const char *str __maybe_unused,
+			    int unset __maybe_unused)
+{
+	pr_err("AUX area tracing not supported\n");
+	return -EINVAL;
+}
+
+static inline
+int auxtrace_parse_snapshot_options(struct auxtrace_record *itr __maybe_unused,
+				    struct record_opts *opts __maybe_unused,
+				    const char *str)
+{
+	if (!str)
+		return 0;
+	pr_err("AUX area tracing not supported\n");
+	return -EINVAL;
+}
+
+static inline
+int auxtrace__process_event(struct perf_session *session __maybe_unused,
+			    union perf_event *event __maybe_unused,
+			    struct perf_sample *sample __maybe_unused,
+			    struct perf_tool *tool __maybe_unused)
+{
+	return 0;
+}
+
+static inline
+int auxtrace__flush_events(struct perf_session *session __maybe_unused,
+			   struct perf_tool *tool __maybe_unused)
+{
+	return 0;
+}
+
+static inline
+void auxtrace__free_events(struct perf_session *session __maybe_unused)
+{
+}
+
+static inline
+void auxtrace_cache__free(struct auxtrace_cache *auxtrace_cache __maybe_unused)
+{
+}
+
+static inline
+void auxtrace__free(struct perf_session *session __maybe_unused)
+{
+}
+
+static inline
+int auxtrace_index__write(int fd __maybe_unused,
+			  struct list_head *head __maybe_unused)
+{
+	return -EINVAL;
+}
+
+static inline
+int auxtrace_index__process(int fd __maybe_unused,
+			    u64 size __maybe_unused,
+			    struct perf_session *session __maybe_unused,
+			    bool needs_swap __maybe_unused)
+{
+	return -EINVAL;
+}
+
+static inline
+void auxtrace_index__free(struct list_head *head __maybe_unused)
+{
+}
+
+int auxtrace_mmap__mmap(struct auxtrace_mmap *mm,
+			struct auxtrace_mmap_params *mp,
+			void *userpg, int fd);
+void auxtrace_mmap__munmap(struct auxtrace_mmap *mm);
+void auxtrace_mmap_params__init(struct auxtrace_mmap_params *mp,
+				off_t auxtrace_offset,
+				unsigned int auxtrace_pages,
+				bool auxtrace_overwrite);
+void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp,
+				   struct perf_evlist *evlist, int idx,
+				   bool per_cpu);
+
+#endif
+
+#endif
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index 61867dff5d5aa6..1f6fc2323ef97d 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -43,6 +43,7 @@ int build_id__mark_dso_hit(struct perf_tool *tool __maybe_unused,
 	if (al.map != NULL)
 		al.map->dso->hit = 1;
 
+	thread__put(thread);
 	return 0;
 }
 
@@ -59,8 +60,10 @@ static int perf_event__exit_del_thread(struct perf_tool *tool __maybe_unused,
 	dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid,
 		    event->fork.ppid, event->fork.ptid);
 
-	if (thread)
+	if (thread) {
 		machine__remove_thread(machine, thread);
+		thread__put(thread);
+	}
 
 	return 0;
 }
@@ -159,15 +162,20 @@ static int write_buildid(const char *name, size_t name_len, u8 *build_id,
 	return write_padded(fd, name, name_len + 1, len);
 }
 
-static int __dsos__write_buildid_table(struct list_head *head,
-				       struct machine *machine,
-				       pid_t pid, u16 misc, int fd)
+static int machine__write_buildid_table(struct machine *machine, int fd)
 {
+	int err = 0;
 	char nm[PATH_MAX];
 	struct dso *pos;
+	u16 kmisc = PERF_RECORD_MISC_KERNEL,
+	    umisc = PERF_RECORD_MISC_USER;
 
-	dsos__for_each_with_build_id(pos, head) {
-		int err;
+	if (!machine__is_host(machine)) {
+		kmisc = PERF_RECORD_MISC_GUEST_KERNEL;
+		umisc = PERF_RECORD_MISC_GUEST_USER;
+	}
+
+	dsos__for_each_with_build_id(pos, &machine->dsos.head) {
 		const char *name;
 		size_t name_len;
 
@@ -186,32 +194,12 @@ static int __dsos__write_buildid_table(struct list_head *head,
 			name_len = pos->long_name_len + 1;
 		}
 
-		err = write_buildid(name, name_len, pos->build_id,
-				    pid, misc, fd);
+		err = write_buildid(name, name_len, pos->build_id, machine->pid,
+				    pos->kernel ? kmisc : umisc, fd);
 		if (err)
-			return err;
-	}
-
-	return 0;
-}
-
-static int machine__write_buildid_table(struct machine *machine, int fd)
-{
-	int err;
-	u16 kmisc = PERF_RECORD_MISC_KERNEL,
-	    umisc = PERF_RECORD_MISC_USER;
-
-	if (!machine__is_host(machine)) {
-		kmisc = PERF_RECORD_MISC_GUEST_KERNEL;
-		umisc = PERF_RECORD_MISC_GUEST_USER;
+			break;
 	}
 
-	err = __dsos__write_buildid_table(&machine->kernel_dsos.head, machine,
-					  machine->pid, kmisc, fd);
-	if (err == 0)
-		err = __dsos__write_buildid_table(&machine->user_dsos.head,
-						  machine, machine->pid, umisc,
-						  fd);
 	return err;
 }
 
@@ -244,13 +232,7 @@ static int __dsos__hit_all(struct list_head *head)
 
 static int machine__hit_all_dsos(struct machine *machine)
 {
-	int err;
-
-	err = __dsos__hit_all(&machine->kernel_dsos.head);
-	if (err)
-		return err;
-
-	return __dsos__hit_all(&machine->user_dsos.head);
+	return __dsos__hit_all(&machine->dsos.head);
 }
 
 int dsos__hit_all(struct perf_session *session)
@@ -490,9 +472,7 @@ static int __dsos__cache_build_ids(struct list_head *head,
 
 static int machine__cache_build_ids(struct machine *machine)
 {
-	int ret = __dsos__cache_build_ids(&machine->kernel_dsos.head, machine);
-	ret |= __dsos__cache_build_ids(&machine->user_dsos.head, machine);
-	return ret;
+	return __dsos__cache_build_ids(&machine->dsos.head, machine);
 }
 
 int perf_session__cache_build_ids(struct perf_session *session)
@@ -517,11 +497,7 @@ int perf_session__cache_build_ids(struct perf_session *session)
 
 static bool machine__read_build_ids(struct machine *machine, bool with_hits)
 {
-	bool ret;
-
-	ret  = __dsos__read_build_ids(&machine->kernel_dsos.head, with_hits);
-	ret |= __dsos__read_build_ids(&machine->user_dsos.head, with_hits);
-	return ret;
+	return __dsos__read_build_ids(&machine->dsos.head, with_hits);
 }
 
 bool perf_session__read_build_ids(struct perf_session *session, bool with_hits)
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index fbcca21d66ab9b..c861373aaed33d 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -30,7 +30,6 @@ extern const char *perf_config_dirname(const char *, const char *);
 
 /* pager.c */
 extern void setup_pager(void);
-extern const char *pager_program;
 extern int pager_in_use(void);
 extern int pager_use_color;
 
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 6033a0a212ca5c..679c2c6d8ade7d 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -72,6 +72,10 @@ extern struct callchain_param callchain_param;
 struct callchain_list {
 	u64			ip;
 	struct map_symbol	ms;
+	struct /* for TUI */ {
+		bool		unfolded;
+		bool		has_children;
+	};
 	char		       *srcline;
 	struct list_head	list;
 };
diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c
index 88f7be3994321f..32e12ecfe9c576 100644
--- a/tools/perf/util/cgroup.c
+++ b/tools/perf/util/cgroup.c
@@ -115,23 +115,19 @@ static int add_cgroup(struct perf_evlist *evlist, char *str)
 			goto found;
 		n++;
 	}
-	if (cgrp->refcnt == 0)
+	if (atomic_read(&cgrp->refcnt) == 0)
 		free(cgrp);
 
 	return -1;
 found:
-	cgrp->refcnt++;
+	atomic_inc(&cgrp->refcnt);
 	counter->cgrp = cgrp;
 	return 0;
 }
 
 void close_cgroup(struct cgroup_sel *cgrp)
 {
-	if (!cgrp)
-		return;
-
-	/* XXX: not reentrant */
-	if (--cgrp->refcnt == 0) {
+	if (cgrp && atomic_dec_and_test(&cgrp->refcnt)) {
 		close(cgrp->fd);
 		zfree(&cgrp->name);
 		free(cgrp);
diff --git a/tools/perf/util/cgroup.h b/tools/perf/util/cgroup.h
index 89acd6debdc5fc..b4b8cb42fe5e04 100644
--- a/tools/perf/util/cgroup.h
+++ b/tools/perf/util/cgroup.h
@@ -1,12 +1,14 @@
 #ifndef __CGROUP_H__
 #define __CGROUP_H__
 
+#include <linux/atomic.h>
+
 struct option;
 
 struct cgroup_sel {
 	char *name;
 	int fd;
-	int refcnt;
+	atomic_t refcnt;
 };
 
 
diff --git a/tools/perf/util/comm.c b/tools/perf/util/comm.c
index b2bb59df65e10c..21b7ff382c3f0d 100644
--- a/tools/perf/util/comm.c
+++ b/tools/perf/util/comm.c
@@ -2,24 +2,27 @@
 #include "util.h"
 #include <stdlib.h>
 #include <stdio.h>
+#include <linux/atomic.h>
 
 struct comm_str {
 	char *str;
 	struct rb_node rb_node;
-	int ref;
+	atomic_t refcnt;
 };
 
 /* Should perhaps be moved to struct machine */
 static struct rb_root comm_str_root;
 
-static void comm_str__get(struct comm_str *cs)
+static struct comm_str *comm_str__get(struct comm_str *cs)
 {
-	cs->ref++;
+	if (cs)
+		atomic_inc(&cs->refcnt);
+	return cs;
 }
 
 static void comm_str__put(struct comm_str *cs)
 {
-	if (!--cs->ref) {
+	if (cs && atomic_dec_and_test(&cs->refcnt)) {
 		rb_erase(&cs->rb_node, &comm_str_root);
 		zfree(&cs->str);
 		free(cs);
@@ -40,6 +43,8 @@ static struct comm_str *comm_str__alloc(const char *str)
 		return NULL;
 	}
 
+	atomic_set(&cs->refcnt, 0);
+
 	return cs;
 }
 
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index dd17c9a32fbcfc..5bfc1198ab465c 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -14,6 +14,7 @@
 #include <babeltrace/ctf-writer/event.h>
 #include <babeltrace/ctf-writer/event-types.h>
 #include <babeltrace/ctf-writer/event-fields.h>
+#include <babeltrace/ctf-ir/utils.h>
 #include <babeltrace/ctf/events.h>
 #include <traceevent/event-parse.h>
 #include "asm/bug.h"
@@ -38,12 +39,21 @@ struct evsel_priv {
 	struct bt_ctf_event_class *event_class;
 };
 
+#define MAX_CPUS	4096
+
+struct ctf_stream {
+	struct bt_ctf_stream *stream;
+	int cpu;
+	u32 count;
+};
+
 struct ctf_writer {
 	/* writer primitives */
-	struct bt_ctf_writer		*writer;
-	struct bt_ctf_stream		*stream;
-	struct bt_ctf_stream_class	*stream_class;
-	struct bt_ctf_clock		*clock;
+	struct bt_ctf_writer		 *writer;
+	struct ctf_stream		**stream;
+	int				  stream_cnt;
+	struct bt_ctf_stream_class	 *stream_class;
+	struct bt_ctf_clock		 *clock;
 
 	/* data types */
 	union {
@@ -65,6 +75,9 @@ struct convert {
 
 	u64			events_size;
 	u64			events_count;
+
+	/* Ordered events configured queue size. */
+	u64			queue_size;
 };
 
 static int value_set(struct bt_ctf_field_type *type,
@@ -153,6 +166,43 @@ get_tracepoint_field_type(struct ctf_writer *cw, struct format_field *field)
 		return cw->data.u32;
 }
 
+static unsigned long long adjust_signedness(unsigned long long value_int, int size)
+{
+	unsigned long long value_mask;
+
+	/*
+	 * value_mask = (1 << (size * 8 - 1)) - 1.
+	 * Directly set value_mask for code readers.
+	 */
+	switch (size) {
+	case 1:
+		value_mask = 0x7fULL;
+		break;
+	case 2:
+		value_mask = 0x7fffULL;
+		break;
+	case 4:
+		value_mask = 0x7fffffffULL;
+		break;
+	case 8:
+		/*
+		 * For 64 bit value, return it self. There is no need
+		 * to fill high bit.
+		 */
+		/* Fall through */
+	default:
+		/* BUG! */
+		return value_int;
+	}
+
+	/* If it is a positive value, don't adjust. */
+	if ((value_int & (~0ULL - value_mask)) == 0)
+		return value_int;
+
+	/* Fill upper part of value_int with 1 to make it a negative long long. */
+	return (value_int & value_mask) | ~value_mask;
+}
+
 static int add_tracepoint_field_value(struct ctf_writer *cw,
 				      struct bt_ctf_event_class *event_class,
 				      struct bt_ctf_event *event,
@@ -164,7 +214,6 @@ static int add_tracepoint_field_value(struct ctf_writer *cw,
 	struct bt_ctf_field *field;
 	const char *name = fmtf->name;
 	void *data = sample->raw_data;
-	unsigned long long value_int;
 	unsigned long flags = fmtf->flags;
 	unsigned int n_items;
 	unsigned int i;
@@ -172,6 +221,7 @@ static int add_tracepoint_field_value(struct ctf_writer *cw,
 	unsigned int len;
 	int ret;
 
+	name = fmtf->alias;
 	offset = fmtf->offset;
 	len = fmtf->size;
 	if (flags & FIELD_IS_STRING)
@@ -208,11 +258,6 @@ static int add_tracepoint_field_value(struct ctf_writer *cw,
 	type = get_tracepoint_field_type(cw, fmtf);
 
 	for (i = 0; i < n_items; i++) {
-		if (!(flags & FIELD_IS_STRING))
-			value_int = pevent_read_number(
-					fmtf->event->pevent,
-					data + offset + i * len, len);
-
 		if (flags & FIELD_IS_ARRAY)
 			field = bt_ctf_field_array_get_field(array_field, i);
 		else
@@ -226,12 +271,21 @@ static int add_tracepoint_field_value(struct ctf_writer *cw,
 		if (flags & FIELD_IS_STRING)
 			ret = bt_ctf_field_string_set_value(field,
 					data + offset + i * len);
-		else if (!(flags & FIELD_IS_SIGNED))
-			ret = bt_ctf_field_unsigned_integer_set_value(
-					field, value_int);
-		else
-			ret = bt_ctf_field_signed_integer_set_value(
-					field, value_int);
+		else {
+			unsigned long long value_int;
+
+			value_int = pevent_read_number(
+					fmtf->event->pevent,
+					data + offset + i * len, len);
+
+			if (!(flags & FIELD_IS_SIGNED))
+				ret = bt_ctf_field_unsigned_integer_set_value(
+						field, value_int);
+			else
+				ret = bt_ctf_field_signed_integer_set_value(
+						field, adjust_signedness(value_int, len));
+		}
+
 		if (ret) {
 			pr_err("failed to set file value %s\n", name);
 			goto err_put_field;
@@ -346,12 +400,6 @@ static int add_generic_values(struct ctf_writer *cw,
 			return -1;
 	}
 
-	if (type & PERF_SAMPLE_CPU) {
-		ret = value_set_u32(cw, event, "perf_cpu", sample->cpu);
-		if (ret)
-			return -1;
-	}
-
 	if (type & PERF_SAMPLE_PERIOD) {
 		ret = value_set_u64(cw, event, "perf_period", sample->period);
 		if (ret)
@@ -381,6 +429,129 @@ static int add_generic_values(struct ctf_writer *cw,
 	return 0;
 }
 
+static int ctf_stream__flush(struct ctf_stream *cs)
+{
+	int err = 0;
+
+	if (cs) {
+		err = bt_ctf_stream_flush(cs->stream);
+		if (err)
+			pr_err("CTF stream %d flush failed\n", cs->cpu);
+
+		pr("Flush stream for cpu %d (%u samples)\n",
+		   cs->cpu, cs->count);
+
+		cs->count = 0;
+	}
+
+	return err;
+}
+
+static struct ctf_stream *ctf_stream__create(struct ctf_writer *cw, int cpu)
+{
+	struct ctf_stream *cs;
+	struct bt_ctf_field *pkt_ctx   = NULL;
+	struct bt_ctf_field *cpu_field = NULL;
+	struct bt_ctf_stream *stream   = NULL;
+	int ret;
+
+	cs = zalloc(sizeof(*cs));
+	if (!cs) {
+		pr_err("Failed to allocate ctf stream\n");
+		return NULL;
+	}
+
+	stream = bt_ctf_writer_create_stream(cw->writer, cw->stream_class);
+	if (!stream) {
+		pr_err("Failed to create CTF stream\n");
+		goto out;
+	}
+
+	pkt_ctx = bt_ctf_stream_get_packet_context(stream);
+	if (!pkt_ctx) {
+		pr_err("Failed to obtain packet context\n");
+		goto out;
+	}
+
+	cpu_field = bt_ctf_field_structure_get_field(pkt_ctx, "cpu_id");
+	bt_ctf_field_put(pkt_ctx);
+	if (!cpu_field) {
+		pr_err("Failed to obtain cpu field\n");
+		goto out;
+	}
+
+	ret = bt_ctf_field_unsigned_integer_set_value(cpu_field, (u32) cpu);
+	if (ret) {
+		pr_err("Failed to update CPU number\n");
+		goto out;
+	}
+
+	bt_ctf_field_put(cpu_field);
+
+	cs->cpu    = cpu;
+	cs->stream = stream;
+	return cs;
+
+out:
+	if (cpu_field)
+		bt_ctf_field_put(cpu_field);
+	if (stream)
+		bt_ctf_stream_put(stream);
+
+	free(cs);
+	return NULL;
+}
+
+static void ctf_stream__delete(struct ctf_stream *cs)
+{
+	if (cs) {
+		bt_ctf_stream_put(cs->stream);
+		free(cs);
+	}
+}
+
+static struct ctf_stream *ctf_stream(struct ctf_writer *cw, int cpu)
+{
+	struct ctf_stream *cs = cw->stream[cpu];
+
+	if (!cs) {
+		cs = ctf_stream__create(cw, cpu);
+		cw->stream[cpu] = cs;
+	}
+
+	return cs;
+}
+
+static int get_sample_cpu(struct ctf_writer *cw, struct perf_sample *sample,
+			  struct perf_evsel *evsel)
+{
+	int cpu = 0;
+
+	if (evsel->attr.sample_type & PERF_SAMPLE_CPU)
+		cpu = sample->cpu;
+
+	if (cpu > cw->stream_cnt) {
+		pr_err("Event was recorded for CPU %d, limit is at %d.\n",
+			cpu, cw->stream_cnt);
+		cpu = 0;
+	}
+
+	return cpu;
+}
+
+#define STREAM_FLUSH_COUNT 100000
+
+/*
+ * Currently we have no other way to determine the
+ * time for the stream flush other than keep track
+ * of the number of events and check it against
+ * threshold.
+ */
+static bool is_flush_needed(struct ctf_stream *cs)
+{
+	return cs->count >= STREAM_FLUSH_COUNT;
+}
+
 static int process_sample_event(struct perf_tool *tool,
 				union perf_event *_event __maybe_unused,
 				struct perf_sample *sample,
@@ -390,6 +561,7 @@ static int process_sample_event(struct perf_tool *tool,
 	struct convert *c = container_of(tool, struct convert, tool);
 	struct evsel_priv *priv = evsel->priv;
 	struct ctf_writer *cw = &c->writer;
+	struct ctf_stream *cs;
 	struct bt_ctf_event_class *event_class;
 	struct bt_ctf_event *event;
 	int ret;
@@ -424,9 +596,93 @@ static int process_sample_event(struct perf_tool *tool,
 			return -1;
 	}
 
-	bt_ctf_stream_append_event(cw->stream, event);
+	cs = ctf_stream(cw, get_sample_cpu(cw, sample, evsel));
+	if (cs) {
+		if (is_flush_needed(cs))
+			ctf_stream__flush(cs);
+
+		cs->count++;
+		bt_ctf_stream_append_event(cs->stream, event);
+	}
+
 	bt_ctf_event_put(event);
-	return 0;
+	return cs ? 0 : -1;
+}
+
+/* If dup < 0, add a prefix. Else, add _dupl_X suffix. */
+static char *change_name(char *name, char *orig_name, int dup)
+{
+	char *new_name = NULL;
+	size_t len;
+
+	if (!name)
+		name = orig_name;
+
+	if (dup >= 10)
+		goto out;
+	/*
+	 * Add '_' prefix to potential keywork.  According to
+	 * Mathieu Desnoyers (https://lkml.org/lkml/2015/1/23/652),
+	 * futher CTF spec updating may require us to use '$'.
+	 */
+	if (dup < 0)
+		len = strlen(name) + sizeof("_");
+	else
+		len = strlen(orig_name) + sizeof("_dupl_X");
+
+	new_name = malloc(len);
+	if (!new_name)
+		goto out;
+
+	if (dup < 0)
+		snprintf(new_name, len, "_%s", name);
+	else
+		snprintf(new_name, len, "%s_dupl_%d", orig_name, dup);
+
+out:
+	if (name != orig_name)
+		free(name);
+	return new_name;
+}
+
+static int event_class_add_field(struct bt_ctf_event_class *event_class,
+		struct bt_ctf_field_type *type,
+		struct format_field *field)
+{
+	struct bt_ctf_field_type *t = NULL;
+	char *name;
+	int dup = 1;
+	int ret;
+
+	/* alias was already assigned */
+	if (field->alias != field->name)
+		return bt_ctf_event_class_add_field(event_class, type,
+				(char *)field->alias);
+
+	name = field->name;
+
+	/* If 'name' is a keywork, add prefix. */
+	if (bt_ctf_validate_identifier(name))
+		name = change_name(name, field->name, -1);
+
+	if (!name) {
+		pr_err("Failed to fix invalid identifier.");
+		return -1;
+	}
+	while ((t = bt_ctf_event_class_get_field_by_name(event_class, name))) {
+		bt_ctf_field_type_put(t);
+		name = change_name(name, field->name, dup++);
+		if (!name) {
+			pr_err("Failed to create dup name for '%s'\n", field->name);
+			return -1;
+		}
+	}
+
+	ret = bt_ctf_event_class_add_field(event_class, type, name);
+	if (!ret)
+		field->alias = name;
+
+	return ret;
 }
 
 static int add_tracepoint_fields_types(struct ctf_writer *cw,
@@ -457,14 +713,14 @@ static int add_tracepoint_fields_types(struct ctf_writer *cw,
 		if (flags & FIELD_IS_ARRAY)
 			type = bt_ctf_field_type_array_create(type, field->arraylen);
 
-		ret = bt_ctf_event_class_add_field(event_class, type,
-				field->name);
+		ret = event_class_add_field(event_class, type, field);
 
 		if (flags & FIELD_IS_ARRAY)
 			bt_ctf_field_type_put(type);
 
 		if (ret) {
-			pr_err("Failed to add field '%s\n", field->name);
+			pr_err("Failed to add field '%s': %d\n",
+					field->name, ret);
 			return -1;
 		}
 	}
@@ -508,7 +764,7 @@ static int add_generic_types(struct ctf_writer *cw, struct perf_evsel *evsel,
 	do {								\
 		pr2("  field '%s'\n", n);				\
 		if (bt_ctf_event_class_add_field(cl, t, n)) {		\
-			pr_err("Failed to add field '%s;\n", n);	\
+			pr_err("Failed to add field '%s';\n", n);	\
 			return -1;					\
 		}							\
 	} while (0)
@@ -528,9 +784,6 @@ static int add_generic_types(struct ctf_writer *cw, struct perf_evsel *evsel,
 	if (type & PERF_SAMPLE_STREAM_ID)
 		ADD_FIELD(event_class, cw->data.u64, "perf_stream_id");
 
-	if (type & PERF_SAMPLE_CPU)
-		ADD_FIELD(event_class, cw->data.u32, "perf_cpu");
-
 	if (type & PERF_SAMPLE_PERIOD)
 		ADD_FIELD(event_class, cw->data.u64, "perf_period");
 
@@ -604,6 +857,39 @@ static int setup_events(struct ctf_writer *cw, struct perf_session *session)
 	return 0;
 }
 
+static int setup_streams(struct ctf_writer *cw, struct perf_session *session)
+{
+	struct ctf_stream **stream;
+	struct perf_header *ph = &session->header;
+	int ncpus;
+
+	/*
+	 * Try to get the number of cpus used in the data file,
+	 * if not present fallback to the MAX_CPUS.
+	 */
+	ncpus = ph->env.nr_cpus_avail ?: MAX_CPUS;
+
+	stream = zalloc(sizeof(*stream) * ncpus);
+	if (!stream) {
+		pr_err("Failed to allocate streams.\n");
+		return -ENOMEM;
+	}
+
+	cw->stream     = stream;
+	cw->stream_cnt = ncpus;
+	return 0;
+}
+
+static void free_streams(struct ctf_writer *cw)
+{
+	int cpu;
+
+	for (cpu = 0; cpu < cw->stream_cnt; cpu++)
+		ctf_stream__delete(cw->stream[cpu]);
+
+	free(cw->stream);
+}
+
 static int ctf_writer__setup_env(struct ctf_writer *cw,
 				 struct perf_session *session)
 {
@@ -713,7 +999,7 @@ static void ctf_writer__cleanup(struct ctf_writer *cw)
 	ctf_writer__cleanup_data(cw);
 
 	bt_ctf_clock_put(cw->clock);
-	bt_ctf_stream_put(cw->stream);
+	free_streams(cw);
 	bt_ctf_stream_class_put(cw->stream_class);
 	bt_ctf_writer_put(cw->writer);
 
@@ -725,8 +1011,9 @@ static int ctf_writer__init(struct ctf_writer *cw, const char *path)
 {
 	struct bt_ctf_writer		*writer;
 	struct bt_ctf_stream_class	*stream_class;
-	struct bt_ctf_stream		*stream;
 	struct bt_ctf_clock		*clock;
+	struct bt_ctf_field_type	*pkt_ctx_type;
+	int				ret;
 
 	/* CTF writer */
 	writer = bt_ctf_writer_create(path);
@@ -767,14 +1054,15 @@ static int ctf_writer__init(struct ctf_writer *cw, const char *path)
 	if (ctf_writer__init_data(cw))
 		goto err_cleanup;
 
-	/* CTF stream instance */
-	stream = bt_ctf_writer_create_stream(writer, stream_class);
-	if (!stream) {
-		pr("Failed to create CTF stream.\n");
+	/* Add cpu_id for packet context */
+	pkt_ctx_type = bt_ctf_stream_class_get_packet_context_type(stream_class);
+	if (!pkt_ctx_type)
 		goto err_cleanup;
-	}
 
-	cw->stream = stream;
+	ret = bt_ctf_field_type_structure_add_field(pkt_ctx_type, cw->data.u32, "cpu_id");
+	bt_ctf_field_type_put(pkt_ctx_type);
+	if (ret)
+		goto err_cleanup;
 
 	/* CTF clock writer setup */
 	if (bt_ctf_writer_add_clock(writer, clock)) {
@@ -791,6 +1079,28 @@ err:
 	return -1;
 }
 
+static int ctf_writer__flush_streams(struct ctf_writer *cw)
+{
+	int cpu, ret = 0;
+
+	for (cpu = 0; cpu < cw->stream_cnt && !ret; cpu++)
+		ret = ctf_stream__flush(cw->stream[cpu]);
+
+	return ret;
+}
+
+static int convert__config(const char *var, const char *value, void *cb)
+{
+	struct convert *c = cb;
+
+	if (!strcmp(var, "convert.queue-size")) {
+		c->queue_size = perf_config_u64(var, value);
+		return 0;
+	}
+
+	return perf_default_config(var, value, cb);
+}
+
 int bt_convert__perf2ctf(const char *input, const char *path, bool force)
 {
 	struct perf_session *session;
@@ -817,6 +1127,8 @@ int bt_convert__perf2ctf(const char *input, const char *path, bool force)
 	struct ctf_writer *cw = &c.writer;
 	int err = -1;
 
+	perf_config(convert__config, &c);
+
 	/* CTF writer */
 	if (ctf_writer__init(cw, path))
 		return -1;
@@ -826,6 +1138,11 @@ int bt_convert__perf2ctf(const char *input, const char *path, bool force)
 	if (!session)
 		goto free_writer;
 
+	if (c.queue_size) {
+		ordered_events__set_alloc_size(&session->ordered_events,
+					       c.queue_size);
+	}
+
 	/* CTF writer env/clock setup  */
 	if (ctf_writer__setup_env(cw, session))
 		goto free_session;
@@ -834,9 +1151,14 @@ int bt_convert__perf2ctf(const char *input, const char *path, bool force)
 	if (setup_events(cw, session))
 		goto free_session;
 
+	if (setup_streams(cw, session))
+		goto free_session;
+
 	err = perf_session__process_events(session);
 	if (!err)
-		err = bt_ctf_stream_flush(cw->stream);
+		err = ctf_writer__flush_streams(cw);
+	else
+		pr_err("Error during conversion.\n");
 
 	fprintf(stderr,
 		"[ perf data convert: Converted '%s' into CTF data '%s' ]\n",
@@ -847,11 +1169,15 @@ int bt_convert__perf2ctf(const char *input, const char *path, bool force)
 		(double) c.events_size / 1024.0 / 1024.0,
 		c.events_count);
 
-	/* its all good */
-free_session:
 	perf_session__delete(session);
+	ctf_writer__cleanup(cw);
+
+	return err;
 
+free_session:
+	perf_session__delete(session);
 free_writer:
 	ctf_writer__cleanup(cw);
+	pr_err("Error during conversion setup.\n");
 	return err;
 }
diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c
index bb39a3ffc70b39..1c9689e4cc179a 100644
--- a/tools/perf/util/db-export.c
+++ b/tools/perf/util/db-export.c
@@ -122,6 +122,7 @@ int db_export__machine(struct db_export *dbe, struct machine *machine)
 int db_export__thread(struct db_export *dbe, struct thread *thread,
 		      struct machine *machine, struct comm *comm)
 {
+	struct thread *main_thread;
 	u64 main_thread_db_id = 0;
 	int err;
 
@@ -131,8 +132,6 @@ int db_export__thread(struct db_export *dbe, struct thread *thread,
 	thread->db_id = ++dbe->thread_last_db_id;
 
 	if (thread->pid_ != -1) {
-		struct thread *main_thread;
-
 		if (thread->pid_ == thread->tid) {
 			main_thread = thread;
 		} else {
@@ -144,14 +143,16 @@ int db_export__thread(struct db_export *dbe, struct thread *thread,
 			err = db_export__thread(dbe, main_thread, machine,
 						comm);
 			if (err)
-				return err;
+				goto out_put;
 			if (comm) {
 				err = db_export__comm_thread(dbe, comm, thread);
 				if (err)
-					return err;
+					goto out_put;
 			}
 		}
 		main_thread_db_id = main_thread->db_id;
+		if (main_thread != thread)
+			thread__put(main_thread);
 	}
 
 	if (dbe->export_thread)
@@ -159,6 +160,10 @@ int db_export__thread(struct db_export *dbe, struct thread *thread,
 					  machine);
 
 	return 0;
+
+out_put:
+	thread__put(main_thread);
+	return err;
 }
 
 int db_export__comm(struct db_export *dbe, struct comm *comm,
@@ -229,7 +234,7 @@ int db_export__symbol(struct db_export *dbe, struct symbol *sym,
 static struct thread *get_main_thread(struct machine *machine, struct thread *thread)
 {
 	if (thread->pid_ == thread->tid)
-		return thread;
+		return thread__get(thread);
 
 	if (thread->pid_ == -1)
 		return NULL;
@@ -309,12 +314,12 @@ int db_export__sample(struct db_export *dbe, union perf_event *event,
 
 	err = db_export__thread(dbe, thread, al->machine, comm);
 	if (err)
-		return err;
+		goto out_put;
 
 	if (comm) {
 		err = db_export__comm(dbe, comm, main_thread);
 		if (err)
-			return err;
+			goto out_put;
 		es.comm_db_id = comm->db_id;
 	}
 
@@ -322,7 +327,7 @@ int db_export__sample(struct db_export *dbe, union perf_event *event,
 
 	err = db_ids_from_al(dbe, al, &es.dso_db_id, &es.sym_db_id, &es.offset);
 	if (err)
-		return err;
+		goto out_put;
 
 	if ((evsel->attr.sample_type & PERF_SAMPLE_ADDR) &&
 	    sample_addr_correlates_sym(&evsel->attr)) {
@@ -332,20 +337,22 @@ int db_export__sample(struct db_export *dbe, union perf_event *event,
 		err = db_ids_from_al(dbe, &addr_al, &es.addr_dso_db_id,
 				     &es.addr_sym_db_id, &es.addr_offset);
 		if (err)
-			return err;
+			goto out_put;
 		if (dbe->crp) {
 			err = thread_stack__process(thread, comm, sample, al,
 						    &addr_al, es.db_id,
 						    dbe->crp);
 			if (err)
-				return err;
+				goto out_put;
 		}
 	}
 
 	if (dbe->export_sample)
-		return dbe->export_sample(dbe, &es);
+		err = dbe->export_sample(dbe, &es);
 
-	return 0;
+out_put:
+	thread__put(main_thread);
+	return err;
 }
 
 static struct {
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index fc0ddd5792a97f..b335db3532a2b4 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -4,6 +4,7 @@
 #include "symbol.h"
 #include "dso.h"
 #include "machine.h"
+#include "auxtrace.h"
 #include "util.h"
 #include "debug.h"
 
@@ -264,6 +265,7 @@ int __kmod_path__parse(struct kmod_path *m, const char *path,
  */
 static LIST_HEAD(dso__data_open);
 static long dso__data_open_cnt;
+static pthread_mutex_t dso__data_open_lock = PTHREAD_MUTEX_INITIALIZER;
 
 static void dso__list_add(struct dso *dso)
 {
@@ -433,18 +435,12 @@ static void check_data_close(void)
  */
 void dso__data_close(struct dso *dso)
 {
+	pthread_mutex_lock(&dso__data_open_lock);
 	close_dso(dso);
+	pthread_mutex_unlock(&dso__data_open_lock);
 }
 
-/**
- * dso__data_fd - Get dso's data file descriptor
- * @dso: dso object
- * @machine: machine object
- *
- * External interface to find dso's file, open it and
- * returns file descriptor.
- */
-int dso__data_fd(struct dso *dso, struct machine *machine)
+static void try_to_open_dso(struct dso *dso, struct machine *machine)
 {
 	enum dso_binary_type binary_type_data[] = {
 		DSO_BINARY_TYPE__BUILD_ID_CACHE,
@@ -453,11 +449,8 @@ int dso__data_fd(struct dso *dso, struct machine *machine)
 	};
 	int i = 0;
 
-	if (dso->data.status == DSO_DATA_STATUS_ERROR)
-		return -1;
-
 	if (dso->data.fd >= 0)
-		goto out;
+		return;
 
 	if (dso->binary_type != DSO_BINARY_TYPE__NOT_FOUND) {
 		dso->data.fd = open_dso(dso, machine);
@@ -477,10 +470,38 @@ out:
 		dso->data.status = DSO_DATA_STATUS_OK;
 	else
 		dso->data.status = DSO_DATA_STATUS_ERROR;
+}
+
+/**
+ * dso__data_get_fd - Get dso's data file descriptor
+ * @dso: dso object
+ * @machine: machine object
+ *
+ * External interface to find dso's file, open it and
+ * returns file descriptor.  It should be paired with
+ * dso__data_put_fd() if it returns non-negative value.
+ */
+int dso__data_get_fd(struct dso *dso, struct machine *machine)
+{
+	if (dso->data.status == DSO_DATA_STATUS_ERROR)
+		return -1;
+
+	if (pthread_mutex_lock(&dso__data_open_lock) < 0)
+		return -1;
+
+	try_to_open_dso(dso, machine);
+
+	if (dso->data.fd < 0)
+		pthread_mutex_unlock(&dso__data_open_lock);
 
 	return dso->data.fd;
 }
 
+void dso__data_put_fd(struct dso *dso __maybe_unused)
+{
+	pthread_mutex_unlock(&dso__data_open_lock);
+}
+
 bool dso__data_status_seen(struct dso *dso, enum dso_data_status_seen by)
 {
 	u32 flag = 1 << by;
@@ -494,10 +515,12 @@ bool dso__data_status_seen(struct dso *dso, enum dso_data_status_seen by)
 }
 
 static void
-dso_cache__free(struct rb_root *root)
+dso_cache__free(struct dso *dso)
 {
+	struct rb_root *root = &dso->data.cache;
 	struct rb_node *next = rb_first(root);
 
+	pthread_mutex_lock(&dso->lock);
 	while (next) {
 		struct dso_cache *cache;
 
@@ -506,10 +529,12 @@ dso_cache__free(struct rb_root *root)
 		rb_erase(&cache->rb_node, root);
 		free(cache);
 	}
+	pthread_mutex_unlock(&dso->lock);
 }
 
-static struct dso_cache *dso_cache__find(const struct rb_root *root, u64 offset)
+static struct dso_cache *dso_cache__find(struct dso *dso, u64 offset)
 {
+	const struct rb_root *root = &dso->data.cache;
 	struct rb_node * const *p = &root->rb_node;
 	const struct rb_node *parent = NULL;
 	struct dso_cache *cache;
@@ -528,17 +553,20 @@ static struct dso_cache *dso_cache__find(const struct rb_root *root, u64 offset)
 		else
 			return cache;
 	}
+
 	return NULL;
 }
 
-static void
-dso_cache__insert(struct rb_root *root, struct dso_cache *new)
+static struct dso_cache *
+dso_cache__insert(struct dso *dso, struct dso_cache *new)
 {
+	struct rb_root *root = &dso->data.cache;
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
 	struct dso_cache *cache;
 	u64 offset = new->offset;
 
+	pthread_mutex_lock(&dso->lock);
 	while (*p != NULL) {
 		u64 end;
 
@@ -550,10 +578,17 @@ dso_cache__insert(struct rb_root *root, struct dso_cache *new)
 			p = &(*p)->rb_left;
 		else if (offset >= end)
 			p = &(*p)->rb_right;
+		else
+			goto out;
 	}
 
 	rb_link_node(&new->rb_node, parent, p);
 	rb_insert_color(&new->rb_node, root);
+
+	cache = NULL;
+out:
+	pthread_mutex_unlock(&dso->lock);
+	return cache;
 }
 
 static ssize_t
@@ -568,19 +603,33 @@ dso_cache__memcpy(struct dso_cache *cache, u64 offset,
 }
 
 static ssize_t
-dso_cache__read(struct dso *dso, u64 offset, u8 *data, ssize_t size)
+dso_cache__read(struct dso *dso, struct machine *machine,
+		u64 offset, u8 *data, ssize_t size)
 {
 	struct dso_cache *cache;
+	struct dso_cache *old;
 	ssize_t ret;
 
 	do {
 		u64 cache_offset;
 
-		ret = -ENOMEM;
-
 		cache = zalloc(sizeof(*cache) + DSO__DATA_CACHE_SIZE);
 		if (!cache)
+			return -ENOMEM;
+
+		pthread_mutex_lock(&dso__data_open_lock);
+
+		/*
+		 * dso->data.fd might be closed if other thread opened another
+		 * file (dso) due to open file limit (RLIMIT_NOFILE).
+		 */
+		try_to_open_dso(dso, machine);
+
+		if (dso->data.fd < 0) {
+			ret = -errno;
+			dso->data.status = DSO_DATA_STATUS_ERROR;
 			break;
+		}
 
 		cache_offset = offset & DSO__DATA_CACHE_MASK;
 
@@ -590,11 +639,20 @@ dso_cache__read(struct dso *dso, u64 offset, u8 *data, ssize_t size)
 
 		cache->offset = cache_offset;
 		cache->size   = ret;
-		dso_cache__insert(&dso->data.cache, cache);
+	} while (0);
 
-		ret = dso_cache__memcpy(cache, offset, data, size);
+	pthread_mutex_unlock(&dso__data_open_lock);
 
-	} while (0);
+	if (ret > 0) {
+		old = dso_cache__insert(dso, cache);
+		if (old) {
+			/* we lose the race */
+			free(cache);
+			cache = old;
+		}
+
+		ret = dso_cache__memcpy(cache, offset, data, size);
+	}
 
 	if (ret <= 0)
 		free(cache);
@@ -602,16 +660,16 @@ dso_cache__read(struct dso *dso, u64 offset, u8 *data, ssize_t size)
 	return ret;
 }
 
-static ssize_t dso_cache_read(struct dso *dso, u64 offset,
-			      u8 *data, ssize_t size)
+static ssize_t dso_cache_read(struct dso *dso, struct machine *machine,
+			      u64 offset, u8 *data, ssize_t size)
 {
 	struct dso_cache *cache;
 
-	cache = dso_cache__find(&dso->data.cache, offset);
+	cache = dso_cache__find(dso, offset);
 	if (cache)
 		return dso_cache__memcpy(cache, offset, data, size);
 	else
-		return dso_cache__read(dso, offset, data, size);
+		return dso_cache__read(dso, machine, offset, data, size);
 }
 
 /*
@@ -619,7 +677,8 @@ static ssize_t dso_cache_read(struct dso *dso, u64 offset,
  * in the rb_tree. Any read to already cached data is served
  * by cached data.
  */
-static ssize_t cached_read(struct dso *dso, u64 offset, u8 *data, ssize_t size)
+static ssize_t cached_read(struct dso *dso, struct machine *machine,
+			   u64 offset, u8 *data, ssize_t size)
 {
 	ssize_t r = 0;
 	u8 *p = data;
@@ -627,7 +686,7 @@ static ssize_t cached_read(struct dso *dso, u64 offset, u8 *data, ssize_t size)
 	do {
 		ssize_t ret;
 
-		ret = dso_cache_read(dso, offset, p, size);
+		ret = dso_cache_read(dso, machine, offset, p, size);
 		if (ret < 0)
 			return ret;
 
@@ -647,21 +706,44 @@ static ssize_t cached_read(struct dso *dso, u64 offset, u8 *data, ssize_t size)
 	return r;
 }
 
-static int data_file_size(struct dso *dso)
+static int data_file_size(struct dso *dso, struct machine *machine)
 {
+	int ret = 0;
 	struct stat st;
 	char sbuf[STRERR_BUFSIZE];
 
-	if (!dso->data.file_size) {
-		if (fstat(dso->data.fd, &st)) {
-			pr_err("dso mmap failed, fstat: %s\n",
-				strerror_r(errno, sbuf, sizeof(sbuf)));
-			return -1;
-		}
-		dso->data.file_size = st.st_size;
+	if (dso->data.file_size)
+		return 0;
+
+	if (dso->data.status == DSO_DATA_STATUS_ERROR)
+		return -1;
+
+	pthread_mutex_lock(&dso__data_open_lock);
+
+	/*
+	 * dso->data.fd might be closed if other thread opened another
+	 * file (dso) due to open file limit (RLIMIT_NOFILE).
+	 */
+	try_to_open_dso(dso, machine);
+
+	if (dso->data.fd < 0) {
+		ret = -errno;
+		dso->data.status = DSO_DATA_STATUS_ERROR;
+		goto out;
 	}
 
-	return 0;
+	if (fstat(dso->data.fd, &st) < 0) {
+		ret = -errno;
+		pr_err("dso cache fstat failed: %s\n",
+		       strerror_r(errno, sbuf, sizeof(sbuf)));
+		dso->data.status = DSO_DATA_STATUS_ERROR;
+		goto out;
+	}
+	dso->data.file_size = st.st_size;
+
+out:
+	pthread_mutex_unlock(&dso__data_open_lock);
+	return ret;
 }
 
 /**
@@ -673,23 +755,17 @@ static int data_file_size(struct dso *dso)
  */
 off_t dso__data_size(struct dso *dso, struct machine *machine)
 {
-	int fd;
-
-	fd = dso__data_fd(dso, machine);
-	if (fd < 0)
-		return fd;
-
-	if (data_file_size(dso))
+	if (data_file_size(dso, machine))
 		return -1;
 
 	/* For now just estimate dso data size is close to file size */
 	return dso->data.file_size;
 }
 
-static ssize_t data_read_offset(struct dso *dso, u64 offset,
-				u8 *data, ssize_t size)
+static ssize_t data_read_offset(struct dso *dso, struct machine *machine,
+				u64 offset, u8 *data, ssize_t size)
 {
-	if (data_file_size(dso))
+	if (data_file_size(dso, machine))
 		return -1;
 
 	/* Check the offset sanity. */
@@ -699,7 +775,7 @@ static ssize_t data_read_offset(struct dso *dso, u64 offset,
 	if (offset + size < offset)
 		return -1;
 
-	return cached_read(dso, offset, data, size);
+	return cached_read(dso, machine, offset, data, size);
 }
 
 /**
@@ -716,10 +792,10 @@ static ssize_t data_read_offset(struct dso *dso, u64 offset,
 ssize_t dso__data_read_offset(struct dso *dso, struct machine *machine,
 			      u64 offset, u8 *data, ssize_t size)
 {
-	if (dso__data_fd(dso, machine) < 0)
+	if (dso->data.status == DSO_DATA_STATUS_ERROR)
 		return -1;
 
-	return data_read_offset(dso, offset, data, size);
+	return data_read_offset(dso, machine, offset, data, size);
 }
 
 /**
@@ -751,13 +827,13 @@ struct map *dso__new_map(const char *name)
 	return map;
 }
 
-struct dso *dso__kernel_findnew(struct machine *machine, const char *name,
-		    const char *short_name, int dso_type)
+struct dso *machine__findnew_kernel(struct machine *machine, const char *name,
+				    const char *short_name, int dso_type)
 {
 	/*
 	 * The kernel dso could be created by build_id processing.
 	 */
-	struct dso *dso = __dsos__findnew(&machine->kernel_dsos, name);
+	struct dso *dso = machine__findnew_dso(machine, name);
 
 	/*
 	 * We need to run this in all cases, since during the build_id
@@ -935,6 +1011,7 @@ struct dso *dso__new(const char *name)
 		RB_CLEAR_NODE(&dso->rb_node);
 		INIT_LIST_HEAD(&dso->node);
 		INIT_LIST_HEAD(&dso->data.open_entry);
+		pthread_mutex_init(&dso->lock, NULL);
 	}
 
 	return dso;
@@ -961,9 +1038,11 @@ void dso__delete(struct dso *dso)
 	}
 
 	dso__data_close(dso);
-	dso_cache__free(&dso->data.cache);
+	auxtrace_cache__free(dso->auxtrace_cache);
+	dso_cache__free(dso);
 	dso__free_a2l(dso);
 	zfree(&dso->symsrc_filename);
+	pthread_mutex_destroy(&dso->lock);
 	free(dso);
 }
 
@@ -1130,12 +1209,15 @@ size_t dso__fprintf(struct dso *dso, enum map_type type, FILE *fp)
 enum dso_type dso__type(struct dso *dso, struct machine *machine)
 {
 	int fd;
+	enum dso_type type = DSO__TYPE_UNKNOWN;
 
-	fd = dso__data_fd(dso, machine);
-	if (fd < 0)
-		return DSO__TYPE_UNKNOWN;
+	fd = dso__data_get_fd(dso, machine);
+	if (fd >= 0) {
+		type = dso__type_fd(fd);
+		dso__data_put_fd(dso);
+	}
 
-	return dso__type_fd(fd);
+	return type;
 }
 
 int dso__strerror_load(struct dso *dso, char *buf, size_t buflen)
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index e0901b4ed8de0d..24a507a5414764 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -126,7 +126,10 @@ struct dsos {
 	struct rb_root	 root;	/* rbtree root sorted by long name */
 };
 
+struct auxtrace_cache;
+
 struct dso {
+	pthread_mutex_t	 lock;
 	struct list_head node;
 	struct rb_node	 rb_node;	/* rbtree node sorted by long name */
 	struct rb_root	 symbols[MAP__NR_TYPES];
@@ -156,6 +159,7 @@ struct dso {
 	u16		 long_name_len;
 	u16		 short_name_len;
 	void		*dwfl;			/* DWARF debug info */
+	struct auxtrace_cache *auxtrace_cache;
 
 	/* dso data file */
 	struct {
@@ -236,7 +240,8 @@ int __kmod_path__parse(struct kmod_path *m, const char *path,
 
 /*
  * The dso__data_* external interface provides following functions:
- *   dso__data_fd
+ *   dso__data_get_fd
+ *   dso__data_put_fd
  *   dso__data_close
  *   dso__data_size
  *   dso__data_read_offset
@@ -253,8 +258,11 @@ int __kmod_path__parse(struct kmod_path *m, const char *path,
  * The current usage of the dso__data_* interface is as follows:
  *
  * Get DSO's fd:
- *   int fd = dso__data_fd(dso, machine);
- *   USE 'fd' SOMEHOW
+ *   int fd = dso__data_get_fd(dso, machine);
+ *   if (fd >= 0) {
+ *       USE 'fd' SOMEHOW
+ *       dso__data_put_fd(dso);
+ *   }
  *
  * Read DSO's data:
  *   n = dso__data_read_offset(dso_0, &machine, 0, buf, BUFSIZE);
@@ -273,7 +281,8 @@ int __kmod_path__parse(struct kmod_path *m, const char *path,
  *
  * TODO
 */
-int dso__data_fd(struct dso *dso, struct machine *machine);
+int dso__data_get_fd(struct dso *dso, struct machine *machine);
+void dso__data_put_fd(struct dso *dso __maybe_unused);
 void dso__data_close(struct dso *dso);
 
 off_t dso__data_size(struct dso *dso, struct machine *machine);
@@ -285,8 +294,8 @@ ssize_t dso__data_read_addr(struct dso *dso, struct map *map,
 bool dso__data_status_seen(struct dso *dso, enum dso_data_status_seen by);
 
 struct map *dso__new_map(const char *name);
-struct dso *dso__kernel_findnew(struct machine *machine, const char *name,
-				const char *short_name, int dso_type);
+struct dso *machine__findnew_kernel(struct machine *machine, const char *name,
+				    const char *short_name, int dso_type);
 
 void dsos__add(struct dsos *dsos, struct dso *dso);
 struct dso *dsos__addnew(struct dsos *dsos, const char *name);
diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index c34e024020c7c5..57f3ef41c2bc3e 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -139,11 +139,27 @@ int cu_walk_functions_at(Dwarf_Die *cu_die, Dwarf_Addr addr,
 bool die_compare_name(Dwarf_Die *dw_die, const char *tname)
 {
 	const char *name;
+
 	name = dwarf_diename(dw_die);
 	return name ? (strcmp(tname, name) == 0) : false;
 }
 
 /**
+ * die_match_name - Match diename and glob
+ * @dw_die: a DIE
+ * @glob: a string of target glob pattern
+ *
+ * Glob matching the name of @dw_die and @glob. Return false if matching fail.
+ */
+bool die_match_name(Dwarf_Die *dw_die, const char *glob)
+{
+	const char *name;
+
+	name = dwarf_diename(dw_die);
+	return name ? strglobmatch(name, glob) : false;
+}
+
+/**
  * die_get_call_lineno - Get callsite line number of inline-function instance
  * @in_die: a DIE of an inlined function instance
  *
@@ -417,6 +433,43 @@ struct __addr_die_search_param {
 	Dwarf_Die	*die_mem;
 };
 
+static int __die_search_func_tail_cb(Dwarf_Die *fn_die, void *data)
+{
+	struct __addr_die_search_param *ad = data;
+	Dwarf_Addr addr = 0;
+
+	if (dwarf_tag(fn_die) == DW_TAG_subprogram &&
+	    !dwarf_highpc(fn_die, &addr) &&
+	    addr == ad->addr) {
+		memcpy(ad->die_mem, fn_die, sizeof(Dwarf_Die));
+		return DWARF_CB_ABORT;
+	}
+	return DWARF_CB_OK;
+}
+
+/**
+ * die_find_tailfunc - Search for a non-inlined function with tail call at
+ * given address
+ * @cu_die: a CU DIE which including @addr
+ * @addr: target address
+ * @die_mem: a buffer for result DIE
+ *
+ * Search for a non-inlined function DIE with tail call at @addr. Stores the
+ * DIE to @die_mem and returns it if found. Returns NULL if failed.
+ */
+Dwarf_Die *die_find_tailfunc(Dwarf_Die *cu_die, Dwarf_Addr addr,
+				    Dwarf_Die *die_mem)
+{
+	struct __addr_die_search_param ad;
+	ad.addr = addr;
+	ad.die_mem = die_mem;
+	/* dwarf_getscopes can't find subprogram. */
+	if (!dwarf_getfuncs(cu_die, __die_search_func_tail_cb, &ad, 0))
+		return NULL;
+	else
+		return die_mem;
+}
+
 /* die_find callback for non-inlined function search */
 static int __die_search_func_cb(Dwarf_Die *fn_die, void *data)
 {
@@ -832,19 +885,17 @@ Dwarf_Die *die_find_member(Dwarf_Die *st_die, const char *name,
 /**
  * die_get_typename - Get the name of given variable DIE
  * @vr_die: a variable DIE
- * @buf: a buffer for result type name
- * @len: a max-length of @buf
+ * @buf: a strbuf for result type name
  *
- * Get the name of @vr_die and stores it to @buf. Return the actual length
- * of type name if succeeded. Return -E2BIG if @len is not enough long, and
- * Return -ENOENT if failed to find type name.
+ * Get the name of @vr_die and stores it to @buf. Return 0 if succeeded.
+ * and Return -ENOENT if failed to find type name.
  * Note that the result will stores typedef name if possible, and stores
  * "*(function_type)" if the type is a function pointer.
  */
-int die_get_typename(Dwarf_Die *vr_die, char *buf, int len)
+int die_get_typename(Dwarf_Die *vr_die, struct strbuf *buf)
 {
 	Dwarf_Die type;
-	int tag, ret, ret2;
+	int tag, ret;
 	const char *tmp = "";
 
 	if (__die_get_real_type(vr_die, &type) == NULL)
@@ -855,8 +906,8 @@ int die_get_typename(Dwarf_Die *vr_die, char *buf, int len)
 		tmp = "*";
 	else if (tag == DW_TAG_subroutine_type) {
 		/* Function pointer */
-		ret = snprintf(buf, len, "(function_type)");
-		return (ret >= len) ? -E2BIG : ret;
+		strbuf_addf(buf, "(function_type)");
+		return 0;
 	} else {
 		if (!dwarf_diename(&type))
 			return -ENOENT;
@@ -867,39 +918,156 @@ int die_get_typename(Dwarf_Die *vr_die, char *buf, int len)
 		else if (tag == DW_TAG_enumeration_type)
 			tmp = "enum ";
 		/* Write a base name */
-		ret = snprintf(buf, len, "%s%s", tmp, dwarf_diename(&type));
-		return (ret >= len) ? -E2BIG : ret;
-	}
-	ret = die_get_typename(&type, buf, len);
-	if (ret > 0) {
-		ret2 = snprintf(buf + ret, len - ret, "%s", tmp);
-		ret = (ret2 >= len - ret) ? -E2BIG : ret2 + ret;
+		strbuf_addf(buf, "%s%s", tmp, dwarf_diename(&type));
+		return 0;
 	}
+	ret = die_get_typename(&type, buf);
+	if (ret == 0)
+		strbuf_addf(buf, "%s", tmp);
+
 	return ret;
 }
 
 /**
  * die_get_varname - Get the name and type of given variable DIE
  * @vr_die: a variable DIE
- * @buf: a buffer for type and variable name
- * @len: the max-length of @buf
+ * @buf: a strbuf for type and variable name
  *
  * Get the name and type of @vr_die and stores it in @buf as "type\tname".
  */
-int die_get_varname(Dwarf_Die *vr_die, char *buf, int len)
+int die_get_varname(Dwarf_Die *vr_die, struct strbuf *buf)
 {
-	int ret, ret2;
+	int ret;
 
-	ret = die_get_typename(vr_die, buf, len);
+	ret = die_get_typename(vr_die, buf);
 	if (ret < 0) {
 		pr_debug("Failed to get type, make it unknown.\n");
-		ret = snprintf(buf, len, "(unknown_type)");
+		strbuf_addf(buf, "(unknown_type)");
 	}
-	if (ret > 0) {
-		ret2 = snprintf(buf + ret, len - ret, "\t%s",
-				dwarf_diename(vr_die));
-		ret = (ret2 >= len - ret) ? -E2BIG : ret2 + ret;
+
+	strbuf_addf(buf, "\t%s", dwarf_diename(vr_die));
+
+	return 0;
+}
+
+/**
+ * die_get_var_innermost_scope - Get innermost scope range of given variable DIE
+ * @sp_die: a subprogram DIE
+ * @vr_die: a variable DIE
+ * @buf: a strbuf for variable byte offset range
+ *
+ * Get the innermost scope range of @vr_die and stores it in @buf as
+ * "@<function_name+[NN-NN,NN-NN]>".
+ */
+static int die_get_var_innermost_scope(Dwarf_Die *sp_die, Dwarf_Die *vr_die,
+				struct strbuf *buf)
+{
+	Dwarf_Die *scopes;
+	int count;
+	size_t offset = 0;
+	Dwarf_Addr base;
+	Dwarf_Addr start, end;
+	Dwarf_Addr entry;
+	int ret;
+	bool first = true;
+	const char *name;
+
+	ret = dwarf_entrypc(sp_die, &entry);
+	if (ret)
+		return ret;
+
+	name = dwarf_diename(sp_die);
+	if (!name)
+		return -ENOENT;
+
+	count = dwarf_getscopes_die(vr_die, &scopes);
+
+	/* (*SCOPES)[1] is the DIE for the scope containing that scope */
+	if (count <= 1) {
+		ret = -EINVAL;
+		goto out;
 	}
+
+	while ((offset = dwarf_ranges(&scopes[1], offset, &base,
+				&start, &end)) > 0) {
+		start -= entry;
+		end -= entry;
+
+		if (first) {
+			strbuf_addf(buf, "@<%s+[%" PRIu64 "-%" PRIu64,
+				name, start, end);
+			first = false;
+		} else {
+			strbuf_addf(buf, ",%" PRIu64 "-%" PRIu64,
+				start, end);
+		}
+	}
+
+	if (!first)
+		strbuf_addf(buf, "]>");
+
+out:
+	free(scopes);
 	return ret;
 }
 
+/**
+ * die_get_var_range - Get byte offset range of given variable DIE
+ * @sp_die: a subprogram DIE
+ * @vr_die: a variable DIE
+ * @buf: a strbuf for type and variable name and byte offset range
+ *
+ * Get the byte offset range of @vr_die and stores it in @buf as
+ * "@<function_name+[NN-NN,NN-NN]>".
+ */
+int die_get_var_range(Dwarf_Die *sp_die, Dwarf_Die *vr_die, struct strbuf *buf)
+{
+	int ret = 0;
+	Dwarf_Addr base;
+	Dwarf_Addr start, end;
+	Dwarf_Addr entry;
+	Dwarf_Op *op;
+	size_t nops;
+	size_t offset = 0;
+	Dwarf_Attribute attr;
+	bool first = true;
+	const char *name;
+
+	ret = dwarf_entrypc(sp_die, &entry);
+	if (ret)
+		return ret;
+
+	name = dwarf_diename(sp_die);
+	if (!name)
+		return -ENOENT;
+
+	if (dwarf_attr(vr_die, DW_AT_location, &attr) == NULL)
+		return -EINVAL;
+
+	while ((offset = dwarf_getlocations(
+				&attr, offset, &base,
+				&start, &end, &op, &nops)) > 0) {
+		if (start == 0) {
+			/* Single Location Descriptions */
+			ret = die_get_var_innermost_scope(sp_die, vr_die, buf);
+			return ret;
+		}
+
+		/* Location Lists */
+		start -= entry;
+		end -= entry;
+		if (first) {
+			strbuf_addf(buf, "@<%s+[%" PRIu64 "-%" PRIu64,
+				name, start, end);
+			first = false;
+		} else {
+			strbuf_addf(buf, ",%" PRIu64 "-%" PRIu64,
+				start, end);
+		}
+	}
+
+	if (!first)
+		strbuf_addf(buf, "]>");
+
+	return ret;
+}
diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h
index af7dbcd5f92994..c42ec366f2a72f 100644
--- a/tools/perf/util/dwarf-aux.h
+++ b/tools/perf/util/dwarf-aux.h
@@ -47,6 +47,9 @@ extern bool die_is_func_instance(Dwarf_Die *dw_die);
 /* Compare diename and tname */
 extern bool die_compare_name(Dwarf_Die *dw_die, const char *tname);
 
+/* Matching diename with glob pattern */
+extern bool die_match_name(Dwarf_Die *dw_die, const char *glob);
+
 /* Get callsite line number of inline-function instance */
 extern int die_get_call_lineno(Dwarf_Die *in_die);
 
@@ -82,6 +85,10 @@ extern Dwarf_Die *die_find_child(Dwarf_Die *rt_die,
 extern Dwarf_Die *die_find_realfunc(Dwarf_Die *cu_die, Dwarf_Addr addr,
 				    Dwarf_Die *die_mem);
 
+/* Search a non-inlined function with tail call at given address */
+Dwarf_Die *die_find_tailfunc(Dwarf_Die *cu_die, Dwarf_Addr addr,
+				    Dwarf_Die *die_mem);
+
 /* Search the top inlined function including given address */
 extern Dwarf_Die *die_find_top_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr,
 					  Dwarf_Die *die_mem);
@@ -114,8 +121,10 @@ extern Dwarf_Die *die_find_member(Dwarf_Die *st_die, const char *name,
 				  Dwarf_Die *die_mem);
 
 /* Get the name of given variable DIE */
-extern int die_get_typename(Dwarf_Die *vr_die, char *buf, int len);
+extern int die_get_typename(Dwarf_Die *vr_die, struct strbuf *buf);
 
 /* Get the name and type of given variable DIE, stored as "type\tname" */
-extern int die_get_varname(Dwarf_Die *vr_die, char *buf, int len);
+extern int die_get_varname(Dwarf_Die *vr_die, struct strbuf *buf);
+extern int die_get_var_range(Dwarf_Die *sp_die, Dwarf_Die *vr_die,
+			struct strbuf *buf);
 #endif
diff --git a/tools/perf/util/environment.c b/tools/perf/util/environment.c
index 275b0ee345f5ea..7405123692f149 100644
--- a/tools/perf/util/environment.c
+++ b/tools/perf/util/environment.c
@@ -5,5 +5,4 @@
  */
 #include "cache.h"
 
-const char *pager_program;
 int pager_use_color = 1;
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index ff866c4d2e2f09..c1925968a8afe1 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -23,12 +23,17 @@ static const char *perf_event__names[] = {
 	[PERF_RECORD_FORK]			= "FORK",
 	[PERF_RECORD_READ]			= "READ",
 	[PERF_RECORD_SAMPLE]			= "SAMPLE",
+	[PERF_RECORD_AUX]			= "AUX",
+	[PERF_RECORD_ITRACE_START]		= "ITRACE_START",
 	[PERF_RECORD_HEADER_ATTR]		= "ATTR",
 	[PERF_RECORD_HEADER_EVENT_TYPE]		= "EVENT_TYPE",
 	[PERF_RECORD_HEADER_TRACING_DATA]	= "TRACING_DATA",
 	[PERF_RECORD_HEADER_BUILD_ID]		= "BUILD_ID",
 	[PERF_RECORD_FINISHED_ROUND]		= "FINISHED_ROUND",
 	[PERF_RECORD_ID_INDEX]			= "ID_INDEX",
+	[PERF_RECORD_AUXTRACE_INFO]		= "AUXTRACE_INFO",
+	[PERF_RECORD_AUXTRACE]			= "AUXTRACE",
+	[PERF_RECORD_AUXTRACE_ERROR]		= "AUXTRACE_ERROR",
 };
 
 const char *perf_event__name(unsigned int id)
@@ -324,8 +329,9 @@ int perf_event__synthesize_modules(struct perf_tool *tool,
 				   struct machine *machine)
 {
 	int rc = 0;
-	struct rb_node *nd;
+	struct map *pos;
 	struct map_groups *kmaps = &machine->kmaps;
+	struct maps *maps = &kmaps->maps[MAP__FUNCTION];
 	union perf_event *event = zalloc((sizeof(event->mmap) +
 					  machine->id_hdr_size));
 	if (event == NULL) {
@@ -345,10 +351,8 @@ int perf_event__synthesize_modules(struct perf_tool *tool,
 	else
 		event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL;
 
-	for (nd = rb_first(&kmaps->maps[MAP__FUNCTION]);
-	     nd; nd = rb_next(nd)) {
+	for (pos = maps__first(maps); pos; pos = map__next(pos)) {
 		size_t size;
-		struct map *pos = rb_entry(nd, struct map, rb_node);
 
 		if (pos->dso->kernel)
 			continue;
@@ -692,6 +696,22 @@ int perf_event__process_lost(struct perf_tool *tool __maybe_unused,
 	return machine__process_lost_event(machine, event, sample);
 }
 
+int perf_event__process_aux(struct perf_tool *tool __maybe_unused,
+			    union perf_event *event,
+			    struct perf_sample *sample __maybe_unused,
+			    struct machine *machine)
+{
+	return machine__process_aux_event(machine, event);
+}
+
+int perf_event__process_itrace_start(struct perf_tool *tool __maybe_unused,
+				     union perf_event *event,
+				     struct perf_sample *sample __maybe_unused,
+				     struct machine *machine)
+{
+	return machine__process_itrace_start_event(machine, event);
+}
+
 size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp)
 {
 	return fprintf(fp, " %d/%d: [%#" PRIx64 "(%#" PRIx64 ") @ %#" PRIx64 "]: %c %s\n",
@@ -755,6 +775,21 @@ int perf_event__process_exit(struct perf_tool *tool __maybe_unused,
 	return machine__process_exit_event(machine, event, sample);
 }
 
+size_t perf_event__fprintf_aux(union perf_event *event, FILE *fp)
+{
+	return fprintf(fp, " offset: %#"PRIx64" size: %#"PRIx64" flags: %#"PRIx64" [%s%s]\n",
+		       event->aux.aux_offset, event->aux.aux_size,
+		       event->aux.flags,
+		       event->aux.flags & PERF_AUX_FLAG_TRUNCATED ? "T" : "",
+		       event->aux.flags & PERF_AUX_FLAG_OVERWRITE ? "O" : "");
+}
+
+size_t perf_event__fprintf_itrace_start(union perf_event *event, FILE *fp)
+{
+	return fprintf(fp, " pid: %u tid: %u\n",
+		       event->itrace_start.pid, event->itrace_start.tid);
+}
+
 size_t perf_event__fprintf(union perf_event *event, FILE *fp)
 {
 	size_t ret = fprintf(fp, "PERF_RECORD_%s",
@@ -774,6 +809,12 @@ size_t perf_event__fprintf(union perf_event *event, FILE *fp)
 	case PERF_RECORD_MMAP2:
 		ret += perf_event__fprintf_mmap2(event, fp);
 		break;
+	case PERF_RECORD_AUX:
+		ret += perf_event__fprintf_aux(event, fp);
+		break;
+	case PERF_RECORD_ITRACE_START:
+		ret += perf_event__fprintf_itrace_start(event, fp);
+		break;
 	default:
 		ret += fprintf(fp, "\n");
 	}
@@ -877,6 +918,10 @@ void thread__find_addr_location(struct thread *thread,
 		al->sym = NULL;
 }
 
+/*
+ * Callers need to drop the reference to al->thread, obtained in
+ * machine__findnew_thread()
+ */
 int perf_event__preprocess_sample(const union perf_event *event,
 				  struct machine *machine,
 				  struct addr_location *al,
@@ -937,6 +982,17 @@ int perf_event__preprocess_sample(const union perf_event *event,
 	return 0;
 }
 
+/*
+ * The preprocess_sample method will return with reference counts for the
+ * in it, when done using (and perhaps getting ref counts if needing to
+ * keep a pointer to one of those entries) it must be paired with
+ * addr_location__put(), so that the refcounts can be decremented.
+ */
+void addr_location__put(struct addr_location *al)
+{
+	thread__zput(al->thread);
+}
+
 bool is_bts_event(struct perf_event_attr *attr)
 {
 	return attr->type == PERF_TYPE_HARDWARE &&
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 09b9e8d3fcf7fa..97179abc80a123 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -157,6 +157,8 @@ enum {
 	PERF_IP_FLAG_IN_TX		= 1ULL << 10,
 };
 
+#define PERF_IP_FLAG_CHARS "bcrosyiABEx"
+
 #define PERF_BRANCH_MASK		(\
 	PERF_IP_FLAG_BRANCH		|\
 	PERF_IP_FLAG_CALL		|\
@@ -215,9 +217,17 @@ enum perf_user_event_type { /* above any possible kernel type */
 	PERF_RECORD_HEADER_BUILD_ID		= 67,
 	PERF_RECORD_FINISHED_ROUND		= 68,
 	PERF_RECORD_ID_INDEX			= 69,
+	PERF_RECORD_AUXTRACE_INFO		= 70,
+	PERF_RECORD_AUXTRACE			= 71,
+	PERF_RECORD_AUXTRACE_ERROR		= 72,
 	PERF_RECORD_HEADER_MAX
 };
 
+enum auxtrace_error_type {
+	PERF_AUXTRACE_ERROR_ITRACE  = 1,
+	PERF_AUXTRACE_ERROR_MAX
+};
+
 /*
  * The kernel collects the number of events it couldn't send in a stretch and
  * when possible sends this number in a PERF_RECORD_LOST event. The number of
@@ -242,6 +252,7 @@ struct events_stats {
 	u32 nr_invalid_chains;
 	u32 nr_unknown_id;
 	u32 nr_unprocessable_samples;
+	u32 nr_auxtrace_errors[PERF_AUXTRACE_ERROR_MAX];
 };
 
 struct attr_event {
@@ -280,6 +291,50 @@ struct id_index_event {
 	struct id_index_entry entries[0];
 };
 
+struct auxtrace_info_event {
+	struct perf_event_header header;
+	u32 type;
+	u32 reserved__; /* For alignment */
+	u64 priv[];
+};
+
+struct auxtrace_event {
+	struct perf_event_header header;
+	u64 size;
+	u64 offset;
+	u64 reference;
+	u32 idx;
+	u32 tid;
+	u32 cpu;
+	u32 reserved__; /* For alignment */
+};
+
+#define MAX_AUXTRACE_ERROR_MSG 64
+
+struct auxtrace_error_event {
+	struct perf_event_header header;
+	u32 type;
+	u32 code;
+	u32 cpu;
+	u32 pid;
+	u32 tid;
+	u32 reserved__; /* For alignment */
+	u64 ip;
+	char msg[MAX_AUXTRACE_ERROR_MSG];
+};
+
+struct aux_event {
+	struct perf_event_header header;
+	u64	aux_offset;
+	u64	aux_size;
+	u64	flags;
+};
+
+struct itrace_start_event {
+	struct perf_event_header header;
+	u32 pid, tid;
+};
+
 union perf_event {
 	struct perf_event_header	header;
 	struct mmap_event		mmap;
@@ -295,6 +350,11 @@ union perf_event {
 	struct tracing_data_event	tracing_data;
 	struct build_id_event		build_id;
 	struct id_index_event		id_index;
+	struct auxtrace_info_event	auxtrace_info;
+	struct auxtrace_event		auxtrace;
+	struct auxtrace_error_event	auxtrace_error;
+	struct aux_event		aux;
+	struct itrace_start_event	itrace_start;
 };
 
 void perf_event__print_totals(void);
@@ -330,6 +390,14 @@ int perf_event__process_lost(struct perf_tool *tool,
 			     union perf_event *event,
 			     struct perf_sample *sample,
 			     struct machine *machine);
+int perf_event__process_aux(struct perf_tool *tool,
+			    union perf_event *event,
+			    struct perf_sample *sample,
+			    struct machine *machine);
+int perf_event__process_itrace_start(struct perf_tool *tool,
+				     union perf_event *event,
+				     struct perf_sample *sample,
+				     struct machine *machine);
 int perf_event__process_mmap(struct perf_tool *tool,
 			     union perf_event *event,
 			     struct perf_sample *sample,
@@ -358,6 +426,8 @@ int perf_event__preprocess_sample(const union perf_event *event,
 				  struct addr_location *al,
 				  struct perf_sample *sample);
 
+void addr_location__put(struct addr_location *al);
+
 struct thread;
 
 bool is_bts_event(struct perf_event_attr *attr);
@@ -387,6 +457,8 @@ size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_mmap2(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_task(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_aux(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_itrace_start(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf(union perf_event *event, FILE *fp);
 
 u64 kallsyms__get_function_start(const char *kallsyms_filename,
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 080be93eea969f..dc1dc2c181ef22 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -695,19 +695,19 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
 
 static bool perf_mmap__empty(struct perf_mmap *md)
 {
-	return perf_mmap__read_head(md) == md->prev;
+	return perf_mmap__read_head(md) == md->prev && !md->auxtrace_mmap.base;
 }
 
 static void perf_evlist__mmap_get(struct perf_evlist *evlist, int idx)
 {
-	++evlist->mmap[idx].refcnt;
+	atomic_inc(&evlist->mmap[idx].refcnt);
 }
 
 static void perf_evlist__mmap_put(struct perf_evlist *evlist, int idx)
 {
-	BUG_ON(evlist->mmap[idx].refcnt == 0);
+	BUG_ON(atomic_read(&evlist->mmap[idx].refcnt) == 0);
 
-	if (--evlist->mmap[idx].refcnt == 0)
+	if (atomic_dec_and_test(&evlist->mmap[idx].refcnt))
 		__perf_evlist__munmap(evlist, idx);
 }
 
@@ -721,17 +721,46 @@ void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx)
 		perf_mmap__write_tail(md, old);
 	}
 
-	if (md->refcnt == 1 && perf_mmap__empty(md))
+	if (atomic_read(&md->refcnt) == 1 && perf_mmap__empty(md))
 		perf_evlist__mmap_put(evlist, idx);
 }
 
+int __weak auxtrace_mmap__mmap(struct auxtrace_mmap *mm __maybe_unused,
+			       struct auxtrace_mmap_params *mp __maybe_unused,
+			       void *userpg __maybe_unused,
+			       int fd __maybe_unused)
+{
+	return 0;
+}
+
+void __weak auxtrace_mmap__munmap(struct auxtrace_mmap *mm __maybe_unused)
+{
+}
+
+void __weak auxtrace_mmap_params__init(
+			struct auxtrace_mmap_params *mp __maybe_unused,
+			off_t auxtrace_offset __maybe_unused,
+			unsigned int auxtrace_pages __maybe_unused,
+			bool auxtrace_overwrite __maybe_unused)
+{
+}
+
+void __weak auxtrace_mmap_params__set_idx(
+			struct auxtrace_mmap_params *mp __maybe_unused,
+			struct perf_evlist *evlist __maybe_unused,
+			int idx __maybe_unused,
+			bool per_cpu __maybe_unused)
+{
+}
+
 static void __perf_evlist__munmap(struct perf_evlist *evlist, int idx)
 {
 	if (evlist->mmap[idx].base != NULL) {
 		munmap(evlist->mmap[idx].base, evlist->mmap_len);
 		evlist->mmap[idx].base = NULL;
-		evlist->mmap[idx].refcnt = 0;
+		atomic_set(&evlist->mmap[idx].refcnt, 0);
 	}
+	auxtrace_mmap__munmap(&evlist->mmap[idx].auxtrace_mmap);
 }
 
 void perf_evlist__munmap(struct perf_evlist *evlist)
@@ -759,6 +788,7 @@ static int perf_evlist__alloc_mmap(struct perf_evlist *evlist)
 struct mmap_params {
 	int prot;
 	int mask;
+	struct auxtrace_mmap_params auxtrace_mp;
 };
 
 static int __perf_evlist__mmap(struct perf_evlist *evlist, int idx,
@@ -777,7 +807,7 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist, int idx,
 	 * evlist layer can't just drop it when filtering events in
 	 * perf_evlist__filter_pollfd().
 	 */
-	evlist->mmap[idx].refcnt = 2;
+	atomic_set(&evlist->mmap[idx].refcnt, 2);
 	evlist->mmap[idx].prev = 0;
 	evlist->mmap[idx].mask = mp->mask;
 	evlist->mmap[idx].base = mmap(NULL, evlist->mmap_len, mp->prot,
@@ -789,6 +819,10 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist, int idx,
 		return -1;
 	}
 
+	if (auxtrace_mmap__mmap(&evlist->mmap[idx].auxtrace_mmap,
+				&mp->auxtrace_mp, evlist->mmap[idx].base, fd))
+		return -1;
+
 	return 0;
 }
 
@@ -853,6 +887,9 @@ static int perf_evlist__mmap_per_cpu(struct perf_evlist *evlist,
 	for (cpu = 0; cpu < nr_cpus; cpu++) {
 		int output = -1;
 
+		auxtrace_mmap_params__set_idx(&mp->auxtrace_mp, evlist, cpu,
+					      true);
+
 		for (thread = 0; thread < nr_threads; thread++) {
 			if (perf_evlist__mmap_per_evsel(evlist, cpu, mp, cpu,
 							thread, &output))
@@ -878,6 +915,9 @@ static int perf_evlist__mmap_per_thread(struct perf_evlist *evlist,
 	for (thread = 0; thread < nr_threads; thread++) {
 		int output = -1;
 
+		auxtrace_mmap_params__set_idx(&mp->auxtrace_mp, evlist, thread,
+					      false);
+
 		if (perf_evlist__mmap_per_evsel(evlist, thread, mp, 0, thread,
 						&output))
 			goto out_unmap;
@@ -960,10 +1000,8 @@ static long parse_pages_arg(const char *str, unsigned long min,
 	return pages;
 }
 
-int perf_evlist__parse_mmap_pages(const struct option *opt, const char *str,
-				  int unset __maybe_unused)
+int __perf_evlist__parse_mmap_pages(unsigned int *mmap_pages, const char *str)
 {
-	unsigned int *mmap_pages = opt->value;
 	unsigned long max = UINT_MAX;
 	long pages;
 
@@ -980,20 +1018,32 @@ int perf_evlist__parse_mmap_pages(const struct option *opt, const char *str,
 	return 0;
 }
 
+int perf_evlist__parse_mmap_pages(const struct option *opt, const char *str,
+				  int unset __maybe_unused)
+{
+	return __perf_evlist__parse_mmap_pages(opt->value, str);
+}
+
 /**
- * perf_evlist__mmap - Create mmaps to receive events.
+ * perf_evlist__mmap_ex - Create mmaps to receive events.
  * @evlist: list of events
  * @pages: map length in pages
  * @overwrite: overwrite older events?
+ * @auxtrace_pages - auxtrace map length in pages
+ * @auxtrace_overwrite - overwrite older auxtrace data?
  *
  * If @overwrite is %false the user needs to signal event consumption using
  * perf_mmap__write_tail().  Using perf_evlist__mmap_read() does this
  * automatically.
  *
+ * Similarly, if @auxtrace_overwrite is %false the user needs to signal data
+ * consumption using auxtrace_mmap__write_tail().
+ *
  * Return: %0 on success, negative error code otherwise.
  */
-int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
-		      bool overwrite)
+int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
+			 bool overwrite, unsigned int auxtrace_pages,
+			 bool auxtrace_overwrite)
 {
 	struct perf_evsel *evsel;
 	const struct cpu_map *cpus = evlist->cpus;
@@ -1013,6 +1063,9 @@ int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
 	pr_debug("mmap size %zuB\n", evlist->mmap_len);
 	mp.mask = evlist->mmap_len - page_size - 1;
 
+	auxtrace_mmap_params__init(&mp.auxtrace_mp, evlist->mmap_len,
+				   auxtrace_pages, auxtrace_overwrite);
+
 	evlist__for_each(evlist, evsel) {
 		if ((evsel->attr.read_format & PERF_FORMAT_ID) &&
 		    evsel->sample_id == NULL &&
@@ -1026,6 +1079,12 @@ int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
 	return perf_evlist__mmap_per_cpu(evlist, &mp);
 }
 
+int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
+		      bool overwrite)
+{
+	return perf_evlist__mmap_ex(evlist, pages, overwrite, 0, false);
+}
+
 int perf_evlist__create_maps(struct perf_evlist *evlist, struct target *target)
 {
 	evlist->threads = thread_map__new_str(target->pid, target->tid,
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index b5cce95d644e0c..955bf31b7dd3f2 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -1,6 +1,7 @@
 #ifndef __PERF_EVLIST_H
 #define __PERF_EVLIST_H 1
 
+#include <linux/atomic.h>
 #include <linux/list.h>
 #include <api/fd/array.h>
 #include <stdio.h>
@@ -8,6 +9,7 @@
 #include "event.h"
 #include "evsel.h"
 #include "util.h"
+#include "auxtrace.h"
 #include <unistd.h>
 
 struct pollfd;
@@ -26,8 +28,9 @@ struct record_opts;
 struct perf_mmap {
 	void		 *base;
 	int		 mask;
-	int		 refcnt;
+	atomic_t	 refcnt;
 	u64		 prev;
+	struct auxtrace_mmap auxtrace_mmap;
 	char		 event_copy[PERF_SAMPLE_MAX_SIZE] __attribute__((aligned(8)));
 };
 
@@ -37,6 +40,7 @@ struct perf_evlist {
 	int		 nr_entries;
 	int		 nr_groups;
 	int		 nr_mmaps;
+	bool		 overwrite;
 	size_t		 mmap_len;
 	int		 id_pos;
 	int		 is_pos;
@@ -45,7 +49,6 @@ struct perf_evlist {
 		int	cork_fd;
 		pid_t	pid;
 	} workload;
-	bool		 overwrite;
 	struct fdarray	 pollfd;
 	struct perf_mmap *mmap;
 	struct thread_map *threads;
@@ -122,10 +125,14 @@ int perf_evlist__start_workload(struct perf_evlist *evlist);
 
 struct option;
 
+int __perf_evlist__parse_mmap_pages(unsigned int *mmap_pages, const char *str);
 int perf_evlist__parse_mmap_pages(const struct option *opt,
 				  const char *str,
 				  int unset);
 
+int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
+			 bool overwrite, unsigned int auxtrace_pages,
+			 bool auxtrace_overwrite);
 int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
 		      bool overwrite);
 void perf_evlist__munmap(struct perf_evlist *evlist);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 33e3fd8c2e682d..a3e36fc634dc40 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1121,6 +1121,7 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
 	PRINT_ATTRf(sample_stack_user, p_unsigned);
 	PRINT_ATTRf(clockid, p_signed);
 	PRINT_ATTRf(sample_regs_intr, p_hex);
+	PRINT_ATTRf(aux_watermark, p_unsigned);
 
 	return ret;
 }
@@ -2148,7 +2149,9 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
 	case EMFILE:
 		return scnprintf(msg, size, "%s",
 			 "Too many events are opened.\n"
-			 "Try again after reducing the number of events.");
+			 "Probably the maximum number of open file descriptors has been reached.\n"
+			 "Hint: Try again after reducing the number of events.\n"
+			 "Hint: Try increasing the limit with 'ulimit -n <limit>'");
 	case ENODEV:
 		if (target->cpu_list)
 			return scnprintf(msg, size, "%s",
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index e486151b030897..21ec08247d4778 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -73,7 +73,6 @@ struct perf_evsel {
 	char			*name;
 	double			scale;
 	const char		*unit;
-	bool			snapshot;
 	struct event_format	*tp_format;
 	union {
 		void		*priv;
@@ -86,6 +85,7 @@ struct perf_evsel {
 	unsigned int		sample_size;
 	int			id_pos;
 	int			is_pos;
+	bool			snapshot;
 	bool 			supported;
 	bool 			needs_swap;
 	bool			no_aux_samples;
@@ -93,11 +93,11 @@ struct perf_evsel {
 	bool			system_wide;
 	bool			tracking;
 	bool			per_pkg;
-	unsigned long		*per_pkg_mask;
 	/* parse modifier helper */
 	int			exclude_GH;
 	int			nr_members;
 	int			sample_read;
+	unsigned long		*per_pkg_mask;
 	struct perf_evsel	*leader;
 	char			*group_name;
 };
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 918fd8ae2d80bc..851143a7988d30 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -869,6 +869,20 @@ static int write_branch_stack(int fd __maybe_unused,
 	return 0;
 }
 
+static int write_auxtrace(int fd, struct perf_header *h,
+			  struct perf_evlist *evlist __maybe_unused)
+{
+	struct perf_session *session;
+	int err;
+
+	session = container_of(h, struct perf_session, header);
+
+	err = auxtrace_index__write(fd, &session->auxtrace_index);
+	if (err < 0)
+		pr_err("Failed to write auxtrace index\n");
+	return err;
+}
+
 static void print_hostname(struct perf_header *ph, int fd __maybe_unused,
 			   FILE *fp)
 {
@@ -1151,6 +1165,12 @@ static void print_branch_stack(struct perf_header *ph __maybe_unused,
 	fprintf(fp, "# contains samples with branch stack\n");
 }
 
+static void print_auxtrace(struct perf_header *ph __maybe_unused,
+			   int fd __maybe_unused, FILE *fp)
+{
+	fprintf(fp, "# contains AUX area data (e.g. instruction trace)\n");
+}
+
 static void print_pmu_mappings(struct perf_header *ph, int fd __maybe_unused,
 			       FILE *fp)
 {
@@ -1218,7 +1238,6 @@ static int __event_process_build_id(struct build_id_event *bev,
 				    struct perf_session *session)
 {
 	int err = -1;
-	struct dsos *dsos;
 	struct machine *machine;
 	u16 misc;
 	struct dso *dso;
@@ -1233,22 +1252,19 @@ static int __event_process_build_id(struct build_id_event *bev,
 	switch (misc) {
 	case PERF_RECORD_MISC_KERNEL:
 		dso_type = DSO_TYPE_KERNEL;
-		dsos = &machine->kernel_dsos;
 		break;
 	case PERF_RECORD_MISC_GUEST_KERNEL:
 		dso_type = DSO_TYPE_GUEST_KERNEL;
-		dsos = &machine->kernel_dsos;
 		break;
 	case PERF_RECORD_MISC_USER:
 	case PERF_RECORD_MISC_GUEST_USER:
 		dso_type = DSO_TYPE_USER;
-		dsos = &machine->user_dsos;
 		break;
 	default:
 		goto out;
 	}
 
-	dso = __dsos__findnew(dsos, filename);
+	dso = machine__findnew_dso(machine, filename);
 	if (dso != NULL) {
 		char sbuild_id[BUILD_ID_SIZE * 2 + 1];
 
@@ -1821,6 +1837,22 @@ out_free:
 	return ret;
 }
 
+static int process_auxtrace(struct perf_file_section *section,
+			    struct perf_header *ph, int fd,
+			    void *data __maybe_unused)
+{
+	struct perf_session *session;
+	int err;
+
+	session = container_of(ph, struct perf_session, header);
+
+	err = auxtrace_index__process(fd, section->size, session,
+				      ph->needs_swap);
+	if (err < 0)
+		pr_err("Failed to process auxtrace index\n");
+	return err;
+}
+
 struct feature_ops {
 	int (*write)(int fd, struct perf_header *h, struct perf_evlist *evlist);
 	void (*print)(struct perf_header *h, int fd, FILE *fp);
@@ -1861,6 +1893,7 @@ static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = {
 	FEAT_OPA(HEADER_BRANCH_STACK,	branch_stack),
 	FEAT_OPP(HEADER_PMU_MAPPINGS,	pmu_mappings),
 	FEAT_OPP(HEADER_GROUP_DESC,	group_desc),
+	FEAT_OPP(HEADER_AUXTRACE,	auxtrace),
 };
 
 struct header_print_data {
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 3bb90ac172a1ba..d4d57962c59129 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -30,6 +30,7 @@ enum {
 	HEADER_BRANCH_STACK,
 	HEADER_PMU_MAPPINGS,
 	HEADER_GROUP_DESC,
+	HEADER_AUXTRACE,
 	HEADER_LAST_FEATURE,
 	HEADER_FEAT_BITS	= 256,
 };
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index cc22b9158b93c4..f53d017c7c22ea 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -362,10 +362,10 @@ static u8 symbol__parent_filter(const struct symbol *parent)
 	return 0;
 }
 
-static struct hist_entry *add_hist_entry(struct hists *hists,
-					 struct hist_entry *entry,
-					 struct addr_location *al,
-					 bool sample_self)
+static struct hist_entry *hists__findnew_entry(struct hists *hists,
+					       struct hist_entry *entry,
+					       struct addr_location *al,
+					       bool sample_self)
 {
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
@@ -468,7 +468,7 @@ struct hist_entry *__hists__add_entry(struct hists *hists,
 		.transaction = transaction,
 	};
 
-	return add_hist_entry(hists, &entry, al, sample_self);
+	return hists__findnew_entry(hists, &entry, al, sample_self);
 }
 
 static int
@@ -548,9 +548,9 @@ iter_finish_mem_entry(struct hist_entry_iter *iter,
 
 out:
 	/*
-	 * We don't need to free iter->priv (mem_info) here since
-	 * the mem info was either already freed in add_hist_entry() or
-	 * passed to a new hist entry by hist_entry__new().
+	 * We don't need to free iter->priv (mem_info) here since the mem info
+	 * was either already freed in hists__findnew_entry() or passed to a
+	 * new hist entry by hist_entry__new().
 	 */
 	iter->priv = NULL;
 
@@ -851,19 +851,15 @@ const struct hist_iter_ops hist_iter_cumulative = {
 };
 
 int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al,
-			 struct perf_evsel *evsel, struct perf_sample *sample,
 			 int max_stack_depth, void *arg)
 {
 	int err, err2;
 
-	err = sample__resolve_callchain(sample, &iter->parent, evsel, al,
-					max_stack_depth);
+	err = sample__resolve_callchain(iter->sample, &iter->parent,
+					iter->evsel, al, max_stack_depth);
 	if (err)
 		return err;
 
-	iter->evsel = evsel;
-	iter->sample = sample;
-
 	err = iter->ops->prepare_entry(iter, al);
 	if (err)
 		goto out;
@@ -1163,7 +1159,7 @@ static void hists__remove_entry_filter(struct hists *hists, struct hist_entry *h
 		return;
 
 	/* force fold unfiltered entry for simplicity */
-	h->ms.unfolded = false;
+	h->unfolded = false;
 	h->row_offset = 0;
 	h->nr_rows = 0;
 
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 9f31b89a527a2e..5ed8d9c229814d 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -111,7 +111,6 @@ struct hist_entry *__hists__add_entry(struct hists *hists,
 				      u64 weight, u64 transaction,
 				      bool sample_self);
 int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al,
-			 struct perf_evsel *evsel, struct perf_sample *sample,
 			 int max_stack_depth, void *arg);
 
 int64_t hist_entry__cmp(struct hist_entry *left, struct hist_entry *right);
diff --git a/tools/perf/util/include/linux/rbtree.h b/tools/perf/util/include/linux/rbtree.h
index 2a030c5af3aa20..f06d89f0b8678d 100644
--- a/tools/perf/util/include/linux/rbtree.h
+++ b/tools/perf/util/include/linux/rbtree.h
@@ -1,2 +1,16 @@
+#ifndef __TOOLS_LINUX_PERF_RBTREE_H
+#define __TOOLS_LINUX_PERF_RBTREE_H
 #include <stdbool.h>
 #include "../../../../include/linux/rbtree.h"
+
+/*
+ * Handy for checking that we are not deleting an entry that is
+ * already in a list, found in block/{blk-throttle,cfq-iosched}.c,
+ * probably should be moved to lib/rbtree.c...
+ */
+static inline void rb_erase_init(struct rb_node *n, struct rb_root *root)
+{
+	rb_erase(n, root);
+	RB_CLEAR_NODE(n);
+}
+#endif /* __TOOLS_LINUX_PERF_RBTREE_H */
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 527e032e24f6e6..2ed61f59d415ec 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -14,6 +14,8 @@
 #include "unwind.h"
 #include "linux/hash.h"
 
+static void __machine__remove_thread(struct machine *machine, struct thread *th, bool lock);
+
 static void dsos__init(struct dsos *dsos)
 {
 	INIT_LIST_HEAD(&dsos->head);
@@ -24,10 +26,10 @@ int machine__init(struct machine *machine, const char *root_dir, pid_t pid)
 {
 	map_groups__init(&machine->kmaps, machine);
 	RB_CLEAR_NODE(&machine->rb_node);
-	dsos__init(&machine->user_dsos);
-	dsos__init(&machine->kernel_dsos);
+	dsos__init(&machine->dsos);
 
 	machine->threads = RB_ROOT;
+	pthread_rwlock_init(&machine->threads_lock, NULL);
 	INIT_LIST_HEAD(&machine->dead_threads);
 	machine->last_match = NULL;
 
@@ -54,6 +56,7 @@ int machine__init(struct machine *machine, const char *root_dir, pid_t pid)
 
 		snprintf(comm, sizeof(comm), "[guest/%d]", pid);
 		thread__set_comm(thread, comm, 0);
+		thread__put(thread);
 	}
 
 	machine->current_tid = NULL;
@@ -91,24 +94,27 @@ static void dsos__delete(struct dsos *dsos)
 
 void machine__delete_threads(struct machine *machine)
 {
-	struct rb_node *nd = rb_first(&machine->threads);
+	struct rb_node *nd;
 
+	pthread_rwlock_wrlock(&machine->threads_lock);
+	nd = rb_first(&machine->threads);
 	while (nd) {
 		struct thread *t = rb_entry(nd, struct thread, rb_node);
 
 		nd = rb_next(nd);
-		machine__remove_thread(machine, t);
+		__machine__remove_thread(machine, t, false);
 	}
+	pthread_rwlock_unlock(&machine->threads_lock);
 }
 
 void machine__exit(struct machine *machine)
 {
 	map_groups__exit(&machine->kmaps);
-	dsos__delete(&machine->user_dsos);
-	dsos__delete(&machine->kernel_dsos);
-	vdso__exit(machine);
+	dsos__delete(&machine->dsos);
+	machine__exit_vdso(machine);
 	zfree(&machine->root_dir);
 	zfree(&machine->current_tid);
+	pthread_rwlock_destroy(&machine->threads_lock);
 }
 
 void machine__delete(struct machine *machine)
@@ -303,7 +309,7 @@ static void machine__update_thread_pid(struct machine *machine,
 	if (th->pid_ == th->tid)
 		return;
 
-	leader = machine__findnew_thread(machine, th->pid_, th->pid_);
+	leader = __machine__findnew_thread(machine, th->pid_, th->pid_);
 	if (!leader)
 		goto out_err;
 
@@ -325,7 +331,7 @@ static void machine__update_thread_pid(struct machine *machine,
 		if (!map_groups__empty(th->mg))
 			pr_err("Discarding thread maps for %d:%d\n",
 			       th->pid_, th->tid);
-		map_groups__delete(th->mg);
+		map_groups__put(th->mg);
 	}
 
 	th->mg = map_groups__get(leader->mg);
@@ -336,9 +342,9 @@ out_err:
 	pr_err("Failed to join map groups for %d:%d\n", th->pid_, th->tid);
 }
 
-static struct thread *__machine__findnew_thread(struct machine *machine,
-						pid_t pid, pid_t tid,
-						bool create)
+static struct thread *____machine__findnew_thread(struct machine *machine,
+						  pid_t pid, pid_t tid,
+						  bool create)
 {
 	struct rb_node **p = &machine->threads.rb_node;
 	struct rb_node *parent = NULL;
@@ -356,7 +362,7 @@ static struct thread *__machine__findnew_thread(struct machine *machine,
 			return th;
 		}
 
-		thread__zput(machine->last_match);
+		machine->last_match = NULL;
 	}
 
 	while (*p != NULL) {
@@ -364,7 +370,7 @@ static struct thread *__machine__findnew_thread(struct machine *machine,
 		th = rb_entry(parent, struct thread, rb_node);
 
 		if (th->tid == tid) {
-			machine->last_match = thread__get(th);
+			machine->last_match = th;
 			machine__update_thread_pid(machine, th, pid);
 			return th;
 		}
@@ -392,7 +398,8 @@ static struct thread *__machine__findnew_thread(struct machine *machine,
 		 * leader and that would screwed the rb tree.
 		 */
 		if (thread__init_map_groups(th, machine)) {
-			rb_erase(&th->rb_node, &machine->threads);
+			rb_erase_init(&th->rb_node, &machine->threads);
+			RB_CLEAR_NODE(&th->rb_node);
 			thread__delete(th);
 			return NULL;
 		}
@@ -400,22 +407,36 @@ static struct thread *__machine__findnew_thread(struct machine *machine,
 		 * It is now in the rbtree, get a ref
 		 */
 		thread__get(th);
-		machine->last_match = thread__get(th);
+		machine->last_match = th;
 	}
 
 	return th;
 }
 
+struct thread *__machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid)
+{
+	return ____machine__findnew_thread(machine, pid, tid, true);
+}
+
 struct thread *machine__findnew_thread(struct machine *machine, pid_t pid,
 				       pid_t tid)
 {
-	return __machine__findnew_thread(machine, pid, tid, true);
+	struct thread *th;
+
+	pthread_rwlock_wrlock(&machine->threads_lock);
+	th = thread__get(__machine__findnew_thread(machine, pid, tid));
+	pthread_rwlock_unlock(&machine->threads_lock);
+	return th;
 }
 
 struct thread *machine__find_thread(struct machine *machine, pid_t pid,
 				    pid_t tid)
 {
-	return __machine__findnew_thread(machine, pid, tid, false);
+	struct thread *th;
+	pthread_rwlock_rdlock(&machine->threads_lock);
+	th =  thread__get(____machine__findnew_thread(machine, pid, tid, false));
+	pthread_rwlock_unlock(&machine->threads_lock);
+	return th;
 }
 
 struct comm *machine__thread_exec_comm(struct machine *machine,
@@ -434,6 +455,7 @@ int machine__process_comm_event(struct machine *machine, union perf_event *event
 							event->comm.pid,
 							event->comm.tid);
 	bool exec = event->header.misc & PERF_RECORD_MISC_COMM_EXEC;
+	int err = 0;
 
 	if (exec)
 		machine->comm_exec = true;
@@ -444,10 +466,12 @@ int machine__process_comm_event(struct machine *machine, union perf_event *event
 	if (thread == NULL ||
 	    __thread__set_comm(thread, event->comm.comm, sample->time, exec)) {
 		dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
-		return -1;
+		err = -1;
 	}
 
-	return 0;
+	thread__put(thread);
+
+	return err;
 }
 
 int machine__process_lost_event(struct machine *machine __maybe_unused,
@@ -464,9 +488,9 @@ machine__module_dso(struct machine *machine, struct kmod_path *m,
 {
 	struct dso *dso;
 
-	dso = dsos__find(&machine->kernel_dsos, m->name, true);
+	dso = dsos__find(&machine->dsos, m->name, true);
 	if (!dso) {
-		dso = dsos__addnew(&machine->kernel_dsos, m->name);
+		dso = dsos__addnew(&machine->dsos, m->name);
 		if (dso == NULL)
 			return NULL;
 
@@ -486,6 +510,22 @@ machine__module_dso(struct machine *machine, struct kmod_path *m,
 	return dso;
 }
 
+int machine__process_aux_event(struct machine *machine __maybe_unused,
+			       union perf_event *event)
+{
+	if (dump_trace)
+		perf_event__fprintf_aux(event, stdout);
+	return 0;
+}
+
+int machine__process_itrace_start_event(struct machine *machine __maybe_unused,
+					union perf_event *event)
+{
+	if (dump_trace)
+		perf_event__fprintf_itrace_start(event, stdout);
+	return 0;
+}
+
 struct map *machine__new_module(struct machine *machine, u64 start,
 				const char *filename)
 {
@@ -519,13 +559,11 @@ out:
 size_t machines__fprintf_dsos(struct machines *machines, FILE *fp)
 {
 	struct rb_node *nd;
-	size_t ret = __dsos__fprintf(&machines->host.kernel_dsos.head, fp) +
-		     __dsos__fprintf(&machines->host.user_dsos.head, fp);
+	size_t ret = __dsos__fprintf(&machines->host.dsos.head, fp);
 
 	for (nd = rb_first(&machines->guests); nd; nd = rb_next(nd)) {
 		struct machine *pos = rb_entry(nd, struct machine, rb_node);
-		ret += __dsos__fprintf(&pos->kernel_dsos.head, fp);
-		ret += __dsos__fprintf(&pos->user_dsos.head, fp);
+		ret += __dsos__fprintf(&pos->dsos.head, fp);
 	}
 
 	return ret;
@@ -534,8 +572,7 @@ size_t machines__fprintf_dsos(struct machines *machines, FILE *fp)
 size_t machine__fprintf_dsos_buildid(struct machine *m, FILE *fp,
 				     bool (skip)(struct dso *dso, int parm), int parm)
 {
-	return __dsos__fprintf_buildid(&m->kernel_dsos.head, fp, skip, parm) +
-	       __dsos__fprintf_buildid(&m->user_dsos.head, fp, skip, parm);
+	return __dsos__fprintf_buildid(&m->dsos.head, fp, skip, parm);
 }
 
 size_t machines__fprintf_dsos_buildid(struct machines *machines, FILE *fp,
@@ -575,12 +612,16 @@ size_t machine__fprintf(struct machine *machine, FILE *fp)
 	size_t ret = 0;
 	struct rb_node *nd;
 
+	pthread_rwlock_rdlock(&machine->threads_lock);
+
 	for (nd = rb_first(&machine->threads); nd; nd = rb_next(nd)) {
 		struct thread *pos = rb_entry(nd, struct thread, rb_node);
 
 		ret += thread__fprintf(pos, fp);
 	}
 
+	pthread_rwlock_unlock(&machine->threads_lock);
+
 	return ret;
 }
 
@@ -594,9 +635,8 @@ static struct dso *machine__get_kernel(struct machine *machine)
 		if (!vmlinux_name)
 			vmlinux_name = "[kernel.kallsyms]";
 
-		kernel = dso__kernel_findnew(machine, vmlinux_name,
-					     "[kernel]",
-					     DSO_TYPE_KERNEL);
+		kernel = machine__findnew_kernel(machine, vmlinux_name,
+						 "[kernel]", DSO_TYPE_KERNEL);
 	} else {
 		char bf[PATH_MAX];
 
@@ -606,9 +646,9 @@ static struct dso *machine__get_kernel(struct machine *machine)
 			vmlinux_name = machine__mmap_name(machine, bf,
 							  sizeof(bf));
 
-		kernel = dso__kernel_findnew(machine, vmlinux_name,
-					     "[guest.kernel]",
-					     DSO_TYPE_GUEST_KERNEL);
+		kernel = machine__findnew_kernel(machine, vmlinux_name,
+						 "[guest.kernel]",
+						 DSO_TYPE_GUEST_KERNEL);
 	}
 
 	if (kernel != NULL && (!kernel->has_build_id))
@@ -713,7 +753,6 @@ void machine__destroy_kernel_maps(struct machine *machine)
 				kmap->ref_reloc_sym = NULL;
 		}
 
-		map__delete(machine->vmlinux_maps[type]);
 		machine->vmlinux_maps[type] = NULL;
 	}
 }
@@ -1062,7 +1101,7 @@ static bool machine__uses_kcore(struct machine *machine)
 {
 	struct dso *dso;
 
-	list_for_each_entry(dso, &machine->kernel_dsos.head, node) {
+	list_for_each_entry(dso, &machine->dsos.head, node) {
 		if (dso__is_kcore(dso))
 			return true;
 	}
@@ -1109,8 +1148,8 @@ static int machine__process_kernel_mmap_event(struct machine *machine,
 		struct dso *kernel = NULL;
 		struct dso *dso;
 
-		list_for_each_entry(dso, &machine->kernel_dsos.head, node) {
-			if (is_kernel_module(dso->long_name))
+		list_for_each_entry(dso, &machine->dsos.head, node) {
+			if (dso->kernel && is_kernel_module(dso->long_name))
 				continue;
 
 			kernel = dso;
@@ -1118,8 +1157,7 @@ static int machine__process_kernel_mmap_event(struct machine *machine,
 		}
 
 		if (kernel == NULL)
-			kernel = __dsos__findnew(&machine->kernel_dsos,
-						 kmmap_prefix);
+			kernel = machine__findnew_dso(machine, kmmap_prefix);
 		if (kernel == NULL)
 			goto out_problem;
 
@@ -1197,11 +1235,15 @@ int machine__process_mmap2_event(struct machine *machine,
 			event->mmap2.filename, type, thread);
 
 	if (map == NULL)
-		goto out_problem;
+		goto out_problem_map;
 
 	thread__insert_map(thread, map);
+	thread__put(thread);
+	map__put(map);
 	return 0;
 
+out_problem_map:
+	thread__put(thread);
 out_problem:
 	dump_printf("problem processing PERF_RECORD_MMAP2, skipping event.\n");
 	return 0;
@@ -1244,31 +1286,46 @@ int machine__process_mmap_event(struct machine *machine, union perf_event *event
 			type, thread);
 
 	if (map == NULL)
-		goto out_problem;
+		goto out_problem_map;
 
 	thread__insert_map(thread, map);
+	thread__put(thread);
+	map__put(map);
 	return 0;
 
+out_problem_map:
+	thread__put(thread);
 out_problem:
 	dump_printf("problem processing PERF_RECORD_MMAP, skipping event.\n");
 	return 0;
 }
 
-void machine__remove_thread(struct machine *machine, struct thread *th)
+static void __machine__remove_thread(struct machine *machine, struct thread *th, bool lock)
 {
 	if (machine->last_match == th)
-		thread__zput(machine->last_match);
+		machine->last_match = NULL;
 
-	rb_erase(&th->rb_node, &machine->threads);
+	BUG_ON(atomic_read(&th->refcnt) == 0);
+	if (lock)
+		pthread_rwlock_wrlock(&machine->threads_lock);
+	rb_erase_init(&th->rb_node, &machine->threads);
+	RB_CLEAR_NODE(&th->rb_node);
 	/*
 	 * Move it first to the dead_threads list, then drop the reference,
 	 * if this is the last reference, then the thread__delete destructor
 	 * will be called and we will remove it from the dead_threads list.
 	 */
 	list_add_tail(&th->node, &machine->dead_threads);
+	if (lock)
+		pthread_rwlock_unlock(&machine->threads_lock);
 	thread__put(th);
 }
 
+void machine__remove_thread(struct machine *machine, struct thread *th)
+{
+	return __machine__remove_thread(machine, th, true);
+}
+
 int machine__process_fork_event(struct machine *machine, union perf_event *event,
 				struct perf_sample *sample)
 {
@@ -1278,10 +1335,13 @@ int machine__process_fork_event(struct machine *machine, union perf_event *event
 	struct thread *parent = machine__findnew_thread(machine,
 							event->fork.ppid,
 							event->fork.ptid);
+	int err = 0;
 
 	/* if a thread currently exists for the thread id remove it */
-	if (thread != NULL)
+	if (thread != NULL) {
 		machine__remove_thread(machine, thread);
+		thread__put(thread);
+	}
 
 	thread = machine__findnew_thread(machine, event->fork.pid,
 					 event->fork.tid);
@@ -1291,10 +1351,12 @@ int machine__process_fork_event(struct machine *machine, union perf_event *event
 	if (thread == NULL || parent == NULL ||
 	    thread__fork(thread, parent, sample->time) < 0) {
 		dump_printf("problem processing PERF_RECORD_FORK, skipping event.\n");
-		return -1;
+		err = -1;
 	}
+	thread__put(thread);
+	thread__put(parent);
 
-	return 0;
+	return err;
 }
 
 int machine__process_exit_event(struct machine *machine, union perf_event *event,
@@ -1307,8 +1369,10 @@ int machine__process_exit_event(struct machine *machine, union perf_event *event
 	if (dump_trace)
 		perf_event__fprintf_task(event, stdout);
 
-	if (thread != NULL)
+	if (thread != NULL) {
 		thread__exited(thread);
+		thread__put(thread);
+	}
 
 	return 0;
 }
@@ -1331,6 +1395,11 @@ int machine__process_event(struct machine *machine, union perf_event *event,
 		ret = machine__process_exit_event(machine, event, sample); break;
 	case PERF_RECORD_LOST:
 		ret = machine__process_lost_event(machine, event, sample); break;
+	case PERF_RECORD_AUX:
+		ret = machine__process_aux_event(machine, event); break;
+	case PERF_RECORD_ITRACE_START:
+		ret = machine__process_itrace_start_event(machine, event);
+		break;
 	default:
 		ret = -1;
 		break;
@@ -1820,6 +1889,7 @@ int machine__set_current_tid(struct machine *machine, int cpu, pid_t pid,
 		return -ENOMEM;
 
 	thread->cpu = cpu;
+	thread__put(thread);
 
 	return 0;
 }
@@ -1845,3 +1915,8 @@ int machine__get_kernel_start(struct machine *machine)
 	}
 	return err;
 }
+
+struct dso *machine__findnew_dso(struct machine *machine, const char *filename)
+{
+	return __dsos__findnew(&machine->dsos, filename);
+}
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index 6d64cedb9d1e8f..39a0ca06cbd804 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -30,11 +30,11 @@ struct machine {
 	bool		  comm_exec;
 	char		  *root_dir;
 	struct rb_root	  threads;
+	pthread_rwlock_t  threads_lock;
 	struct list_head  dead_threads;
 	struct thread	  *last_match;
 	struct vdso_info  *vdso_info;
-	struct dsos	  user_dsos;
-	struct dsos	  kernel_dsos;
+	struct dsos	  dsos;
 	struct map_groups kmaps;
 	struct map	  *vmlinux_maps[MAP__NR_TYPES];
 	u64		  kernel_start;
@@ -81,6 +81,10 @@ int machine__process_fork_event(struct machine *machine, union perf_event *event
 				struct perf_sample *sample);
 int machine__process_lost_event(struct machine *machine, union perf_event *event,
 				struct perf_sample *sample);
+int machine__process_aux_event(struct machine *machine,
+			       union perf_event *event);
+int machine__process_itrace_start_event(struct machine *machine,
+					union perf_event *event);
 int machine__process_mmap_event(struct machine *machine, union perf_event *event,
 				struct perf_sample *sample);
 int machine__process_mmap2_event(struct machine *machine, union perf_event *event,
@@ -147,8 +151,10 @@ static inline bool machine__is_host(struct machine *machine)
 	return machine ? machine->pid == HOST_KERNEL_ID : false;
 }
 
-struct thread *machine__findnew_thread(struct machine *machine, pid_t pid,
-				       pid_t tid);
+struct thread *__machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid);
+struct thread *machine__findnew_thread(struct machine *machine, pid_t pid, pid_t tid);
+
+struct dso *machine__findnew_dso(struct machine *machine, const char *filename);
 
 size_t machine__fprintf(struct machine *machine, FILE *fp);
 
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index a14f08f4168639..365011c233a688 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -16,6 +16,8 @@
 #include "machine.h"
 #include <linux/string.h>
 
+static void __maps__insert(struct maps *maps, struct map *map);
+
 const char *map_type__name[MAP__NR_TYPES] = {
 	[MAP__FUNCTION] = "Functions",
 	[MAP__VARIABLE] = "Variables",
@@ -137,6 +139,7 @@ void map__init(struct map *map, enum map_type type,
 	map->groups   = NULL;
 	map->referenced = false;
 	map->erange_warned = false;
+	atomic_set(&map->refcnt, 1);
 }
 
 struct map *map__new(struct machine *machine, u64 start, u64 len,
@@ -175,9 +178,9 @@ struct map *map__new(struct machine *machine, u64 start, u64 len,
 
 		if (vdso) {
 			pgoff = 0;
-			dso = vdso__dso_findnew(machine, thread);
+			dso = machine__findnew_vdso(machine, thread);
 		} else
-			dso = __dsos__findnew(&machine->user_dsos, filename);
+			dso = machine__findnew_dso(machine, filename);
 
 		if (dso == NULL)
 			goto out_delete;
@@ -223,9 +226,16 @@ struct map *map__new2(u64 start, struct dso *dso, enum map_type type)
 
 void map__delete(struct map *map)
 {
+	BUG_ON(!RB_EMPTY_NODE(&map->rb_node));
 	free(map);
 }
 
+void map__put(struct map *map)
+{
+	if (map && atomic_dec_and_test(&map->refcnt))
+		map__delete(map);
+}
+
 void map__fixup_start(struct map *map)
 {
 	struct rb_root *symbols = &map->dso->symbols[map->type];
@@ -292,6 +302,11 @@ int map__load(struct map *map, symbol_filter_t filter)
 	return 0;
 }
 
+int __weak arch__compare_symbol_names(const char *namea, const char *nameb)
+{
+	return strcmp(namea, nameb);
+}
+
 struct symbol *map__find_symbol(struct map *map, u64 addr,
 				symbol_filter_t filter)
 {
@@ -413,48 +428,61 @@ u64 map__objdump_2mem(struct map *map, u64 ip)
 	return ip + map->reloc;
 }
 
+static void maps__init(struct maps *maps)
+{
+	maps->entries = RB_ROOT;
+	pthread_rwlock_init(&maps->lock, NULL);
+	INIT_LIST_HEAD(&maps->removed_maps);
+}
+
 void map_groups__init(struct map_groups *mg, struct machine *machine)
 {
 	int i;
 	for (i = 0; i < MAP__NR_TYPES; ++i) {
-		mg->maps[i] = RB_ROOT;
-		INIT_LIST_HEAD(&mg->removed_maps[i]);
+		maps__init(&mg->maps[i]);
 	}
 	mg->machine = machine;
-	mg->refcnt = 1;
+	atomic_set(&mg->refcnt, 1);
 }
 
-static void maps__delete(struct rb_root *maps)
+static void __maps__purge(struct maps *maps)
 {
-	struct rb_node *next = rb_first(maps);
+	struct rb_root *root = &maps->entries;
+	struct rb_node *next = rb_first(root);
 
 	while (next) {
 		struct map *pos = rb_entry(next, struct map, rb_node);
 
 		next = rb_next(&pos->rb_node);
-		rb_erase(&pos->rb_node, maps);
-		map__delete(pos);
+		rb_erase_init(&pos->rb_node, root);
+		map__put(pos);
 	}
 }
 
-static void maps__delete_removed(struct list_head *maps)
+static void __maps__purge_removed_maps(struct maps *maps)
 {
 	struct map *pos, *n;
 
-	list_for_each_entry_safe(pos, n, maps, node) {
-		list_del(&pos->node);
-		map__delete(pos);
+	list_for_each_entry_safe(pos, n, &maps->removed_maps, node) {
+		list_del_init(&pos->node);
+		map__put(pos);
 	}
 }
 
+static void maps__exit(struct maps *maps)
+{
+	pthread_rwlock_wrlock(&maps->lock);
+	__maps__purge(maps);
+	__maps__purge_removed_maps(maps);
+	pthread_rwlock_unlock(&maps->lock);
+}
+
 void map_groups__exit(struct map_groups *mg)
 {
 	int i;
 
-	for (i = 0; i < MAP__NR_TYPES; ++i) {
-		maps__delete(&mg->maps[i]);
-		maps__delete_removed(&mg->removed_maps[i]);
-	}
+	for (i = 0; i < MAP__NR_TYPES; ++i)
+		maps__exit(&mg->maps[i]);
 }
 
 bool map_groups__empty(struct map_groups *mg)
@@ -464,7 +492,7 @@ bool map_groups__empty(struct map_groups *mg)
 	for (i = 0; i < MAP__NR_TYPES; ++i) {
 		if (maps__first(&mg->maps[i]))
 			return false;
-		if (!list_empty(&mg->removed_maps[i]))
+		if (!list_empty(&mg->maps[i].removed_maps))
 			return false;
 	}
 
@@ -489,32 +517,10 @@ void map_groups__delete(struct map_groups *mg)
 
 void map_groups__put(struct map_groups *mg)
 {
-	if (--mg->refcnt == 0)
+	if (mg && atomic_dec_and_test(&mg->refcnt))
 		map_groups__delete(mg);
 }
 
-void map_groups__flush(struct map_groups *mg)
-{
-	int type;
-
-	for (type = 0; type < MAP__NR_TYPES; type++) {
-		struct rb_root *root = &mg->maps[type];
-		struct rb_node *next = rb_first(root);
-
-		while (next) {
-			struct map *pos = rb_entry(next, struct map, rb_node);
-			next = rb_next(&pos->rb_node);
-			rb_erase(&pos->rb_node, root);
-			/*
-			 * We may have references to this map, for
-			 * instance in some hist_entry instances, so
-			 * just move them to a separate list.
-			 */
-			list_add_tail(&pos->node, &mg->removed_maps[pos->type]);
-		}
-	}
-}
-
 struct symbol *map_groups__find_symbol(struct map_groups *mg,
 				       enum map_type type, u64 addr,
 				       struct map **mapp,
@@ -538,20 +544,28 @@ struct symbol *map_groups__find_symbol_by_name(struct map_groups *mg,
 					       struct map **mapp,
 					       symbol_filter_t filter)
 {
+	struct maps *maps = &mg->maps[type];
+	struct symbol *sym;
 	struct rb_node *nd;
 
-	for (nd = rb_first(&mg->maps[type]); nd; nd = rb_next(nd)) {
+	pthread_rwlock_rdlock(&maps->lock);
+
+	for (nd = rb_first(&maps->entries); nd; nd = rb_next(nd)) {
 		struct map *pos = rb_entry(nd, struct map, rb_node);
-		struct symbol *sym = map__find_symbol_by_name(pos, name, filter);
+
+		sym = map__find_symbol_by_name(pos, name, filter);
 
 		if (sym == NULL)
 			continue;
 		if (mapp != NULL)
 			*mapp = pos;
-		return sym;
+		goto out;
 	}
 
-	return NULL;
+	sym = NULL;
+out:
+	pthread_rwlock_unlock(&maps->lock);
+	return sym;
 }
 
 int map_groups__find_ams(struct addr_map_symbol *ams, symbol_filter_t filter)
@@ -571,25 +585,35 @@ int map_groups__find_ams(struct addr_map_symbol *ams, symbol_filter_t filter)
 	return ams->sym ? 0 : -1;
 }
 
-size_t __map_groups__fprintf_maps(struct map_groups *mg, enum map_type type,
-				  FILE *fp)
+static size_t maps__fprintf(struct maps *maps, FILE *fp)
 {
-	size_t printed = fprintf(fp, "%s:\n", map_type__name[type]);
+	size_t printed = 0;
 	struct rb_node *nd;
 
-	for (nd = rb_first(&mg->maps[type]); nd; nd = rb_next(nd)) {
+	pthread_rwlock_rdlock(&maps->lock);
+
+	for (nd = rb_first(&maps->entries); nd; nd = rb_next(nd)) {
 		struct map *pos = rb_entry(nd, struct map, rb_node);
 		printed += fprintf(fp, "Map:");
 		printed += map__fprintf(pos, fp);
 		if (verbose > 2) {
-			printed += dso__fprintf(pos->dso, type, fp);
+			printed += dso__fprintf(pos->dso, pos->type, fp);
 			printed += fprintf(fp, "--\n");
 		}
 	}
 
+	pthread_rwlock_unlock(&maps->lock);
+
 	return printed;
 }
 
+size_t __map_groups__fprintf_maps(struct map_groups *mg, enum map_type type,
+				  FILE *fp)
+{
+	size_t printed = fprintf(fp, "%s:\n", map_type__name[type]);
+	return printed += maps__fprintf(&mg->maps[type], fp);
+}
+
 static size_t map_groups__fprintf_maps(struct map_groups *mg, FILE *fp)
 {
 	size_t printed = 0, i;
@@ -604,7 +628,7 @@ static size_t __map_groups__fprintf_removed_maps(struct map_groups *mg,
 	struct map *pos;
 	size_t printed = 0;
 
-	list_for_each_entry(pos, &mg->removed_maps[type], node) {
+	list_for_each_entry(pos, &mg->maps[type].removed_maps, node) {
 		printed += fprintf(fp, "Map:");
 		printed += map__fprintf(pos, fp);
 		if (verbose > 1) {
@@ -631,13 +655,17 @@ size_t map_groups__fprintf(struct map_groups *mg, FILE *fp)
 	return printed + map_groups__fprintf_removed_maps(mg, fp);
 }
 
-int map_groups__fixup_overlappings(struct map_groups *mg, struct map *map,
-				   FILE *fp)
+static int maps__fixup_overlappings(struct maps *maps, struct map *map, FILE *fp)
 {
-	struct rb_root *root = &mg->maps[map->type];
-	struct rb_node *next = rb_first(root);
+	struct rb_root *root;
+	struct rb_node *next;
 	int err = 0;
 
+	pthread_rwlock_wrlock(&maps->lock);
+
+	root = &maps->entries;
+	next = rb_first(root);
+
 	while (next) {
 		struct map *pos = rb_entry(next, struct map, rb_node);
 		next = rb_next(&pos->rb_node);
@@ -651,7 +679,7 @@ int map_groups__fixup_overlappings(struct map_groups *mg, struct map *map,
 			map__fprintf(pos, fp);
 		}
 
-		rb_erase(&pos->rb_node, root);
+		rb_erase_init(&pos->rb_node, root);
 		/*
 		 * Now check if we need to create new maps for areas not
 		 * overlapped by the new map:
@@ -661,11 +689,11 @@ int map_groups__fixup_overlappings(struct map_groups *mg, struct map *map,
 
 			if (before == NULL) {
 				err = -ENOMEM;
-				goto move_map;
+				goto put_map;
 			}
 
 			before->end = map->start;
-			map_groups__insert(mg, before);
+			__maps__insert(maps, before);
 			if (verbose >= 2)
 				map__fprintf(before, fp);
 		}
@@ -675,28 +703,37 @@ int map_groups__fixup_overlappings(struct map_groups *mg, struct map *map,
 
 			if (after == NULL) {
 				err = -ENOMEM;
-				goto move_map;
+				goto put_map;
 			}
 
 			after->start = map->end;
-			map_groups__insert(mg, after);
+			__maps__insert(maps, after);
 			if (verbose >= 2)
 				map__fprintf(after, fp);
 		}
-move_map:
+put_map:
 		/*
 		 * If we have references, just move them to a separate list.
 		 */
 		if (pos->referenced)
-			list_add_tail(&pos->node, &mg->removed_maps[map->type]);
+			list_add_tail(&pos->node, &maps->removed_maps);
 		else
-			map__delete(pos);
+			map__put(pos);
 
 		if (err)
-			return err;
+			goto out;
 	}
 
-	return 0;
+	err = 0;
+out:
+	pthread_rwlock_unlock(&maps->lock);
+	return err;
+}
+
+int map_groups__fixup_overlappings(struct map_groups *mg, struct map *map,
+				   FILE *fp)
+{
+	return maps__fixup_overlappings(&mg->maps[map->type], map, fp);
 }
 
 /*
@@ -705,20 +742,28 @@ move_map:
 int map_groups__clone(struct map_groups *mg,
 		      struct map_groups *parent, enum map_type type)
 {
-	struct rb_node *nd;
-	for (nd = rb_first(&parent->maps[type]); nd; nd = rb_next(nd)) {
-		struct map *map = rb_entry(nd, struct map, rb_node);
+	int err = -ENOMEM;
+	struct map *map;
+	struct maps *maps = &parent->maps[type];
+
+	pthread_rwlock_rdlock(&maps->lock);
+
+	for (map = maps__first(maps); map; map = map__next(map)) {
 		struct map *new = map__clone(map);
 		if (new == NULL)
-			return -ENOMEM;
+			goto out_unlock;
 		map_groups__insert(mg, new);
 	}
-	return 0;
+
+	err = 0;
+out_unlock:
+	pthread_rwlock_unlock(&maps->lock);
+	return err;
 }
 
-void maps__insert(struct rb_root *maps, struct map *map)
+static void __maps__insert(struct maps *maps, struct map *map)
 {
-	struct rb_node **p = &maps->rb_node;
+	struct rb_node **p = &maps->entries.rb_node;
 	struct rb_node *parent = NULL;
 	const u64 ip = map->start;
 	struct map *m;
@@ -733,20 +778,38 @@ void maps__insert(struct rb_root *maps, struct map *map)
 	}
 
 	rb_link_node(&map->rb_node, parent, p);
-	rb_insert_color(&map->rb_node, maps);
+	rb_insert_color(&map->rb_node, &maps->entries);
+	map__get(map);
 }
 
-void maps__remove(struct rb_root *maps, struct map *map)
+void maps__insert(struct maps *maps, struct map *map)
 {
-	rb_erase(&map->rb_node, maps);
+	pthread_rwlock_wrlock(&maps->lock);
+	__maps__insert(maps, map);
+	pthread_rwlock_unlock(&maps->lock);
 }
 
-struct map *maps__find(struct rb_root *maps, u64 ip)
+static void __maps__remove(struct maps *maps, struct map *map)
 {
-	struct rb_node **p = &maps->rb_node;
-	struct rb_node *parent = NULL;
+	rb_erase_init(&map->rb_node, &maps->entries);
+	map__put(map);
+}
+
+void maps__remove(struct maps *maps, struct map *map)
+{
+	pthread_rwlock_wrlock(&maps->lock);
+	__maps__remove(maps, map);
+	pthread_rwlock_unlock(&maps->lock);
+}
+
+struct map *maps__find(struct maps *maps, u64 ip)
+{
+	struct rb_node **p, *parent = NULL;
 	struct map *m;
 
+	pthread_rwlock_rdlock(&maps->lock);
+
+	p = &maps->entries.rb_node;
 	while (*p != NULL) {
 		parent = *p;
 		m = rb_entry(parent, struct map, rb_node);
@@ -755,22 +818,25 @@ struct map *maps__find(struct rb_root *maps, u64 ip)
 		else if (ip >= m->end)
 			p = &(*p)->rb_right;
 		else
-			return m;
+			goto out;
 	}
 
-	return NULL;
+	m = NULL;
+out:
+	pthread_rwlock_unlock(&maps->lock);
+	return m;
 }
 
-struct map *maps__first(struct rb_root *maps)
+struct map *maps__first(struct maps *maps)
 {
-	struct rb_node *first = rb_first(maps);
+	struct rb_node *first = rb_first(&maps->entries);
 
 	if (first)
 		return rb_entry(first, struct map, rb_node);
 	return NULL;
 }
 
-struct map *maps__next(struct map *map)
+struct map *map__next(struct map *map)
 {
 	struct rb_node *next = rb_next(&map->rb_node);
 
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index ec19c59ca38e07..b8df09d94aca50 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -1,9 +1,11 @@
 #ifndef __PERF_MAP_H
 #define __PERF_MAP_H
 
+#include <linux/atomic.h>
 #include <linux/compiler.h>
 #include <linux/list.h>
 #include <linux/rbtree.h>
+#include <pthread.h>
 #include <stdio.h>
 #include <stdbool.h>
 #include <linux/types.h>
@@ -50,6 +52,7 @@ struct map {
 
 	struct dso		*dso;
 	struct map_groups	*groups;
+	atomic_t		refcnt;
 };
 
 struct kmap {
@@ -57,11 +60,16 @@ struct kmap {
 	struct map_groups	*kmaps;
 };
 
+struct maps {
+	struct rb_root	 entries;
+	pthread_rwlock_t lock;
+	struct list_head removed_maps;
+};
+
 struct map_groups {
-	struct rb_root	 maps[MAP__NR_TYPES];
-	struct list_head removed_maps[MAP__NR_TYPES];
+	struct maps	 maps[MAP__NR_TYPES];
 	struct machine	 *machine;
-	int		 refcnt;
+	atomic_t	 refcnt;
 };
 
 struct map_groups *map_groups__new(struct machine *machine);
@@ -70,7 +78,8 @@ bool map_groups__empty(struct map_groups *mg);
 
 static inline struct map_groups *map_groups__get(struct map_groups *mg)
 {
-	++mg->refcnt;
+	if (mg)
+		atomic_inc(&mg->refcnt);
 	return mg;
 }
 
@@ -124,7 +133,7 @@ struct thread;
  */
 #define __map__for_each_symbol_by_name(map, sym_name, pos, filter)	\
 	for (pos = map__find_symbol_by_name(map, sym_name, filter);	\
-	     pos && strcmp(pos->name, sym_name) == 0;		\
+	     pos && arch__compare_symbol_names(pos->name, sym_name) == 0;	\
 	     pos = symbol__next_by_name(pos))
 
 #define map__for_each_symbol_by_name(map, sym_name, pos)		\
@@ -132,6 +141,7 @@ struct thread;
 
 typedef int (*symbol_filter_t)(struct map *map, struct symbol *sym);
 
+int arch__compare_symbol_names(const char *namea, const char *nameb);
 void map__init(struct map *map, enum map_type type,
 	       u64 start, u64 end, u64 pgoff, struct dso *dso);
 struct map *map__new(struct machine *machine, u64 start, u64 len,
@@ -141,6 +151,16 @@ struct map *map__new(struct machine *machine, u64 start, u64 len,
 struct map *map__new2(u64 start, struct dso *dso, enum map_type type);
 void map__delete(struct map *map);
 struct map *map__clone(struct map *map);
+
+static inline struct map *map__get(struct map *map)
+{
+	if (map)
+		atomic_inc(&map->refcnt);
+	return map;
+}
+
+void map__put(struct map *map);
+
 int map__overlap(struct map *l, struct map *r);
 size_t map__fprintf(struct map *map, FILE *fp);
 size_t map__fprintf_dsoname(struct map *map, FILE *fp);
@@ -159,11 +179,11 @@ void map__reloc_vmlinux(struct map *map);
 
 size_t __map_groups__fprintf_maps(struct map_groups *mg, enum map_type type,
 				  FILE *fp);
-void maps__insert(struct rb_root *maps, struct map *map);
-void maps__remove(struct rb_root *maps, struct map *map);
-struct map *maps__find(struct rb_root *maps, u64 addr);
-struct map *maps__first(struct rb_root *maps);
-struct map *maps__next(struct map *map);
+void maps__insert(struct maps *maps, struct map *map);
+void maps__remove(struct maps *maps, struct map *map);
+struct map *maps__find(struct maps *maps, u64 addr);
+struct map *maps__first(struct maps *maps);
+struct map *map__next(struct map *map);
 void map_groups__init(struct map_groups *mg, struct machine *machine);
 void map_groups__exit(struct map_groups *mg);
 int map_groups__clone(struct map_groups *mg,
@@ -198,7 +218,7 @@ static inline struct map *map_groups__first(struct map_groups *mg,
 
 static inline struct map *map_groups__next(struct map *map)
 {
-	return maps__next(map);
+	return map__next(map);
 }
 
 struct symbol *map_groups__find_symbol(struct map_groups *mg,
@@ -230,6 +250,4 @@ int map_groups__fixup_overlappings(struct map_groups *mg, struct map *map,
 struct map *map_groups__find_by_name(struct map_groups *mg,
 				     enum map_type type, const char *name);
 
-void map_groups__flush(struct map_groups *mg);
-
 #endif /* __PERF_MAP_H */
diff --git a/tools/perf/util/pager.c b/tools/perf/util/pager.c
index 31ee02d4e988a7..53ef006a951c3f 100644
--- a/tools/perf/util/pager.c
+++ b/tools/perf/util/pager.c
@@ -50,11 +50,6 @@ void setup_pager(void)
 
 	if (!isatty(1))
 		return;
-	if (!pager) {
-		if (!pager_program)
-			perf_config(perf_default_config, NULL);
-		pager = pager_program;
-	}
 	if (!pager)
 		pager = getenv("PAGER");
 	if (!(pager || access("/usr/bin/pager", X_OK)))
diff --git a/tools/perf/util/parse-branch-options.c b/tools/perf/util/parse-branch-options.c
new file mode 100644
index 00000000000000..9d999436658fd4
--- /dev/null
+++ b/tools/perf/util/parse-branch-options.c
@@ -0,0 +1,93 @@
+#include "perf.h"
+#include "util/util.h"
+#include "util/debug.h"
+#include "util/parse-options.h"
+#include "util/parse-branch-options.h"
+
+#define BRANCH_OPT(n, m) \
+	{ .name = n, .mode = (m) }
+
+#define BRANCH_END { .name = NULL }
+
+struct branch_mode {
+	const char *name;
+	int mode;
+};
+
+static const struct branch_mode branch_modes[] = {
+	BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
+	BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
+	BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
+	BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
+	BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
+	BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
+	BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
+	BRANCH_OPT("abort_tx", PERF_SAMPLE_BRANCH_ABORT_TX),
+	BRANCH_OPT("in_tx", PERF_SAMPLE_BRANCH_IN_TX),
+	BRANCH_OPT("no_tx", PERF_SAMPLE_BRANCH_NO_TX),
+	BRANCH_OPT("cond", PERF_SAMPLE_BRANCH_COND),
+	BRANCH_END
+};
+
+int
+parse_branch_stack(const struct option *opt, const char *str, int unset)
+{
+#define ONLY_PLM \
+	(PERF_SAMPLE_BRANCH_USER	|\
+	 PERF_SAMPLE_BRANCH_KERNEL	|\
+	 PERF_SAMPLE_BRANCH_HV)
+
+	uint64_t *mode = (uint64_t *)opt->value;
+	const struct branch_mode *br;
+	char *s, *os = NULL, *p;
+	int ret = -1;
+
+	if (unset)
+		return 0;
+
+	/*
+	 * cannot set it twice, -b + --branch-filter for instance
+	 */
+	if (*mode)
+		return -1;
+
+	/* str may be NULL in case no arg is passed to -b */
+	if (str) {
+		/* because str is read-only */
+		s = os = strdup(str);
+		if (!s)
+			return -1;
+
+		for (;;) {
+			p = strchr(s, ',');
+			if (p)
+				*p = '\0';
+
+			for (br = branch_modes; br->name; br++) {
+				if (!strcasecmp(s, br->name))
+					break;
+			}
+			if (!br->name) {
+				ui__warning("unknown branch filter %s,"
+					    " check man page\n", s);
+				goto error;
+			}
+
+			*mode |= br->mode;
+
+			if (!p)
+				break;
+
+			s = p + 1;
+		}
+	}
+	ret = 0;
+
+	/* default to any branch */
+	if ((*mode & ~ONLY_PLM) == 0) {
+		*mode = PERF_SAMPLE_BRANCH_ANY;
+	}
+error:
+	free(os);
+	return ret;
+}
diff --git a/tools/perf/util/parse-branch-options.h b/tools/perf/util/parse-branch-options.h
new file mode 100644
index 00000000000000..b9d9470c2e82d3
--- /dev/null
+++ b/tools/perf/util/parse-branch-options.h
@@ -0,0 +1,5 @@
+#ifndef _PERF_PARSE_BRANCH_OPTIONS_H
+#define _PERF_PARSE_BRANCH_OPTIONS_H 1
+struct option;
+int parse_branch_stack(const struct option *opt, const char *str, int unset);
+#endif /* _PERF_PARSE_BRANCH_OPTIONS_H */
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index be0655388b38e4..2a4d1ec0284647 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -17,6 +17,7 @@
 #include "parse-events-flex.h"
 #include "pmu.h"
 #include "thread_map.h"
+#include "asm/bug.h"
 
 #define MAX_NAME_LEN 100
 
@@ -538,16 +539,40 @@ int parse_events_add_breakpoint(struct list_head *list, int *idx,
 	return add_event(list, idx, &attr, NULL);
 }
 
+static int check_type_val(struct parse_events_term *term,
+			  struct parse_events_error *err,
+			  int type)
+{
+	if (type == term->type_val)
+		return 0;
+
+	if (err) {
+		err->idx = term->err_val;
+		if (type == PARSE_EVENTS__TERM_TYPE_NUM)
+			err->str = strdup("expected numeric value");
+		else
+			err->str = strdup("expected string value");
+	}
+	return -EINVAL;
+}
+
 static int config_term(struct perf_event_attr *attr,
-		       struct parse_events_term *term)
+		       struct parse_events_term *term,
+		       struct parse_events_error *err)
 {
-#define CHECK_TYPE_VAL(type)					\
-do {								\
-	if (PARSE_EVENTS__TERM_TYPE_ ## type != term->type_val)	\
-		return -EINVAL;					\
+#define CHECK_TYPE_VAL(type)						   \
+do {									   \
+	if (check_type_val(term, err, PARSE_EVENTS__TERM_TYPE_ ## type)) \
+		return -EINVAL;						   \
 } while (0)
 
 	switch (term->type_term) {
+	case PARSE_EVENTS__TERM_TYPE_USER:
+		/*
+		 * Always succeed for sysfs terms, as we dont know
+		 * at this point what type they need to have.
+		 */
+		return 0;
 	case PARSE_EVENTS__TERM_TYPE_CONFIG:
 		CHECK_TYPE_VAL(NUM);
 		attr->config = term->val.num;
@@ -582,18 +607,20 @@ do {								\
 }
 
 static int config_attr(struct perf_event_attr *attr,
-		       struct list_head *head, int fail)
+		       struct list_head *head,
+		       struct parse_events_error *err)
 {
 	struct parse_events_term *term;
 
 	list_for_each_entry(term, head, list)
-		if (config_term(attr, term) && fail)
+		if (config_term(attr, term, err))
 			return -EINVAL;
 
 	return 0;
 }
 
-int parse_events_add_numeric(struct list_head *list, int *idx,
+int parse_events_add_numeric(struct parse_events_evlist *data,
+			     struct list_head *list,
 			     u32 type, u64 config,
 			     struct list_head *head_config)
 {
@@ -604,10 +631,10 @@ int parse_events_add_numeric(struct list_head *list, int *idx,
 	attr.config = config;
 
 	if (head_config &&
-	    config_attr(&attr, head_config, 1))
+	    config_attr(&attr, head_config, data->error))
 		return -EINVAL;
 
-	return add_event(list, idx, &attr, NULL);
+	return add_event(list, &data->idx, &attr, NULL);
 }
 
 static int parse_events__is_name_term(struct parse_events_term *term)
@@ -626,8 +653,9 @@ static char *pmu_event_name(struct list_head *head_terms)
 	return NULL;
 }
 
-int parse_events_add_pmu(struct list_head *list, int *idx,
-			 char *name, struct list_head *head_config)
+int parse_events_add_pmu(struct parse_events_evlist *data,
+			 struct list_head *list, char *name,
+			 struct list_head *head_config)
 {
 	struct perf_event_attr attr;
 	struct perf_pmu_info info;
@@ -647,7 +675,7 @@ int parse_events_add_pmu(struct list_head *list, int *idx,
 
 	if (!head_config) {
 		attr.type = pmu->type;
-		evsel = __add_event(list, idx, &attr, NULL, pmu->cpus);
+		evsel = __add_event(list, &data->idx, &attr, NULL, pmu->cpus);
 		return evsel ? 0 : -ENOMEM;
 	}
 
@@ -658,13 +686,14 @@ int parse_events_add_pmu(struct list_head *list, int *idx,
 	 * Configure hardcoded terms first, no need to check
 	 * return value when called with fail == 0 ;)
 	 */
-	config_attr(&attr, head_config, 0);
+	if (config_attr(&attr, head_config, data->error))
+		return -EINVAL;
 
-	if (perf_pmu__config(pmu, &attr, head_config))
+	if (perf_pmu__config(pmu, &attr, head_config, data->error))
 		return -EINVAL;
 
-	evsel = __add_event(list, idx, &attr, pmu_event_name(head_config),
-			    pmu->cpus);
+	evsel = __add_event(list, &data->idx, &attr,
+			    pmu_event_name(head_config), pmu->cpus);
 	if (evsel) {
 		evsel->unit = info.unit;
 		evsel->scale = info.scale;
@@ -1019,11 +1048,13 @@ int parse_events_terms(struct list_head *terms, const char *str)
 	return ret;
 }
 
-int parse_events(struct perf_evlist *evlist, const char *str)
+int parse_events(struct perf_evlist *evlist, const char *str,
+		 struct parse_events_error *err)
 {
 	struct parse_events_evlist data = {
-		.list = LIST_HEAD_INIT(data.list),
-		.idx  = evlist->nr_entries,
+		.list  = LIST_HEAD_INIT(data.list),
+		.idx   = evlist->nr_entries,
+		.error = err,
 	};
 	int ret;
 
@@ -1044,16 +1075,87 @@ int parse_events(struct perf_evlist *evlist, const char *str)
 	return ret;
 }
 
+#define MAX_WIDTH 1000
+static int get_term_width(void)
+{
+	struct winsize ws;
+
+	get_term_dimensions(&ws);
+	return ws.ws_col > MAX_WIDTH ? MAX_WIDTH : ws.ws_col;
+}
+
+static void parse_events_print_error(struct parse_events_error *err,
+				     const char *event)
+{
+	const char *str = "invalid or unsupported event: ";
+	char _buf[MAX_WIDTH];
+	char *buf = (char *) event;
+	int idx = 0;
+
+	if (err->str) {
+		/* -2 for extra '' in the final fprintf */
+		int width       = get_term_width() - 2;
+		int len_event   = strlen(event);
+		int len_str, max_len, cut = 0;
+
+		/*
+		 * Maximum error index indent, we will cut
+		 * the event string if it's bigger.
+		 */
+		int max_err_idx = 10;
+
+		/*
+		 * Let's be specific with the message when
+		 * we have the precise error.
+		 */
+		str     = "event syntax error: ";
+		len_str = strlen(str);
+		max_len = width - len_str;
+
+		buf = _buf;
+
+		/* We're cutting from the beggining. */
+		if (err->idx > max_err_idx)
+			cut = err->idx - max_err_idx;
+
+		strncpy(buf, event + cut, max_len);
+
+		/* Mark cut parts with '..' on both sides. */
+		if (cut)
+			buf[0] = buf[1] = '.';
+
+		if ((len_event - cut) > max_len) {
+			buf[max_len - 1] = buf[max_len - 2] = '.';
+			buf[max_len] = 0;
+		}
+
+		idx = len_str + err->idx - cut;
+	}
+
+	fprintf(stderr, "%s'%s'\n", str, buf);
+	if (idx) {
+		fprintf(stderr, "%*s\\___ %s\n", idx + 1, "", err->str);
+		if (err->help)
+			fprintf(stderr, "\n%s\n", err->help);
+		free(err->str);
+		free(err->help);
+	}
+
+	fprintf(stderr, "Run 'perf list' for a list of valid events\n");
+}
+
+#undef MAX_WIDTH
+
 int parse_events_option(const struct option *opt, const char *str,
 			int unset __maybe_unused)
 {
 	struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;
-	int ret = parse_events(evlist, str);
+	struct parse_events_error err = { .idx = 0, };
+	int ret = parse_events(evlist, str, &err);
+
+	if (ret)
+		parse_events_print_error(&err, str);
 
-	if (ret) {
-		fprintf(stderr, "invalid or unsupported event: '%s'\n", str);
-		fprintf(stderr, "Run 'perf list' for a list of valid events\n");
-	}
 	return ret;
 }
 
@@ -1460,7 +1562,7 @@ int parse_events__is_hardcoded_term(struct parse_events_term *term)
 
 static int new_term(struct parse_events_term **_term, int type_val,
 		    int type_term, char *config,
-		    char *str, u64 num)
+		    char *str, u64 num, int err_term, int err_val)
 {
 	struct parse_events_term *term;
 
@@ -1472,6 +1574,8 @@ static int new_term(struct parse_events_term **_term, int type_val,
 	term->type_val  = type_val;
 	term->type_term = type_term;
 	term->config = config;
+	term->err_term = err_term;
+	term->err_val  = err_val;
 
 	switch (type_val) {
 	case PARSE_EVENTS__TERM_TYPE_NUM:
@@ -1490,17 +1594,29 @@ static int new_term(struct parse_events_term **_term, int type_val,
 }
 
 int parse_events_term__num(struct parse_events_term **term,
-			   int type_term, char *config, u64 num)
+			   int type_term, char *config, u64 num,
+			   void *loc_term_, void *loc_val_)
 {
+	YYLTYPE *loc_term = loc_term_;
+	YYLTYPE *loc_val = loc_val_;
+
 	return new_term(term, PARSE_EVENTS__TERM_TYPE_NUM, type_term,
-			config, NULL, num);
+			config, NULL, num,
+			loc_term ? loc_term->first_column : 0,
+			loc_val ? loc_val->first_column : 0);
 }
 
 int parse_events_term__str(struct parse_events_term **term,
-			   int type_term, char *config, char *str)
+			   int type_term, char *config, char *str,
+			   void *loc_term_, void *loc_val_)
 {
+	YYLTYPE *loc_term = loc_term_;
+	YYLTYPE *loc_val = loc_val_;
+
 	return new_term(term, PARSE_EVENTS__TERM_TYPE_STR, type_term,
-			config, str, 0);
+			config, str, 0,
+			loc_term ? loc_term->first_column : 0,
+			loc_val ? loc_val->first_column : 0);
 }
 
 int parse_events_term__sym_hw(struct parse_events_term **term,
@@ -1514,18 +1630,20 @@ int parse_events_term__sym_hw(struct parse_events_term **term,
 	if (config)
 		return new_term(term, PARSE_EVENTS__TERM_TYPE_STR,
 				PARSE_EVENTS__TERM_TYPE_USER, config,
-				(char *) sym->symbol, 0);
+				(char *) sym->symbol, 0, 0, 0);
 	else
 		return new_term(term, PARSE_EVENTS__TERM_TYPE_STR,
 				PARSE_EVENTS__TERM_TYPE_USER,
-				(char *) "event", (char *) sym->symbol, 0);
+				(char *) "event", (char *) sym->symbol,
+				0, 0, 0);
 }
 
 int parse_events_term__clone(struct parse_events_term **new,
 			     struct parse_events_term *term)
 {
 	return new_term(new, term->type_val, term->type_term, term->config,
-			term->val.str, term->val.num);
+			term->val.str, term->val.num,
+			term->err_term, term->err_val);
 }
 
 void parse_events__free_terms(struct list_head *terms)
@@ -1535,3 +1653,15 @@ void parse_events__free_terms(struct list_head *terms)
 	list_for_each_entry_safe(term, h, terms, list)
 		free(term);
 }
+
+void parse_events_evlist_error(struct parse_events_evlist *data,
+			       int idx, const char *str)
+{
+	struct parse_events_error *err = data->error;
+
+	if (!err)
+		return;
+	err->idx = idx;
+	err->str = strdup(str);
+	WARN_ONCE(!err->str, "WARNING: failed to allocate error string");
+}
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 52a2dda4f954a7..131f29b2f13258 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -12,6 +12,7 @@
 struct list_head;
 struct perf_evsel;
 struct perf_evlist;
+struct parse_events_error;
 
 struct option;
 
@@ -29,7 +30,8 @@ const char *event_type(int type);
 
 extern int parse_events_option(const struct option *opt, const char *str,
 			       int unset);
-extern int parse_events(struct perf_evlist *evlist, const char *str);
+extern int parse_events(struct perf_evlist *evlist, const char *str,
+			struct parse_events_error *error);
 extern int parse_events_terms(struct list_head *terms, const char *str);
 extern int parse_filter(const struct option *opt, const char *str, int unset);
 
@@ -72,12 +74,23 @@ struct parse_events_term {
 	int type_term;
 	struct list_head list;
 	bool used;
+
+	/* error string indexes for within parsed string */
+	int err_term;
+	int err_val;
+};
+
+struct parse_events_error {
+	int   idx;	/* index in the parsed string */
+	char *str;      /* string to display at the index */
+	char *help;	/* optional help string */
 };
 
 struct parse_events_evlist {
-	struct list_head list;
-	int idx;
-	int nr_groups;
+	struct list_head	   list;
+	int			   idx;
+	int			   nr_groups;
+	struct parse_events_error *error;
 };
 
 struct parse_events_terms {
@@ -85,10 +98,12 @@ struct parse_events_terms {
 };
 
 int parse_events__is_hardcoded_term(struct parse_events_term *term);
-int parse_events_term__num(struct parse_events_term **_term,
-			   int type_term, char *config, u64 num);
-int parse_events_term__str(struct parse_events_term **_term,
-			   int type_term, char *config, char *str);
+int parse_events_term__num(struct parse_events_term **term,
+			   int type_term, char *config, u64 num,
+			   void *loc_term, void *loc_val);
+int parse_events_term__str(struct parse_events_term **term,
+			   int type_term, char *config, char *str,
+			   void *loc_term, void *loc_val);
 int parse_events_term__sym_hw(struct parse_events_term **term,
 			      char *config, unsigned idx);
 int parse_events_term__clone(struct parse_events_term **new,
@@ -99,21 +114,24 @@ int parse_events__modifier_group(struct list_head *list, char *event_mod);
 int parse_events_name(struct list_head *list, char *name);
 int parse_events_add_tracepoint(struct list_head *list, int *idx,
 				char *sys, char *event);
-int parse_events_add_numeric(struct list_head *list, int *idx,
+int parse_events_add_numeric(struct parse_events_evlist *data,
+			     struct list_head *list,
 			     u32 type, u64 config,
 			     struct list_head *head_config);
 int parse_events_add_cache(struct list_head *list, int *idx,
 			   char *type, char *op_result1, char *op_result2);
 int parse_events_add_breakpoint(struct list_head *list, int *idx,
 				void *ptr, char *type, u64 len);
-int parse_events_add_pmu(struct list_head *list, int *idx,
-			 char *pmu , struct list_head *head_config);
+int parse_events_add_pmu(struct parse_events_evlist *data,
+			 struct list_head *list, char *name,
+			 struct list_head *head_config);
 enum perf_pmu_event_symbol_type
 perf_pmu__parse_check(const char *name);
 void parse_events__set_leader(char *name, struct list_head *list);
 void parse_events_update_lists(struct list_head *list_event,
 			       struct list_head *list_all);
-void parse_events_error(void *data, void *scanner, char const *msg);
+void parse_events_evlist_error(struct parse_events_evlist *data,
+			       int idx, const char *str);
 
 void print_events(const char *event_glob, bool name_only);
 
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 8895cf3132ab24..09e738fe9ea279 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -3,6 +3,8 @@
 %option bison-bridge
 %option prefix="parse_events_"
 %option stack
+%option bison-locations
+%option yylineno
 
 %{
 #include <errno.h>
@@ -51,6 +53,18 @@ static int str(yyscan_t scanner, int token)
 	return token;
 }
 
+#define REWIND(__alloc)				\
+do {								\
+	YYSTYPE *__yylval = parse_events_get_lval(yyscanner);	\
+	char *text = parse_events_get_text(yyscanner);		\
+								\
+	if (__alloc)						\
+		__yylval->str = strdup(text);			\
+								\
+	yycolumn -= strlen(text);				\
+	yyless(0);						\
+} while (0)
+
 static int pmu_str_check(yyscan_t scanner)
 {
 	YYSTYPE *yylval = parse_events_get_lval(scanner);
@@ -85,6 +99,13 @@ static int term(yyscan_t scanner, int type)
 	return PE_TERM;
 }
 
+#define YY_USER_ACTION					\
+do {							\
+	yylloc->last_column  = yylloc->first_column;	\
+	yylloc->first_column = yycolumn;		\
+	yycolumn += yyleng;				\
+} while (0);
+
 %}
 
 %x mem
@@ -119,6 +140,12 @@ modifier_bp	[rwx]{1,3}
 
 		if (start_token) {
 			parse_events_set_extra(NULL, yyscanner);
+			/*
+			 * The flex parser does not init locations variable
+			 * via the scan_string interface, so we need do the
+			 * init in here.
+			 */
+			yycolumn = 0;
 			return start_token;
 		}
          }
@@ -127,24 +154,30 @@ modifier_bp	[rwx]{1,3}
 <event>{
 
 {group}		{
-			BEGIN(INITIAL); yyless(0);
+			BEGIN(INITIAL);
+			REWIND(0);
 		}
 
 {event_pmu}	|
 {event}		{
-			str(yyscanner, PE_EVENT_NAME);
-			BEGIN(INITIAL); yyless(0);
+			BEGIN(INITIAL);
+			REWIND(1);
 			return PE_EVENT_NAME;
 		}
 
 .		|
 <<EOF>>		{
-			BEGIN(INITIAL); yyless(0);
+			BEGIN(INITIAL);
+			REWIND(0);
 		}
 
 }
 
 <config>{
+	/*
+	 * Please update formats_error_string any time
+	 * new static term is added.
+	 */
 config			{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG); }
 config1			{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG1); }
 config2			{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG2); }
diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y
index 72def077dbbfda..591905a02b926b 100644
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -2,6 +2,7 @@
 %parse-param {void *_data}
 %parse-param {void *scanner}
 %lex-param {void* scanner}
+%locations
 
 %{
 
@@ -14,8 +15,6 @@
 #include "parse-events.h"
 #include "parse-events-bison.h"
 
-extern int parse_events_lex (YYSTYPE* lvalp, void* scanner);
-
 #define ABORT_ON(val) \
 do { \
 	if (val) \
@@ -208,7 +207,7 @@ PE_NAME '/' event_config '/'
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_pmu(list, &data->idx, $1, $3));
+	ABORT_ON(parse_events_add_pmu(data, list, $1, $3));
 	parse_events__free_terms($3);
 	$$ = list;
 }
@@ -219,7 +218,7 @@ PE_NAME '/' '/'
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_pmu(list, &data->idx, $1, NULL));
+	ABORT_ON(parse_events_add_pmu(data, list, $1, NULL));
 	$$ = list;
 }
 |
@@ -232,11 +231,11 @@ PE_KERNEL_PMU_EVENT sep_dc
 
 	ALLOC_LIST(head);
 	ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-					$1, 1));
+					$1, 1, &@1, NULL));
 	list_add_tail(&term->list, head);
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_pmu(list, &data->idx, "cpu", head));
+	ABORT_ON(parse_events_add_pmu(data, list, "cpu", head));
 	parse_events__free_terms(head);
 	$$ = list;
 }
@@ -252,7 +251,7 @@ PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF sep_dc
 
 	ALLOC_LIST(head);
 	ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-					&pmu_name, 1));
+					&pmu_name, 1, &@1, NULL));
 	list_add_tail(&term->list, head);
 
 	ALLOC_LIST(list);
@@ -275,8 +274,7 @@ value_sym '/' event_config '/'
 	int config = $1 & 255;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_numeric(list, &data->idx,
-					  type, config, $3));
+	ABORT_ON(parse_events_add_numeric(data, list, type, config, $3));
 	parse_events__free_terms($3);
 	$$ = list;
 }
@@ -289,8 +287,7 @@ value_sym sep_slash_dc
 	int config = $1 & 255;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_numeric(list, &data->idx,
-					  type, config, NULL));
+	ABORT_ON(parse_events_add_numeric(data, list, type, config, NULL));
 	$$ = list;
 }
 
@@ -389,7 +386,15 @@ PE_NAME ':' PE_NAME
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_tracepoint(list, &data->idx, $1, $3));
+	if (parse_events_add_tracepoint(list, &data->idx, $1, $3)) {
+		struct parse_events_error *error = data->error;
+
+		if (error) {
+			error->idx = @1.first_column;
+			error->str = strdup("unknown tracepoint");
+		}
+		return -1;
+	}
 	$$ = list;
 }
 
@@ -400,7 +405,7 @@ PE_VALUE ':' PE_VALUE
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_numeric(list, &data->idx, (u32)$1, $3, NULL));
+	ABORT_ON(parse_events_add_numeric(data, list, (u32)$1, $3, NULL));
 	$$ = list;
 }
 
@@ -411,8 +416,7 @@ PE_RAW
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_numeric(list, &data->idx,
-					  PERF_TYPE_RAW, $1, NULL));
+	ABORT_ON(parse_events_add_numeric(data, list, PERF_TYPE_RAW, $1, NULL));
 	$$ = list;
 }
 
@@ -450,7 +454,7 @@ PE_NAME '=' PE_NAME
 	struct parse_events_term *term;
 
 	ABORT_ON(parse_events_term__str(&term, PARSE_EVENTS__TERM_TYPE_USER,
-					$1, $3));
+					$1, $3, &@1, &@3));
 	$$ = term;
 }
 |
@@ -459,7 +463,7 @@ PE_NAME '=' PE_VALUE
 	struct parse_events_term *term;
 
 	ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-					$1, $3));
+					$1, $3, &@1, &@3));
 	$$ = term;
 }
 |
@@ -477,7 +481,7 @@ PE_NAME
 	struct parse_events_term *term;
 
 	ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
-					$1, 1));
+					$1, 1, &@1, NULL));
 	$$ = term;
 }
 |
@@ -494,7 +498,7 @@ PE_TERM '=' PE_NAME
 {
 	struct parse_events_term *term;
 
-	ABORT_ON(parse_events_term__str(&term, (int)$1, NULL, $3));
+	ABORT_ON(parse_events_term__str(&term, (int)$1, NULL, $3, &@1, &@3));
 	$$ = term;
 }
 |
@@ -502,7 +506,7 @@ PE_TERM '=' PE_VALUE
 {
 	struct parse_events_term *term;
 
-	ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, $3));
+	ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, $3, &@1, &@3));
 	$$ = term;
 }
 |
@@ -510,7 +514,7 @@ PE_TERM
 {
 	struct parse_events_term *term;
 
-	ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1));
+	ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1, &@1, NULL));
 	$$ = term;
 }
 
@@ -520,7 +524,9 @@ sep_slash_dc: '/' | ':' |
 
 %%
 
-void parse_events_error(void *data __maybe_unused, void *scanner __maybe_unused,
+void parse_events_error(YYLTYPE *loc, void *data,
+			void *scanner __maybe_unused,
 			char const *msg __maybe_unused)
 {
+	parse_events_evlist_error(data, loc->last_column, "parser error");
 }
diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h
index 59561fd8627827..367d8b816cc7e7 100644
--- a/tools/perf/util/parse-options.h
+++ b/tools/perf/util/parse-options.h
@@ -123,6 +123,10 @@ struct option {
 #define OPT_LONG(s, l, v, h)        { .type = OPTION_LONG, .short_name = (s), .long_name = (l), .value = check_vtype(v, long *), .help = (h) }
 #define OPT_U64(s, l, v, h)         { .type = OPTION_U64, .short_name = (s), .long_name = (l), .value = check_vtype(v, u64 *), .help = (h) }
 #define OPT_STRING(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h) }
+#define OPT_STRING_OPTARG(s, l, v, a, h, d) \
+	{ .type = OPTION_STRING,  .short_name = (s), .long_name = (l), \
+	  .value = check_vtype(v, const char **), (a), .help = (h), \
+	  .flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d) }
 #define OPT_STRING_NOEMPTY(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h), .flags = PARSE_OPT_NOEMPTY}
 #define OPT_DATE(s, l, v, h) \
 	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = "time", .help = (h), .callback = parse_opt_approxidate_cb }
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 48411674da0f9c..5d3ab7c8ceaf00 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -442,6 +442,10 @@ static struct perf_pmu *pmu_lookup(const char *name)
 	LIST_HEAD(aliases);
 	__u32 type;
 
+	/* No support for intel_bts or intel_pt so disallow them */
+	if (!strcmp(name, "intel_bts") || !strcmp(name, "intel_pt"))
+		return NULL;
+
 	/*
 	 * The pmu data we store & need consists of the pmu
 	 * type value and format definitions. Load both right
@@ -579,6 +583,38 @@ static int pmu_resolve_param_term(struct parse_events_term *term,
 	return -1;
 }
 
+static char *formats_error_string(struct list_head *formats)
+{
+	struct perf_pmu_format *format;
+	char *err, *str;
+	static const char *static_terms = "config,config1,config2,name,period,branch_type\n";
+	unsigned i = 0;
+
+	if (!asprintf(&str, "valid terms:"))
+		return NULL;
+
+	/* sysfs exported terms */
+	list_for_each_entry(format, formats, list) {
+		char c = i++ ? ',' : ' ';
+
+		err = str;
+		if (!asprintf(&str, "%s%c%s", err, c, format->name))
+			goto fail;
+		free(err);
+	}
+
+	/* static terms */
+	err = str;
+	if (!asprintf(&str, "%s,%s", err, static_terms))
+		goto fail;
+
+	free(err);
+	return str;
+fail:
+	free(err);
+	return NULL;
+}
+
 /*
  * Setup one of config[12] attr members based on the
  * user input data - term parameter.
@@ -587,7 +623,7 @@ static int pmu_config_term(struct list_head *formats,
 			   struct perf_event_attr *attr,
 			   struct parse_events_term *term,
 			   struct list_head *head_terms,
-			   bool zero)
+			   bool zero, struct parse_events_error *err)
 {
 	struct perf_pmu_format *format;
 	__u64 *vp;
@@ -611,6 +647,11 @@ static int pmu_config_term(struct list_head *formats,
 	if (!format) {
 		if (verbose)
 			printf("Invalid event/parameter '%s'\n", term->config);
+		if (err) {
+			err->idx  = term->err_term;
+			err->str  = strdup("unknown term");
+			err->help = formats_error_string(formats);
+		}
 		return -EINVAL;
 	}
 
@@ -636,9 +677,14 @@ static int pmu_config_term(struct list_head *formats,
 		val = term->val.num;
 	else if (term->type_val == PARSE_EVENTS__TERM_TYPE_STR) {
 		if (strcmp(term->val.str, "?")) {
-			if (verbose)
+			if (verbose) {
 				pr_info("Invalid sysfs entry %s=%s\n",
 						term->config, term->val.str);
+			}
+			if (err) {
+				err->idx = term->err_val;
+				err->str = strdup("expected numeric value");
+			}
 			return -EINVAL;
 		}
 
@@ -654,12 +700,13 @@ static int pmu_config_term(struct list_head *formats,
 int perf_pmu__config_terms(struct list_head *formats,
 			   struct perf_event_attr *attr,
 			   struct list_head *head_terms,
-			   bool zero)
+			   bool zero, struct parse_events_error *err)
 {
 	struct parse_events_term *term;
 
 	list_for_each_entry(term, head_terms, list) {
-		if (pmu_config_term(formats, attr, term, head_terms, zero))
+		if (pmu_config_term(formats, attr, term, head_terms,
+				    zero, err))
 			return -EINVAL;
 	}
 
@@ -672,12 +719,14 @@ int perf_pmu__config_terms(struct list_head *formats,
  * 2) pmu format definitions - specified by pmu parameter
  */
 int perf_pmu__config(struct perf_pmu *pmu, struct perf_event_attr *attr,
-		     struct list_head *head_terms)
+		     struct list_head *head_terms,
+		     struct parse_events_error *err)
 {
 	bool zero = !!pmu->default_config;
 
 	attr->type = pmu->type;
-	return perf_pmu__config_terms(&pmu->format, attr, head_terms, zero);
+	return perf_pmu__config_terms(&pmu->format, attr, head_terms,
+				      zero, err);
 }
 
 static struct perf_pmu_alias *pmu_find_alias(struct perf_pmu *pmu,
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 6b1249fbdb5f3a..7b9c8cf8ae3e59 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -4,6 +4,7 @@
 #include <linux/bitmap.h>
 #include <linux/perf_event.h>
 #include <stdbool.h>
+#include "parse-events.h"
 
 enum {
 	PERF_PMU_FORMAT_VALUE_CONFIG,
@@ -47,11 +48,12 @@ struct perf_pmu_alias {
 
 struct perf_pmu *perf_pmu__find(const char *name);
 int perf_pmu__config(struct perf_pmu *pmu, struct perf_event_attr *attr,
-		     struct list_head *head_terms);
+		     struct list_head *head_terms,
+		     struct parse_events_error *error);
 int perf_pmu__config_terms(struct list_head *formats,
 			   struct perf_event_attr *attr,
 			   struct list_head *head_terms,
-			   bool zero);
+			   bool zero, struct parse_events_error *error);
 int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms,
 			  struct perf_pmu_info *info);
 struct list_head *perf_pmu__alias(struct perf_pmu *pmu,
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index d05b77cf35f770..d27edef5eb5b55 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -51,6 +51,7 @@
 #define PERFPROBE_GROUP "probe"
 
 bool probe_event_dry_run;	/* Dry run flag */
+struct probe_conf probe_conf;
 
 #define semantic_error(msg ...) pr_err("Semantic error :" msg)
 
@@ -161,8 +162,9 @@ static u64 kernel_get_symbol_address_by_name(const char *name, bool reloc)
 
 static struct map *kernel_get_module_map(const char *module)
 {
-	struct rb_node *nd;
 	struct map_groups *grp = &host_machine->kmaps;
+	struct maps *maps = &grp->maps[MAP__FUNCTION];
+	struct map *pos;
 
 	/* A file path -- this is an offline module */
 	if (module && strchr(module, '/'))
@@ -171,8 +173,7 @@ static struct map *kernel_get_module_map(const char *module)
 	if (!module)
 		module = "kernel";
 
-	for (nd = rb_first(&grp->maps[MAP__FUNCTION]); nd; nd = rb_next(nd)) {
-		struct map *pos = rb_entry(nd, struct map, rb_node);
+	for (pos = maps__first(maps); pos; pos = map__next(pos)) {
 		if (strncmp(pos->dso->short_name + 1, module,
 			    pos->dso->short_name_len - 2) == 0) {
 			return pos;
@@ -194,52 +195,11 @@ static void put_target_map(struct map *map, bool user)
 {
 	if (map && user) {
 		/* Only the user map needs to be released */
-		dso__delete(map->dso);
-		map__delete(map);
+		map__put(map);
 	}
 }
 
 
-static struct dso *kernel_get_module_dso(const char *module)
-{
-	struct dso *dso;
-	struct map *map;
-	const char *vmlinux_name;
-
-	if (module) {
-		list_for_each_entry(dso, &host_machine->kernel_dsos.head,
-				    node) {
-			if (strncmp(dso->short_name + 1, module,
-				    dso->short_name_len - 2) == 0)
-				goto found;
-		}
-		pr_debug("Failed to find module %s.\n", module);
-		return NULL;
-	}
-
-	map = host_machine->vmlinux_maps[MAP__FUNCTION];
-	dso = map->dso;
-
-	vmlinux_name = symbol_conf.vmlinux_name;
-	if (vmlinux_name) {
-		if (dso__load_vmlinux(dso, map, vmlinux_name, false, NULL) <= 0)
-			return NULL;
-	} else {
-		if (dso__load_vmlinux_path(dso, map, NULL) <= 0) {
-			pr_debug("Failed to load kernel map.\n");
-			return NULL;
-		}
-	}
-found:
-	return dso;
-}
-
-const char *kernel_get_module_path(const char *module)
-{
-	struct dso *dso = kernel_get_module_dso(module);
-	return (dso) ? dso->long_name : NULL;
-}
-
 static int convert_exec_to_group(const char *exec, char **result)
 {
 	char *ptr1, *ptr2, *exec_copy;
@@ -287,6 +247,40 @@ static void clear_probe_trace_events(struct probe_trace_event *tevs, int ntevs)
 }
 
 #ifdef HAVE_DWARF_SUPPORT
+
+static int kernel_get_module_dso(const char *module, struct dso **pdso)
+{
+	struct dso *dso;
+	struct map *map;
+	const char *vmlinux_name;
+	int ret = 0;
+
+	if (module) {
+		list_for_each_entry(dso, &host_machine->dsos.head, node) {
+			if (!dso->kernel)
+				continue;
+			if (strncmp(dso->short_name + 1, module,
+				    dso->short_name_len - 2) == 0)
+				goto found;
+		}
+		pr_debug("Failed to find module %s.\n", module);
+		return -ENOENT;
+	}
+
+	map = host_machine->vmlinux_maps[MAP__FUNCTION];
+	dso = map->dso;
+
+	vmlinux_name = symbol_conf.vmlinux_name;
+	dso->load_errno = 0;
+	if (vmlinux_name)
+		ret = dso__load_vmlinux(dso, map, vmlinux_name, false, NULL);
+	else
+		ret = dso__load_vmlinux_path(dso, map, NULL);
+found:
+	*pdso = dso;
+	return ret;
+}
+
 /*
  * Some binaries like glibc have special symbols which are on the symbol
  * table, but not in the debuginfo. If we can find the address of the
@@ -344,15 +338,14 @@ out:
 
 static int get_alternative_probe_event(struct debuginfo *dinfo,
 				       struct perf_probe_event *pev,
-				       struct perf_probe_point *tmp,
-				       const char *target)
+				       struct perf_probe_point *tmp)
 {
 	int ret;
 
 	memcpy(tmp, &pev->point, sizeof(*tmp));
 	memset(&pev->point, 0, sizeof(pev->point));
 	ret = find_alternative_probe_point(dinfo, tmp, &pev->point,
-					   target, pev->uprobes);
+					   pev->target, pev->uprobes);
 	if (ret < 0)
 		memcpy(&pev->point, tmp, sizeof(*tmp));
 
@@ -390,16 +383,25 @@ static int get_alternative_line_range(struct debuginfo *dinfo,
 static struct debuginfo *open_debuginfo(const char *module, bool silent)
 {
 	const char *path = module;
-	struct debuginfo *ret;
+	char reason[STRERR_BUFSIZE];
+	struct debuginfo *ret = NULL;
+	struct dso *dso = NULL;
+	int err;
 
 	if (!module || !strchr(module, '/')) {
-		path = kernel_get_module_path(module);
-		if (!path) {
+		err = kernel_get_module_dso(module, &dso);
+		if (err < 0) {
+			if (!dso || dso->load_errno == 0) {
+				if (!strerror_r(-err, reason, STRERR_BUFSIZE))
+					strcpy(reason, "(unknown)");
+			} else
+				dso__strerror_load(dso, reason, STRERR_BUFSIZE);
 			if (!silent)
-				pr_err("Failed to find path of %s module.\n",
-				       module ?: "kernel");
+				pr_err("Failed to find the path for %s: %s\n",
+					module ?: "kernel", reason);
 			return NULL;
 		}
+		path = dso->long_name;
 	}
 	ret = debuginfo__new(path);
 	if (!ret && !silent) {
@@ -557,8 +559,9 @@ static int post_process_probe_trace_events(struct probe_trace_event *tevs,
 					   bool uprobe)
 {
 	struct ref_reloc_sym *reloc_sym;
+	u64 etext_addr;
 	char *tmp;
-	int i;
+	int i, skipped = 0;
 
 	if (uprobe)
 		return add_exec_to_probe_trace_events(tevs, ntevs, module);
@@ -572,33 +575,45 @@ static int post_process_probe_trace_events(struct probe_trace_event *tevs,
 		pr_warning("Relocated base symbol is not found!\n");
 		return -EINVAL;
 	}
+	/* Get the address of _etext for checking non-probable text symbol */
+	etext_addr = kernel_get_symbol_address_by_name("_etext", false);
 
 	for (i = 0; i < ntevs; i++) {
 		if (tevs[i].point.address && !tevs[i].point.retprobe) {
-			tmp = strdup(reloc_sym->name);
-			if (!tmp)
-				return -ENOMEM;
-			free(tevs[i].point.symbol);
+			/* If we found a wrong one, mark it by NULL symbol */
+			if (etext_addr < tevs[i].point.address) {
+				pr_warning("%s+%lu is out of .text, skip it.\n",
+				   tevs[i].point.symbol, tevs[i].point.offset);
+				tmp = NULL;
+				skipped++;
+			} else {
+				tmp = strdup(reloc_sym->name);
+				if (!tmp)
+					return -ENOMEM;
+			}
+			/* If we have no realname, use symbol for it */
+			if (!tevs[i].point.realname)
+				tevs[i].point.realname = tevs[i].point.symbol;
+			else
+				free(tevs[i].point.symbol);
 			tevs[i].point.symbol = tmp;
 			tevs[i].point.offset = tevs[i].point.address -
 					       reloc_sym->unrelocated_addr;
 		}
 	}
-	return 0;
+	return skipped;
 }
 
 /* Try to find perf_probe_event with debuginfo */
 static int try_to_find_probe_trace_events(struct perf_probe_event *pev,
-					  struct probe_trace_event **tevs,
-					  int max_tevs, const char *target)
+					  struct probe_trace_event **tevs)
 {
 	bool need_dwarf = perf_probe_event_need_dwarf(pev);
 	struct perf_probe_point tmp;
 	struct debuginfo *dinfo;
 	int ntevs, ret = 0;
 
-	dinfo = open_debuginfo(target, !need_dwarf);
-
+	dinfo = open_debuginfo(pev->target, !need_dwarf);
 	if (!dinfo) {
 		if (need_dwarf)
 			return -ENOENT;
@@ -608,13 +623,12 @@ static int try_to_find_probe_trace_events(struct perf_probe_event *pev,
 
 	pr_debug("Try to find probe point from debuginfo.\n");
 	/* Searching trace events corresponding to a probe event */
-	ntevs = debuginfo__find_trace_events(dinfo, pev, tevs, max_tevs);
+	ntevs = debuginfo__find_trace_events(dinfo, pev, tevs);
 
 	if (ntevs == 0)	{  /* Not found, retry with an alternative */
-		ret = get_alternative_probe_event(dinfo, pev, &tmp, target);
+		ret = get_alternative_probe_event(dinfo, pev, &tmp);
 		if (!ret) {
-			ntevs = debuginfo__find_trace_events(dinfo, pev,
-							     tevs, max_tevs);
+			ntevs = debuginfo__find_trace_events(dinfo, pev, tevs);
 			/*
 			 * Write back to the original probe_event for
 			 * setting appropriate (user given) event name
@@ -629,12 +643,15 @@ static int try_to_find_probe_trace_events(struct perf_probe_event *pev,
 	if (ntevs > 0) {	/* Succeeded to find trace events */
 		pr_debug("Found %d probe_trace_events.\n", ntevs);
 		ret = post_process_probe_trace_events(*tevs, ntevs,
-							target, pev->uprobes);
-		if (ret < 0) {
+						pev->target, pev->uprobes);
+		if (ret < 0 || ret == ntevs) {
 			clear_probe_trace_events(*tevs, ntevs);
 			zfree(tevs);
 		}
-		return ret < 0 ? ret : ntevs;
+		if (ret != ntevs)
+			return ret < 0 ? ret : ntevs;
+		ntevs = 0;
+		/* Fall through */
 	}
 
 	if (ntevs == 0)	{	/* No error but failed to find probe point. */
@@ -809,8 +826,7 @@ int show_line_range(struct line_range *lr, const char *module, bool user)
 
 static int show_available_vars_at(struct debuginfo *dinfo,
 				  struct perf_probe_event *pev,
-				  int max_vls, struct strfilter *_filter,
-				  bool externs, const char *target)
+				  struct strfilter *_filter)
 {
 	char *buf;
 	int ret, i, nvars;
@@ -824,13 +840,12 @@ static int show_available_vars_at(struct debuginfo *dinfo,
 		return -EINVAL;
 	pr_debug("Searching variables at %s\n", buf);
 
-	ret = debuginfo__find_available_vars_at(dinfo, pev, &vls,
-						max_vls, externs);
+	ret = debuginfo__find_available_vars_at(dinfo, pev, &vls);
 	if (!ret) {  /* Not found, retry with an alternative */
-		ret = get_alternative_probe_event(dinfo, pev, &tmp, target);
+		ret = get_alternative_probe_event(dinfo, pev, &tmp);
 		if (!ret) {
 			ret = debuginfo__find_available_vars_at(dinfo, pev,
-						&vls, max_vls, externs);
+								&vls);
 			/* Release the old probe_point */
 			clear_perf_probe_point(&tmp);
 		}
@@ -877,8 +892,7 @@ end:
 
 /* Show available variables on given probe point */
 int show_available_vars(struct perf_probe_event *pevs, int npevs,
-			int max_vls, const char *module,
-			struct strfilter *_filter, bool externs)
+			struct strfilter *_filter)
 {
 	int i, ret = 0;
 	struct debuginfo *dinfo;
@@ -887,7 +901,7 @@ int show_available_vars(struct perf_probe_event *pevs, int npevs,
 	if (ret < 0)
 		return ret;
 
-	dinfo = open_debuginfo(module, false);
+	dinfo = open_debuginfo(pevs->target, false);
 	if (!dinfo) {
 		ret = -ENOENT;
 		goto out;
@@ -896,8 +910,7 @@ int show_available_vars(struct perf_probe_event *pevs, int npevs,
 	setup_pager();
 
 	for (i = 0; i < npevs && ret >= 0; i++)
-		ret = show_available_vars_at(dinfo, &pevs[i], max_vls, _filter,
-					     externs, module);
+		ret = show_available_vars_at(dinfo, &pevs[i], _filter);
 
 	debuginfo__delete(dinfo);
 out:
@@ -916,9 +929,7 @@ find_perf_probe_point_from_dwarf(struct probe_trace_point *tp __maybe_unused,
 }
 
 static int try_to_find_probe_trace_events(struct perf_probe_event *pev,
-				struct probe_trace_event **tevs __maybe_unused,
-				int max_tevs __maybe_unused,
-				const char *target __maybe_unused)
+				struct probe_trace_event **tevs __maybe_unused)
 {
 	if (perf_probe_event_need_dwarf(pev)) {
 		pr_warning("Debuginfo-analysis is not supported.\n");
@@ -937,10 +948,8 @@ int show_line_range(struct line_range *lr __maybe_unused,
 }
 
 int show_available_vars(struct perf_probe_event *pevs __maybe_unused,
-			int npevs __maybe_unused, int max_vls __maybe_unused,
-			const char *module __maybe_unused,
-			struct strfilter *filter __maybe_unused,
-			bool externs __maybe_unused)
+			int npevs __maybe_unused,
+			struct strfilter *filter __maybe_unused)
 {
 	pr_warning("Debuginfo-analysis is not supported.\n");
 	return -ENOSYS;
@@ -980,6 +989,18 @@ static int parse_line_num(char **ptr, int *val, const char *what)
 	return 0;
 }
 
+/* Check the name is good for event, group or function */
+static bool is_c_func_name(const char *name)
+{
+	if (!isalpha(*name) && *name != '_')
+		return false;
+	while (*++name != '\0') {
+		if (!isalpha(*name) && !isdigit(*name) && *name != '_')
+			return false;
+	}
+	return true;
+}
+
 /*
  * Stuff 'lr' according to the line range described by 'arg'.
  * The line range syntax is described by:
@@ -1048,10 +1069,15 @@ int parse_line_range_desc(const char *arg, struct line_range *lr)
 			goto err;
 		}
 		lr->function = name;
-	} else if (strchr(name, '.'))
+	} else if (strchr(name, '/') || strchr(name, '.'))
 		lr->file = name;
-	else
+	else if (is_c_func_name(name))/* We reuse it for checking funcname */
 		lr->function = name;
+	else {	/* Invalid name */
+		semantic_error("'%s' is not a valid function name.\n", name);
+		err = -EINVAL;
+		goto err;
+	}
 
 	return 0;
 err:
@@ -1059,24 +1085,13 @@ err:
 	return err;
 }
 
-/* Check the name is good for event/group */
-static bool check_event_name(const char *name)
-{
-	if (!isalpha(*name) && *name != '_')
-		return false;
-	while (*++name != '\0') {
-		if (!isalpha(*name) && !isdigit(*name) && *name != '_')
-			return false;
-	}
-	return true;
-}
-
 /* Parse probepoint definition. */
 static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 {
 	struct perf_probe_point *pp = &pev->point;
 	char *ptr, *tmp;
 	char c, nc = 0;
+	bool file_spec = false;
 	/*
 	 * <Syntax>
 	 * perf probe [EVENT=]SRC[:LN|;PTN]
@@ -1095,7 +1110,7 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 			semantic_error("Group name is not supported yet.\n");
 			return -ENOTSUP;
 		}
-		if (!check_event_name(arg)) {
+		if (!is_c_func_name(arg)) {
 			semantic_error("%s is bad for event name -it must "
 				       "follow C symbol-naming rule.\n", arg);
 			return -EINVAL;
@@ -1107,6 +1122,23 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 		arg = tmp;
 	}
 
+	/*
+	 * Check arg is function or file name and copy it.
+	 *
+	 * We consider arg to be a file spec if and only if it satisfies
+	 * all of the below criteria::
+	 * - it does not include any of "+@%",
+	 * - it includes one of ":;", and
+	 * - it has a period '.' in the name.
+	 *
+	 * Otherwise, we consider arg to be a function specification.
+	 */
+	if (!strpbrk(arg, "+@%") && (ptr = strpbrk(arg, ";:")) != NULL) {
+		/* This is a file spec if it includes a '.' before ; or : */
+		if (memchr(arg, '.', ptr - arg))
+			file_spec = true;
+	}
+
 	ptr = strpbrk(arg, ";:+@%");
 	if (ptr) {
 		nc = *ptr;
@@ -1117,10 +1149,9 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 	if (tmp == NULL)
 		return -ENOMEM;
 
-	/* Check arg is function or file and copy it */
-	if (strchr(tmp, '.'))	/* File */
+	if (file_spec)
 		pp->file = tmp;
-	else			/* Function */
+	else
 		pp->function = tmp;
 
 	/* Parse other options */
@@ -1762,8 +1793,7 @@ static int find_perf_probe_point_from_map(struct probe_trace_point *tp,
 
 out:
 	if (map && !is_kprobe) {
-		dso__delete(map->dso);
-		map__delete(map);
+		map__put(map);
 	}
 
 	return ret;
@@ -1877,6 +1907,7 @@ static void clear_probe_trace_event(struct probe_trace_event *tev)
 	free(tev->event);
 	free(tev->group);
 	free(tev->point.symbol);
+	free(tev->point.realname);
 	free(tev->point.module);
 	for (i = 0; i < tev->nargs; i++) {
 		free(tev->args[i].name);
@@ -1954,7 +1985,7 @@ static int open_probe_events(const char *trace_file, bool readwrite)
 	if (ret >= 0) {
 		pr_debug("Opening %s write=%d\n", buf, readwrite);
 		if (readwrite && !probe_event_dry_run)
-			ret = open(buf, O_RDWR, O_APPEND);
+			ret = open(buf, O_RDWR | O_APPEND, 0);
 		else
 			ret = open(buf, O_RDONLY, 0);
 
@@ -2131,7 +2162,23 @@ static int show_perf_probe_event(struct perf_probe_event *pev,
 	return ret;
 }
 
-static int __show_perf_probe_events(int fd, bool is_kprobe)
+static bool filter_probe_trace_event(struct probe_trace_event *tev,
+				     struct strfilter *filter)
+{
+	char tmp[128];
+
+	/* At first, check the event name itself */
+	if (strfilter__compare(filter, tev->event))
+		return true;
+
+	/* Next, check the combination of name and group */
+	if (e_snprintf(tmp, 128, "%s:%s", tev->group, tev->event) < 0)
+		return false;
+	return strfilter__compare(filter, tmp);
+}
+
+static int __show_perf_probe_events(int fd, bool is_kprobe,
+				    struct strfilter *filter)
 {
 	int ret = 0;
 	struct probe_trace_event tev;
@@ -2149,12 +2196,15 @@ static int __show_perf_probe_events(int fd, bool is_kprobe)
 	strlist__for_each(ent, rawlist) {
 		ret = parse_probe_trace_command(ent->s, &tev);
 		if (ret >= 0) {
+			if (!filter_probe_trace_event(&tev, filter))
+				goto next;
 			ret = convert_to_perf_probe_event(&tev, &pev,
 								is_kprobe);
 			if (ret >= 0)
 				ret = show_perf_probe_event(&pev,
 							    tev.point.module);
 		}
+next:
 		clear_perf_probe_event(&pev);
 		clear_probe_trace_event(&tev);
 		if (ret < 0)
@@ -2166,7 +2216,7 @@ static int __show_perf_probe_events(int fd, bool is_kprobe)
 }
 
 /* List up current perf-probe events */
-int show_perf_probe_events(void)
+int show_perf_probe_events(struct strfilter *filter)
 {
 	int kp_fd, up_fd, ret;
 
@@ -2178,7 +2228,7 @@ int show_perf_probe_events(void)
 
 	kp_fd = open_kprobe_events(false);
 	if (kp_fd >= 0) {
-		ret = __show_perf_probe_events(kp_fd, true);
+		ret = __show_perf_probe_events(kp_fd, true, filter);
 		close(kp_fd);
 		if (ret < 0)
 			goto out;
@@ -2192,7 +2242,7 @@ int show_perf_probe_events(void)
 	}
 
 	if (up_fd >= 0) {
-		ret = __show_perf_probe_events(up_fd, false);
+		ret = __show_perf_probe_events(up_fd, false, filter);
 		close(up_fd);
 	}
 out:
@@ -2267,6 +2317,9 @@ static int get_new_event_name(char *buf, size_t len, const char *base,
 {
 	int i, ret;
 
+	if (*base == '.')
+		base++;
+
 	/* Try no suffix */
 	ret = e_snprintf(buf, len, "%s", base);
 	if (ret < 0) {
@@ -2332,6 +2385,7 @@ static int __add_probe_trace_events(struct perf_probe_event *pev,
 	struct strlist *namelist;
 	LIST_HEAD(blacklist);
 	struct kprobe_blacklist_node *node;
+	bool safename;
 
 	if (pev->uprobes)
 		fd = open_uprobe_events(true);
@@ -2347,7 +2401,8 @@ static int __add_probe_trace_events(struct perf_probe_event *pev,
 	namelist = get_probe_trace_event_names(fd, false);
 	if (!namelist) {
 		pr_debug("Failed to get current event list.\n");
-		return -EIO;
+		ret = -ENOMEM;
+		goto close_out;
 	}
 	/* Get kprobe blacklist if exists */
 	if (!pev->uprobes) {
@@ -2356,10 +2411,14 @@ static int __add_probe_trace_events(struct perf_probe_event *pev,
 			pr_debug("No kprobe blacklist support, ignored\n");
 	}
 
+	safename = (pev->point.function && !strisglob(pev->point.function));
 	ret = 0;
 	pr_info("Added new event%s\n", (ntevs > 1) ? "s:" : ":");
 	for (i = 0; i < ntevs; i++) {
 		tev = &tevs[i];
+		/* Skip if the symbol is out of .text (marked previously) */
+		if (!tev->point.symbol)
+			continue;
 		/* Ensure that the address is NOT blacklisted */
 		node = kprobe_blacklist__find_by_address(&blacklist,
 							 tev->point.address);
@@ -2371,10 +2430,10 @@ static int __add_probe_trace_events(struct perf_probe_event *pev,
 		if (pev->event)
 			event = pev->event;
 		else
-			if (pev->point.function)
+			if (safename)
 				event = pev->point.function;
 			else
-				event = tev->point.symbol;
+				event = tev->point.realname;
 		if (pev->group)
 			group = pev->group;
 		else
@@ -2430,6 +2489,7 @@ static int __add_probe_trace_events(struct perf_probe_event *pev,
 
 	kprobe_blacklist__delete(&blacklist);
 	strlist__delete(namelist);
+close_out:
 	close(fd);
 	return ret;
 }
@@ -2438,9 +2498,14 @@ static int find_probe_functions(struct map *map, char *name)
 {
 	int found = 0;
 	struct symbol *sym;
+	struct rb_node *tmp;
+
+	if (map__load(map, NULL) < 0)
+		return 0;
 
-	map__for_each_symbol_by_name(map, name, sym) {
-		found++;
+	map__for_each_symbol(map, sym, tmp) {
+		if (strglobmatch(sym->name, name))
+			found++;
 	}
 
 	return found;
@@ -2449,13 +2514,16 @@ static int find_probe_functions(struct map *map, char *name)
 #define strdup_or_goto(str, label)	\
 	({ char *__p = strdup(str); if (!__p) goto label; __p; })
 
+void __weak arch__fix_tev_from_maps(struct perf_probe_event *pev __maybe_unused,
+				struct probe_trace_event *tev __maybe_unused,
+				struct map *map __maybe_unused) { }
+
 /*
  * Find probe function addresses from map.
  * Return an error or the number of found probe_trace_event
  */
 static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
-					    struct probe_trace_event **tevs,
-					    int max_tevs, const char *target)
+					    struct probe_trace_event **tevs)
 {
 	struct map *map = NULL;
 	struct ref_reloc_sym *reloc_sym = NULL;
@@ -2466,7 +2534,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
 	int num_matched_functions;
 	int ret, i;
 
-	map = get_target_map(target, pev->uprobes);
+	map = get_target_map(pev->target, pev->uprobes);
 	if (!map) {
 		ret = -EINVAL;
 		goto out;
@@ -2479,12 +2547,12 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
 	num_matched_functions = find_probe_functions(map, pp->function);
 	if (num_matched_functions == 0) {
 		pr_err("Failed to find symbol %s in %s\n", pp->function,
-			target ? : "kernel");
+			pev->target ? : "kernel");
 		ret = -ENOENT;
 		goto out;
-	} else if (num_matched_functions > max_tevs) {
+	} else if (num_matched_functions > probe_conf.max_probes) {
 		pr_err("Too many functions matched in %s\n",
-			target ? : "kernel");
+			pev->target ? : "kernel");
 		ret = -E2BIG;
 		goto out;
 	}
@@ -2532,8 +2600,9 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
 			tp->offset = pp->offset;
 		}
 		tp->retprobe = pp->retprobe;
-		if (target)
-			tev->point.module = strdup_or_goto(target, nomem_out);
+		if (pev->target)
+			tev->point.module = strdup_or_goto(pev->target,
+							   nomem_out);
 		tev->uprobes = pev->uprobes;
 		tev->nargs = pev->nargs;
 		if (tev->nargs) {
@@ -2555,6 +2624,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
 					strdup_or_goto(pev->args[i].type,
 							nomem_out);
 		}
+		arch__fix_tev_from_maps(pev, tev, map);
 	}
 
 out:
@@ -2569,27 +2639,34 @@ err_out:
 	goto out;
 }
 
+bool __weak arch__prefers_symtab(void) { return false; }
+
 static int convert_to_probe_trace_events(struct perf_probe_event *pev,
-					  struct probe_trace_event **tevs,
-					  int max_tevs, const char *target)
+					 struct probe_trace_event **tevs)
 {
 	int ret;
 
 	if (pev->uprobes && !pev->group) {
 		/* Replace group name if not given */
-		ret = convert_exec_to_group(target, &pev->group);
+		ret = convert_exec_to_group(pev->target, &pev->group);
 		if (ret != 0) {
 			pr_warning("Failed to make a group name.\n");
 			return ret;
 		}
 	}
 
+	if (arch__prefers_symtab() && !perf_probe_event_need_dwarf(pev)) {
+		ret = find_probe_trace_events_from_map(pev, tevs);
+		if (ret > 0)
+			return ret; /* Found in symbol table */
+	}
+
 	/* Convert perf_probe_event with debuginfo */
-	ret = try_to_find_probe_trace_events(pev, tevs, max_tevs, target);
+	ret = try_to_find_probe_trace_events(pev, tevs);
 	if (ret != 0)
 		return ret;	/* Found in debuginfo or got an error */
 
-	return find_probe_trace_events_from_map(pev, tevs, max_tevs, target);
+	return find_probe_trace_events_from_map(pev, tevs);
 }
 
 struct __event_package {
@@ -2598,8 +2675,7 @@ struct __event_package {
 	int				ntevs;
 };
 
-int add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
-			  int max_tevs, bool force_add)
+int add_perf_probe_events(struct perf_probe_event *pevs, int npevs)
 {
 	int i, j, ret;
 	struct __event_package *pkgs;
@@ -2621,9 +2697,7 @@ int add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
 		pkgs[i].pev = &pevs[i];
 		/* Convert with or without debuginfo */
 		ret  = convert_to_probe_trace_events(pkgs[i].pev,
-						     &pkgs[i].tevs,
-						     max_tevs,
-						     pkgs[i].pev->target);
+						     &pkgs[i].tevs);
 		if (ret < 0)
 			goto end;
 		pkgs[i].ntevs = ret;
@@ -2632,7 +2706,8 @@ int add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
 	/* Loop 2: add all events */
 	for (i = 0; i < npevs; i++) {
 		ret = __add_probe_trace_events(pkgs[i].pev, pkgs[i].tevs,
-						pkgs[i].ntevs, force_add);
+					       pkgs[i].ntevs,
+					       probe_conf.force_add);
 		if (ret < 0)
 			break;
 	}
@@ -2684,40 +2759,39 @@ error:
 	return ret;
 }
 
-static int del_trace_probe_event(int fd, const char *buf,
-						  struct strlist *namelist)
+static int del_trace_probe_events(int fd, struct strfilter *filter,
+				  struct strlist *namelist)
 {
-	struct str_node *ent, *n;
-	int ret = -1;
+	struct str_node *ent;
+	const char *p;
+	int ret = -ENOENT;
 
-	if (strpbrk(buf, "*?")) { /* Glob-exp */
-		strlist__for_each_safe(ent, n, namelist)
-			if (strglobmatch(ent->s, buf)) {
-				ret = __del_trace_probe_event(fd, ent);
-				if (ret < 0)
-					break;
-				strlist__remove(namelist, ent);
-			}
-	} else {
-		ent = strlist__find(namelist, buf);
-		if (ent) {
+	if (!namelist)
+		return -ENOENT;
+
+	strlist__for_each(ent, namelist) {
+		p = strchr(ent->s, ':');
+		if ((p && strfilter__compare(filter, p + 1)) ||
+		    strfilter__compare(filter, ent->s)) {
 			ret = __del_trace_probe_event(fd, ent);
-			if (ret >= 0)
-				strlist__remove(namelist, ent);
+			if (ret < 0)
+				break;
 		}
 	}
 
 	return ret;
 }
 
-int del_perf_probe_events(struct strlist *dellist)
+int del_perf_probe_events(struct strfilter *filter)
 {
-	int ret = -1, ufd = -1, kfd = -1;
-	char buf[128];
-	const char *group, *event;
-	char *p, *str;
-	struct str_node *ent;
+	int ret, ret2, ufd = -1, kfd = -1;
 	struct strlist *namelist = NULL, *unamelist = NULL;
+	char *str = strfilter__string(filter);
+
+	if (!str)
+		return -EINVAL;
+
+	pr_debug("Delete filter: \'%s\'\n", str);
 
 	/* Get current event names */
 	kfd = open_kprobe_events(true);
@@ -2730,49 +2804,23 @@ int del_perf_probe_events(struct strlist *dellist)
 
 	if (kfd < 0 && ufd < 0) {
 		print_both_open_warning(kfd, ufd);
+		ret = kfd;
 		goto error;
 	}
 
-	if (namelist == NULL && unamelist == NULL)
+	ret = del_trace_probe_events(kfd, filter, namelist);
+	if (ret < 0 && ret != -ENOENT)
 		goto error;
 
-	strlist__for_each(ent, dellist) {
-		str = strdup(ent->s);
-		if (str == NULL) {
-			ret = -ENOMEM;
-			goto error;
-		}
-		pr_debug("Parsing: %s\n", str);
-		p = strchr(str, ':');
-		if (p) {
-			group = str;
-			*p = '\0';
-			event = p + 1;
-		} else {
-			group = "*";
-			event = str;
-		}
-
-		ret = e_snprintf(buf, 128, "%s:%s", group, event);
-		if (ret < 0) {
-			pr_err("Failed to copy event.");
-			free(str);
-			goto error;
-		}
-
-		pr_debug("Group: %s, Event: %s\n", group, event);
-
-		if (namelist)
-			ret = del_trace_probe_event(kfd, buf, namelist);
-
-		if (unamelist && ret != 0)
-			ret = del_trace_probe_event(ufd, buf, unamelist);
-
-		if (ret != 0)
-			pr_info("Info: Event \"%s\" does not exist.\n", buf);
-
-		free(str);
+	ret2 = del_trace_probe_events(ufd, filter, unamelist);
+	if (ret2 < 0 && ret2 != -ENOENT) {
+		ret = ret2;
+		goto error;
 	}
+	if (ret == -ENOENT && ret2 == -ENOENT)
+		pr_debug("\"%s\" does not hit any event.\n", str);
+		/* Note that this is silently ignored */
+	ret = 0;
 
 error:
 	if (kfd >= 0) {
@@ -2784,6 +2832,7 @@ error:
 		strlist__delete(unamelist);
 		close(ufd);
 	}
+	free(str);
 
 	return ret;
 }
@@ -2837,8 +2886,7 @@ int show_available_funcs(const char *target, struct strfilter *_filter,
 	dso__fprintf_symbols_by_name(map->dso, map->type, stdout);
 end:
 	if (user) {
-		dso__delete(map->dso);
-		map__delete(map);
+		map__put(map);
 	}
 	exit_symbol_maps();
 
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index d6b783447be95d..31db6ee7db5478 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -6,10 +6,20 @@
 #include "strlist.h"
 #include "strfilter.h"
 
+/* Probe related configurations */
+struct probe_conf {
+	bool	show_ext_vars;
+	bool	show_location_range;
+	bool	force_add;
+	bool	no_inlines;
+	int	max_probes;
+};
+extern struct probe_conf probe_conf;
 extern bool probe_event_dry_run;
 
 /* kprobe-tracer and uprobe-tracer tracing point */
 struct probe_trace_point {
+	char		*realname;	/* function real name (if needed) */
 	char		*symbol;	/* Base symbol */
 	char		*module;	/* Module name */
 	unsigned long	offset;		/* Offset from symbol */
@@ -121,20 +131,18 @@ extern void line_range__clear(struct line_range *lr);
 /* Initialize line range */
 extern int line_range__init(struct line_range *lr);
 
-/* Internal use: Return kernel/module path */
-extern const char *kernel_get_module_path(const char *module);
-
-extern int add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
-				 int max_probe_points, bool force_add);
-extern int del_perf_probe_events(struct strlist *dellist);
-extern int show_perf_probe_events(void);
+extern int add_perf_probe_events(struct perf_probe_event *pevs, int npevs);
+extern int del_perf_probe_events(struct strfilter *filter);
+extern int show_perf_probe_events(struct strfilter *filter);
 extern int show_line_range(struct line_range *lr, const char *module,
 			   bool user);
 extern int show_available_vars(struct perf_probe_event *pevs, int npevs,
-			       int max_probe_points, const char *module,
-			       struct strfilter *filter, bool externs);
+			       struct strfilter *filter);
 extern int show_available_funcs(const char *module, struct strfilter *filter,
 				bool user);
+bool arch__prefers_symtab(void);
+void arch__fix_tev_from_maps(struct perf_probe_event *pev,
+			     struct probe_trace_event *tev, struct map *map);
 
 /* Maximum index number of event-name postfix */
 #define MAX_EVENT_INDEX	1024
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 2a76e14db73289..c50da392e2562a 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -177,7 +177,7 @@ static int convert_variable_location(Dwarf_Die *vr_die, Dwarf_Addr addr,
 	Dwarf_Word offs = 0;
 	bool ref = false;
 	const char *regs;
-	int ret;
+	int ret, ret2 = 0;
 
 	if (dwarf_attr(vr_die, DW_AT_external, &attr) != NULL)
 		goto static_var;
@@ -187,9 +187,19 @@ static int convert_variable_location(Dwarf_Die *vr_die, Dwarf_Addr addr,
 		return -EINVAL;	/* Broken DIE ? */
 	if (dwarf_getlocation_addr(&attr, addr, &op, &nops, 1) <= 0) {
 		ret = dwarf_entrypc(sp_die, &tmp);
-		if (ret || addr != tmp ||
-		    dwarf_tag(vr_die) != DW_TAG_formal_parameter ||
-		    dwarf_highpc(sp_die, &tmp))
+		if (ret)
+			return -ENOENT;
+
+		if (probe_conf.show_location_range &&
+			(dwarf_tag(vr_die) == DW_TAG_variable)) {
+			ret2 = -ERANGE;
+		} else if (addr != tmp ||
+			dwarf_tag(vr_die) != DW_TAG_formal_parameter) {
+			return -ENOENT;
+		}
+
+		ret = dwarf_highpc(sp_die, &tmp);
+		if (ret)
 			return -ENOENT;
 		/*
 		 * This is fuzzed by fentry mcount. We try to find the
@@ -210,7 +220,7 @@ found:
 	if (op->atom == DW_OP_addr) {
 static_var:
 		if (!tvar)
-			return 0;
+			return ret2;
 		/* Static variables on memory (not stack), make @varname */
 		ret = strlen(dwarf_diename(vr_die));
 		tvar->value = zalloc(ret + 2);
@@ -220,7 +230,7 @@ static_var:
 		tvar->ref = alloc_trace_arg_ref((long)offs);
 		if (tvar->ref == NULL)
 			return -ENOMEM;
-		return 0;
+		return ret2;
 	}
 
 	/* If this is based on frame buffer, set the offset */
@@ -250,14 +260,14 @@ static_var:
 	}
 
 	if (!tvar)
-		return 0;
+		return ret2;
 
 	regs = get_arch_regstr(regn);
 	if (!regs) {
 		/* This should be a bug in DWARF or this tool */
 		pr_warning("Mapping for the register number %u "
 			   "missing on this architecture.\n", regn);
-		return -ERANGE;
+		return -ENOTSUP;
 	}
 
 	tvar->value = strdup(regs);
@@ -269,7 +279,7 @@ static_var:
 		if (tvar->ref == NULL)
 			return -ENOMEM;
 	}
-	return 0;
+	return ret2;
 }
 
 #define BYTES_TO_BITS(nb)	((nb) * BITS_PER_LONG / sizeof(long))
@@ -517,10 +527,12 @@ static int convert_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
 
 	ret = convert_variable_location(vr_die, pf->addr, pf->fb_ops,
 					&pf->sp_die, pf->tvar);
-	if (ret == -ENOENT || ret == -EINVAL)
-		pr_err("Failed to find the location of %s at this address.\n"
-		       " Perhaps, it has been optimized out.\n", pf->pvar->var);
-	else if (ret == -ENOTSUP)
+	if (ret == -ENOENT || ret == -EINVAL) {
+		pr_err("Failed to find the location of the '%s' variable at this address.\n"
+		       " Perhaps it has been optimized out.\n"
+		       " Use -V with the --range option to show '%s' location range.\n",
+		       pf->pvar->var, pf->pvar->var);
+	} else if (ret == -ENOTSUP)
 		pr_err("Sorry, we don't support this variable location yet.\n");
 	else if (ret == 0 && pf->pvar->field) {
 		ret = convert_variable_fields(vr_die, pf->pvar->var,
@@ -662,9 +674,15 @@ static int call_probe_finder(Dwarf_Die *sc_die, struct probe_finder *pf)
 	/* If not a real subprogram, find a real one */
 	if (!die_is_func_def(sc_die)) {
 		if (!die_find_realfunc(&pf->cu_die, pf->addr, &pf->sp_die)) {
-			pr_warning("Failed to find probe point in any "
-				   "functions.\n");
-			return -ENOENT;
+			if (die_find_tailfunc(&pf->cu_die, pf->addr, &pf->sp_die)) {
+				pr_warning("Ignoring tail call from %s\n",
+						dwarf_diename(&pf->sp_die));
+				return 0;
+			} else {
+				pr_warning("Failed to find probe point in any "
+					   "functions.\n");
+				return -ENOENT;
+			}
 		}
 	} else
 		memcpy(&pf->sp_die, sc_die, sizeof(Dwarf_Die));
@@ -719,7 +737,7 @@ static int find_best_scope_cb(Dwarf_Die *fn_die, void *data)
 	}
 	/* If the function name is given, that's what user expects */
 	if (fsp->function) {
-		if (die_compare_name(fn_die, fsp->function)) {
+		if (die_match_name(fn_die, fsp->function)) {
 			memcpy(fsp->die_mem, fn_die, sizeof(Dwarf_Die));
 			fsp->found = true;
 			return 1;
@@ -922,13 +940,14 @@ static int probe_point_search_cb(Dwarf_Die *sp_die, void *data)
 
 	/* Check tag and diename */
 	if (!die_is_func_def(sp_die) ||
-	    !die_compare_name(sp_die, pp->function))
+	    !die_match_name(sp_die, pp->function))
 		return DWARF_CB_OK;
 
 	/* Check declared file */
 	if (pp->file && strtailcmp(pp->file, dwarf_decl_file(sp_die)))
 		return DWARF_CB_OK;
 
+	pr_debug("Matched function: %s\n", dwarf_diename(sp_die));
 	pf->fname = dwarf_decl_file(sp_die);
 	if (pp->line) { /* Function relative line */
 		dwarf_decl_line(sp_die, &pf->lno);
@@ -945,10 +964,20 @@ static int probe_point_search_cb(Dwarf_Die *sp_die, void *data)
 			/* TODO: Check the address in this function */
 			param->retval = call_probe_finder(sp_die, pf);
 		}
-	} else
+	} else if (!probe_conf.no_inlines) {
 		/* Inlined function: search instances */
 		param->retval = die_walk_instances(sp_die,
 					probe_point_inline_cb, (void *)pf);
+		/* This could be a non-existed inline definition */
+		if (param->retval == -ENOENT && strisglob(pp->function))
+			param->retval = 0;
+	}
+
+	/* We need to find other candidates */
+	if (strisglob(pp->function) && param->retval >= 0) {
+		param->retval = 0;	/* We have to clear the result */
+		return DWARF_CB_OK;
+	}
 
 	return DWARF_CB_ABORT; /* Exit; no same symbol in this CU. */
 }
@@ -977,7 +1006,7 @@ static int pubname_search_cb(Dwarf *dbg, Dwarf_Global *gl, void *data)
 		if (dwarf_tag(param->sp_die) != DW_TAG_subprogram)
 			return DWARF_CB_OK;
 
-		if (die_compare_name(param->sp_die, param->function)) {
+		if (die_match_name(param->sp_die, param->function)) {
 			if (!dwarf_offdie(dbg, gl->cu_offset, param->cu_die))
 				return DWARF_CB_OK;
 
@@ -1030,7 +1059,7 @@ static int debuginfo__find_probes(struct debuginfo *dbg,
 		return -ENOMEM;
 
 	/* Fastpath: lookup by function name from .debug_pubnames section */
-	if (pp->function) {
+	if (pp->function && !strisglob(pp->function)) {
 		struct pubname_callback_param pubname_param = {
 			.function = pp->function,
 			.file	  = pp->file,
@@ -1089,6 +1118,7 @@ found:
 struct local_vars_finder {
 	struct probe_finder *pf;
 	struct perf_probe_arg *args;
+	bool vars;
 	int max_args;
 	int nargs;
 	int ret;
@@ -1103,7 +1133,7 @@ static int copy_variables_cb(Dwarf_Die *die_mem, void *data)
 
 	tag = dwarf_tag(die_mem);
 	if (tag == DW_TAG_formal_parameter ||
-	    tag == DW_TAG_variable) {
+	    (tag == DW_TAG_variable && vf->vars)) {
 		if (convert_variable_location(die_mem, vf->pf->addr,
 					      vf->pf->fb_ops, &pf->sp_die,
 					      NULL) == 0) {
@@ -1129,26 +1159,28 @@ static int expand_probe_args(Dwarf_Die *sc_die, struct probe_finder *pf,
 	Dwarf_Die die_mem;
 	int i;
 	int n = 0;
-	struct local_vars_finder vf = {.pf = pf, .args = args,
+	struct local_vars_finder vf = {.pf = pf, .args = args, .vars = false,
 				.max_args = MAX_PROBE_ARGS, .ret = 0};
 
 	for (i = 0; i < pf->pev->nargs; i++) {
 		/* var never be NULL */
-		if (strcmp(pf->pev->args[i].var, "$vars") == 0) {
-			pr_debug("Expanding $vars into:");
-			vf.nargs = n;
-			/* Special local variables */
-			die_find_child(sc_die, copy_variables_cb, (void *)&vf,
-				       &die_mem);
-			pr_debug(" (%d)\n", vf.nargs - n);
-			if (vf.ret < 0)
-				return vf.ret;
-			n = vf.nargs;
-		} else {
+		if (strcmp(pf->pev->args[i].var, PROBE_ARG_VARS) == 0)
+			vf.vars = true;
+		else if (strcmp(pf->pev->args[i].var, PROBE_ARG_PARAMS) != 0) {
 			/* Copy normal argument */
 			args[n] = pf->pev->args[i];
 			n++;
+			continue;
 		}
+		pr_debug("Expanding %s into:", pf->pev->args[i].var);
+		vf.nargs = n;
+		/* Special local variables */
+		die_find_child(sc_die, copy_variables_cb, (void *)&vf,
+			       &die_mem);
+		pr_debug(" (%d)\n", vf.nargs - n);
+		if (vf.ret < 0)
+			return vf.ret;
+		n = vf.nargs;
 	}
 	return n;
 }
@@ -1176,6 +1208,10 @@ static int add_probe_trace_event(Dwarf_Die *sc_die, struct probe_finder *pf)
 	if (ret < 0)
 		return ret;
 
+	tev->point.realname = strdup(dwarf_diename(sc_die));
+	if (!tev->point.realname)
+		return -ENOMEM;
+
 	pr_debug("Probe point found: %s+%lu\n", tev->point.symbol,
 		 tev->point.offset);
 
@@ -1213,15 +1249,15 @@ end:
 /* Find probe_trace_events specified by perf_probe_event from debuginfo */
 int debuginfo__find_trace_events(struct debuginfo *dbg,
 				 struct perf_probe_event *pev,
-				 struct probe_trace_event **tevs, int max_tevs)
+				 struct probe_trace_event **tevs)
 {
 	struct trace_event_finder tf = {
 			.pf = {.pev = pev, .callback = add_probe_trace_event},
-			.mod = dbg->mod, .max_tevs = max_tevs};
+			.max_tevs = probe_conf.max_probes, .mod = dbg->mod};
 	int ret;
 
 	/* Allocate result tevs array */
-	*tevs = zalloc(sizeof(struct probe_trace_event) * max_tevs);
+	*tevs = zalloc(sizeof(struct probe_trace_event) * tf.max_tevs);
 	if (*tevs == NULL)
 		return -ENOMEM;
 
@@ -1237,14 +1273,11 @@ int debuginfo__find_trace_events(struct debuginfo *dbg,
 	return (ret < 0) ? ret : tf.ntevs;
 }
 
-#define MAX_VAR_LEN 64
-
 /* Collect available variables in this scope */
 static int collect_variables_cb(Dwarf_Die *die_mem, void *data)
 {
 	struct available_var_finder *af = data;
 	struct variable_list *vl;
-	char buf[MAX_VAR_LEN];
 	int tag, ret;
 
 	vl = &af->vls[af->nvls - 1];
@@ -1255,11 +1288,38 @@ static int collect_variables_cb(Dwarf_Die *die_mem, void *data)
 		ret = convert_variable_location(die_mem, af->pf.addr,
 						af->pf.fb_ops, &af->pf.sp_die,
 						NULL);
-		if (ret == 0) {
-			ret = die_get_varname(die_mem, buf, MAX_VAR_LEN);
-			pr_debug2("Add new var: %s\n", buf);
-			if (ret > 0)
-				strlist__add(vl->vars, buf);
+		if (ret == 0 || ret == -ERANGE) {
+			int ret2;
+			bool externs = !af->child;
+			struct strbuf buf;
+
+			strbuf_init(&buf, 64);
+
+			if (probe_conf.show_location_range) {
+				if (!externs) {
+					if (ret)
+						strbuf_addf(&buf, "[INV]\t");
+					else
+						strbuf_addf(&buf, "[VAL]\t");
+				} else
+					strbuf_addf(&buf, "[EXT]\t");
+			}
+
+			ret2 = die_get_varname(die_mem, &buf);
+
+			if (!ret2 && probe_conf.show_location_range &&
+				!externs) {
+				strbuf_addf(&buf, "\t");
+				ret2 = die_get_var_range(&af->pf.sp_die,
+							die_mem, &buf);
+			}
+
+			pr_debug("Add new var: %s\n", buf.buf);
+			if (ret2 == 0) {
+				strlist__add(vl->vars,
+					strbuf_detach(&buf, NULL));
+			}
+			strbuf_release(&buf);
 		}
 	}
 
@@ -1302,9 +1362,9 @@ static int add_available_vars(Dwarf_Die *sc_die, struct probe_finder *pf)
 	die_find_child(sc_die, collect_variables_cb, (void *)af, &die_mem);
 
 	/* Find external variables */
-	if (!af->externs)
+	if (!probe_conf.show_ext_vars)
 		goto out;
-	/* Don't need to search child DIE for externs. */
+	/* Don't need to search child DIE for external vars. */
 	af->child = false;
 	die_find_child(&pf->cu_die, collect_variables_cb, (void *)af, &die_mem);
 
@@ -1324,17 +1384,16 @@ out:
  */
 int debuginfo__find_available_vars_at(struct debuginfo *dbg,
 				      struct perf_probe_event *pev,
-				      struct variable_list **vls,
-				      int max_vls, bool externs)
+				      struct variable_list **vls)
 {
 	struct available_var_finder af = {
 			.pf = {.pev = pev, .callback = add_available_vars},
 			.mod = dbg->mod,
-			.max_vls = max_vls, .externs = externs};
+			.max_vls = probe_conf.max_probes};
 	int ret;
 
 	/* Allocate result vls array */
-	*vls = zalloc(sizeof(struct variable_list) * max_vls);
+	*vls = zalloc(sizeof(struct variable_list) * af.max_vls);
 	if (*vls == NULL)
 		return -ENOMEM;
 
@@ -1535,7 +1594,7 @@ static int line_range_search_cb(Dwarf_Die *sp_die, void *data)
 		return DWARF_CB_OK;
 
 	if (die_is_func_def(sp_die) &&
-	    die_compare_name(sp_die, lr->function)) {
+	    die_match_name(sp_die, lr->function)) {
 		lf->fname = dwarf_decl_file(sp_die);
 		dwarf_decl_line(sp_die, &lr->offset);
 		pr_debug("fname: %s, lineno:%d\n", lf->fname, lr->offset);
diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h
index ebf8c8c814531f..bed82716e1b449 100644
--- a/tools/perf/util/probe-finder.h
+++ b/tools/perf/util/probe-finder.h
@@ -10,6 +10,9 @@
 #define MAX_PROBES		 128
 #define MAX_PROBE_ARGS		 128
 
+#define PROBE_ARG_VARS		"$vars"
+#define PROBE_ARG_PARAMS	"$params"
+
 static inline int is_c_varname(const char *name)
 {
 	/* TODO */
@@ -37,8 +40,7 @@ extern void debuginfo__delete(struct debuginfo *dbg);
 /* Find probe_trace_events specified by perf_probe_event from debuginfo */
 extern int debuginfo__find_trace_events(struct debuginfo *dbg,
 					struct perf_probe_event *pev,
-					struct probe_trace_event **tevs,
-					int max_tevs);
+					struct probe_trace_event **tevs);
 
 /* Find a perf_probe_point from debuginfo */
 extern int debuginfo__find_probe_point(struct debuginfo *dbg,
@@ -52,8 +54,7 @@ extern int debuginfo__find_line_range(struct debuginfo *dbg,
 /* Find available variables */
 extern int debuginfo__find_available_vars_at(struct debuginfo *dbg,
 					     struct perf_probe_event *pev,
-					     struct variable_list **vls,
-					     int max_points, bool externs);
+					     struct variable_list **vls);
 
 /* Find a src file from a DWARF tag path */
 int get_real_path(const char *raw_path, const char *comp_dir,
@@ -96,7 +97,6 @@ struct available_var_finder {
 	struct variable_list	*vls;		/* Found variable lists */
 	int			nvls;		/* Number of variable lists */
 	int			max_vls;	/* Max no. of variable lists */
-	bool			externs;	/* Find external vars too */
 	bool			child;		/* Search child scopes */
 };
 
diff --git a/tools/perf/util/pstack.c b/tools/perf/util/pstack.c
index a126e6cc6e73ad..b234a6e3d0d4f3 100644
--- a/tools/perf/util/pstack.c
+++ b/tools/perf/util/pstack.c
@@ -74,3 +74,10 @@ void *pstack__pop(struct pstack *pstack)
 	pstack->entries[pstack->top] = NULL;
 	return ret;
 }
+
+void *pstack__peek(struct pstack *pstack)
+{
+	if (pstack->top == 0)
+		return NULL;
+	return pstack->entries[pstack->top - 1];
+}
diff --git a/tools/perf/util/pstack.h b/tools/perf/util/pstack.h
index c3cb6584d52763..ded7f2e36624a9 100644
--- a/tools/perf/util/pstack.h
+++ b/tools/perf/util/pstack.h
@@ -10,5 +10,6 @@ bool pstack__empty(const struct pstack *pstack);
 void pstack__remove(struct pstack *pstack, void *key);
 void pstack__push(struct pstack *pstack, void *key);
 void *pstack__pop(struct pstack *pstack);
+void *pstack__peek(struct pstack *pstack);
 
 #endif /* _PERF_PSTACK_ */
diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c
index 8acd0df88b5c4b..d457c523a33d8b 100644
--- a/tools/perf/util/record.c
+++ b/tools/perf/util/record.c
@@ -20,7 +20,7 @@ static int perf_do_probe_api(setup_probe_fn_t fn, int cpu, const char *str)
 	if (!evlist)
 		return -ENOMEM;
 
-	if (parse_events(evlist, str))
+	if (parse_events(evlist, str, NULL))
 		goto out_delete;
 
 	evsel = perf_evlist__first(evlist);
@@ -119,7 +119,16 @@ void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts)
 			evsel->attr.comm_exec = 1;
 	}
 
-	if (evlist->nr_entries > 1) {
+	if (opts->full_auxtrace) {
+		/*
+		 * Need to be able to synthesize and parse selected events with
+		 * arbitrary sample types, which requires always being able to
+		 * match the id.
+		 */
+		use_sample_identifier = perf_can_sample_identifier();
+		evlist__for_each(evlist, evsel)
+			perf_evsel__set_sample_id(evsel, use_sample_identifier);
+	} else if (evlist->nr_entries > 1) {
 		struct perf_evsel *first = perf_evlist__first(evlist);
 
 		evlist__for_each(evlist, evsel) {
@@ -207,7 +216,7 @@ bool perf_evlist__can_select_event(struct perf_evlist *evlist, const char *str)
 	if (!temp_evlist)
 		return false;
 
-	err = parse_events(temp_evlist, str);
+	err = parse_events(temp_evlist, str, NULL);
 	if (err)
 		goto out_delete;
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 0c74012575ac92..39fe09d5a87efb 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -15,12 +15,13 @@
 #include "cpumap.h"
 #include "perf_regs.h"
 #include "asm/bug.h"
+#include "auxtrace.h"
 
-static int machines__deliver_event(struct machines *machines,
-				   struct perf_evlist *evlist,
-				   union perf_event *event,
-				   struct perf_sample *sample,
-				   struct perf_tool *tool, u64 file_offset);
+static int perf_session__deliver_event(struct perf_session *session,
+				       union perf_event *event,
+				       struct perf_sample *sample,
+				       struct perf_tool *tool,
+				       u64 file_offset);
 
 static int perf_session__open(struct perf_session *session)
 {
@@ -105,8 +106,8 @@ static int ordered_events__deliver_event(struct ordered_events *oe,
 		return ret;
 	}
 
-	return machines__deliver_event(&session->machines, session->evlist, event->event,
-				       &sample, session->tool, event->file_offset);
+	return perf_session__deliver_event(session, event->event, &sample,
+					   session->tool, event->file_offset);
 }
 
 struct perf_session *perf_session__new(struct perf_data_file *file,
@@ -119,6 +120,7 @@ struct perf_session *perf_session__new(struct perf_data_file *file,
 
 	session->repipe = repipe;
 	session->tool   = tool;
+	INIT_LIST_HEAD(&session->auxtrace_index);
 	machines__init(&session->machines);
 	ordered_events__init(&session->ordered_events, ordered_events__deliver_event);
 
@@ -185,6 +187,8 @@ static void perf_session_env__delete(struct perf_session_env *env)
 
 void perf_session__delete(struct perf_session *session)
 {
+	auxtrace__free(session);
+	auxtrace_index__free(&session->auxtrace_index);
 	perf_session__destroy_kernel_maps(session);
 	perf_session__delete_threads(session);
 	perf_session_env__delete(&session->header.env);
@@ -262,6 +266,49 @@ static int process_id_index_stub(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
+static int process_event_auxtrace_info_stub(struct perf_tool *tool __maybe_unused,
+				union perf_event *event __maybe_unused,
+				struct perf_session *session __maybe_unused)
+{
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
+static int skipn(int fd, off_t n)
+{
+	char buf[4096];
+	ssize_t ret;
+
+	while (n > 0) {
+		ret = read(fd, buf, min(n, (off_t)sizeof(buf)));
+		if (ret <= 0)
+			return ret;
+		n -= ret;
+	}
+
+	return 0;
+}
+
+static s64 process_event_auxtrace_stub(struct perf_tool *tool __maybe_unused,
+				       union perf_event *event,
+				       struct perf_session *session
+				       __maybe_unused)
+{
+	dump_printf(": unhandled!\n");
+	if (perf_data_file__is_pipe(session->file))
+		skipn(perf_data_file__fd(session->file), event->auxtrace.size);
+	return event->auxtrace.size;
+}
+
+static
+int process_event_auxtrace_error_stub(struct perf_tool *tool __maybe_unused,
+				      union perf_event *event __maybe_unused,
+				      struct perf_session *session __maybe_unused)
+{
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
 void perf_tool__fill_defaults(struct perf_tool *tool)
 {
 	if (tool->sample == NULL)
@@ -278,6 +325,10 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 		tool->exit = process_event_stub;
 	if (tool->lost == NULL)
 		tool->lost = perf_event__process_lost;
+	if (tool->aux == NULL)
+		tool->aux = perf_event__process_aux;
+	if (tool->itrace_start == NULL)
+		tool->itrace_start = perf_event__process_itrace_start;
 	if (tool->read == NULL)
 		tool->read = process_event_sample_stub;
 	if (tool->throttle == NULL)
@@ -298,6 +349,12 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 	}
 	if (tool->id_index == NULL)
 		tool->id_index = process_id_index_stub;
+	if (tool->auxtrace_info == NULL)
+		tool->auxtrace_info = process_event_auxtrace_info_stub;
+	if (tool->auxtrace == NULL)
+		tool->auxtrace = process_event_auxtrace_stub;
+	if (tool->auxtrace_error == NULL)
+		tool->auxtrace_error = process_event_auxtrace_error_stub;
 }
 
 static void swap_sample_id_all(union perf_event *event, void *data)
@@ -390,6 +447,26 @@ static void perf_event__read_swap(union perf_event *event, bool sample_id_all)
 		swap_sample_id_all(event, &event->read + 1);
 }
 
+static void perf_event__aux_swap(union perf_event *event, bool sample_id_all)
+{
+	event->aux.aux_offset = bswap_64(event->aux.aux_offset);
+	event->aux.aux_size   = bswap_64(event->aux.aux_size);
+	event->aux.flags      = bswap_64(event->aux.flags);
+
+	if (sample_id_all)
+		swap_sample_id_all(event, &event->aux + 1);
+}
+
+static void perf_event__itrace_start_swap(union perf_event *event,
+					  bool sample_id_all)
+{
+	event->itrace_start.pid	 = bswap_32(event->itrace_start.pid);
+	event->itrace_start.tid	 = bswap_32(event->itrace_start.tid);
+
+	if (sample_id_all)
+		swap_sample_id_all(event, &event->itrace_start + 1);
+}
+
 static void perf_event__throttle_swap(union perf_event *event,
 				      bool sample_id_all)
 {
@@ -449,6 +526,7 @@ void perf_event__attr_swap(struct perf_event_attr *attr)
 	attr->branch_sample_type = bswap_64(attr->branch_sample_type);
 	attr->sample_regs_user	 = bswap_64(attr->sample_regs_user);
 	attr->sample_stack_user  = bswap_32(attr->sample_stack_user);
+	attr->aux_watermark	 = bswap_32(attr->aux_watermark);
 
 	swap_bitfield((u8 *) (&attr->read_format + 1), sizeof(u64));
 }
@@ -478,6 +556,40 @@ static void perf_event__tracing_data_swap(union perf_event *event,
 	event->tracing_data.size = bswap_32(event->tracing_data.size);
 }
 
+static void perf_event__auxtrace_info_swap(union perf_event *event,
+					   bool sample_id_all __maybe_unused)
+{
+	size_t size;
+
+	event->auxtrace_info.type = bswap_32(event->auxtrace_info.type);
+
+	size = event->header.size;
+	size -= (void *)&event->auxtrace_info.priv - (void *)event;
+	mem_bswap_64(event->auxtrace_info.priv, size);
+}
+
+static void perf_event__auxtrace_swap(union perf_event *event,
+				      bool sample_id_all __maybe_unused)
+{
+	event->auxtrace.size      = bswap_64(event->auxtrace.size);
+	event->auxtrace.offset    = bswap_64(event->auxtrace.offset);
+	event->auxtrace.reference = bswap_64(event->auxtrace.reference);
+	event->auxtrace.idx       = bswap_32(event->auxtrace.idx);
+	event->auxtrace.tid       = bswap_32(event->auxtrace.tid);
+	event->auxtrace.cpu       = bswap_32(event->auxtrace.cpu);
+}
+
+static void perf_event__auxtrace_error_swap(union perf_event *event,
+					    bool sample_id_all __maybe_unused)
+{
+	event->auxtrace_error.type = bswap_32(event->auxtrace_error.type);
+	event->auxtrace_error.code = bswap_32(event->auxtrace_error.code);
+	event->auxtrace_error.cpu  = bswap_32(event->auxtrace_error.cpu);
+	event->auxtrace_error.pid  = bswap_32(event->auxtrace_error.pid);
+	event->auxtrace_error.tid  = bswap_32(event->auxtrace_error.tid);
+	event->auxtrace_error.ip   = bswap_64(event->auxtrace_error.ip);
+}
+
 typedef void (*perf_event__swap_op)(union perf_event *event,
 				    bool sample_id_all);
 
@@ -492,11 +604,16 @@ static perf_event__swap_op perf_event__swap_ops[] = {
 	[PERF_RECORD_THROTTLE]		  = perf_event__throttle_swap,
 	[PERF_RECORD_UNTHROTTLE]	  = perf_event__throttle_swap,
 	[PERF_RECORD_SAMPLE]		  = perf_event__all64_swap,
+	[PERF_RECORD_AUX]		  = perf_event__aux_swap,
+	[PERF_RECORD_ITRACE_START]	  = perf_event__itrace_start_swap,
 	[PERF_RECORD_HEADER_ATTR]	  = perf_event__hdr_attr_swap,
 	[PERF_RECORD_HEADER_EVENT_TYPE]	  = perf_event__event_type_swap,
 	[PERF_RECORD_HEADER_TRACING_DATA] = perf_event__tracing_data_swap,
 	[PERF_RECORD_HEADER_BUILD_ID]	  = NULL,
 	[PERF_RECORD_ID_INDEX]		  = perf_event__all64_swap,
+	[PERF_RECORD_AUXTRACE_INFO]	  = perf_event__auxtrace_info_swap,
+	[PERF_RECORD_AUXTRACE]		  = perf_event__auxtrace_swap,
+	[PERF_RECORD_AUXTRACE_ERROR]	  = perf_event__auxtrace_error_swap,
 	[PERF_RECORD_HEADER_MAX]	  = NULL,
 };
 
@@ -938,12 +1055,34 @@ static int machines__deliver_event(struct machines *machines,
 		return tool->throttle(tool, event, sample, machine);
 	case PERF_RECORD_UNTHROTTLE:
 		return tool->unthrottle(tool, event, sample, machine);
+	case PERF_RECORD_AUX:
+		return tool->aux(tool, event, sample, machine);
+	case PERF_RECORD_ITRACE_START:
+		return tool->itrace_start(tool, event, sample, machine);
 	default:
 		++evlist->stats.nr_unknown_events;
 		return -1;
 	}
 }
 
+static int perf_session__deliver_event(struct perf_session *session,
+				       union perf_event *event,
+				       struct perf_sample *sample,
+				       struct perf_tool *tool,
+				       u64 file_offset)
+{
+	int ret;
+
+	ret = auxtrace__process_event(session, event, sample, tool);
+	if (ret < 0)
+		return ret;
+	if (ret > 0)
+		return 0;
+
+	return machines__deliver_event(&session->machines, session->evlist,
+				       event, sample, tool, file_offset);
+}
+
 static s64 perf_session__process_user_event(struct perf_session *session,
 					    union perf_event *event,
 					    u64 file_offset)
@@ -980,6 +1119,15 @@ static s64 perf_session__process_user_event(struct perf_session *session,
 		return tool->finished_round(tool, event, oe);
 	case PERF_RECORD_ID_INDEX:
 		return tool->id_index(tool, event, session);
+	case PERF_RECORD_AUXTRACE_INFO:
+		return tool->auxtrace_info(tool, event, session);
+	case PERF_RECORD_AUXTRACE:
+		/* setup for reading amidst mmap */
+		lseek(fd, file_offset + event->header.size, SEEK_SET);
+		return tool->auxtrace(tool, event, session);
+	case PERF_RECORD_AUXTRACE_ERROR:
+		perf_session__auxtrace_error_inc(session, event);
+		return tool->auxtrace_error(tool, event, session);
 	default:
 		return -EINVAL;
 	}
@@ -1034,7 +1182,7 @@ int perf_session__peek_event(struct perf_session *session, off_t file_offset,
 		return -1;
 
 	if (lseek(fd, file_offset, SEEK_SET) == (off_t)-1 ||
-	    readn(fd, &buf, hdr_sz) != (ssize_t)hdr_sz)
+	    readn(fd, buf, hdr_sz) != (ssize_t)hdr_sz)
 		return -1;
 
 	event = (union perf_event *)buf;
@@ -1042,12 +1190,12 @@ int perf_session__peek_event(struct perf_session *session, off_t file_offset,
 	if (session->header.needs_swap)
 		perf_event_header__bswap(&event->header);
 
-	if (event->header.size < hdr_sz)
+	if (event->header.size < hdr_sz || event->header.size > buf_sz)
 		return -1;
 
 	rest = event->header.size - hdr_sz;
 
-	if (readn(fd, &buf, rest) != (ssize_t)rest)
+	if (readn(fd, buf, rest) != (ssize_t)rest)
 		return -1;
 
 	if (session->header.needs_swap)
@@ -1096,8 +1244,8 @@ static s64 perf_session__process_event(struct perf_session *session,
 			return ret;
 	}
 
-	return machines__deliver_event(&session->machines, evlist, event,
-				       &sample, tool, file_offset);
+	return perf_session__deliver_event(session, event, &sample, tool,
+					   file_offset);
 }
 
 void perf_event_header__bswap(struct perf_event_header *hdr)
@@ -1168,6 +1316,8 @@ static void perf_session__warn_about_errors(const struct perf_session *session)
 
 	if (oe->nr_unordered_events != 0)
 		ui__warning("%u out of order events recorded.\n", oe->nr_unordered_events);
+
+	events_stats__auxtrace_error_warn(stats);
 }
 
 volatile int session_done;
@@ -1256,10 +1406,14 @@ more:
 done:
 	/* do the final flush for ordered samples */
 	err = ordered_events__flush(oe, OE_FLUSH__FINAL);
+	if (err)
+		goto out_err;
+	err = auxtrace__flush_events(session, tool);
 out_err:
 	free(buf);
 	perf_session__warn_about_errors(session);
 	ordered_events__free(&session->ordered_events);
+	auxtrace__free_events(session);
 	return err;
 }
 
@@ -1402,10 +1556,14 @@ more:
 out:
 	/* do the final flush for ordered samples */
 	err = ordered_events__flush(oe, OE_FLUSH__FINAL);
+	if (err)
+		goto out_err;
+	err = auxtrace__flush_events(session, tool);
 out_err:
 	ui_progress__finish();
 	perf_session__warn_about_errors(session);
 	ordered_events__free(&session->ordered_events);
+	auxtrace__free_events(session);
 	session->one_mmap = false;
 	return err;
 }
@@ -1488,7 +1646,13 @@ size_t perf_session__fprintf_dsos_buildid(struct perf_session *session, FILE *fp
 
 size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp)
 {
-	size_t ret = fprintf(fp, "Aggregated stats:\n");
+	size_t ret;
+	const char *msg = "";
+
+	if (perf_header__has_feat(&session->header, HEADER_AUXTRACE))
+		msg = " (excludes AUX area (e.g. instruction trace) decoded / synthesized events)";
+
+	ret = fprintf(fp, "Aggregated stats:%s\n", msg);
 
 	ret += events_stats__fprintf(&session->evlist->stats, fp);
 	return ret;
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index d5fa7b7916ef40..b44afc75d1cc51 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -15,10 +15,16 @@
 struct ip_callchain;
 struct thread;
 
+struct auxtrace;
+struct itrace_synth_opts;
+
 struct perf_session {
 	struct perf_header	header;
 	struct machines		machines;
 	struct perf_evlist	*evlist;
+	struct auxtrace		*auxtrace;
+	struct itrace_synth_opts *itrace_synth_opts;
+	struct list_head	auxtrace_index;
 	struct trace_event	tevent;
 	bool			repipe;
 	bool			one_mmap;
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 4593f36ecc4c65..09d4696fd9a115 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -89,14 +89,14 @@ static int64_t
 sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
 {
 	/* Compare the addr that should be unique among comm */
-	return comm__str(right->comm) - comm__str(left->comm);
+	return strcmp(comm__str(right->comm), comm__str(left->comm));
 }
 
 static int64_t
 sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
 {
 	/* Compare the addr that should be unique among comm */
-	return comm__str(right->comm) - comm__str(left->comm);
+	return strcmp(comm__str(right->comm), comm__str(left->comm));
 }
 
 static int64_t
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 846036a921dc91..e97cd476d336f2 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -58,15 +58,16 @@ struct he_stat {
 
 struct hist_entry_diff {
 	bool	computed;
+	union {
+		/* PERF_HPP__DELTA */
+		double	period_ratio_delta;
 
-	/* PERF_HPP__DELTA */
-	double	period_ratio_delta;
-
-	/* PERF_HPP__RATIO */
-	double	period_ratio;
+		/* PERF_HPP__RATIO */
+		double	period_ratio;
 
-	/* HISTC_WEIGHTED_DIFF */
-	s64	wdiff;
+		/* HISTC_WEIGHTED_DIFF */
+		s64	wdiff;
+	};
 };
 
 /**
@@ -92,21 +93,28 @@ struct hist_entry {
 	s32			cpu;
 	u8			cpumode;
 
-	struct hist_entry_diff	diff;
-
 	/* We are added by hists__add_dummy_entry. */
 	bool			dummy;
 
-	/* XXX These two should move to some tree widget lib */
-	u16			row_offset;
-	u16			nr_rows;
-
-	bool			init_have_children;
 	char			level;
 	u8			filtered;
+	union {
+		/*
+		 * Since perf diff only supports the stdio output, TUI
+		 * fields are only accessed from perf report (or perf
+		 * top).  So make it an union to reduce memory usage.
+		 */
+		struct hist_entry_diff	diff;
+		struct /* for TUI */ {
+			u16	row_offset;
+			u16	nr_rows;
+			bool	init_have_children;
+			bool	unfolded;
+			bool	has_children;
+		};
+	};
 	char			*srcline;
 	struct symbol		*parent;
-	unsigned long		position;
 	struct rb_root		sorted_chain;
 	struct branch_info	*branch_info;
 	struct hists		*hists;
diff --git a/tools/perf/util/strfilter.c b/tools/perf/util/strfilter.c
index 79a757a2a15c22..bcae659b65462c 100644
--- a/tools/perf/util/strfilter.c
+++ b/tools/perf/util/strfilter.c
@@ -170,6 +170,46 @@ struct strfilter *strfilter__new(const char *rules, const char **err)
 	return filter;
 }
 
+static int strfilter__append(struct strfilter *filter, bool _or,
+			     const char *rules, const char **err)
+{
+	struct strfilter_node *right, *root;
+	const char *ep = NULL;
+
+	if (!filter || !rules)
+		return -EINVAL;
+
+	right = strfilter_node__new(rules, &ep);
+	if (!right || *ep != '\0') {
+		if (err)
+			*err = ep;
+		goto error;
+	}
+	root = strfilter_node__alloc(_or ? OP_or : OP_and, filter->root, right);
+	if (!root) {
+		ep = NULL;
+		goto error;
+	}
+
+	filter->root = root;
+	return 0;
+
+error:
+	strfilter_node__delete(right);
+	return ep ? -EINVAL : -ENOMEM;
+}
+
+int strfilter__or(struct strfilter *filter, const char *rules, const char **err)
+{
+	return strfilter__append(filter, true, rules, err);
+}
+
+int strfilter__and(struct strfilter *filter, const char *rules,
+		   const char **err)
+{
+	return strfilter__append(filter, false, rules, err);
+}
+
 static bool strfilter_node__compare(struct strfilter_node *node,
 				    const char *str)
 {
@@ -197,3 +237,70 @@ bool strfilter__compare(struct strfilter *filter, const char *str)
 		return false;
 	return strfilter_node__compare(filter->root, str);
 }
+
+static int strfilter_node__sprint(struct strfilter_node *node, char *buf);
+
+/* sprint node in parenthesis if needed */
+static int strfilter_node__sprint_pt(struct strfilter_node *node, char *buf)
+{
+	int len;
+	int pt = node->r ? 2 : 0;	/* don't need to check node->l */
+
+	if (buf && pt)
+		*buf++ = '(';
+	len = strfilter_node__sprint(node, buf);
+	if (len < 0)
+		return len;
+	if (buf && pt)
+		*(buf + len) = ')';
+	return len + pt;
+}
+
+static int strfilter_node__sprint(struct strfilter_node *node, char *buf)
+{
+	int len = 0, rlen;
+
+	if (!node || !node->p)
+		return -EINVAL;
+
+	switch (*node->p) {
+	case '|':
+	case '&':
+		len = strfilter_node__sprint_pt(node->l, buf);
+		if (len < 0)
+			return len;
+	case '!':
+		if (buf) {
+			*(buf + len++) = *node->p;
+			buf += len;
+		} else
+			len++;
+		rlen = strfilter_node__sprint_pt(node->r, buf);
+		if (rlen < 0)
+			return rlen;
+		len += rlen;
+		break;
+	default:
+		len = strlen(node->p);
+		if (buf)
+			strcpy(buf, node->p);
+	}
+
+	return len;
+}
+
+char *strfilter__string(struct strfilter *filter)
+{
+	int len;
+	char *ret = NULL;
+
+	len = strfilter_node__sprint(filter->root, NULL);
+	if (len < 0)
+		return NULL;
+
+	ret = malloc(len + 1);
+	if (ret)
+		strfilter_node__sprint(filter->root, ret);
+
+	return ret;
+}
diff --git a/tools/perf/util/strfilter.h b/tools/perf/util/strfilter.h
index fe611f3c9e3965..cff5eda88728b2 100644
--- a/tools/perf/util/strfilter.h
+++ b/tools/perf/util/strfilter.h
@@ -29,6 +29,32 @@ struct strfilter {
 struct strfilter *strfilter__new(const char *rules, const char **err);
 
 /**
+ * strfilter__or - Append an additional rule by logical-or
+ * @filter: Original string filter
+ * @rules: Filter rule to be appended at left of the root of
+ *         @filter by using logical-or.
+ * @err: Pointer which points an error detected on @rules
+ *
+ * Parse @rules and join it to the @filter by using logical-or.
+ * Return 0 if success, or return the error code.
+ */
+int strfilter__or(struct strfilter *filter,
+		  const char *rules, const char **err);
+
+/**
+ * strfilter__add - Append an additional rule by logical-and
+ * @filter: Original string filter
+ * @rules: Filter rule to be appended at left of the root of
+ *         @filter by using logical-and.
+ * @err: Pointer which points an error detected on @rules
+ *
+ * Parse @rules and join it to the @filter by using logical-and.
+ * Return 0 if success, or return the error code.
+ */
+int strfilter__and(struct strfilter *filter,
+		   const char *rules, const char **err);
+
+/**
  * strfilter__compare - compare given string and a string filter
  * @filter: String filter
  * @str: target string
@@ -45,4 +71,13 @@ bool strfilter__compare(struct strfilter *filter, const char *str);
  */
 void strfilter__delete(struct strfilter *filter);
 
+/**
+ * strfilter__string - Reconstruct a rule string from filter
+ * @filter: String filter to reconstruct
+ *
+ * Reconstruct a rule string from @filter. This will be good for
+ * debug messages. Note that returning string must be freed afterward.
+ */
+char *strfilter__string(struct strfilter *filter);
+
 #endif
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index a7ab6063e03894..a93ba85509b287 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -630,6 +630,11 @@ void symsrc__destroy(struct symsrc *ss)
 	close(ss->fd);
 }
 
+bool __weak elf__needs_adjust_symbols(GElf_Ehdr ehdr)
+{
+	return ehdr.e_type == ET_EXEC || ehdr.e_type == ET_REL;
+}
+
 int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
 		 enum dso_binary_type type)
 {
@@ -678,6 +683,7 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
 		}
 
 		if (!dso__build_id_equal(dso, build_id)) {
+			pr_debug("%s: build id mismatch for %s.\n", __func__, name);
 			dso->load_errno = DSO_LOAD_ERRNO__MISMATCHING_BUILDID;
 			goto out_elf_end;
 		}
@@ -711,8 +717,7 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
 						     ".gnu.prelink_undo",
 						     NULL) != NULL);
 	} else {
-		ss->adjust_symbols = ehdr.e_type == ET_EXEC ||
-				     ehdr.e_type == ET_REL;
+		ss->adjust_symbols = elf__needs_adjust_symbols(ehdr);
 	}
 
 	ss->name   = strdup(name);
@@ -771,6 +776,8 @@ static bool want_demangle(bool is_kernel_sym)
 	return is_kernel_sym ? symbol_conf.demangle_kernel : symbol_conf.demangle;
 }
 
+void __weak arch__elf_sym_adjust(GElf_Sym *sym __maybe_unused) { }
+
 int dso__load_sym(struct dso *dso, struct map *map,
 		  struct symsrc *syms_ss, struct symsrc *runtime_ss,
 		  symbol_filter_t filter, int kmodule)
@@ -935,6 +942,8 @@ int dso__load_sym(struct dso *dso, struct map *map,
 		    (sym.st_value & 1))
 			--sym.st_value;
 
+		arch__elf_sym_adjust(&sym);
+
 		if (dso->kernel || kmodule) {
 			char dso_name[PATH_MAX];
 
@@ -963,8 +972,10 @@ int dso__load_sym(struct dso *dso, struct map *map,
 					map->unmap_ip = map__unmap_ip;
 					/* Ensure maps are correctly ordered */
 					if (kmaps) {
+						map__get(map);
 						map_groups__remove(kmaps, map);
 						map_groups__insert(kmaps, map);
+						map__put(map);
 					}
 				}
 
@@ -1020,11 +1031,7 @@ int dso__load_sym(struct dso *dso, struct map *map,
 				}
 				curr_dso->symtab_type = dso->symtab_type;
 				map_groups__insert(kmaps, curr_map);
-				/*
-				 * The new DSO should go to the kernel DSOS
-				 */
-				dsos__add(&map->groups->machine->kernel_dsos,
-					  curr_dso);
+				dsos__add(&map->groups->machine->dsos, curr_dso);
 				dso__set_loaded(curr_dso, map->type);
 			} else
 				curr_dso = curr_map->dso;
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 201f6c4ca738dd..a3e80d6ad70aec 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -85,8 +85,17 @@ static int prefix_underscores_count(const char *str)
 	return tail - str;
 }
 
-#define SYMBOL_A 0
-#define SYMBOL_B 1
+int __weak arch__choose_best_symbol(struct symbol *syma,
+				    struct symbol *symb __maybe_unused)
+{
+	/* Avoid "SyS" kernel syscall aliases */
+	if (strlen(syma->name) >= 3 && !strncmp(syma->name, "SyS", 3))
+		return SYMBOL_B;
+	if (strlen(syma->name) >= 10 && !strncmp(syma->name, "compat_SyS", 10))
+		return SYMBOL_B;
+
+	return SYMBOL_A;
+}
 
 static int choose_best_symbol(struct symbol *syma, struct symbol *symb)
 {
@@ -134,13 +143,7 @@ static int choose_best_symbol(struct symbol *syma, struct symbol *symb)
 	else if (na < nb)
 		return SYMBOL_B;
 
-	/* Avoid "SyS" kernel syscall aliases */
-	if (na >= 3 && !strncmp(syma->name, "SyS", 3))
-		return SYMBOL_B;
-	if (na >= 10 && !strncmp(syma->name, "compat_SyS", 10))
-		return SYMBOL_B;
-
-	return SYMBOL_A;
+	return arch__choose_best_symbol(syma, symb);
 }
 
 void symbols__fixup_duplicate(struct rb_root *symbols)
@@ -199,18 +202,18 @@ void symbols__fixup_end(struct rb_root *symbols)
 
 void __map_groups__fixup_end(struct map_groups *mg, enum map_type type)
 {
-	struct map *prev, *curr;
-	struct rb_node *nd, *prevnd = rb_first(&mg->maps[type]);
+	struct maps *maps = &mg->maps[type];
+	struct map *next, *curr;
 
-	if (prevnd == NULL)
-		return;
+	pthread_rwlock_wrlock(&maps->lock);
 
-	curr = rb_entry(prevnd, struct map, rb_node);
+	curr = maps__first(maps);
+	if (curr == NULL)
+		goto out_unlock;
 
-	for (nd = rb_next(prevnd); nd; nd = rb_next(nd)) {
-		prev = curr;
-		curr = rb_entry(nd, struct map, rb_node);
-		prev->end = curr->start;
+	for (next = map__next(curr); next; next = map__next(curr)) {
+		curr->end = next->start;
+		curr = next;
 	}
 
 	/*
@@ -218,6 +221,9 @@ void __map_groups__fixup_end(struct map_groups *mg, enum map_type type)
 	 * last map final address.
 	 */
 	curr->end = ~0ULL;
+
+out_unlock:
+	pthread_rwlock_unlock(&maps->lock);
 }
 
 struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name)
@@ -397,7 +403,7 @@ static struct symbol *symbols__find_by_name(struct rb_root *symbols,
 					    const char *name)
 {
 	struct rb_node *n;
-	struct symbol_name_rb_node *s;
+	struct symbol_name_rb_node *s = NULL;
 
 	if (symbols == NULL)
 		return NULL;
@@ -408,7 +414,7 @@ static struct symbol *symbols__find_by_name(struct rb_root *symbols,
 		int cmp;
 
 		s = rb_entry(n, struct symbol_name_rb_node, rb_node);
-		cmp = strcmp(name, s->sym.name);
+		cmp = arch__compare_symbol_names(name, s->sym.name);
 
 		if (cmp < 0)
 			n = n->rb_left;
@@ -426,7 +432,7 @@ static struct symbol *symbols__find_by_name(struct rb_root *symbols,
 		struct symbol_name_rb_node *tmp;
 
 		tmp = rb_entry(n, struct symbol_name_rb_node, rb_node);
-		if (strcmp(tmp->sym.name, s->sym.name))
+		if (arch__compare_symbol_names(tmp->sym.name, s->sym.name))
 			break;
 
 		s = tmp;
@@ -653,14 +659,14 @@ static int dso__split_kallsyms_for_kcore(struct dso *dso, struct map *map,
 		curr_map = map_groups__find(kmaps, map->type, pos->start);
 
 		if (!curr_map || (filter && filter(curr_map, pos))) {
-			rb_erase(&pos->rb_node, root);
+			rb_erase_init(&pos->rb_node, root);
 			symbol__delete(pos);
 		} else {
 			pos->start -= curr_map->start - curr_map->pgoff;
 			if (pos->end)
 				pos->end -= curr_map->start - curr_map->pgoff;
 			if (curr_map != map) {
-				rb_erase(&pos->rb_node, root);
+				rb_erase_init(&pos->rb_node, root);
 				symbols__insert(
 					&curr_map->dso->symbols[curr_map->type],
 					pos);
@@ -1167,20 +1173,23 @@ static int dso__load_kcore(struct dso *dso, struct map *map,
 	/* Add new maps */
 	while (!list_empty(&md.maps)) {
 		new_map = list_entry(md.maps.next, struct map, node);
-		list_del(&new_map->node);
+		list_del_init(&new_map->node);
 		if (new_map == replacement_map) {
 			map->start	= new_map->start;
 			map->end	= new_map->end;
 			map->pgoff	= new_map->pgoff;
 			map->map_ip	= new_map->map_ip;
 			map->unmap_ip	= new_map->unmap_ip;
-			map__delete(new_map);
 			/* Ensure maps are correctly ordered */
+			map__get(map);
 			map_groups__remove(kmaps, map);
 			map_groups__insert(kmaps, map);
+			map__put(map);
 		} else {
 			map_groups__insert(kmaps, new_map);
 		}
+
+		map__put(new_map);
 	}
 
 	/*
@@ -1205,8 +1214,8 @@ static int dso__load_kcore(struct dso *dso, struct map *map,
 out_err:
 	while (!list_empty(&md.maps)) {
 		map = list_entry(md.maps.next, struct map, node);
-		list_del(&map->node);
-		map__delete(map);
+		list_del_init(&map->node);
+		map__put(map);
 	}
 	close(fd);
 	return -EINVAL;
@@ -1380,12 +1389,22 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
 	struct symsrc *syms_ss = NULL, *runtime_ss = NULL;
 	bool kmod;
 
-	dso__set_loaded(dso, map->type);
+	pthread_mutex_lock(&dso->lock);
+
+	/* check again under the dso->lock */
+	if (dso__loaded(dso, map->type)) {
+		ret = 1;
+		goto out;
+	}
 
-	if (dso->kernel == DSO_TYPE_KERNEL)
-		return dso__load_kernel_sym(dso, map, filter);
-	else if (dso->kernel == DSO_TYPE_GUEST_KERNEL)
-		return dso__load_guest_kernel_sym(dso, map, filter);
+	if (dso->kernel) {
+		if (dso->kernel == DSO_TYPE_KERNEL)
+			ret = dso__load_kernel_sym(dso, map, filter);
+		else if (dso->kernel == DSO_TYPE_GUEST_KERNEL)
+			ret = dso__load_guest_kernel_sym(dso, map, filter);
+
+		goto out;
+	}
 
 	if (map->groups && map->groups->machine)
 		machine = map->groups->machine;
@@ -1398,18 +1417,18 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
 		struct stat st;
 
 		if (lstat(dso->name, &st) < 0)
-			return -1;
+			goto out;
 
 		if (st.st_uid && (st.st_uid != geteuid())) {
 			pr_warning("File %s not owned by current user or root, "
 				"ignoring it.\n", dso->name);
-			return -1;
+			goto out;
 		}
 
 		ret = dso__load_perf_map(dso, map, filter);
 		dso->symtab_type = ret > 0 ? DSO_BINARY_TYPE__JAVA_JIT :
 					     DSO_BINARY_TYPE__NOT_FOUND;
-		return ret;
+		goto out;
 	}
 
 	if (machine)
@@ -1417,7 +1436,7 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
 
 	name = malloc(PATH_MAX);
 	if (!name)
-		return -1;
+		goto out;
 
 	kmod = dso->symtab_type == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE ||
 		dso->symtab_type == DSO_BINARY_TYPE__SYSTEM_PATH_KMODULE_COMP ||
@@ -1498,23 +1517,32 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
 out_free:
 	free(name);
 	if (ret < 0 && strstr(dso->name, " (deleted)") != NULL)
-		return 0;
+		ret = 0;
+out:
+	dso__set_loaded(dso, map->type);
+	pthread_mutex_unlock(&dso->lock);
+
 	return ret;
 }
 
 struct map *map_groups__find_by_name(struct map_groups *mg,
 				     enum map_type type, const char *name)
 {
-	struct rb_node *nd;
+	struct maps *maps = &mg->maps[type];
+	struct map *map;
 
-	for (nd = rb_first(&mg->maps[type]); nd; nd = rb_next(nd)) {
-		struct map *map = rb_entry(nd, struct map, rb_node);
+	pthread_rwlock_rdlock(&maps->lock);
 
+	for (map = maps__first(maps); map; map = map__next(map)) {
 		if (map->dso && strcmp(map->dso->short_name, name) == 0)
-			return map;
+			goto out_unlock;
 	}
 
-	return NULL;
+	map = NULL;
+
+out_unlock:
+	pthread_rwlock_unlock(&maps->lock);
+	return map;
 }
 
 int dso__load_vmlinux(struct dso *dso, struct map *map,
@@ -1802,6 +1830,7 @@ static void vmlinux_path__exit(void)
 {
 	while (--vmlinux_path__nr_entries >= 0)
 		zfree(&vmlinux_path[vmlinux_path__nr_entries]);
+	vmlinux_path__nr_entries = 0;
 
 	zfree(&vmlinux_path);
 }
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 09561500164a07..bef47ead1d9bd1 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -158,8 +158,6 @@ struct ref_reloc_sym {
 struct map_symbol {
 	struct map    *map;
 	struct symbol *sym;
-	bool	      unfolded;
-	bool	      has_children;
 };
 
 struct addr_map_symbol {
@@ -303,4 +301,14 @@ int setup_list(struct strlist **list, const char *list_str,
 int setup_intlist(struct intlist **list, const char *list_str,
 		  const char *list_name);
 
+#ifdef HAVE_LIBELF_SUPPORT
+bool elf__needs_adjust_symbols(GElf_Ehdr ehdr);
+void arch__elf_sym_adjust(GElf_Sym *sym);
+#endif
+
+#define SYMBOL_A 0
+#define SYMBOL_B 1
+
+int arch__choose_best_symbol(struct symbol *syma, struct symbol *symb);
+
 #endif /* __PERF_SYMBOL */
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 1c8fbc9588c5fd..28c4b746baa19b 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -18,7 +18,7 @@ int thread__init_map_groups(struct thread *thread, struct machine *machine)
 	if (pid == thread->tid || pid == -1) {
 		thread->mg = map_groups__new(machine);
 	} else {
-		leader = machine__findnew_thread(machine, pid, pid);
+		leader = __machine__findnew_thread(machine, pid, pid);
 		if (leader)
 			thread->mg = map_groups__get(leader->mg);
 	}
@@ -53,7 +53,8 @@ struct thread *thread__new(pid_t pid, pid_t tid)
 			goto err_thread;
 
 		list_add(&comm->list, &thread->comm_list);
-
+		atomic_set(&thread->refcnt, 0);
+		RB_CLEAR_NODE(&thread->rb_node);
 	}
 
 	return thread;
@@ -67,6 +68,8 @@ void thread__delete(struct thread *thread)
 {
 	struct comm *comm, *tmp;
 
+	BUG_ON(!RB_EMPTY_NODE(&thread->rb_node));
+
 	thread_stack__free(thread);
 
 	if (thread->mg) {
@@ -84,13 +87,14 @@ void thread__delete(struct thread *thread)
 
 struct thread *thread__get(struct thread *thread)
 {
-	++thread->refcnt;
+	if (thread)
+		atomic_inc(&thread->refcnt);
 	return thread;
 }
 
 void thread__put(struct thread *thread)
 {
-	if (thread && --thread->refcnt == 0) {
+	if (thread && atomic_dec_and_test(&thread->refcnt)) {
 		list_del_init(&thread->node);
 		thread__delete(thread);
 	}
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 9b8a54dc34a819..a0ac0317affb5f 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -1,6 +1,7 @@
 #ifndef __PERF_THREAD_H
 #define __PERF_THREAD_H
 
+#include <linux/atomic.h>
 #include <linux/rbtree.h>
 #include <linux/list.h>
 #include <unistd.h>
@@ -21,12 +22,12 @@ struct thread {
 	pid_t			tid;
 	pid_t			ppid;
 	int			cpu;
-	int			refcnt;
+	atomic_t		refcnt;
 	char			shortname[3];
 	bool			comm_set;
+	int			comm_len;
 	bool			dead; /* if set thread has exited */
 	struct list_head	comm_list;
-	int			comm_len;
 	u64			db_id;
 
 	void			*priv;
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index 51d9e56c0f841d..7f282ad1d2bd44 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -3,6 +3,8 @@
 
 #include <stdbool.h>
 
+#include <linux/types.h>
+
 struct perf_session;
 union perf_event;
 struct perf_evlist;
@@ -29,6 +31,9 @@ typedef int (*event_op2)(struct perf_tool *tool, union perf_event *event,
 typedef int (*event_oe)(struct perf_tool *tool, union perf_event *event,
 			struct ordered_events *oe);
 
+typedef s64 (*event_op3)(struct perf_tool *tool, union perf_event *event,
+			 struct perf_session *session);
+
 struct perf_tool {
 	event_sample	sample,
 			read;
@@ -38,13 +43,18 @@ struct perf_tool {
 			fork,
 			exit,
 			lost,
+			aux,
+			itrace_start,
 			throttle,
 			unthrottle;
 	event_attr_op	attr;
 	event_op2	tracing_data;
 	event_oe	finished_round;
 	event_op2	build_id,
-			id_index;
+			id_index,
+			auxtrace_info,
+			auxtrace_error;
+	event_op3	auxtrace;
 	bool		ordered_events;
 	bool		ordering_requires_timestamps;
 };
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 25d6c737be3e67..d4957418657ec3 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -173,7 +173,7 @@ void parse_ftrace_printk(struct pevent *pevent,
 	char *line;
 	char *next = NULL;
 	char *addr_str;
-	char *fmt;
+	char *fmt = NULL;
 
 	line = strtok_r(file, "\n", &next);
 	while (line) {
diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
index 7b09a443a28042..f079b63f0b7fdc 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -269,13 +269,14 @@ static int read_unwind_spec_eh_frame(struct dso *dso, struct machine *machine,
 	u64 offset = dso->data.eh_frame_hdr_offset;
 
 	if (offset == 0) {
-		fd = dso__data_fd(dso, machine);
+		fd = dso__data_get_fd(dso, machine);
 		if (fd < 0)
 			return -EINVAL;
 
 		/* Check the .eh_frame section for unwinding info */
 		offset = elf_section_offset(fd, ".eh_frame_hdr");
 		dso->data.eh_frame_hdr_offset = offset;
+		dso__data_put_fd(dso);
 	}
 
 	if (offset)
@@ -294,13 +295,14 @@ static int read_unwind_spec_debug_frame(struct dso *dso,
 	u64 ofs = dso->data.debug_frame_offset;
 
 	if (ofs == 0) {
-		fd = dso__data_fd(dso, machine);
+		fd = dso__data_get_fd(dso, machine);
 		if (fd < 0)
 			return -EINVAL;
 
 		/* Check the .debug_frame section for unwinding info */
 		ofs = elf_section_offset(fd, ".debug_frame");
 		dso->data.debug_frame_offset = ofs;
+		dso__data_put_fd(dso);
 	}
 
 	*offset = ofs;
@@ -353,10 +355,13 @@ find_proc_info(unw_addr_space_t as, unw_word_t ip, unw_proc_info_t *pi,
 #ifndef NO_LIBUNWIND_DEBUG_FRAME
 	/* Check the .debug_frame section for unwinding info */
 	if (!read_unwind_spec_debug_frame(map->dso, ui->machine, &segbase)) {
-		int fd = dso__data_fd(map->dso, ui->machine);
+		int fd = dso__data_get_fd(map->dso, ui->machine);
 		int is_exec = elf_is_exec(fd, map->dso->name);
 		unw_word_t base = is_exec ? 0 : map->start;
 
+		if (fd >= 0)
+			dso__data_put_fd(dso);
+
 		memset(&di, 0, sizeof(di));
 		if (dwarf_find_debug_frame(0, &di, ip, base, map->dso->name,
 					   map->start, map->end))
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 4ee6d0d4c99317..0c264bc685aceb 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -72,6 +72,49 @@ int mkdir_p(char *path, mode_t mode)
 	return (stat(path, &st) && mkdir(path, mode)) ? -1 : 0;
 }
 
+int rm_rf(char *path)
+{
+	DIR *dir;
+	int ret = 0;
+	struct dirent *d;
+	char namebuf[PATH_MAX];
+
+	dir = opendir(path);
+	if (dir == NULL)
+		return 0;
+
+	while ((d = readdir(dir)) != NULL && !ret) {
+		struct stat statbuf;
+
+		if (!strcmp(d->d_name, ".") || !strcmp(d->d_name, ".."))
+			continue;
+
+		scnprintf(namebuf, sizeof(namebuf), "%s/%s",
+			  path, d->d_name);
+
+		ret = stat(namebuf, &statbuf);
+		if (ret < 0) {
+			pr_debug("stat failed: %s\n", namebuf);
+			break;
+		}
+
+		if (S_ISREG(statbuf.st_mode))
+			ret = unlink(namebuf);
+		else if (S_ISDIR(statbuf.st_mode))
+			ret = rm_rf(namebuf);
+		else {
+			pr_debug("unknown file: %s\n", namebuf);
+			ret = -1;
+		}
+	}
+	closedir(dir);
+
+	if (ret < 0)
+		return ret;
+
+	return rmdir(path);
+}
+
 static int slow_copyfile(const char *from, const char *to, mode_t mode)
 {
 	int err = -1;
@@ -102,11 +145,38 @@ out:
 	return err;
 }
 
+int copyfile_offset(int ifd, loff_t off_in, int ofd, loff_t off_out, u64 size)
+{
+	void *ptr;
+	loff_t pgoff;
+
+	pgoff = off_in & ~(page_size - 1);
+	off_in -= pgoff;
+
+	ptr = mmap(NULL, off_in + size, PROT_READ, MAP_PRIVATE, ifd, pgoff);
+	if (ptr == MAP_FAILED)
+		return -1;
+
+	while (size) {
+		ssize_t ret = pwrite(ofd, ptr + off_in, size, off_out);
+		if (ret < 0 && errno == EINTR)
+			continue;
+		if (ret <= 0)
+			break;
+
+		size -= ret;
+		off_in += ret;
+		off_out -= ret;
+	}
+	munmap(ptr, off_in + size);
+
+	return size ? -1 : 0;
+}
+
 int copyfile_mode(const char *from, const char *to, mode_t mode)
 {
 	int fromfd, tofd;
 	struct stat st;
-	void *addr;
 	int err = -1;
 
 	if (stat(from, &st))
@@ -123,15 +193,8 @@ int copyfile_mode(const char *from, const char *to, mode_t mode)
 	if (tofd < 0)
 		goto out_close_from;
 
-	addr = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fromfd, 0);
-	if (addr == MAP_FAILED)
-		goto out_close_to;
-
-	if (write(tofd, addr, st.st_size) == st.st_size)
-		err = 0;
+	err = copyfile_offset(fromfd, 0, tofd, 0, st.st_size);
 
-	munmap(addr, st.st_size);
-out_close_to:
 	close(tofd);
 	if (err)
 		unlink(to);
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 1ff23e04ad2730..8bce58b47a8269 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -249,14 +249,20 @@ static inline int sane_case(int x, int high)
 }
 
 int mkdir_p(char *path, mode_t mode);
+int rm_rf(char *path);
 int copyfile(const char *from, const char *to);
 int copyfile_mode(const char *from, const char *to, mode_t mode);
+int copyfile_offset(int fromfd, loff_t from_ofs, int tofd, loff_t to_ofs, u64 size);
 
 s64 perf_atoll(const char *str);
 char **argv_split(const char *str, int *argcp);
 void argv_free(char **argv);
 bool strglobmatch(const char *str, const char *pat);
 bool strlazymatch(const char *str, const char *pat);
+static inline bool strisglob(const char *str)
+{
+	return strpbrk(str, "*?[") != NULL;
+}
 int strtailcmp(const char *s1, const char *s2);
 char *strxfrchar(char *s, char from, char to);
 unsigned long convert_unit(unsigned long value, char *unit);
diff --git a/tools/perf/util/vdso.c b/tools/perf/util/vdso.c
index 5c7dd796979d0d..2e8f6886ca72d2 100644
--- a/tools/perf/util/vdso.c
+++ b/tools/perf/util/vdso.c
@@ -101,7 +101,7 @@ static char *get_file(struct vdso_file *vdso_file)
 	return vdso;
 }
 
-void vdso__exit(struct machine *machine)
+void machine__exit_vdso(struct machine *machine)
 {
 	struct vdso_info *vdso_info = machine->vdso_info;
 
@@ -120,14 +120,14 @@ void vdso__exit(struct machine *machine)
 	zfree(&machine->vdso_info);
 }
 
-static struct dso *vdso__new(struct machine *machine, const char *short_name,
-			     const char *long_name)
+static struct dso *machine__addnew_vdso(struct machine *machine, const char *short_name,
+					const char *long_name)
 {
 	struct dso *dso;
 
 	dso = dso__new(short_name);
 	if (dso != NULL) {
-		dsos__add(&machine->user_dsos, dso);
+		dsos__add(&machine->dsos, dso);
 		dso__set_long_name(dso, long_name, false);
 	}
 
@@ -236,7 +236,7 @@ static struct dso *vdso__findnew_compat(struct machine *machine,
 	const char *file_name;
 	struct dso *dso;
 
-	dso = dsos__find(&machine->user_dsos, vdso_file->dso_name, true);
+	dso = dsos__find(&machine->dsos, vdso_file->dso_name, true);
 	if (dso)
 		return dso;
 
@@ -244,10 +244,10 @@ static struct dso *vdso__findnew_compat(struct machine *machine,
 	if (!file_name)
 		return NULL;
 
-	return vdso__new(machine, vdso_file->dso_name, file_name);
+	return machine__addnew_vdso(machine, vdso_file->dso_name, file_name);
 }
 
-static int vdso__dso_findnew_compat(struct machine *machine,
+static int machine__findnew_vdso_compat(struct machine *machine,
 				    struct thread *thread,
 				    struct vdso_info *vdso_info,
 				    struct dso **dso)
@@ -281,8 +281,8 @@ static int vdso__dso_findnew_compat(struct machine *machine,
 
 #endif
 
-struct dso *vdso__dso_findnew(struct machine *machine,
-			      struct thread *thread __maybe_unused)
+struct dso *machine__findnew_vdso(struct machine *machine,
+				  struct thread *thread __maybe_unused)
 {
 	struct vdso_info *vdso_info;
 	struct dso *dso;
@@ -295,11 +295,11 @@ struct dso *vdso__dso_findnew(struct machine *machine,
 		return NULL;
 
 #if BITS_PER_LONG == 64
-	if (vdso__dso_findnew_compat(machine, thread, vdso_info, &dso))
+	if (machine__findnew_vdso_compat(machine, thread, vdso_info, &dso))
 		return dso;
 #endif
 
-	dso = dsos__find(&machine->user_dsos, DSO__NAME_VDSO, true);
+	dso = dsos__find(&machine->dsos, DSO__NAME_VDSO, true);
 	if (!dso) {
 		char *file;
 
@@ -307,7 +307,7 @@ struct dso *vdso__dso_findnew(struct machine *machine,
 		if (!file)
 			return NULL;
 
-		dso = vdso__new(machine, DSO__NAME_VDSO, file);
+		dso = machine__addnew_vdso(machine, DSO__NAME_VDSO, file);
 	}
 
 	return dso;
diff --git a/tools/perf/util/vdso.h b/tools/perf/util/vdso.h
index d97da1616f0c5b..cdc4fabfc2124e 100644
--- a/tools/perf/util/vdso.h
+++ b/tools/perf/util/vdso.h
@@ -23,7 +23,7 @@ bool dso__is_vdso(struct dso *dso);
 struct machine;
 struct thread;
 
-struct dso *vdso__dso_findnew(struct machine *machine, struct thread *thread);
-void vdso__exit(struct machine *machine);
+struct dso *machine__findnew_vdso(struct machine *machine, struct thread *thread);
+void machine__exit_vdso(struct machine *machine);
 
 #endif /* __PERF_VDSO__ */
diff --git a/tools/testing/selftests/rcutorture/bin/configinit.sh b/tools/testing/selftests/rcutorture/bin/configinit.sh
index 15f1a17ca96e69..3f81a109520693 100755
--- a/tools/testing/selftests/rcutorture/bin/configinit.sh
+++ b/tools/testing/selftests/rcutorture/bin/configinit.sh
@@ -66,7 +66,7 @@ make $buildloc $TORTURE_DEFCONFIG > $builddir/Make.defconfig.out 2>&1
 mv $builddir/.config $builddir/.config.sav
 sh $T/upd.sh < $builddir/.config.sav > $builddir/.config
 cp $builddir/.config $builddir/.config.new
-yes '' | make $buildloc oldconfig > $builddir/Make.modconfig.out 2>&1
+yes '' | make $buildloc oldconfig > $builddir/Make.oldconfig.out 2> $builddir/Make.oldconfig.err
 
 # verify new config matches specification.
 configcheck.sh $builddir/.config $c
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
index 4f5b20f367a944..d86bdd6b6cc2df 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
@@ -43,6 +43,10 @@ do
 		if test -f "$i/console.log"
 		then
 			configcheck.sh $i/.config $i/ConfigFragment
+			if test -r $i/Make.oldconfig.err
+			then
+				cat $i/Make.oldconfig.err
+			fi
 			parse-build.sh $i/Make.out $configfile
 			parse-torture.sh $i/console.log $configfile
 			parse-console.sh $i/console.log $configfile
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index dd2812ceb0baba..fbe2dbff1e210c 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -55,7 +55,7 @@ usage () {
 	echo "       --bootargs kernel-boot-arguments"
 	echo "       --bootimage relative-path-to-kernel-boot-image"
 	echo "       --buildonly"
-	echo "       --configs \"config-file list\""
+	echo "       --configs \"config-file list w/ repeat factor (3*TINY01)\""
 	echo "       --cpus N"
 	echo "       --datestamp string"
 	echo "       --defconfig string"
@@ -178,13 +178,26 @@ fi
 touch $T/cfgcpu
 for CF in $configs
 do
-	if test -f "$CONFIGFRAG/$CF"
+	case $CF in
+	[0-9]\**|[0-9][0-9]\**|[0-9][0-9][0-9]\**)
+		config_reps=`echo $CF | sed -e 's/\*.*$//'`
+		CF1=`echo $CF | sed -e 's/^[^*]*\*//'`
+		;;
+	*)
+		config_reps=1
+		CF1=$CF
+		;;
+	esac
+	if test -f "$CONFIGFRAG/$CF1"
 	then
-		cpu_count=`configNR_CPUS.sh $CONFIGFRAG/$CF`
-		cpu_count=`configfrag_boot_cpus "$TORTURE_BOOTARGS" "$CONFIGFRAG/$CF" "$cpu_count"`
-		echo $CF $cpu_count >> $T/cfgcpu
+		cpu_count=`configNR_CPUS.sh $CONFIGFRAG/$CF1`
+		cpu_count=`configfrag_boot_cpus "$TORTURE_BOOTARGS" "$CONFIGFRAG/$CF1" "$cpu_count"`
+		for ((cur_rep=0;cur_rep<$config_reps;cur_rep++))
+		do
+			echo $CF1 $cpu_count >> $T/cfgcpu
+		done
 	else
-		echo "The --configs file $CF does not exist, terminating."
+		echo "The --configs file $CF1 does not exist, terminating."
 		exit 1
 	fi
 done
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
index 49701218dc6204..f824b4c9d9d913 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
@@ -1,3 +1,5 @@
 CONFIG_RCU_TORTURE_TEST=y
 CONFIG_PRINTK_TIME=y
+CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y
 CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y
+CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N
index 9fbb41b9b3149e..1a087c3c8bb861 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N
@@ -5,3 +5,4 @@ CONFIG_HOTPLUG_CPU=y
 CONFIG_PREEMPT_NONE=y
 CONFIG_PREEMPT_VOLUNTARY=n
 CONFIG_PREEMPT=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P
index 4b6f272dba27f8..4837430a71c0c3 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P
@@ -5,3 +5,4 @@ CONFIG_HOTPLUG_CPU=y
 CONFIG_PREEMPT_NONE=n
 CONFIG_PREEMPT_VOLUNTARY=n
 CONFIG_PREEMPT=y
+#CHECK#CONFIG_RCU_EXPERT=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot
index 238bfe3bd0cccf..84a7d51b7481e7 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot
@@ -1 +1 @@
-rcutorture.torture_type=srcu
+rcutorture.torture_type=srcud
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01
index 97f0a0b27ef729..2cc0e60eba6eed 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS01
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01
@@ -5,5 +5,6 @@ CONFIG_PREEMPT_NONE=n
 CONFIG_PREEMPT_VOLUNTARY=n
 CONFIG_PREEMPT=y
 CONFIG_DEBUG_LOCK_ALLOC=y
-CONFIG_PROVE_RCU=y
-CONFIG_TASKS_RCU=y
+CONFIG_PROVE_LOCKING=n
+#CHECK#CONFIG_PROVE_RCU=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS02 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS02
index 696d2ea74d13bb..ad2be91e5ee762 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS02
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS02
@@ -2,4 +2,3 @@ CONFIG_SMP=n
 CONFIG_PREEMPT_NONE=y
 CONFIG_PREEMPT_VOLUNTARY=n
 CONFIG_PREEMPT=n
-CONFIG_TASKS_RCU=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03
index 9c60da5b5d1dda..c70c51d5ded15a 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03
@@ -6,8 +6,8 @@ CONFIG_HIBERNATION=n
 CONFIG_PREEMPT_NONE=n
 CONFIG_PREEMPT_VOLUNTARY=n
 CONFIG_PREEMPT=y
-CONFIG_TASKS_RCU=y
 CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=n
 CONFIG_NO_HZ_FULL=y
 CONFIG_NO_HZ_FULL_ALL=y
+#CHECK#CONFIG_RCU_EXPERT=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TINY02 b/tools/testing/selftests/rcutorture/configs/rcu/TINY02
index 36e41df3d27aa6..f1892e0371c954 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TINY02
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TINY02
@@ -8,7 +8,7 @@ CONFIG_NO_HZ_IDLE=n
 CONFIG_NO_HZ_FULL=n
 CONFIG_RCU_TRACE=y
 CONFIG_PROVE_LOCKING=y
-CONFIG_PROVE_RCU=y
+#CHECK#CONFIG_PROVE_RCU=y
 CONFIG_DEBUG_LOCK_ALLOC=y
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
 CONFIG_PREEMPT_COUNT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TINY02.boot b/tools/testing/selftests/rcutorture/configs/rcu/TINY02.boot
index 0f0802730014c6..6c1a292a65fb49 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TINY02.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TINY02.boot
@@ -1,2 +1,3 @@
 rcupdate.rcu_self_test=1
 rcupdate.rcu_self_test_bh=1
+rcutorture.torture_type=rcu_bh
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 b/tools/testing/selftests/rcutorture/configs/rcu/TREE01
index f8a10a7500c64f..8e9137f66831f2 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01
@@ -16,3 +16,4 @@ CONFIG_DEBUG_LOCK_ALLOC=n
 CONFIG_RCU_CPU_STALL_INFO=n
 CONFIG_RCU_BOOST=n
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE02 b/tools/testing/selftests/rcutorture/configs/rcu/TREE02
index 629122fb8b4a12..aeea6a204d14b1 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE02
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE02
@@ -14,10 +14,10 @@ CONFIG_SUSPEND=n
 CONFIG_HIBERNATION=n
 CONFIG_RCU_FANOUT=3
 CONFIG_RCU_FANOUT_LEAF=3
-CONFIG_RCU_FANOUT_EXACT=n
 CONFIG_RCU_NOCB_CPU=n
 CONFIG_DEBUG_LOCK_ALLOC=y
 CONFIG_PROVE_LOCKING=n
 CONFIG_RCU_CPU_STALL_INFO=n
 CONFIG_RCU_BOOST=n
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T b/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T
index a25de47888a4fc..2ac9e68ea3d148 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T
@@ -14,7 +14,6 @@ CONFIG_SUSPEND=n
 CONFIG_HIBERNATION=n
 CONFIG_RCU_FANOUT=3
 CONFIG_RCU_FANOUT_LEAF=3
-CONFIG_RCU_FANOUT_EXACT=n
 CONFIG_RCU_NOCB_CPU=n
 CONFIG_DEBUG_LOCK_ALLOC=y
 CONFIG_PROVE_LOCKING=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03 b/tools/testing/selftests/rcutorture/configs/rcu/TREE03
index 53f24e0a0ab618..72aa7d87ea9915 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03
@@ -1,5 +1,5 @@
 CONFIG_SMP=y
-CONFIG_NR_CPUS=8
+CONFIG_NR_CPUS=16
 CONFIG_PREEMPT_NONE=n
 CONFIG_PREEMPT_VOLUNTARY=n
 CONFIG_PREEMPT=y
@@ -9,12 +9,12 @@ CONFIG_NO_HZ_IDLE=n
 CONFIG_NO_HZ_FULL=n
 CONFIG_RCU_TRACE=y
 CONFIG_HOTPLUG_CPU=y
-CONFIG_RCU_FANOUT=4
-CONFIG_RCU_FANOUT_LEAF=4
-CONFIG_RCU_FANOUT_EXACT=n
+CONFIG_RCU_FANOUT=2
+CONFIG_RCU_FANOUT_LEAF=2
 CONFIG_RCU_NOCB_CPU=n
 CONFIG_DEBUG_LOCK_ALLOC=n
 CONFIG_RCU_CPU_STALL_INFO=n
 CONFIG_RCU_BOOST=y
 CONFIG_RCU_KTHREAD_PRIO=2
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
new file mode 100644
index 00000000000000..120c0c88d1003a
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
@@ -0,0 +1 @@
+rcutorture.onoff_interval=1 rcutorture.onoff_holdoff=30
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
index 0f84db35b36d62..3f5112751cda0d 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
@@ -13,10 +13,10 @@ CONFIG_RCU_TRACE=y
 CONFIG_HOTPLUG_CPU=n
 CONFIG_SUSPEND=n
 CONFIG_HIBERNATION=n
-CONFIG_RCU_FANOUT=2
-CONFIG_RCU_FANOUT_LEAF=2
-CONFIG_RCU_FANOUT_EXACT=n
+CONFIG_RCU_FANOUT=4
+CONFIG_RCU_FANOUT_LEAF=4
 CONFIG_RCU_NOCB_CPU=n
 CONFIG_DEBUG_LOCK_ALLOC=n
-CONFIG_RCU_CPU_STALL_INFO=y
+CONFIG_RCU_CPU_STALL_INFO=n
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05 b/tools/testing/selftests/rcutorture/configs/rcu/TREE05
index 212e3bfd2b2ac0..c04dfea6fd217e 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE05
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05
@@ -12,11 +12,11 @@ CONFIG_RCU_TRACE=n
 CONFIG_HOTPLUG_CPU=y
 CONFIG_RCU_FANOUT=6
 CONFIG_RCU_FANOUT_LEAF=6
-CONFIG_RCU_FANOUT_EXACT=n
 CONFIG_RCU_NOCB_CPU=y
 CONFIG_RCU_NOCB_CPU_NONE=y
 CONFIG_DEBUG_LOCK_ALLOC=y
 CONFIG_PROVE_LOCKING=y
-CONFIG_PROVE_RCU=y
+#CHECK#CONFIG_PROVE_RCU=y
 CONFIG_RCU_CPU_STALL_INFO=n
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE06 b/tools/testing/selftests/rcutorture/configs/rcu/TREE06
index 7eee63b4421812..f51d2c73a68ec2 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE06
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE06
@@ -14,10 +14,10 @@ CONFIG_SUSPEND=n
 CONFIG_HIBERNATION=n
 CONFIG_RCU_FANOUT=6
 CONFIG_RCU_FANOUT_LEAF=6
-CONFIG_RCU_FANOUT_EXACT=y
 CONFIG_RCU_NOCB_CPU=n
 CONFIG_DEBUG_LOCK_ALLOC=y
 CONFIG_PROVE_LOCKING=y
-CONFIG_PROVE_RCU=y
+#CHECK#CONFIG_PROVE_RCU=y
 CONFIG_RCU_CPU_STALL_INFO=n
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot
index da9a03a398db14..dd90f28ed700b3 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot
@@ -1,3 +1,4 @@
 rcupdate.rcu_self_test=1
 rcupdate.rcu_self_test_bh=1
 rcupdate.rcu_self_test_sched=1
+rcutree.rcu_fanout_exact=1
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 b/tools/testing/selftests/rcutorture/configs/rcu/TREE07
index 92a97fa97dec1c..f422af4ff5a31b 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07
@@ -15,8 +15,8 @@ CONFIG_RCU_TRACE=y
 CONFIG_HOTPLUG_CPU=y
 CONFIG_RCU_FANOUT=2
 CONFIG_RCU_FANOUT_LEAF=2
-CONFIG_RCU_FANOUT_EXACT=n
 CONFIG_RCU_NOCB_CPU=n
 CONFIG_DEBUG_LOCK_ALLOC=n
-CONFIG_RCU_CPU_STALL_INFO=y
+CONFIG_RCU_CPU_STALL_INFO=n
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08 b/tools/testing/selftests/rcutorture/configs/rcu/TREE08
index 5812027d6f9ff0..a24d2ca30646c3 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08
@@ -1,5 +1,5 @@
 CONFIG_SMP=y
-CONFIG_NR_CPUS=16
+CONFIG_NR_CPUS=8
 CONFIG_PREEMPT_NONE=n
 CONFIG_PREEMPT_VOLUNTARY=n
 CONFIG_PREEMPT=y
@@ -13,13 +13,13 @@ CONFIG_HOTPLUG_CPU=n
 CONFIG_SUSPEND=n
 CONFIG_HIBERNATION=n
 CONFIG_RCU_FANOUT=3
-CONFIG_RCU_FANOUT_EXACT=y
 CONFIG_RCU_FANOUT_LEAF=2
 CONFIG_RCU_NOCB_CPU=y
 CONFIG_RCU_NOCB_CPU_ALL=y
 CONFIG_DEBUG_LOCK_ALLOC=n
 CONFIG_PROVE_LOCKING=y
-CONFIG_PROVE_RCU=y
+#CHECK#CONFIG_PROVE_RCU=y
 CONFIG_RCU_CPU_STALL_INFO=n
 CONFIG_RCU_BOOST=n
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T b/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T
index 3eaeccacb08389..b2b8cea69dc993 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T
@@ -13,7 +13,6 @@ CONFIG_HOTPLUG_CPU=n
 CONFIG_SUSPEND=n
 CONFIG_HIBERNATION=n
 CONFIG_RCU_FANOUT=3
-CONFIG_RCU_FANOUT_EXACT=y
 CONFIG_RCU_FANOUT_LEAF=2
 CONFIG_RCU_NOCB_CPU=y
 CONFIG_RCU_NOCB_CPU_ALL=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T.boot
new file mode 100644
index 00000000000000..883149b5f2d19d
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T.boot
@@ -0,0 +1 @@
+rcutree.rcu_fanout_exact=1
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot
index 2561daf605ad5f..fb066dc82769fe 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot
@@ -1,3 +1,4 @@
 rcutorture.torture_type=sched
 rcupdate.rcu_self_test=1
 rcupdate.rcu_self_test_sched=1
+rcutree.rcu_fanout_exact=1
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 b/tools/testing/selftests/rcutorture/configs/rcu/TREE09
index 6076b36f6c0b45..aa4ed08d999d9d 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE09
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE09
@@ -16,3 +16,4 @@ CONFIG_DEBUG_LOCK_ALLOC=n
 CONFIG_RCU_CPU_STALL_INFO=n
 CONFIG_RCU_BOOST=n
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+#CHECK#CONFIG_RCU_EXPERT=n
diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
index ec03c883db0051..b24c0004fc4998 100644
--- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
+++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
@@ -12,13 +12,12 @@ CONFIG_NO_HZ_IDLE -- Do those not otherwise specified. (Groups of two.)
 CONFIG_NO_HZ_FULL -- Do two, one with CONFIG_NO_HZ_FULL_SYSIDLE.
 CONFIG_NO_HZ_FULL_SYSIDLE -- Do one.
 CONFIG_PREEMPT -- Do half.  (First three and #8.)
-CONFIG_PROVE_LOCKING -- Do all but two, covering CONFIG_PROVE_RCU and not.
-CONFIG_PROVE_RCU -- Do all but one under CONFIG_PROVE_LOCKING.
+CONFIG_PROVE_LOCKING -- Do several, covering CONFIG_DEBUG_LOCK_ALLOC=y and not.
+CONFIG_PROVE_RCU -- Hardwired to CONFIG_PROVE_LOCKING.
 CONFIG_RCU_BOOST -- one of PREEMPT_RCU.
 CONFIG_RCU_KTHREAD_PRIO -- set to 2 for _BOOST testing.
-CONFIG_RCU_CPU_STALL_INFO -- Do one.
-CONFIG_RCU_FANOUT -- Cover hierarchy as currently, but overlap with others.
-CONFIG_RCU_FANOUT_EXACT -- Do one.
+CONFIG_RCU_CPU_STALL_INFO -- Now default, avoid at least twice.
+CONFIG_RCU_FANOUT -- Cover hierarchy, but overlap with others.
 CONFIG_RCU_FANOUT_LEAF -- Do one non-default.
 CONFIG_RCU_FAST_NO_HZ -- Do one, but not with CONFIG_RCU_NOCB_CPU_ALL.
 CONFIG_RCU_NOCB_CPU -- Do three, see below.
@@ -27,28 +26,19 @@ CONFIG_RCU_NOCB_CPU_NONE -- Do one.
 CONFIG_RCU_NOCB_CPU_ZERO -- Do one.
 CONFIG_RCU_TRACE -- Do half.
 CONFIG_SMP -- Need one !SMP for PREEMPT_RCU.
+!RCU_EXPERT -- Do a few, but these have to be vanilla configurations.
 RCU-bh: Do one with PREEMPT and one with !PREEMPT.
 RCU-sched: Do one with PREEMPT but not BOOST.
 
 
-Hierarchy:
-
-TREE01.	CONFIG_NR_CPUS=8, CONFIG_RCU_FANOUT=8, CONFIG_RCU_FANOUT_EXACT=n.
-TREE02.	CONFIG_NR_CPUS=8, CONFIG_RCU_FANOUT=3, CONFIG_RCU_FANOUT_EXACT=n,
-	CONFIG_RCU_FANOUT_LEAF=3.
-TREE03.	CONFIG_NR_CPUS=8, CONFIG_RCU_FANOUT=4, CONFIG_RCU_FANOUT_EXACT=n,
-	CONFIG_RCU_FANOUT_LEAF=4.
-TREE04.	CONFIG_NR_CPUS=8, CONFIG_RCU_FANOUT=2, CONFIG_RCU_FANOUT_EXACT=n,
-	CONFIG_RCU_FANOUT_LEAF=2.
-TREE05.	CONFIG_NR_CPUS=8, CONFIG_RCU_FANOUT=6, CONFIG_RCU_FANOUT_EXACT=n
-	CONFIG_RCU_FANOUT_LEAF=6.
-TREE06.	CONFIG_NR_CPUS=8, CONFIG_RCU_FANOUT=6, CONFIG_RCU_FANOUT_EXACT=y
-	CONFIG_RCU_FANOUT_LEAF=6.
-TREE07.	CONFIG_NR_CPUS=16, CONFIG_RCU_FANOUT=2, CONFIG_RCU_FANOUT_EXACT=n,
-	CONFIG_RCU_FANOUT_LEAF=2.
-TREE08.	CONFIG_NR_CPUS=16, CONFIG_RCU_FANOUT=3, CONFIG_RCU_FANOUT_EXACT=y,
-	CONFIG_RCU_FANOUT_LEAF=2.
-TREE09.	CONFIG_NR_CPUS=1.
+Boot parameters:
+
+nohz_full - do at least one.
+maxcpu -- do at least one.
+rcupdate.rcu_self_test_bh -- Do at least one each, offloaded and not.
+rcupdate.rcu_self_test_sched -- Do at least one each, offloaded and not.
+rcupdate.rcu_self_test -- Do at least one each, offloaded and not.
+rcutree.rcu_fanout_exact -- Do at least one.
 
 
 Kconfig Parameters Ignored:
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 5bdb781163d1f2..caa60d56d7d1f5 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -4,9 +4,11 @@ include ../lib.mk
 
 .PHONY: all all_32 all_64 warn_32bit_failure clean
 
-TARGETS_C_BOTHBITS := sigreturn single_step_syscall
+TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs
+TARGETS_C_32BIT_ONLY := entry_from_vm86
 
-BINARIES_32 := $(TARGETS_C_BOTHBITS:%=%_32)
+TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
+BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
 BINARIES_64 := $(TARGETS_C_BOTHBITS:%=%_64)
 
 CFLAGS := -O2 -g -std=gnu99 -pthread -Wall
@@ -32,7 +34,7 @@ all_64: $(BINARIES_64)
 clean:
 	$(RM) $(BINARIES_32) $(BINARIES_64)
 
-$(TARGETS_C_BOTHBITS:%=%_32): %_32: %.c
+$(TARGETS_C_32BIT_ALL:%=%_32): %_32: %.c
 	$(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
 
 $(TARGETS_C_BOTHBITS:%=%_64): %_64: %.c
@@ -55,3 +57,6 @@ warn_32bit_failure:
 	echo "  yum install glibc-devel.*i686";				\
 	exit 0;
 endif
+
+# Some tests have additional dependencies.
+sysret_ss_attrs_64: thunks.S
diff --git a/tools/testing/selftests/x86/entry_from_vm86.c b/tools/testing/selftests/x86/entry_from_vm86.c
new file mode 100644
index 00000000000000..5c38a187677b07
--- /dev/null
+++ b/tools/testing/selftests/x86/entry_from_vm86.c
@@ -0,0 +1,114 @@
+/*
+ * entry_from_vm86.c - tests kernel entries from vm86 mode
+ * Copyright (c) 2014-2015 Andrew Lutomirski
+ *
+ * This exercises a few paths that need to special-case vm86 mode.
+ *
+ * GPL v2.
+ */
+
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <sys/signal.h>
+#include <sys/ucontext.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <err.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <errno.h>
+#include <sys/vm86.h>
+
+static unsigned long load_addr = 0x10000;
+static int nerrs = 0;
+
+asm (
+	".pushsection .rodata\n\t"
+	".type vmcode_bound, @object\n\t"
+	"vmcode:\n\t"
+	"vmcode_bound:\n\t"
+	".code16\n\t"
+	"bound %ax, (2048)\n\t"
+	"int3\n\t"
+	"vmcode_sysenter:\n\t"
+	"sysenter\n\t"
+	".size vmcode, . - vmcode\n\t"
+	"end_vmcode:\n\t"
+	".code32\n\t"
+	".popsection"
+	);
+
+extern unsigned char vmcode[], end_vmcode[];
+extern unsigned char vmcode_bound[], vmcode_sysenter[];
+
+static void do_test(struct vm86plus_struct *v86, unsigned long eip,
+		    const char *text)
+{
+	long ret;
+
+	printf("[RUN]\t%s from vm86 mode\n", text);
+	v86->regs.eip = eip;
+	ret = vm86(VM86_ENTER, v86);
+
+	if (ret == -1 && errno == ENOSYS) {
+		printf("[SKIP]\tvm86 not supported\n");
+		return;
+	}
+
+	if (VM86_TYPE(ret) == VM86_INTx) {
+		char trapname[32];
+		int trapno = VM86_ARG(ret);
+		if (trapno == 13)
+			strcpy(trapname, "GP");
+		else if (trapno == 5)
+			strcpy(trapname, "BR");
+		else if (trapno == 14)
+			strcpy(trapname, "PF");
+		else
+			sprintf(trapname, "%d", trapno);
+
+		printf("[OK]\tExited vm86 mode due to #%s\n", trapname);
+	} else if (VM86_TYPE(ret) == VM86_UNKNOWN) {
+		printf("[OK]\tExited vm86 mode due to unhandled GP fault\n");
+	} else {
+		printf("[OK]\tExited vm86 mode due to type %ld, arg %ld\n",
+		       VM86_TYPE(ret), VM86_ARG(ret));
+	}
+}
+
+int main(void)
+{
+	struct vm86plus_struct v86;
+	unsigned char *addr = mmap((void *)load_addr, 4096,
+				   PROT_READ | PROT_WRITE | PROT_EXEC,
+				   MAP_ANONYMOUS | MAP_PRIVATE, -1,0);
+	if (addr != (unsigned char *)load_addr)
+		err(1, "mmap");
+
+	memcpy(addr, vmcode, end_vmcode - vmcode);
+	addr[2048] = 2;
+	addr[2050] = 3;
+
+	memset(&v86, 0, sizeof(v86));
+
+	v86.regs.cs = load_addr / 16;
+	v86.regs.ss = load_addr / 16;
+	v86.regs.ds = load_addr / 16;
+	v86.regs.es = load_addr / 16;
+
+	assert((v86.regs.cs & 3) == 0);	/* Looks like RPL = 0 */
+
+	/* #BR -- should deliver SIG??? */
+	do_test(&v86, vmcode_bound - vmcode, "#BR");
+
+	/* SYSENTER -- should cause #GP or #UD depending on CPU */
+	do_test(&v86, vmcode_sysenter - vmcode, "SYSENTER");
+
+	return (nerrs == 0 ? 0 : 1);
+}
diff --git a/tools/testing/selftests/x86/sysret_ss_attrs.c b/tools/testing/selftests/x86/sysret_ss_attrs.c
new file mode 100644
index 00000000000000..ce42d5a6400904
--- /dev/null
+++ b/tools/testing/selftests/x86/sysret_ss_attrs.c
@@ -0,0 +1,112 @@
+/*
+ * sysret_ss_attrs.c - test that syscalls return valid hidden SS attributes
+ * Copyright (c) 2015 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * On AMD CPUs, SYSRET can return with a valid SS descriptor with with
+ * the hidden attributes set to an unusable state.  Make sure the kernel
+ * doesn't let this happen.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <err.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <pthread.h>
+
+static void *threadproc(void *ctx)
+{
+	/*
+	 * Do our best to cause sleeps on this CPU to exit the kernel and
+	 * re-enter with SS = 0.
+	 */
+	while (true)
+		;
+
+	return NULL;
+}
+
+#ifdef __x86_64__
+extern unsigned long call32_from_64(void *stack, void (*function)(void));
+
+asm (".pushsection .text\n\t"
+     ".code32\n\t"
+     "test_ss:\n\t"
+     "pushl $0\n\t"
+     "popl %eax\n\t"
+     "ret\n\t"
+     ".code64");
+extern void test_ss(void);
+#endif
+
+int main()
+{
+	/*
+	 * Start a busy-looping thread on the same CPU we're on.
+	 * For simplicity, just stick everything to CPU 0.  This will
+	 * fail in some containers, but that's probably okay.
+	 */
+	cpu_set_t cpuset;
+	CPU_ZERO(&cpuset);
+	CPU_SET(0, &cpuset);
+	if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
+		printf("[WARN]\tsched_setaffinity failed\n");
+
+	pthread_t thread;
+	if (pthread_create(&thread, 0, threadproc, 0) != 0)
+		err(1, "pthread_create");
+
+#ifdef __x86_64__
+	unsigned char *stack32 = mmap(NULL, 4096, PROT_READ | PROT_WRITE,
+				      MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE,
+				      -1, 0);
+	if (stack32 == MAP_FAILED)
+		err(1, "mmap");
+#endif
+
+	printf("[RUN]\tSyscalls followed by SS validation\n");
+
+	for (int i = 0; i < 1000; i++) {
+		/*
+		 * Go to sleep and return using sysret (if we're 64-bit
+		 * or we're 32-bit on AMD on a 64-bit kernel).  On AMD CPUs,
+		 * SYSRET doesn't fix up the cached SS descriptor, so the
+		 * kernel needs some kind of workaround to make sure that we
+		 * end the system call with a valid stack segment.  This
+		 * can be a confusing failure because the SS *selector*
+		 * is the same regardless.
+		 */
+		usleep(2);
+
+#ifdef __x86_64__
+		/*
+		 * On 32-bit, just doing a syscall through glibc is enough
+		 * to cause a crash if our cached SS descriptor is invalid.
+		 * On 64-bit, it's not, so try extra hard.
+		 */
+		call32_from_64(stack32 + 4088, test_ss);
+#endif
+	}
+
+	printf("[OK]\tWe survived\n");
+
+#ifdef __x86_64__
+	munmap(stack32, 4096);
+#endif
+
+	return 0;
+}
diff --git a/tools/testing/selftests/x86/thunks.S b/tools/testing/selftests/x86/thunks.S
new file mode 100644
index 00000000000000..ce8a995bbb17b4
--- /dev/null
+++ b/tools/testing/selftests/x86/thunks.S
@@ -0,0 +1,67 @@
+/*
+ * thunks.S - assembly helpers for mixed-bitness code
+ * Copyright (c) 2015 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * These are little helpers that make it easier to switch bitness on
+ * the fly.
+ */
+
+	.text
+
+	.global call32_from_64
+	.type call32_from_64, @function
+call32_from_64:
+	// rdi: stack to use
+	// esi: function to call
+
+	// Save registers
+	pushq %rbx
+	pushq %rbp
+	pushq %r12
+	pushq %r13
+	pushq %r14
+	pushq %r15
+	pushfq
+
+	// Switch stacks
+	mov %rsp,(%rdi)
+	mov %rdi,%rsp
+
+	// Switch to compatibility mode
+	pushq $0x23  /* USER32_CS */
+	pushq $1f
+	lretq
+
+1:
+	.code32
+	// Call the function
+	call *%esi
+	// Switch back to long mode
+	jmp $0x33,$1f
+	.code64
+
+1:
+	// Restore the stack
+	mov (%rsp),%rsp
+
+	// Restore registers
+	popfq
+	popq %r15
+	popq %r14
+	popq %r13
+	popq %r12
+	popq %rbp
+	popq %rbx
+
+	ret
+
+.size call32_from_64, .-call32_from_64
author	Michael Ellerman <mpe@ellerman.id.au>	2015-06-05 18:27:08 +1000
committer	Michael Ellerman <mpe@ellerman.id.au>	2015-06-05 18:27:11 +1000
commit	123ed28d0811617e5a0a9f96e650720768ff80d6 (patch)
tree	c5d4b8ca156214b7b0e3d3460ff6764db6427fc4
parent	e36a281282a20be3e51adccc9c9d0be8262e3a7d (diff)
parent	7a6a010964a6fa86c5d41159ac1893cc918589fc (diff)
download	linux-next-123ed28d0811617e5a0a9f96e650720768ff80d6.tar.gz