diff options
author | Sebastian Andrzej Siewior <bigeasy@linutronix.de> | 2016-07-15 13:30:25 +0200 |
---|---|---|
committer | Sebastian Andrzej Siewior <bigeasy@linutronix.de> | 2016-07-15 13:30:25 +0200 |
commit | 0cc16024f914f3e629502ed3f0a35407e773c4d7 (patch) | |
tree | a5a4d5fe10716cf2e4eacac5555b659d8beb0075 | |
parent | 7a203491ad9e98ea6506f7822995c41c8995681d (diff) | |
download | 4.9-rt-patches-0cc16024f914f3e629502ed3f0a35407e773c4d7.tar.gz |
[ANNOUNCE] 4.6.4-rt8
Dear RT folks!
I'm pleased to announce the v4.6.4-rt8 patch set.
Changes since v4.6.4-rt8:
- Import Thomas' timer rework known as "timer: Refactor the timer
wheel" patch set which made its way into the -TIP tree. With this
changes we get NOHZ_FULL working. Finally.
- Avoid warning of an unused symbol in the !RT case
(preemptible_lazy())
- Replace the "trace event preempt count" fixup with Steven's version.
Known issues
- CPU hotplug got a little better but can deadlock.
The delta patch against 4.6.4-rt8 is appended below and can be found here:
https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.6/incr/patch-4.6.4-rt7-rt8.patch.xz
You can get this release via the git tree at:
git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v4.6.4-rt8
The RT patch against 4.6.4 can be found here:
https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.6/patch-4.6.4-rt8.patch.xz
The split quilt queue is available at:
https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.6/patches-4.6.4-rt8.tar.xz
Sebastian
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
42 files changed, 3731 insertions, 294 deletions
diff --git a/patches/driver-net-ethernet-tile-Initialize-timer-as-pinned.patch b/patches/driver-net-ethernet-tile-Initialize-timer-as-pinned.patch new file mode 100644 index 00000000000000..975115f8219909 --- /dev/null +++ b/patches/driver-net-ethernet-tile-Initialize-timer-as-pinned.patch @@ -0,0 +1,40 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:19 +0000 +Subject: [PATCH 05/22] driver/net/ethernet/tile: Initialize timer as pinned + +Pinned timers must carry that attribute in the timer itself. No functional +change. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Chris Mason <clm@fb.com> +Cc: Eric Dumazet <edumazet@google.com> +Cc: rt@linutronix.de +Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/net/ethernet/tile/tilepro.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/tile/tilepro.c ++++ b/drivers/net/ethernet/tile/tilepro.c +@@ -588,7 +588,7 @@ static bool tile_net_lepp_free_comps(str + static void tile_net_schedule_egress_timer(struct tile_net_cpu *info) + { + if (!info->egress_timer_scheduled) { +- mod_timer_pinned(&info->egress_timer, jiffies + 1); ++ mod_timer(&info->egress_timer, jiffies + 1); + info->egress_timer_scheduled = true; + } + } +@@ -1004,7 +1004,7 @@ static void tile_net_register(void *dev_ + BUG(); + + /* Initialize the egress timer. */ +- init_timer(&info->egress_timer); ++ init_timer_pinned(&info->egress_timer); + info->egress_timer.data = (long)info; + info->egress_timer.function = tile_net_handle_egress_timer; + diff --git a/patches/drivers-tty-metag_da-Initialize-timer-as-pinned.patch b/patches/drivers-tty-metag_da-Initialize-timer-as-pinned.patch new file mode 100644 index 00000000000000..465ef3c020498b --- /dev/null +++ b/patches/drivers-tty-metag_da-Initialize-timer-as-pinned.patch @@ -0,0 +1,37 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:21 +0000 +Subject: [PATCH 06/22] drivers/tty/metag_da: Initialize timer as pinned + +Pinned timers must carry that attribute in the timer itself. No functional +change. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Chris Mason <clm@fb.com> +Cc: Eric Dumazet <edumazet@google.com> +Cc: rt@linutronix.de +Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/tty/metag_da.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/tty/metag_da.c ++++ b/drivers/tty/metag_da.c +@@ -323,12 +323,12 @@ static void dashtty_timer(unsigned long + if (channel >= 0) + fetch_data(channel); + +- mod_timer_pinned(&poll_timer, jiffies + DA_TTY_POLL); ++ mod_timer(&poll_timer, jiffies + DA_TTY_POLL); + } + + static void add_poll_timer(struct timer_list *poll_timer) + { +- setup_timer(poll_timer, dashtty_timer, 0); ++ setup_pinned_timer(poll_timer, dashtty_timer, 0); + poll_timer->expires = jiffies + DA_TTY_POLL; + + /* diff --git a/patches/drivers-tty-mips_ejtag-Initialize-timer-as-pinned.patch b/patches/drivers-tty-mips_ejtag-Initialize-timer-as-pinned.patch new file mode 100644 index 00000000000000..010ddaecf79a61 --- /dev/null +++ b/patches/drivers-tty-mips_ejtag-Initialize-timer-as-pinned.patch @@ -0,0 +1,40 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:22 +0000 +Subject: [PATCH 07/22] drivers/tty/mips_ejtag: Initialize timer as pinned + +Pinned timers must carry that attribute in the timer itself. No functional +change. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Chris Mason <clm@fb.com> +Cc: Eric Dumazet <edumazet@google.com> +Cc: rt@linutronix.de +Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/tty/mips_ejtag_fdc.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/tty/mips_ejtag_fdc.c ++++ b/drivers/tty/mips_ejtag_fdc.c +@@ -689,7 +689,7 @@ static void mips_ejtag_fdc_tty_timer(uns + + mips_ejtag_fdc_handle(priv); + if (!priv->removing) +- mod_timer_pinned(&priv->poll_timer, jiffies + FDC_TTY_POLL); ++ mod_timer(&priv->poll_timer, jiffies + FDC_TTY_POLL); + } + + /* TTY Port operations */ +@@ -1002,7 +1002,7 @@ static int mips_ejtag_fdc_tty_probe(stru + raw_spin_unlock_irq(&priv->lock); + } else { + /* If we didn't get an usable IRQ, poll instead */ +- setup_timer(&priv->poll_timer, mips_ejtag_fdc_tty_timer, ++ setup_pinned_timer(&priv->poll_timer, mips_ejtag_fdc_tty_timer, + (unsigned long)priv); + priv->poll_timer.expires = jiffies + FDC_TTY_POLL; + /* diff --git a/patches/drm-i915-Use-consistent-forcewake-auto-release-timeo.patch b/patches/drm-i915-Use-consistent-forcewake-auto-release-timeo.patch new file mode 100644 index 00000000000000..0f2cdf98c7ca31 --- /dev/null +++ b/patches/drm-i915-Use-consistent-forcewake-auto-release-timeo.patch @@ -0,0 +1,151 @@ +From: Tvrtko Ursulin <tvrtko.ursulin@intel.com> +Date: Thu, 7 Apr 2016 17:04:32 +0100 +Subject: [PATCH] drm/i915: Use consistent forcewake auto-release timeout + across kernel configs + +Upstream commit fde61b596b994195b9dd83feb325df95d99702ce + +Because it is based on jiffies, current implementation releases the +forcewake at any time between straight away and between 1ms and 10ms, +depending on the kernel configuration (CONFIG_HZ). + +This is probably not what has been desired, since the dynamics of keeping +parts of the GPU awake should not be correlated with this kernel +configuration parameter. + +Change the auto-release mechanism to use hrtimers and set the timeout to +1ms with a 1ms of slack. This should make the GPU power consistent +across kernel configs, and timer slack should enable some timer coalescing +where multiple force-wake domains exist, or with unrelated timers. + +For GlBench/T-Rex this decreases the number of forcewake releases from +~480 to ~300 per second, and for a heavy combined OGL/OCL test from +~670 to ~360 (HZ=1000 kernel). + +Even though this reduction can be attributed to the average release period +extending from 0-1ms to 1-2ms, as discussed above, it will make the +forcewake timeout consistent for different CONFIG_HZ values. + +Real life measurements with the above workload has shown that, with this +patch, both manage to auto-release the forcewake between 2-4 times per +10ms, even though the number of forcewake gets is dramatically different. + +T-Rex requests between 5-10 explicit gets and 5-10 implict gets in each +10ms period, while the OGL/OCL test requests 250 and 380 times in the same +period. + +The two data points together suggest that the nature of the forwake +accesses is bursty and that further changes and potential timeout +extensions, or moving the start of timeout from the first to the last +automatic forcewake grab, should be carefully measured for power and +performance effects. + +v2: + * Commit spelling. (Dave Gordon) + * More discussion on numbers in the commit. (Chris Wilson) + +Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> +Reviewed-by: Dave Gordon <david.s.gordon@intel.com> +Cc: Chris Wilson <chris@chris-wilson.co.uk> +Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/gpu/drm/i915/i915_drv.h | 2 +- + drivers/gpu/drm/i915/intel_uncore.c | 25 ++++++++++++++++--------- + 2 files changed, 17 insertions(+), 10 deletions(-) + +--- a/drivers/gpu/drm/i915/i915_drv.h ++++ b/drivers/gpu/drm/i915/i915_drv.h +@@ -714,7 +714,7 @@ struct intel_uncore { + struct drm_i915_private *i915; + enum forcewake_domain_id id; + unsigned wake_count; +- struct timer_list timer; ++ struct hrtimer timer; + i915_reg_t reg_set; + u32 val_set; + u32 val_clear; +--- a/drivers/gpu/drm/i915/intel_uncore.c ++++ b/drivers/gpu/drm/i915/intel_uncore.c +@@ -60,7 +60,11 @@ fw_domain_reset(const struct intel_uncor + static inline void + fw_domain_arm_timer(struct intel_uncore_forcewake_domain *d) + { +- mod_timer_pinned(&d->timer, jiffies + 1); ++ d->wake_count++; ++ hrtimer_start_range_ns(&d->timer, ++ ktime_set(0, NSEC_PER_MSEC), ++ NSEC_PER_MSEC, ++ HRTIMER_MODE_REL); + } + + static inline void +@@ -224,9 +228,11 @@ static int __gen6_gt_wait_for_fifo(struc + return ret; + } + +-static void intel_uncore_fw_release_timer(unsigned long arg) ++static enum hrtimer_restart ++intel_uncore_fw_release_timer(struct hrtimer *timer) + { +- struct intel_uncore_forcewake_domain *domain = (void *)arg; ++ struct intel_uncore_forcewake_domain *domain = ++ container_of(timer, struct intel_uncore_forcewake_domain, timer); + unsigned long irqflags; + + assert_rpm_device_not_suspended(domain->i915); +@@ -240,6 +246,8 @@ static void intel_uncore_fw_release_time + 1 << domain->id); + + spin_unlock_irqrestore(&domain->i915->uncore.lock, irqflags); ++ ++ return HRTIMER_NORESTART; + } + + void intel_uncore_forcewake_reset(struct drm_device *dev, bool restore) +@@ -259,16 +267,16 @@ void intel_uncore_forcewake_reset(struct + active_domains = 0; + + for_each_fw_domain(domain, dev_priv, id) { +- if (del_timer_sync(&domain->timer) == 0) ++ if (hrtimer_cancel(&domain->timer) == 0) + continue; + +- intel_uncore_fw_release_timer((unsigned long)domain); ++ intel_uncore_fw_release_timer(&domain->timer); + } + + spin_lock_irqsave(&dev_priv->uncore.lock, irqflags); + + for_each_fw_domain(domain, dev_priv, id) { +- if (timer_pending(&domain->timer)) ++ if (hrtimer_active(&domain->timer)) + active_domains |= (1 << id); + } + +@@ -491,7 +499,6 @@ static void __intel_uncore_forcewake_put + if (--domain->wake_count) + continue; + +- domain->wake_count++; + fw_domain_arm_timer(domain); + } + } +@@ -732,7 +739,6 @@ static inline void __force_wake_get(stru + continue; + } + +- domain->wake_count++; + fw_domain_arm_timer(domain); + } + +@@ -1150,7 +1156,8 @@ static void fw_domain_init(struct drm_i9 + d->i915 = dev_priv; + d->id = domain_id; + +- setup_timer(&d->timer, intel_uncore_fw_release_timer, (unsigned long)d); ++ hrtimer_init(&d->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ d->timer.function = intel_uncore_fw_release_timer; + + dev_priv->uncore.fw_domains |= (1 << domain_id); + diff --git a/patches/hlist-Add-hlist_is_singular_node-helper.patch b/patches/hlist-Add-hlist_is_singular_node-helper.patch new file mode 100644 index 00000000000000..f81ebe5c39ff81 --- /dev/null +++ b/patches/hlist-Add-hlist_is_singular_node-helper.patch @@ -0,0 +1,38 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:27 +0000 +Subject: [PATCH 11/22] hlist: Add hlist_is_singular_node() helper + +Required to figure out whether the entry is the only one in the hlist. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Chris Mason <clm@fb.com> +Cc: Eric Dumazet <edumazet@google.com> +Cc: rt@linutronix.de +Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/list.h | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/include/linux/list.h ++++ b/include/linux/list.h +@@ -679,6 +679,16 @@ static inline bool hlist_fake(struct hli + } + + /* ++ * Check whether the node is the only node of the head without ++ * accessing head. ++ */ ++static inline bool hlist_is_singular_node(struct hlist_node *n, ++ struct hlist_head *h) ++{ ++ return !n->next && n->pprev == &h->first; ++} ++ ++/* + * Move a list from one list head to another. Fixup the pprev + * reference of the first entry if it exists. + */ diff --git a/patches/hrtimer-fixup-hrtimer-callback-changes-for-preempt-r.patch b/patches/hrtimer-fixup-hrtimer-callback-changes-for-preempt-r.patch index 2698c67604bc9a..5b432c637667c0 100644 --- a/patches/hrtimer-fixup-hrtimer-callback-changes-for-preempt-r.patch +++ b/patches/hrtimer-fixup-hrtimer-callback-changes-for-preempt-r.patch @@ -315,7 +315,7 @@ Signed-off-by: Ingo Molnar <mingo@elte.hu> /** --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c -@@ -1213,6 +1213,7 @@ void tick_setup_sched_timer(void) +@@ -1194,6 +1194,7 @@ void tick_setup_sched_timer(void) * Emulate tick processing via per-CPU hrtimers: */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); diff --git a/patches/irqwork-Move-irq-safe-work-to-irq-context.patch b/patches/irqwork-Move-irq-safe-work-to-irq-context.patch index 8bd359a81417fe..9254680a86df3e 100644 --- a/patches/irqwork-Move-irq-safe-work-to-irq-context.patch +++ b/patches/irqwork-Move-irq-safe-work-to-irq-context.patch @@ -55,7 +55,7 @@ Cc: stable-rt@vger.kernel.org * Synchronize against the irq_work @entry, ensures the entry is not --- a/kernel/time/timer.c +++ b/kernel/time/timer.c -@@ -1484,7 +1484,7 @@ void update_process_times(int user_tick) +@@ -1642,7 +1642,7 @@ void update_process_times(int user_tick) scheduler_tick(); run_local_timers(); rcu_check_callbacks(user_tick); @@ -64,14 +64,14 @@ Cc: stable-rt@vger.kernel.org if (in_irq()) irq_work_tick(); #endif -@@ -1498,9 +1498,7 @@ static void run_timer_softirq(struct sof +@@ -1682,9 +1682,7 @@ static void run_timer_softirq(struct sof { - struct tvec_base *base = this_cpu_ptr(&tvec_bases); + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); -#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL) - irq_work_tick(); -#endif + irq_work_tick_soft(); - if (time_after_eq(jiffies, base->timer_jiffies)) - __run_timers(base); + __run_timers(base); + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) diff --git a/patches/irqwork-push_most_work_into_softirq_context.patch b/patches/irqwork-push_most_work_into_softirq_context.patch index b79d7896641c6b..7bb75c907e78bb 100644 --- a/patches/irqwork-push_most_work_into_softirq_context.patch +++ b/patches/irqwork-push_most_work_into_softirq_context.patch @@ -163,7 +163,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> /* --- a/kernel/time/timer.c +++ b/kernel/time/timer.c -@@ -1484,7 +1484,7 @@ void update_process_times(int user_tick) +@@ -1642,7 +1642,7 @@ void update_process_times(int user_tick) scheduler_tick(); run_local_timers(); rcu_check_callbacks(user_tick); @@ -172,14 +172,14 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> if (in_irq()) irq_work_tick(); #endif -@@ -1498,6 +1498,10 @@ static void run_timer_softirq(struct sof +@@ -1682,6 +1682,10 @@ static void run_timer_softirq(struct sof { - struct tvec_base *base = this_cpu_ptr(&tvec_bases); + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL) + irq_work_tick(); +#endif + - if (time_after_eq(jiffies, base->timer_jiffies)) - __run_timers(base); - } + __run_timers(base); + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) + __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); diff --git a/patches/localversion.patch b/patches/localversion.patch index bbb08330835de1..68c7b973cc487c 100644 --- a/patches/localversion.patch +++ b/patches/localversion.patch @@ -10,4 +10,4 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- /dev/null +++ b/localversion-rt @@ -0,0 +1 @@ -+-rt7 ++-rt8 diff --git a/patches/mm-zsmalloc-Use-get-put_cpu_light-in-zs_map_object-z.patch b/patches/mm-zsmalloc-Use-get-put_cpu_light-in-zs_map_object-z.patch index 5db2c5898b05f6..d3b55e4c8a67a6 100644 --- a/patches/mm-zsmalloc-Use-get-put_cpu_light-in-zs_map_object-z.patch +++ b/patches/mm-zsmalloc-Use-get-put_cpu_light-in-zs_map_object-z.patch @@ -1,4 +1,3 @@ -From 1fd1b32ad881496d3a3b4caac77965555cc021b0 Mon Sep 17 00:00:00 2001 From: Mike Galbraith <umgwanakikbuti@gmail.com> Date: Tue, 22 Mar 2016 11:16:09 +0100 Subject: [PATCH] mm/zsmalloc: Use get/put_cpu_light in diff --git a/patches/net-ipv4-inet-Initialize-timers-as-pinned.patch b/patches/net-ipv4-inet-Initialize-timers-as-pinned.patch new file mode 100644 index 00000000000000..be993ee580c273 --- /dev/null +++ b/patches/net-ipv4-inet-Initialize-timers-as-pinned.patch @@ -0,0 +1,65 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:23 +0000 +Subject: [PATCH 08/22] net/ipv4/inet: Initialize timers as pinned + +Pinned timers must carry that attribute in the timer itself. No functional +change. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Chris Mason <clm@fb.com> +Cc: Eric Dumazet <edumazet@google.com> +Cc: rt@linutronix.de +Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + net/ipv4/inet_connection_sock.c | 7 ++++--- + net/ipv4/inet_timewait_sock.c | 5 +++-- + 2 files changed, 7 insertions(+), 5 deletions(-) + +--- a/net/ipv4/inet_connection_sock.c ++++ b/net/ipv4/inet_connection_sock.c +@@ -603,7 +603,7 @@ static void reqsk_timer_handler(unsigned + if (req->num_timeout++ == 0) + atomic_dec(&queue->young); + timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); +- mod_timer_pinned(&req->rsk_timer, jiffies + timeo); ++ mod_timer(&req->rsk_timer, jiffies + timeo); + return; + } + drop: +@@ -617,8 +617,9 @@ static void reqsk_queue_hash_req(struct + req->num_timeout = 0; + req->sk = NULL; + +- setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); +- mod_timer_pinned(&req->rsk_timer, jiffies + timeout); ++ setup_pinned_timer(&req->rsk_timer, reqsk_timer_handler, ++ (unsigned long)req); ++ mod_timer(&req->rsk_timer, jiffies + timeout); + + inet_ehash_insert(req_to_sk(req), NULL); + /* before letting lookups find us, make sure all req fields +--- a/net/ipv4/inet_timewait_sock.c ++++ b/net/ipv4/inet_timewait_sock.c +@@ -188,7 +188,8 @@ struct inet_timewait_sock *inet_twsk_all + tw->tw_prot = sk->sk_prot_creator; + atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); + twsk_net_set(tw, sock_net(sk)); +- setup_timer(&tw->tw_timer, tw_timer_handler, (unsigned long)tw); ++ setup_pinned_timer(&tw->tw_timer, tw_timer_handler, ++ (unsigned long)tw); + /* + * Because we use RCU lookups, we should not set tw_refcnt + * to a non null value before everything is setup for this +@@ -248,7 +249,7 @@ void __inet_twsk_schedule(struct inet_ti + + tw->tw_kill = timeo <= 4*HZ; + if (!rearm) { +- BUG_ON(mod_timer_pinned(&tw->tw_timer, jiffies + timeo)); ++ BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo)); + atomic_inc(&tw->tw_dr->tw_count); + } else { + mod_timer_pending(&tw->tw_timer, jiffies + timeo); diff --git a/patches/sched-lazy_preempt-avoid-a-warning-in-the-RT-case.patch b/patches/sched-lazy_preempt-avoid-a-warning-in-the-RT-case.patch new file mode 100644 index 00000000000000..0223da8789643d --- /dev/null +++ b/patches/sched-lazy_preempt-avoid-a-warning-in-the-RT-case.patch @@ -0,0 +1,20 @@ +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Thu, 14 Jul 2016 14:57:07 +0200 +Subject: [PATCH] sched: lazy_preempt: avoid a warning in the !RT case + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/sched/core.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -3518,7 +3518,7 @@ static __always_inline int preemptible_l + + #else + +-static int preemptible_lazy(void) ++static inline int preemptible_lazy(void) + { + return 1; + } diff --git a/patches/series b/patches/series index 04677f6bdd2cbe..1ba1075f1418b0 100644 --- a/patches/series +++ b/patches/series @@ -6,10 +6,38 @@ # UPSTREAM changes queued ############################################################ ARM-imx-always-use-TWD-on-IMX6Q.patch +tracing-Show-the-preempt-count-of-when-the-event-was.patch + +# timer: "Refactor the timer wheel v4' + 2 prerequisites +drm-i915-Use-consistent-forcewake-auto-release-timeo.patch +timer-add-setup_deferrable_timer-macro.patch + +timer-Make-pinned-a-timer-property.patch +x86-apic-uv-Initialize-timer-as-pinned.patch +x86-mce-Initialize-timer-as-pinned.patch +driver-net-ethernet-tile-Initialize-timer-as-pinned.patch +drivers-tty-metag_da-Initialize-timer-as-pinned.patch +drivers-tty-mips_ejtag-Initialize-timer-as-pinned.patch +net-ipv4-inet-Initialize-timers-as-pinned.patch +timer-Remove-mod_timer_pinned.patch +signal-Use-hrtimer-for-sigtimedwait.patch +hlist-Add-hlist_is_singular_node-helper.patch +timer-Give-a-few-structs-and-members-proper-names.patch +timer-Reduce-the-CPU-index-space-to-256k.patch +timer-Switch-to-a-non-cascading-wheel.patch +timer-Remove-slack-leftovers.patch +timer-Move-__run_timers-function.patch +timer-Optimize-collect-timers-for-NOHZ.patch +tick-sched-Remove-pointless-empty-function.patch +timer-Forward-wheel-clock-whenever-possible.patch +timer-Only-wake-softirq-if-necessary.patch +timer-Split-out-index-calculation.patch +timer-Optimization-for-same-expiry-time-in-mod_timer.patch ############################################################ # UPSTREAM FIXES, patches pending ############################################################ +timer-make-the-base-lock-raw.patch ############################################################ # Stuff broken upstream, patches submitted @@ -74,7 +102,6 @@ kernel-SRCU-provide-a-static-initializer.patch ############################################################ # Stuff which should go upstream ASAP ############################################################ -trace-correct-off-by-one-while-recording-the-trace-e.patch # SCHED BLOCK/WQ block-shorten-interrupt-disabled-regions.patch @@ -262,11 +289,7 @@ relay-fix-timer-madness.patch # TIMERS timers-prepare-for-full-preemption.patch -timers-wakeup-all-timer-waiters.patch -timers-wakeup-all-timer-waiters-without-holding-the-.patch -timers-preempt-rt-support.patch timer-delay-waking-softirqs-from-the-jiffy-tick.patch -timers-avoid-the-base-null-otptimization-on-rt.patch # HRTIMERS hrtimers-prepare-full-preemption.patch @@ -556,6 +579,7 @@ rcu-make-RCU_BOOST-default-on-RT.patch # PREEMPT LAZY preempt-lazy-support.patch preempt-lazy-check-preempt_schedule.patch +sched-lazy_preempt-avoid-a-warning-in-the-RT-case.patch x86-preempt-lazy.patch arm-preempt-lazy-support.patch arm-lazy-preempt-correct-resched-condition.patch diff --git a/patches/signal-Use-hrtimer-for-sigtimedwait.patch b/patches/signal-Use-hrtimer-for-sigtimedwait.patch new file mode 100644 index 00000000000000..8d2f263aa91a75 --- /dev/null +++ b/patches/signal-Use-hrtimer-for-sigtimedwait.patch @@ -0,0 +1,77 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:25 +0000 +Subject: [PATCH 10/22] signal: Use hrtimer for sigtimedwait + +We've converted most timeout related syscalls to hrtimers. sigtimedwait() did +not get this treatment. Convert it so we get a reasonable accuracy and remove +the user space exposure to the timer wheel properties. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com> +Cc: Cyril Hrubis <chrubis@suse.cz> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/signal.c | 24 ++++++++++-------------- + 1 file changed, 10 insertions(+), 14 deletions(-) + +--- a/kernel/signal.c ++++ b/kernel/signal.c +@@ -2751,23 +2751,18 @@ int copy_siginfo_to_user(siginfo_t __use + * @ts: upper bound on process time suspension + */ + int do_sigtimedwait(const sigset_t *which, siginfo_t *info, +- const struct timespec *ts) ++ const struct timespec *ts) + { ++ ktime_t *to = NULL, timeout = { .tv64 = KTIME_MAX }; + struct task_struct *tsk = current; +- long timeout = MAX_SCHEDULE_TIMEOUT; + sigset_t mask = *which; +- int sig; ++ int sig, ret = 0; + + if (ts) { + if (!timespec_valid(ts)) + return -EINVAL; +- timeout = timespec_to_jiffies(ts); +- /* +- * We can be close to the next tick, add another one +- * to ensure we will wait at least the time asked for. +- */ +- if (ts->tv_sec || ts->tv_nsec) +- timeout++; ++ timeout = timespec_to_ktime(*ts); ++ to = &timeout; + } + + /* +@@ -2778,7 +2773,7 @@ int do_sigtimedwait(const sigset_t *whic + + spin_lock_irq(&tsk->sighand->siglock); + sig = dequeue_signal(tsk, &mask, info); +- if (!sig && timeout) { ++ if (!sig && timeout.tv64) { + /* + * None ready, temporarily unblock those we're interested + * while we are sleeping in so that we'll be awakened when +@@ -2790,8 +2785,9 @@ int do_sigtimedwait(const sigset_t *whic + recalc_sigpending(); + spin_unlock_irq(&tsk->sighand->siglock); + +- timeout = freezable_schedule_timeout_interruptible(timeout); +- ++ __set_current_state(TASK_INTERRUPTIBLE); ++ ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns, ++ HRTIMER_MODE_REL); + spin_lock_irq(&tsk->sighand->siglock); + __set_task_blocked(tsk, &tsk->real_blocked); + sigemptyset(&tsk->real_blocked); +@@ -2801,7 +2797,7 @@ int do_sigtimedwait(const sigset_t *whic + + if (sig) + return sig; +- return timeout ? -EINTR : -EAGAIN; ++ return ret ? -EINTR : -EAGAIN; + } + + /** diff --git a/patches/softirq-split-locks.patch b/patches/softirq-split-locks.patch index 5bc8e60f001aee..2d06ad6e540632 100644 --- a/patches/softirq-split-locks.patch +++ b/patches/softirq-split-locks.patch @@ -785,7 +785,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> .thread_comm = "ksoftirqd/%u", --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c -@@ -866,14 +866,7 @@ static bool can_stop_idle_tick(int cpu, +@@ -878,14 +878,7 @@ static bool can_stop_idle_tick(int cpu, return false; if (unlikely(local_softirq_pending() && cpu_online(cpu))) { diff --git a/patches/tick-sched-Remove-pointless-empty-function.patch b/patches/tick-sched-Remove-pointless-empty-function.patch new file mode 100644 index 00000000000000..8dcbae1957996e --- /dev/null +++ b/patches/tick-sched-Remove-pointless-empty-function.patch @@ -0,0 +1,70 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:35 +0000 +Subject: [PATCH 18/22] tick/sched: Remove pointless empty function + +This was a failed attempt to optimize the timer expiry in idle, which was +disabled and never revisited. Remove the cruft. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Frederic Weisbecker <fweisbec@gmail.com> +Cc: Chris Mason <clm@fb.com> +Cc: Eric Dumazet <edumazet@google.com> +Cc: rt@linutronix.de +Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/time/tick-sched.c | 33 +-------------------------------- + 1 file changed, 1 insertion(+), 32 deletions(-) + +--- a/kernel/time/tick-sched.c ++++ b/kernel/time/tick-sched.c +@@ -1091,35 +1091,6 @@ static void tick_nohz_switch_to_nohz(voi + tick_nohz_activate(ts, NOHZ_MODE_LOWRES); + } + +-/* +- * When NOHZ is enabled and the tick is stopped, we need to kick the +- * tick timer from irq_enter() so that the jiffies update is kept +- * alive during long running softirqs. That's ugly as hell, but +- * correctness is key even if we need to fix the offending softirq in +- * the first place. +- * +- * Note, this is different to tick_nohz_restart. We just kick the +- * timer and do not touch the other magic bits which need to be done +- * when idle is left. +- */ +-static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) +-{ +-#if 0 +- /* Switch back to 2.6.27 behaviour */ +- ktime_t delta; +- +- /* +- * Do not touch the tick device, when the next expiry is either +- * already reached or less/equal than the tick period. +- */ +- delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); +- if (delta.tv64 <= tick_period.tv64) +- return; +- +- tick_nohz_restart(ts, now); +-#endif +-} +- + static inline void tick_nohz_irq_enter(void) + { + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); +@@ -1130,10 +1101,8 @@ static inline void tick_nohz_irq_enter(v + now = ktime_get(); + if (ts->idle_active) + tick_nohz_stop_idle(ts, now); +- if (ts->tick_stopped) { ++ if (ts->tick_stopped) + tick_nohz_update_jiffies(now); +- tick_nohz_kick_tick(ts, now); +- } + } + + #else diff --git a/patches/timer-Forward-wheel-clock-whenever-possible.patch b/patches/timer-Forward-wheel-clock-whenever-possible.patch new file mode 100644 index 00000000000000..f5b1dfe5d8c7dd --- /dev/null +++ b/patches/timer-Forward-wheel-clock-whenever-possible.patch @@ -0,0 +1,240 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:36 +0000 +Subject: [PATCH 19/22] timer: Forward wheel clock whenever possible + +The wheel clock is stale when a cpu goes into a long idle sleep. This has the +side effect, that timers which are queued end up in the outer wheel +levels. That results in coarser granularity. + +To solve this, we keep track of the idle state and forward the wheel clock +whenever it's possible. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Frederic Weisbecker <fweisbec@gmail.com> +Cc: Chris Mason <clm@fb.com> +Cc: Eric Dumazet <edumazet@google.com> +Cc: rt@linutronix.de +Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/time/tick-internal.h | 1 + kernel/time/tick-sched.c | 12 ++++ + kernel/time/timer.c | 128 ++++++++++++++++++++++++++++++++++++-------- + 3 files changed, 120 insertions(+), 21 deletions(-) + +--- a/kernel/time/tick-internal.h ++++ b/kernel/time/tick-internal.h +@@ -164,3 +164,4 @@ static inline void timers_update_migrati + DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); + + extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); ++void timer_clear_idle(void); +--- a/kernel/time/tick-sched.c ++++ b/kernel/time/tick-sched.c +@@ -700,6 +700,12 @@ static ktime_t tick_nohz_stop_sched_tick + delta = next_tick - basemono; + if (delta <= (u64)TICK_NSEC) { + tick.tv64 = 0; ++ ++ /* ++ * Tell the timer code that the base is not idle, i.e. undo ++ * the effect of get_next_timer_interrupt(). ++ */ ++ timer_clear_idle(); + /* + * We've not stopped the tick yet, and there's a timer in the + * next period, so no point in stopping it either, bail. +@@ -808,6 +814,12 @@ static void tick_nohz_restart_sched_tick + tick_do_update_jiffies64(now); + update_cpu_load_nohz(active); + ++ /* ++ * Clear the timer idle flag, so we avoid IPIs on remote queueing and ++ * the clock forward checks in the enqueue path. ++ */ ++ timer_clear_idle(); ++ + calc_load_exit_idle(); + touch_softlockup_watchdog_sched(); + /* +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -196,9 +196,11 @@ struct timer_base { + spinlock_t lock; + struct timer_list *running_timer; + unsigned long clk; ++ unsigned long next_expiry; + unsigned int cpu; + bool migration_enabled; + bool nohz_active; ++ bool is_idle; + DECLARE_BITMAP(pending_map, WHEEL_SIZE); + struct hlist_head vectors[WHEEL_SIZE]; + } ____cacheline_aligned; +@@ -519,24 +521,37 @@ static void internal_add_timer(struct ti + { + __internal_add_timer(base, timer); + ++ if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) ++ return; ++ + /* +- * Check whether the other CPU is in dynticks mode and needs +- * to be triggered to reevaluate the timer wheel. We are +- * protected against the other CPU fiddling with the timer by +- * holding the timer base lock. This also makes sure that a +- * CPU on the way to stop its tick can not evaluate the timer +- * wheel. +- * +- * Spare the IPI for deferrable timers on idle targets though. +- * The next busy ticks will take care of it. Except full dynticks +- * require special care against races with idle_cpu(), lets deal +- * with that later. +- */ +- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) { +- if (!(timer->flags & TIMER_DEFERRABLE) || +- tick_nohz_full_cpu(base->cpu)) ++ * This wants some optimizing similar to the below, but we do that ++ * when we switch from push to pull for deferrable timers. ++ */ ++ if (timer->flags & TIMER_DEFERRABLE) { ++ if (tick_nohz_full_cpu(base->cpu)) + wake_up_nohz_cpu(base->cpu); ++ return; + } ++ ++ /* ++ * We might have to IPI the remote CPU if the base is idle and the ++ * timer is not deferrable. If the other cpu is on the way to idle ++ * then it can't set base->is_idle as we hold base lock. ++ */ ++ if (!base->is_idle) ++ return; ++ ++ /* Check whether this is the new first expiring timer */ ++ if (time_after_eq(timer->expires, base->next_expiry)) ++ return; ++ ++ /* ++ * Set the next expiry time and kick the cpu so it can reevaluate the ++ * wheel ++ */ ++ base->next_expiry = timer->expires; ++ wake_up_nohz_cpu(base->cpu); + } + + #ifdef CONFIG_TIMER_STATS +@@ -859,10 +874,11 @@ static inline struct timer_base *get_tim + return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); + } + +-static inline struct timer_base *get_target_base(struct timer_base *base, +- unsigned tflags) ++#ifdef CONFIG_NO_HZ_COMMON ++static inline struct timer_base *__get_target_base(struct timer_base *base, ++ unsigned tflags) + { +-#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) ++#ifdef CONFIG_SMP + if ((tflags & TIMER_PINNED) || !base->migration_enabled) + return get_timer_this_cpu_base(tflags); + return get_timer_cpu_base(tflags, get_nohz_timer_target()); +@@ -871,6 +887,43 @@ static inline struct timer_base *get_tar + #endif + } + ++static inline void forward_timer_base(struct timer_base *base) ++{ ++ /* ++ * We only forward the base when it's idle and we have a delta between ++ * base clock and jiffies. ++ */ ++ if (!base->is_idle || (long) (jiffies - base->clk) < 2) ++ return; ++ ++ /* ++ * If the next expiry value is > jiffies, then we fast forward to ++ * jiffies otherwise we forward to the next expiry value. ++ */ ++ if (time_after(base->next_expiry, jiffies)) ++ base->clk = jiffies; ++ else ++ base->clk = base->next_expiry; ++} ++#else ++static inline struct timer_base *__get_target_base(struct timer_base *base, ++ unsigned tflags) ++{ ++ return get_timer_this_cpu_base(tflags); ++} ++ ++static inline void forward_timer_base(struct timer_base *base) { } ++#endif ++ ++static inline struct timer_base *get_target_base(struct timer_base *base, ++ unsigned tflags) ++{ ++ struct timer_base *target = __get_target_base(base, tflags); ++ ++ forward_timer_base(target); ++ return target; ++} ++ + /* + * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means + * that all timers which are tied to this base are locked, and the base itself +@@ -1432,16 +1485,49 @@ u64 get_next_timer_interrupt(unsigned lo + + spin_lock(&base->lock); + nextevt = __next_timer_interrupt(base); +- spin_unlock(&base->lock); ++ base->next_expiry = nextevt; ++ /* ++ * We have a fresh next event. Check whether we can forward the base. ++ */ ++ if (time_after(nextevt, jiffies)) ++ base->clk = jiffies; ++ else if (time_after(nextevt, base->clk)) ++ base->clk = nextevt; + +- if (time_before_eq(nextevt, basej)) ++ if (time_before_eq(nextevt, basej)) { + expires = basem; +- else ++ base->is_idle = false; ++ } else { + expires = basem + (nextevt - basej) * TICK_NSEC; ++ /* ++ * If we expect to sleep more than a tick, mark the base idle. ++ */ ++ if ((expires - basem) > TICK_NSEC) ++ base->is_idle = true; ++ } ++ spin_unlock(&base->lock); + + return cmp_next_hrtimer_event(basem, expires); + } + ++/** ++ * timer_clear_idle - Clear the idle state of the timer base ++ * ++ * Called with interrupts disabled ++ */ ++void timer_clear_idle(void) ++{ ++ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); ++ ++ /* ++ * We do this unlocked. The worst outcome is a remote enqueue sending ++ * a pointless IPI, but taking the lock would just make the window for ++ * sending the IPI a few instructions smaller for the cost of taking ++ * the lock in the exit from idle path. ++ */ ++ base->is_idle = false; ++} ++ + static int collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) + { diff --git a/patches/timer-Give-a-few-structs-and-members-proper-names.patch b/patches/timer-Give-a-few-structs-and-members-proper-names.patch new file mode 100644 index 00000000000000..23e201d24177fe --- /dev/null +++ b/patches/timer-Give-a-few-structs-and-members-proper-names.patch @@ -0,0 +1,421 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:28 +0000 +Subject: [PATCH 12/22] timer: Give a few structs and members proper names + +Some of the names are not longer correct and others are simply too long to +type. Clean it up before we switch the wheel implementation over to the new +scheme. + +No functional change. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Chris Mason <clm@fb.com> +Cc: Eric Dumazet <edumazet@google.com> +Cc: rt@linutronix.de +Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/time/timer.c | 118 ++++++++++++++++++++++++++-------------------------- + 1 file changed, 59 insertions(+), 59 deletions(-) + +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -77,10 +77,10 @@ struct tvec_root { + struct hlist_head vec[TVR_SIZE]; + }; + +-struct tvec_base { ++struct timer_base { + spinlock_t lock; + struct timer_list *running_timer; +- unsigned long timer_jiffies; ++ unsigned long clk; + unsigned long next_timer; + unsigned long active_timers; + unsigned long all_timers; +@@ -95,7 +95,7 @@ struct tvec_base { + } ____cacheline_aligned; + + +-static DEFINE_PER_CPU(struct tvec_base, tvec_bases); ++static DEFINE_PER_CPU(struct timer_base, timer_bases); + + #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + unsigned int sysctl_timer_migration = 1; +@@ -106,15 +106,15 @@ void timers_update_migration(bool update + unsigned int cpu; + + /* Avoid the loop, if nothing to update */ +- if (this_cpu_read(tvec_bases.migration_enabled) == on) ++ if (this_cpu_read(timer_bases.migration_enabled) == on) + return; + + for_each_possible_cpu(cpu) { +- per_cpu(tvec_bases.migration_enabled, cpu) = on; ++ per_cpu(timer_bases.migration_enabled, cpu) = on; + per_cpu(hrtimer_bases.migration_enabled, cpu) = on; + if (!update_nohz) + continue; +- per_cpu(tvec_bases.nohz_active, cpu) = true; ++ per_cpu(timer_bases.nohz_active, cpu) = true; + per_cpu(hrtimer_bases.nohz_active, cpu) = true; + } + } +@@ -134,18 +134,18 @@ int timer_migration_handler(struct ctl_t + return ret; + } + +-static inline struct tvec_base *get_target_base(struct tvec_base *base, ++static inline struct timer_base *get_target_base(struct timer_base *base, + int pinned) + { + if (pinned || !base->migration_enabled) +- return this_cpu_ptr(&tvec_bases); +- return per_cpu_ptr(&tvec_bases, get_nohz_timer_target()); ++ return this_cpu_ptr(&timer_bases); ++ return per_cpu_ptr(&timer_bases, get_nohz_timer_target()); + } + #else +-static inline struct tvec_base *get_target_base(struct tvec_base *base, ++static inline struct timer_base *get_target_base(struct timer_base *base, + int pinned) + { +- return this_cpu_ptr(&tvec_bases); ++ return this_cpu_ptr(&timer_bases); + } + #endif + +@@ -371,10 +371,10 @@ void set_timer_slack(struct timer_list * + EXPORT_SYMBOL_GPL(set_timer_slack); + + static void +-__internal_add_timer(struct tvec_base *base, struct timer_list *timer) ++__internal_add_timer(struct timer_base *base, struct timer_list *timer) + { + unsigned long expires = timer->expires; +- unsigned long idx = expires - base->timer_jiffies; ++ unsigned long idx = expires - base->clk; + struct hlist_head *vec; + + if (idx < TVR_SIZE) { +@@ -394,7 +394,7 @@ static void + * Can happen if you add a timer with expires == jiffies, + * or you set a timer to go off in the past + */ +- vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); ++ vec = base->tv1.vec + (base->clk & TVR_MASK); + } else { + int i; + /* If the timeout is larger than MAX_TVAL (on 64-bit +@@ -403,7 +403,7 @@ static void + */ + if (idx > MAX_TVAL) { + idx = MAX_TVAL; +- expires = idx + base->timer_jiffies; ++ expires = idx + base->clk; + } + i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; + vec = base->tv5.vec + i; +@@ -412,11 +412,11 @@ static void + hlist_add_head(&timer->entry, vec); + } + +-static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) ++static void internal_add_timer(struct timer_base *base, struct timer_list *timer) + { + /* Advance base->jiffies, if the base is empty */ + if (!base->all_timers++) +- base->timer_jiffies = jiffies; ++ base->clk = jiffies; + + __internal_add_timer(base, timer); + /* +@@ -722,7 +722,7 @@ static inline void detach_timer(struct t + } + + static inline void +-detach_expired_timer(struct timer_list *timer, struct tvec_base *base) ++detach_expired_timer(struct timer_list *timer, struct timer_base *base) + { + detach_timer(timer, true); + if (!(timer->flags & TIMER_DEFERRABLE)) +@@ -730,7 +730,7 @@ detach_expired_timer(struct timer_list * + base->all_timers--; + } + +-static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, ++static int detach_if_pending(struct timer_list *timer, struct timer_base *base, + bool clear_pending) + { + if (!timer_pending(timer)) +@@ -740,16 +740,16 @@ static int detach_if_pending(struct time + if (!(timer->flags & TIMER_DEFERRABLE)) { + base->active_timers--; + if (timer->expires == base->next_timer) +- base->next_timer = base->timer_jiffies; ++ base->next_timer = base->clk; + } + /* If this was the last timer, advance base->jiffies */ + if (!--base->all_timers) +- base->timer_jiffies = jiffies; ++ base->clk = jiffies; + return 1; + } + + /* +- * We are using hashed locking: holding per_cpu(tvec_bases).lock ++ * We are using hashed locking: holding per_cpu(timer_bases).lock + * means that all timers which are tied to this base via timer->base are + * locked, and the base itself is locked too. + * +@@ -759,16 +759,16 @@ static int detach_if_pending(struct time + * When the timer's base is locked and removed from the list, the + * TIMER_MIGRATING flag is set, FIXME + */ +-static struct tvec_base *lock_timer_base(struct timer_list *timer, ++static struct timer_base *lock_timer_base(struct timer_list *timer, + unsigned long *flags) + __acquires(timer->base->lock) + { + for (;;) { + u32 tf = timer->flags; +- struct tvec_base *base; ++ struct timer_base *base; + + if (!(tf & TIMER_MIGRATING)) { +- base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK); ++ base = per_cpu_ptr(&timer_bases, tf & TIMER_CPUMASK); + spin_lock_irqsave(&base->lock, *flags); + if (timer->flags == tf) + return base; +@@ -781,7 +781,7 @@ static struct tvec_base *lock_timer_base + static inline int + __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) + { +- struct tvec_base *base, *new_base; ++ struct timer_base *base, *new_base; + unsigned long flags; + int ret = 0; + +@@ -948,8 +948,8 @@ EXPORT_SYMBOL(add_timer); + */ + void add_timer_on(struct timer_list *timer, int cpu) + { +- struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu); +- struct tvec_base *base; ++ struct timer_base *new_base = per_cpu_ptr(&timer_bases, cpu); ++ struct timer_base *base; + unsigned long flags; + + timer_stats_timer_set_start_info(timer); +@@ -990,7 +990,7 @@ EXPORT_SYMBOL_GPL(add_timer_on); + */ + int del_timer(struct timer_list *timer) + { +- struct tvec_base *base; ++ struct timer_base *base; + unsigned long flags; + int ret = 0; + +@@ -1016,7 +1016,7 @@ EXPORT_SYMBOL(del_timer); + */ + int try_to_del_timer_sync(struct timer_list *timer) + { +- struct tvec_base *base; ++ struct timer_base *base; + unsigned long flags; + int ret = -1; + +@@ -1100,7 +1100,7 @@ int del_timer_sync(struct timer_list *ti + EXPORT_SYMBOL(del_timer_sync); + #endif + +-static int cascade(struct tvec_base *base, struct tvec *tv, int index) ++static int cascade(struct timer_base *base, struct tvec *tv, int index) + { + /* cascade all the timers from tv up one level */ + struct timer_list *timer; +@@ -1164,7 +1164,7 @@ static void call_timer_fn(struct timer_l + } + } + +-#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) ++#define INDEX(N) ((base->clk >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) + + /** + * __run_timers - run all expired timers (if any) on this CPU. +@@ -1173,23 +1173,23 @@ static void call_timer_fn(struct timer_l + * This function cascades all vectors and executes all expired timer + * vectors. + */ +-static inline void __run_timers(struct tvec_base *base) ++static inline void __run_timers(struct timer_base *base) + { + struct timer_list *timer; + + spin_lock_irq(&base->lock); + +- while (time_after_eq(jiffies, base->timer_jiffies)) { ++ while (time_after_eq(jiffies, base->clk)) { + struct hlist_head work_list; + struct hlist_head *head = &work_list; + int index; + + if (!base->all_timers) { +- base->timer_jiffies = jiffies; ++ base->clk = jiffies; + break; + } + +- index = base->timer_jiffies & TVR_MASK; ++ index = base->clk & TVR_MASK; + + /* + * Cascade timers: +@@ -1199,7 +1199,7 @@ static inline void __run_timers(struct t + (!cascade(base, &base->tv3, INDEX(1))) && + !cascade(base, &base->tv4, INDEX(2))) + cascade(base, &base->tv5, INDEX(3)); +- ++base->timer_jiffies; ++ ++base->clk; + hlist_move_list(base->tv1.vec + index, head); + while (!hlist_empty(head)) { + void (*fn)(unsigned long); +@@ -1237,16 +1237,16 @@ static inline void __run_timers(struct t + * is used on S/390 to stop all activity when a CPU is idle. + * This function needs to be called with interrupts disabled. + */ +-static unsigned long __next_timer_interrupt(struct tvec_base *base) ++static unsigned long __next_timer_interrupt(struct timer_base *base) + { +- unsigned long timer_jiffies = base->timer_jiffies; +- unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; ++ unsigned long clk = base->clk; ++ unsigned long expires = clk + NEXT_TIMER_MAX_DELTA; + int index, slot, array, found = 0; + struct timer_list *nte; + struct tvec *varray[4]; + + /* Look for timer events in tv1. */ +- index = slot = timer_jiffies & TVR_MASK; ++ index = slot = clk & TVR_MASK; + do { + hlist_for_each_entry(nte, base->tv1.vec + slot, entry) { + if (nte->flags & TIMER_DEFERRABLE) +@@ -1265,8 +1265,8 @@ static unsigned long __next_timer_interr + cascade: + /* Calculate the next cascade event */ + if (index) +- timer_jiffies += TVR_SIZE - index; +- timer_jiffies >>= TVR_BITS; ++ clk += TVR_SIZE - index; ++ clk >>= TVR_BITS; + + /* Check tv2-tv5. */ + varray[0] = &base->tv2; +@@ -1277,7 +1277,7 @@ static unsigned long __next_timer_interr + for (array = 0; array < 4; array++) { + struct tvec *varp = varray[array]; + +- index = slot = timer_jiffies & TVN_MASK; ++ index = slot = clk & TVN_MASK; + do { + hlist_for_each_entry(nte, varp->vec + slot, entry) { + if (nte->flags & TIMER_DEFERRABLE) +@@ -1301,8 +1301,8 @@ static unsigned long __next_timer_interr + } while (slot != index); + + if (index) +- timer_jiffies += TVN_SIZE - index; +- timer_jiffies >>= TVN_BITS; ++ clk += TVN_SIZE - index; ++ clk >>= TVN_BITS; + } + return expires; + } +@@ -1350,7 +1350,7 @@ static u64 cmp_next_hrtimer_event(u64 ba + */ + u64 get_next_timer_interrupt(unsigned long basej, u64 basem) + { +- struct tvec_base *base = this_cpu_ptr(&tvec_bases); ++ struct timer_base *base = this_cpu_ptr(&timer_bases); + u64 expires = KTIME_MAX; + unsigned long nextevt; + +@@ -1363,7 +1363,7 @@ u64 get_next_timer_interrupt(unsigned lo + + spin_lock(&base->lock); + if (base->active_timers) { +- if (time_before_eq(base->next_timer, base->timer_jiffies)) ++ if (time_before_eq(base->next_timer, base->clk)) + base->next_timer = __next_timer_interrupt(base); + nextevt = base->next_timer; + if (time_before_eq(nextevt, basej)) +@@ -1402,9 +1402,9 @@ void update_process_times(int user_tick) + */ + static void run_timer_softirq(struct softirq_action *h) + { +- struct tvec_base *base = this_cpu_ptr(&tvec_bases); ++ struct timer_base *base = this_cpu_ptr(&timer_bases); + +- if (time_after_eq(jiffies, base->timer_jiffies)) ++ if (time_after_eq(jiffies, base->clk)) + __run_timers(base); + } + +@@ -1549,7 +1549,7 @@ signed long __sched schedule_timeout_idl + EXPORT_SYMBOL(schedule_timeout_idle); + + #ifdef CONFIG_HOTPLUG_CPU +-static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) ++static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) + { + struct timer_list *timer; + int cpu = new_base->cpu; +@@ -1565,13 +1565,13 @@ static void migrate_timer_list(struct tv + + static void migrate_timers(int cpu) + { +- struct tvec_base *old_base; +- struct tvec_base *new_base; ++ struct timer_base *old_base; ++ struct timer_base *new_base; + int i; + + BUG_ON(cpu_online(cpu)); +- old_base = per_cpu_ptr(&tvec_bases, cpu); +- new_base = get_cpu_ptr(&tvec_bases); ++ old_base = per_cpu_ptr(&timer_bases, cpu); ++ new_base = get_cpu_ptr(&timer_bases); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. +@@ -1595,7 +1595,7 @@ static void migrate_timers(int cpu) + + spin_unlock(&old_base->lock); + spin_unlock_irq(&new_base->lock); +- put_cpu_ptr(&tvec_bases); ++ put_cpu_ptr(&timer_bases); + } + + static int timer_cpu_notify(struct notifier_block *self, +@@ -1623,13 +1623,13 @@ static inline void timer_register_cpu_no + + static void __init init_timer_cpu(int cpu) + { +- struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu); ++ struct timer_base *base = per_cpu_ptr(&timer_bases, cpu); + + base->cpu = cpu; + spin_lock_init(&base->lock); + +- base->timer_jiffies = jiffies; +- base->next_timer = base->timer_jiffies; ++ base->clk = jiffies; ++ base->next_timer = base->clk; + } + + static void __init init_timer_cpus(void) diff --git a/patches/timer-Make-pinned-a-timer-property.patch b/patches/timer-Make-pinned-a-timer-property.patch new file mode 100644 index 00000000000000..53e82de75aa237 --- /dev/null +++ b/patches/timer-Make-pinned-a-timer-property.patch @@ -0,0 +1,144 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:15 +0000 +Subject: [PATCH 01/22] timer: Make pinned a timer property + +We want to move the timer migration from a push to a pull model. This requires +to store the pinned attribute of a timer in the timer itself. This must happen +at initialization time. + +Add the helper macros for this. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Chris Mason <clm@fb.com> +Cc: Eric Dumazet <edumazet@google.com> +Cc: rt@linutronix.de +Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/timer.h | 25 ++++++++++++++++++++++--- + kernel/time/timer.c | 10 +++++----- + 2 files changed, 27 insertions(+), 8 deletions(-) + +--- a/include/linux/timer.h ++++ b/include/linux/timer.h +@@ -62,7 +62,8 @@ struct timer_list { + #define TIMER_MIGRATING 0x00080000 + #define TIMER_BASEMASK (TIMER_CPUMASK | TIMER_MIGRATING) + #define TIMER_DEFERRABLE 0x00100000 +-#define TIMER_IRQSAFE 0x00200000 ++#define TIMER_PINNED 0x00200000 ++#define TIMER_IRQSAFE 0x00400000 + + #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \ + .entry = { .next = TIMER_ENTRY_STATIC }, \ +@@ -78,9 +79,15 @@ struct timer_list { + #define TIMER_INITIALIZER(_function, _expires, _data) \ + __TIMER_INITIALIZER((_function), (_expires), (_data), 0) + ++#define TIMER_PINNED_INITIALIZER(_function, _expires, _data) \ ++ __TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_PINNED) ++ + #define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data) \ + __TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE) + ++#define TIMER_PINNED_DEFERRED_INITIALIZER(_function, _expires, _data) \ ++ __TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE | TIMER_PINNED) ++ + #define DEFINE_TIMER(_name, _function, _expires, _data) \ + struct timer_list _name = \ + TIMER_INITIALIZER(_function, _expires, _data) +@@ -124,8 +131,12 @@ static inline void init_timer_on_stack_k + + #define init_timer(timer) \ + __init_timer((timer), 0) ++#define init_timer_pinned(timer) \ ++ __init_timer((timer), TIMER_PINNED) + #define init_timer_deferrable(timer) \ + __init_timer((timer), TIMER_DEFERRABLE) ++#define init_timer_pinned_deferrable(timer) \ ++ __init_timer((timer), TIMER_DEFERRABLE | TIMER_PINNED) + #define init_timer_on_stack(timer) \ + __init_timer_on_stack((timer), 0) + +@@ -145,12 +156,20 @@ static inline void init_timer_on_stack_k + + #define setup_timer(timer, fn, data) \ + __setup_timer((timer), (fn), (data), 0) ++#define setup_pinned_timer(timer, fn, data) \ ++ __setup_timer((timer), (fn), (data), TIMER_PINNED) + #define setup_deferrable_timer(timer, fn, data) \ + __setup_timer((timer), (fn), (data), TIMER_DEFERRABLE) ++#define setup_pinned_deferrable_timer(timer, fn, data) \ ++ __setup_timer((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED) + #define setup_timer_on_stack(timer, fn, data) \ + __setup_timer_on_stack((timer), (fn), (data), 0) ++#define setup_pinned_timer_on_stack(timer, fn, data) \ ++ __setup_timer_on_stack((timer), (fn), (data), TIMER_PINNED) + #define setup_deferrable_timer_on_stack(timer, fn, data) \ + __setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE) ++#define setup_pinned_deferrable_timer_on_stack(timer, fn, data) \ ++ __setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED) + + /** + * timer_pending - is a timer pending? +@@ -175,8 +194,8 @@ extern int mod_timer_pinned(struct timer + + extern void set_timer_slack(struct timer_list *time, int slack_hz); + +-#define TIMER_NOT_PINNED 0 +-#define TIMER_PINNED 1 ++#define MOD_TIMER_NOT_PINNED 0 ++#define MOD_TIMER_PINNED 1 + /* + * The jiffies value which is added to now, when there is no timer + * in the timer wheel: +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -797,7 +797,7 @@ static inline int + + debug_activate(timer, expires); + +- new_base = get_target_base(base, pinned); ++ new_base = get_target_base(base, pinned || timer->flags & TIMER_PINNED); + + if (base != new_base) { + /* +@@ -840,7 +840,7 @@ static inline int + */ + int mod_timer_pending(struct timer_list *timer, unsigned long expires) + { +- return __mod_timer(timer, expires, true, TIMER_NOT_PINNED); ++ return __mod_timer(timer, expires, true, MOD_TIMER_NOT_PINNED); + } + EXPORT_SYMBOL(mod_timer_pending); + +@@ -915,7 +915,7 @@ int mod_timer(struct timer_list *timer, + if (timer_pending(timer) && timer->expires == expires) + return 1; + +- return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); ++ return __mod_timer(timer, expires, false, MOD_TIMER_NOT_PINNED); + } + EXPORT_SYMBOL(mod_timer); + +@@ -943,7 +943,7 @@ int mod_timer_pinned(struct timer_list * + if (timer->expires == expires && timer_pending(timer)) + return 1; + +- return __mod_timer(timer, expires, false, TIMER_PINNED); ++ return __mod_timer(timer, expires, false, MOD_TIMER_PINNED); + } + EXPORT_SYMBOL(mod_timer_pinned); + +@@ -1527,7 +1527,7 @@ signed long __sched schedule_timeout(sig + expire = timeout + jiffies; + + setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); +- __mod_timer(&timer, expire, false, TIMER_NOT_PINNED); ++ __mod_timer(&timer, expire, false, MOD_TIMER_NOT_PINNED); + schedule(); + del_singleshot_timer_sync(&timer); + diff --git a/patches/timer-Move-__run_timers-function.patch b/patches/timer-Move-__run_timers-function.patch new file mode 100644 index 00000000000000..2ceee9302bb7e0 --- /dev/null +++ b/patches/timer-Move-__run_timers-function.patch @@ -0,0 +1,91 @@ +From: Anna-Maria Gleixner <anna-maria@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:33 +0000 +Subject: [PATCH 16/22] timer: Move __run_timers() function + +Move __run_timers() below __next_timer_interrupt() and next_pending_bucket() +in preparation for __run_timers() NOHZ optimization. + +No functional change. + +Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Frederic Weisbecker <fweisbec@gmail.com> +Cc: Chris Mason <clm@fb.com> +Cc: Eric Dumazet <edumazet@google.com> +Cc: rt@linutronix.de +Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/time/timer.c | 52 ++++++++++++++++++++++++++-------------------------- + 1 file changed, 26 insertions(+), 26 deletions(-) + +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1292,32 +1292,6 @@ static int collect_expired_timers(struct + return levels; + } + +-/** +- * __run_timers - run all expired timers (if any) on this CPU. +- * @base: the timer vector to be processed. +- */ +-static inline void __run_timers(struct timer_base *base) +-{ +- struct hlist_head heads[LVL_DEPTH]; +- int levels; +- +- if (!time_after_eq(jiffies, base->clk)) +- return; +- +- spin_lock_irq(&base->lock); +- +- while (time_after_eq(jiffies, base->clk)) { +- +- levels = collect_expired_timers(base, heads); +- base->clk++; +- +- while (levels--) +- expire_timers(base, heads + levels); +- } +- base->running_timer = NULL; +- spin_unlock_irq(&base->lock); +-} +- + #ifdef CONFIG_NO_HZ_COMMON + /* + * Find the next pending bucket of a level. Search from @offset + @clk upwards +@@ -1487,6 +1461,32 @@ void update_process_times(int user_tick) + run_posix_cpu_timers(p); + } + ++/** ++ * __run_timers - run all expired timers (if any) on this CPU. ++ * @base: the timer vector to be processed. ++ */ ++static inline void __run_timers(struct timer_base *base) ++{ ++ struct hlist_head heads[LVL_DEPTH]; ++ int levels; ++ ++ if (!time_after_eq(jiffies, base->clk)) ++ return; ++ ++ spin_lock_irq(&base->lock); ++ ++ while (time_after_eq(jiffies, base->clk)) { ++ ++ levels = collect_expired_timers(base, heads); ++ base->clk++; ++ ++ while (levels--) ++ expire_timers(base, heads + levels); ++ } ++ base->running_timer = NULL; ++ spin_unlock_irq(&base->lock); ++} ++ + /* + * This function runs timers and the timer-tq in bottom half context. + */ diff --git a/patches/timer-Only-wake-softirq-if-necessary.patch b/patches/timer-Only-wake-softirq-if-necessary.patch new file mode 100644 index 00000000000000..22f297d82d68d3 --- /dev/null +++ b/patches/timer-Only-wake-softirq-if-necessary.patch @@ -0,0 +1,34 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:37 +0000 +Subject: [PATCH 20/22] timer: Only wake softirq if necessary + +With the wheel forwading in place and with the HZ=1000 4ms folding we can +avoid running the softirq at all. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/time/timer.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1623,7 +1623,18 @@ static void run_timer_softirq(struct sof + */ + void run_local_timers(void) + { ++ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); ++ + hrtimer_run_queues(); ++ /* Raise the softirq only if required. */ ++ if (time_before(jiffies, base->clk)) { ++ if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) ++ return; ++ /* CPU is awake, so check the deferrable base. */ ++ base++; ++ if (time_before(jiffies, base->clk)) ++ return; ++ } + raise_softirq(TIMER_SOFTIRQ); + } + diff --git a/patches/timer-Optimization-for-same-expiry-time-in-mod_timer.patch b/patches/timer-Optimization-for-same-expiry-time-in-mod_timer.patch new file mode 100644 index 00000000000000..da3390e3b336a0 --- /dev/null +++ b/patches/timer-Optimization-for-same-expiry-time-in-mod_timer.patch @@ -0,0 +1,130 @@ +From: Anna-Maria Gleixner <anna-maria@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:40 +0000 +Subject: [PATCH 22/22] timer: Optimization for same expiry time in mod_timer() + +The existing optimization for same expiry time in mod_timer() checks whether +the timer expiry time is the same as the new requested expiry time. In the old +timer wheel implementation this does not take the slack batching into account, +neither does the new implementation evaluate whether the new expiry time will +requeue the timer to the same bucket. + +To optimize that, we can calculate the resulting bucket and check if the new +expiry time is different from the current expiry time. This calculation +happens outside the base lock held region. If the resulting bucket is the same +we can avoid taking the base lock and requeueing the timer. + +If the timer needs to be requeued then we have to check under the base lock +whether the base time has changed between the lockless calculation and taking +the lock. If it has changed we need to recalculate under the lock. + +This optimization takes effect for timers which are enqueued into the less +granular wheel levels (1 and above). With a simple test case the functionality +has been verified: + + Before After +Match: 5.5% 86.6% +Requeue: 94.5% 13.4% +Recalc: <0.01% + +In the non optimized case the timer is requeued in 94.5% of the cases. With +the index optimization in place the requeue rate drops to 13.4%. The case +where the lockless index calculation has to be redone is less than 0.01%. + +With a real world test case (networking) we observed the following changes: + + Before After +Match: 97.8% 99.7% +Requeue: 2.2% 0.3% +Recalc: <0.001% + +That means two percent less lock/requeue/unlock operations in one of the hot +path use cases of timers. + +Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Frederic Weisbecker <fweisbec@gmail.com> +Cc: Chris Mason <clm@fb.com> +Cc: Eric Dumazet <edumazet@google.com> +Cc: rt@linutronix.de +Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/time/timer.c | 51 +++++++++++++++++++++++++++++++++++---------------- + 1 file changed, 35 insertions(+), 16 deletions(-) + +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -975,28 +975,36 @@ static inline int + __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) + { + struct timer_base *base, *new_base; +- unsigned long flags; ++ unsigned int idx = UINT_MAX; ++ unsigned long clk = 0, flags; + int ret = 0; + + /* +- * TODO: Calculate the array bucket of the timer right here w/o +- * holding the base lock. This allows to check not only +- * timer->expires == expires below, but also whether the timer +- * ends up in the same bucket. If we really need to requeue +- * the timer then we check whether base->clk have +- * advanced between here and locking the timer base. If +- * jiffies advanced we have to recalc the array bucket with the +- * lock held. +- */ +- +- /* +- * This is a common optimization triggered by the +- * networking code - if the timer is re-modified +- * to be the same thing then just return: ++ * This is a common optimization triggered by the networking code - if ++ * the timer is re-modified to be the same thing or ends up in the ++ * same array bucket then just return: + */ + if (timer_pending(timer)) { + if (timer->expires == expires) + return 1; ++ /* ++ * Take the current timer_jiffies of base, but without holding ++ * the lock! ++ */ ++ base = get_timer_base(timer->flags); ++ clk = base->clk; ++ ++ idx = calc_wheel_index(expires, clk); ++ ++ /* ++ * Retrieve and compare the array index of the pending ++ * timer. If it matches set the expiry to the new value so a ++ * subsequent call will exit in the expires check above. ++ */ ++ if (idx == timer_get_idx(timer)) { ++ timer->expires = expires; ++ return 1; ++ } + } + + timer_stats_timer_set_start_info(timer); +@@ -1033,7 +1041,18 @@ static inline int + } + + timer->expires = expires; +- internal_add_timer(base, timer); ++ /* ++ * If idx was calculated above and the base time did not advance ++ * between calculating idx and taking the lock, only enqueue_timer() ++ * and trigger_dyntick_cpu() is required. Otherwise we need to ++ * (re)calculate the wheel index via internal_add_timer(). ++ */ ++ if (idx != UINT_MAX && clk == base->clk) { ++ enqueue_timer(base, timer, idx); ++ trigger_dyntick_cpu(base, timer); ++ } else { ++ internal_add_timer(base, timer); ++ } + + out_unlock: + spin_unlock_irqrestore(&base->lock, flags); diff --git a/patches/timer-Optimize-collect-timers-for-NOHZ.patch b/patches/timer-Optimize-collect-timers-for-NOHZ.patch new file mode 100644 index 00000000000000..9f1cdd55c6b5de --- /dev/null +++ b/patches/timer-Optimize-collect-timers-for-NOHZ.patch @@ -0,0 +1,128 @@ +From: Anna-Maria Gleixner <anna-maria@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:34 +0000 +Subject: [PATCH 17/22] timer: Optimize collect timers for NOHZ + +After a NOHZ idle sleep the wheel must be forwarded to current jiffies. There +might be expired timers so the current code loops and checks the epxired +buckets for timers. This can take quite some time for long NOHZ idle periods. + +The pending bitmask in the timer base allows us to do a quick search for the +next expiring timer and therefor a fast forward of the base time which +prevents pointless long lasting loops. + +For a 3 second idle sleep this reduces the catchup time from ~1ms to 5us. + +Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Frederic Weisbecker <fweisbec@gmail.com> +Cc: Chris Mason <clm@fb.com> +Cc: Eric Dumazet <edumazet@google.com> +Cc: rt@linutronix.de +Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/time/timer.c | 49 +++++++++++++++++++++++++++++++++++++++++-------- + 1 file changed, 41 insertions(+), 8 deletions(-) + +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1267,8 +1267,8 @@ static void expire_timers(struct timer_b + } + } + +-static int collect_expired_timers(struct timer_base *base, +- struct hlist_head *heads) ++static int __collect_expired_timers(struct timer_base *base, ++ struct hlist_head *heads) + { + unsigned long clk = base->clk; + struct hlist_head *vec; +@@ -1294,9 +1294,9 @@ static int collect_expired_timers(struct + + #ifdef CONFIG_NO_HZ_COMMON + /* +- * Find the next pending bucket of a level. Search from @offset + @clk upwards +- * and if nothing there, search from start of the level (@offset) up to +- * @offset + clk. ++ * Find the next pending bucket of a level. Search from level start (@offset) ++ * + @clk upwards and if nothing there, search from start of the level ++ * (@offset) up to @offset + clk. + */ + static int next_pending_bucket(struct timer_base *base, unsigned offset, + unsigned clk) +@@ -1313,14 +1313,14 @@ static int next_pending_bucket(struct ti + } + + /* +- * Search the first expiring timer in the various clock levels. ++ * Search the first expiring timer in the various clock levels. Caller must ++ * hold base->lock. + */ + static unsigned long __next_timer_interrupt(struct timer_base *base) + { + unsigned long clk, next, adj; + unsigned lvl, offset = 0; + +- spin_lock(&base->lock); + next = base->clk + NEXT_TIMER_MAX_DELTA; + clk = base->clk; + for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { +@@ -1373,7 +1373,6 @@ static unsigned long __next_timer_interr + clk >>= LVL_CLK_SHIFT; + clk += adj; + } +- spin_unlock(&base->lock); + return next; + } + +@@ -1431,7 +1430,10 @@ u64 get_next_timer_interrupt(unsigned lo + if (cpu_is_offline(smp_processor_id())) + return expires; + ++ spin_lock(&base->lock); + nextevt = __next_timer_interrupt(base); ++ spin_unlock(&base->lock); ++ + if (time_before_eq(nextevt, basej)) + expires = basem; + else +@@ -1439,6 +1441,37 @@ u64 get_next_timer_interrupt(unsigned lo + + return cmp_next_hrtimer_event(basem, expires); + } ++ ++static int collect_expired_timers(struct timer_base *base, ++ struct hlist_head *heads) ++{ ++ /* ++ * NOHZ optimization. After a long idle sleep we need to forward the ++ * base to current jiffies. Avoid a loop by searching the bitfield for ++ * the next expiring timer. ++ */ ++ if ((long)(jiffies - base->clk) > 2) { ++ unsigned long next = __next_timer_interrupt(base); ++ ++ /* ++ * If the next timer is ahead of time forward to current ++ * jiffies, otherwise forward to the next expiry time. ++ */ ++ if (time_after(next, jiffies)) { ++ /* The call site will increment clock! */ ++ base->clk = jiffies - 1; ++ return 0; ++ } ++ base->clk = next; ++ } ++ return __collect_expired_timers(base, heads); ++} ++#else ++static inline int collect_expired_timers(struct timer_base *base, ++ struct hlist_head *heads) ++{ ++ return __collect_expired_timers(base, heads); ++} + #endif + + /* diff --git a/patches/timer-Reduce-the-CPU-index-space-to-256k.patch b/patches/timer-Reduce-the-CPU-index-space-to-256k.patch new file mode 100644 index 00000000000000..e153377d4b95c5 --- /dev/null +++ b/patches/timer-Reduce-the-CPU-index-space-to-256k.patch @@ -0,0 +1,34 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:29 +0000 +Subject: [PATCH 13/22] timer: Reduce the CPU index space to 256k + +We want to store the array index in the flags space. 256k CPUs should be +enough for a while. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/timer.h | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +--- a/include/linux/timer.h ++++ b/include/linux/timer.h +@@ -58,12 +58,12 @@ struct timer_list { + * workqueue locking issues. It's not meant for executing random crap + * with interrupts disabled. Abuse is monitored! + */ +-#define TIMER_CPUMASK 0x0007FFFF +-#define TIMER_MIGRATING 0x00080000 ++#define TIMER_CPUMASK 0x0003FFFF ++#define TIMER_MIGRATING 0x00040000 + #define TIMER_BASEMASK (TIMER_CPUMASK | TIMER_MIGRATING) +-#define TIMER_DEFERRABLE 0x00100000 +-#define TIMER_PINNED 0x00200000 +-#define TIMER_IRQSAFE 0x00400000 ++#define TIMER_DEFERRABLE 0x00080000 ++#define TIMER_PINNED 0x00100000 ++#define TIMER_IRQSAFE 0x00200000 + + #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \ + .entry = { .next = TIMER_ENTRY_STATIC }, \ diff --git a/patches/timer-Remove-mod_timer_pinned.patch b/patches/timer-Remove-mod_timer_pinned.patch new file mode 100644 index 00000000000000..d61b24e8c027c7 --- /dev/null +++ b/patches/timer-Remove-mod_timer_pinned.patch @@ -0,0 +1,116 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:24 +0000 +Subject: [PATCH 09/22] timer: Remove mod_timer_pinned + +We switched all users to initialize the timers as pinned and call +mod_timer(). Remove the now unused function. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Chris Mason <clm@fb.com> +Cc: Eric Dumazet <edumazet@google.com> +Cc: rt@linutronix.de +Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/timer.h | 3 --- + kernel/time/timer.c | 39 +++++---------------------------------- + 2 files changed, 5 insertions(+), 37 deletions(-) + +--- a/include/linux/timer.h ++++ b/include/linux/timer.h +@@ -190,12 +190,9 @@ extern void add_timer_on(struct timer_li + extern int del_timer(struct timer_list * timer); + extern int mod_timer(struct timer_list *timer, unsigned long expires); + extern int mod_timer_pending(struct timer_list *timer, unsigned long expires); +-extern int mod_timer_pinned(struct timer_list *timer, unsigned long expires); + + extern void set_timer_slack(struct timer_list *time, int slack_hz); + +-#define MOD_TIMER_NOT_PINNED 0 +-#define MOD_TIMER_PINNED 1 + /* + * The jiffies value which is added to now, when there is no timer + * in the timer wheel: +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -779,8 +779,7 @@ static struct tvec_base *lock_timer_base + } + + static inline int +-__mod_timer(struct timer_list *timer, unsigned long expires, +- bool pending_only, int pinned) ++__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) + { + struct tvec_base *base, *new_base; + unsigned long flags; +@@ -797,7 +796,7 @@ static inline int + + debug_activate(timer, expires); + +- new_base = get_target_base(base, pinned || timer->flags & TIMER_PINNED); ++ new_base = get_target_base(base, timer->flags & TIMER_PINNED); + + if (base != new_base) { + /* +@@ -840,7 +839,7 @@ static inline int + */ + int mod_timer_pending(struct timer_list *timer, unsigned long expires) + { +- return __mod_timer(timer, expires, true, MOD_TIMER_NOT_PINNED); ++ return __mod_timer(timer, expires, true); + } + EXPORT_SYMBOL(mod_timer_pending); + +@@ -915,39 +914,11 @@ int mod_timer(struct timer_list *timer, + if (timer_pending(timer) && timer->expires == expires) + return 1; + +- return __mod_timer(timer, expires, false, MOD_TIMER_NOT_PINNED); ++ return __mod_timer(timer, expires, false); + } + EXPORT_SYMBOL(mod_timer); + + /** +- * mod_timer_pinned - modify a timer's timeout +- * @timer: the timer to be modified +- * @expires: new timeout in jiffies +- * +- * mod_timer_pinned() is a way to update the expire field of an +- * active timer (if the timer is inactive it will be activated) +- * and to ensure that the timer is scheduled on the current CPU. +- * +- * Note that this does not prevent the timer from being migrated +- * when the current CPU goes offline. If this is a problem for +- * you, use CPU-hotplug notifiers to handle it correctly, for +- * example, cancelling the timer when the corresponding CPU goes +- * offline. +- * +- * mod_timer_pinned(timer, expires) is equivalent to: +- * +- * del_timer(timer); timer->expires = expires; add_timer(timer); +- */ +-int mod_timer_pinned(struct timer_list *timer, unsigned long expires) +-{ +- if (timer->expires == expires && timer_pending(timer)) +- return 1; +- +- return __mod_timer(timer, expires, false, MOD_TIMER_PINNED); +-} +-EXPORT_SYMBOL(mod_timer_pinned); +- +-/** + * add_timer - start a timer + * @timer: the timer to be added + * +@@ -1527,7 +1498,7 @@ signed long __sched schedule_timeout(sig + expire = timeout + jiffies; + + setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); +- __mod_timer(&timer, expire, false, MOD_TIMER_NOT_PINNED); ++ __mod_timer(&timer, expire, false); + schedule(); + del_singleshot_timer_sync(&timer); + diff --git a/patches/timer-Remove-slack-leftovers.patch b/patches/timer-Remove-slack-leftovers.patch new file mode 100644 index 00000000000000..4ec5f4014086ee --- /dev/null +++ b/patches/timer-Remove-slack-leftovers.patch @@ -0,0 +1,161 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:31 +0000 +Subject: [PATCH 15/22] timer: Remove slack leftovers + +We now have implicit batching in the timer wheel. The slack is not longer +used. Remove it. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Frederic Weisbecker <fweisbec@gmail.com> +Cc: Chris Mason <clm@fb.com> +Cc: Eric Dumazet <edumazet@google.com> +Cc: rt@linutronix.de +Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + block/genhd.c | 5 ----- + drivers/mmc/host/jz4740_mmc.c | 2 -- + drivers/power/bq27xxx_battery.c | 5 +---- + drivers/usb/host/ohci-hcd.c | 1 - + drivers/usb/host/xhci.c | 2 -- + include/linux/timer.h | 4 ---- + kernel/time/timer.c | 19 ------------------- + lib/random32.c | 1 - + 8 files changed, 1 insertion(+), 38 deletions(-) + +--- a/block/genhd.c ++++ b/block/genhd.c +@@ -1523,12 +1523,7 @@ static void __disk_unblock_events(struct + if (--ev->block) + goto out_unlock; + +- /* +- * Not exactly a latency critical operation, set poll timer +- * slack to 25% and kick event check. +- */ + intv = disk_events_poll_jiffies(disk); +- set_timer_slack(&ev->dwork.timer, intv / 4); + if (check_now) + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, 0); +--- a/drivers/mmc/host/jz4740_mmc.c ++++ b/drivers/mmc/host/jz4740_mmc.c +@@ -1068,8 +1068,6 @@ static int jz4740_mmc_probe(struct platf + jz4740_mmc_clock_disable(host); + setup_timer(&host->timeout_timer, jz4740_mmc_timeout, + (unsigned long)host); +- /* It is not important when it times out, it just needs to timeout. */ +- set_timer_slack(&host->timeout_timer, HZ); + + host->use_dma = true; + if (host->use_dma && jz4740_mmc_acquire_dma_channels(host) != 0) +--- a/drivers/power/bq27xxx_battery.c ++++ b/drivers/power/bq27xxx_battery.c +@@ -735,11 +735,8 @@ static void bq27xxx_battery_poll(struct + + bq27xxx_battery_update(di); + +- if (poll_interval > 0) { +- /* The timer does not have to be accurate. */ +- set_timer_slack(&di->work.timer, poll_interval * HZ / 4); ++ if (poll_interval > 0) + schedule_delayed_work(&di->work, poll_interval * HZ); +- } + } + + /* +--- a/drivers/usb/host/ohci-hcd.c ++++ b/drivers/usb/host/ohci-hcd.c +@@ -500,7 +500,6 @@ static int ohci_init (struct ohci_hcd *o + + setup_timer(&ohci->io_watchdog, io_watchdog_func, + (unsigned long) ohci); +- set_timer_slack(&ohci->io_watchdog, msecs_to_jiffies(20)); + + ohci->hcca = dma_alloc_coherent (hcd->self.controller, + sizeof(*ohci->hcca), &ohci->hcca_dma, GFP_KERNEL); +--- a/drivers/usb/host/xhci.c ++++ b/drivers/usb/host/xhci.c +@@ -490,8 +490,6 @@ static void compliance_mode_recovery_tim + xhci->comp_mode_recovery_timer.expires = jiffies + + msecs_to_jiffies(COMP_MODE_RCVRY_MSECS); + +- set_timer_slack(&xhci->comp_mode_recovery_timer, +- msecs_to_jiffies(COMP_MODE_RCVRY_MSECS)); + add_timer(&xhci->comp_mode_recovery_timer); + xhci_dbg_trace(xhci, trace_xhci_dbg_quirks, + "Compliance mode recovery timer initialized"); +--- a/include/linux/timer.h ++++ b/include/linux/timer.h +@@ -19,7 +19,6 @@ struct timer_list { + void (*function)(unsigned long); + unsigned long data; + u32 flags; +- int slack; + + #ifdef CONFIG_TIMER_STATS + int start_pid; +@@ -73,7 +72,6 @@ struct timer_list { + .expires = (_expires), \ + .data = (_data), \ + .flags = (_flags), \ +- .slack = -1, \ + __TIMER_LOCKDEP_MAP_INITIALIZER( \ + __FILE__ ":" __stringify(__LINE__)) \ + } +@@ -193,8 +191,6 @@ extern int del_timer(struct timer_list * + extern int mod_timer(struct timer_list *timer, unsigned long expires); + extern int mod_timer_pending(struct timer_list *timer, unsigned long expires); + +-extern void set_timer_slack(struct timer_list *time, int slack_hz); +- + /* + * The jiffies value which is added to now, when there is no timer + * in the timer wheel: +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -447,24 +447,6 @@ unsigned long round_jiffies_up_relative( + } + EXPORT_SYMBOL_GPL(round_jiffies_up_relative); + +-/** +- * set_timer_slack - set the allowed slack for a timer +- * @timer: the timer to be modified +- * @slack_hz: the amount of time (in jiffies) allowed for rounding +- * +- * Set the amount of time, in jiffies, that a certain timer has +- * in terms of slack. By setting this value, the timer subsystem +- * will schedule the actual timer somewhere between +- * the time mod_timer() asks for, and that time plus the slack. +- * +- * By setting the slack to -1, a percentage of the delay is used +- * instead. +- */ +-void set_timer_slack(struct timer_list *timer, int slack_hz) +-{ +- timer->slack = slack_hz; +-} +-EXPORT_SYMBOL_GPL(set_timer_slack); + + static inline unsigned int timer_get_idx(struct timer_list *timer) + { +@@ -790,7 +772,6 @@ static void do_init_timer(struct timer_l + { + timer->entry.pprev = NULL; + timer->flags = flags | raw_smp_processor_id(); +- timer->slack = -1; + #ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; + timer->start_pid = -1; +--- a/lib/random32.c ++++ b/lib/random32.c +@@ -233,7 +233,6 @@ static void __prandom_timer(unsigned lon + + static void __init __prandom_start_seed_timer(void) + { +- set_timer_slack(&seed_timer, HZ); + seed_timer.expires = jiffies + msecs_to_jiffies(40 * MSEC_PER_SEC); + add_timer(&seed_timer); + } diff --git a/patches/timer-Split-out-index-calculation.patch b/patches/timer-Split-out-index-calculation.patch new file mode 100644 index 00000000000000..6bf4ebe1982124 --- /dev/null +++ b/patches/timer-Split-out-index-calculation.patch @@ -0,0 +1,105 @@ +From: Anna-Maria Gleixner <anna-maria@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:39 +0000 +Subject: [PATCH 21/22] timer: Split out index calculation + +For further optimizations we need to seperate index calculation and +queueing. No functional change. + +Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Frederic Weisbecker <fweisbec@gmail.com> +Cc: Chris Mason <clm@fb.com> +Cc: Eric Dumazet <edumazet@google.com> +Cc: rt@linutronix.de +Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/time/timer.c | 47 ++++++++++++++++++++++++++++++++--------------- + 1 file changed, 32 insertions(+), 15 deletions(-) + +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -471,12 +471,9 @@ static inline unsigned calc_index(unsign + return LVL_OFFS(lvl) + (expires & LVL_MASK); + } + +-static void +-__internal_add_timer(struct timer_base *base, struct timer_list *timer) ++static int calc_wheel_index(unsigned long expires, unsigned long clk) + { +- unsigned long expires = timer->expires; +- unsigned long delta = expires - base->clk; +- struct hlist_head *vec; ++ unsigned long delta = expires - clk; + unsigned int idx; + + if (delta < LVL_START(1)) { +@@ -496,7 +493,7 @@ static void + } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { + idx = calc_index(expires, 7); + } else if ((long) delta < 0) { +- idx = base->clk & LVL_MASK; ++ idx = clk & LVL_MASK; + } else { + /* + * Force expire obscene large timeouts to expire at the +@@ -507,20 +504,33 @@ static void + + idx = calc_index(expires, LVL_DEPTH - 1); + } +- /* +- * Enqueue the timer into the array bucket, mark it pending in +- * the bitmap and store the index in the timer flags. +- */ +- vec = base->vectors + idx; +- hlist_add_head(&timer->entry, vec); ++ return idx; ++} ++ ++/* ++ * Enqueue the timer into the hash bucket, mark it pending in ++ * the bitmap and store the index in the timer flags. ++ */ ++static void enqueue_timer(struct timer_base *base, struct timer_list *timer, ++ unsigned int idx) ++{ ++ hlist_add_head(&timer->entry, base->vectors + idx); + __set_bit(idx, base->pending_map); + timer_set_idx(timer, idx); + } + +-static void internal_add_timer(struct timer_base *base, struct timer_list *timer) ++static void ++__internal_add_timer(struct timer_base *base, struct timer_list *timer) + { +- __internal_add_timer(base, timer); ++ unsigned int idx; ++ ++ idx = calc_wheel_index(timer->expires, base->clk); ++ enqueue_timer(base, timer, idx); ++} + ++static void ++trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) ++{ + if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) + return; + +@@ -551,7 +561,14 @@ static void internal_add_timer(struct ti + * wheel + */ + base->next_expiry = timer->expires; +- wake_up_nohz_cpu(base->cpu); ++ wake_up_nohz_cpu(base->cpu); ++} ++ ++static void ++internal_add_timer(struct timer_base *base, struct timer_list *timer) ++{ ++ __internal_add_timer(base, timer); ++ trigger_dyntick_cpu(base, timer); + } + + #ifdef CONFIG_TIMER_STATS diff --git a/patches/timer-Switch-to-a-non-cascading-wheel.patch b/patches/timer-Switch-to-a-non-cascading-wheel.patch new file mode 100644 index 00000000000000..c43ad1b47ceb83 --- /dev/null +++ b/patches/timer-Switch-to-a-non-cascading-wheel.patch @@ -0,0 +1,1169 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:30 +0000 +Subject: [PATCH 14/22] timer: Switch to a non cascading wheel + +The current timer wheel has some drawbacks: + +1) Cascading + + Cascading can be an unbound operation and is completely pointless in most + cases because the vast majority of the timer wheel timers are canceled or + rearmed before expiration. + +2) No fast lookup of the next expiring timer + + In NOHZ scenarios the first timer soft interrupt after a long NOHZ period + must fast forward the base time to current jiffies. As we have no way to + find the next expiring timer fast, the code loops and increments the base + time by one and checks for expired timers in each step. + +After a thorough analysis of real world data gathered on laptops, +workstations, webservers and other machines (thanks Chris!) I came to the +conclusion that the current 'classic' timer wheel implementation can be +modified to address the above issues. + +The vast majority of timer wheel timers is canceled or rearmed before +expiry. Most of them are timeouts for networking and other I/O tasks. The +nature of timeouts is to catch the exception from normal operation (TCP ack +timed out, disk does not respond, etc.). For these kind of timeouts the +accuracy is not really a concern. In case the timeout fires, performance is +down the drain already. + +The few timers which actually expire can be split into two categories: + + 1) Short expiry times which expect halfways accurate expiry + + 2) Long term expiry times are inaccurate today already due to the batching + which is done for NOHZ. + +So for long term expiry timers we can avoid the cascading property and just +leave them in the less granular outer wheels until expiry or +cancelation. Timers which are armed with a timeout larger than the wheel +capacity are not longer cascaded. We expire them with the longest possible +timeout (6+ days). We have not observed such timeouts in our data collection, +but at least we handle them with the least surprising effect. + +To avoid extending the wheel levels for HZ=1000 so we can accomodate the +longest observed timeouts (5 days in the network conntrack code) we reduce the +first level granularity on HZ=1000 to 4ms, which effectively is the same as +the HZ=250 behaviour. From our data analysis there is nothing which relies on +that 1ms granularity and as a side effect we get better batching and timer +locality for the networking code as well. + +Contrary to the classic wheel the granularity of the next wheel is not the +capacity of the first wheel. The granularities of the wheels are in the +currently chosen setting 8 times the granularity of the previous wheel. So for +HZ=250 we end up with the following granularity levels: + +Level Offset Granularity Range + 0 0 4 ms 0 ms - 252 ms + 1 64 32 ms 256 ms - 2044 ms (256ms - ~2s) + 2 128 256 ms 2048 ms - 16380 ms (~2s - ~16s) + 3 192 2048 ms (~2s) 16384 ms - 131068 ms (~16s - ~2m) + 4 256 16384 ms (~16s) 131072 ms - 1048572 ms (~2m - ~17m) + 5 320 131072 ms (~2m) 1048576 ms - 8388604 ms (~17m - ~2h) + 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h) + 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d) + +That's a worst case inaccuracy of 12.5% for the timers which are queued at the +beginning of a level. + +So the new wheel concept addresses the old issues: + +1) Cascading is avoided (except for extreme long time timers) + +2) By keeping the timers in the bucket until expiry/cancelation we can track + the buckets which have timers enqueued in a bucket bitmap and therefor can + lookup the next expiring timer fast and time bound. + +A further benefit of the concept is, that the slack calculation which is done +on every timer start is not longer necessary because the granularity levels +provide natural batching already. + +Our extensive testing with various loads did not show any performance +degradation vs. the current wheel implementation. + +This patch does not address the 'fast lookup' issue as we wanted to make sure +that there is no regression introduced by the wheel redesign. The +optimizations are in follow up patches. + +[ Contains fixes from Anna-Maria Gleixner and Richard Cochran ] + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Frederic Weisbecker <fweisbec@gmail.com> +Cc: Chris Mason <clm@fb.com> +Cc: Eric Dumazet <edumazet@google.com> +Cc: rt@linutronix.de +Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/timer.h | 2 + kernel/time/timer.c | 825 ++++++++++++++++++++++++++++---------------------- + 2 files changed, 467 insertions(+), 360 deletions(-) + +--- a/include/linux/timer.h ++++ b/include/linux/timer.h +@@ -64,6 +64,8 @@ struct timer_list { + #define TIMER_DEFERRABLE 0x00080000 + #define TIMER_PINNED 0x00100000 + #define TIMER_IRQSAFE 0x00200000 ++#define TIMER_ARRAYSHIFT 22 ++#define TIMER_ARRAYMASK 0xFFC00000 + + #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \ + .entry = { .next = TIMER_ENTRY_STATIC }, \ +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -59,43 +59,151 @@ + EXPORT_SYMBOL(jiffies_64); + + /* +- * per-CPU timer vector definitions: ++ * The timer wheel has LVL_DEPTH array levels. Each level provides an array of ++ * LVL_SIZE buckets. Each level is driven by its own clock and therefor each ++ * level has a different granularity. ++ * ++ * The level granularity is: LVL_CLK_DIV ^ lvl ++ * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level) ++ * ++ * The array level of a newly armed timer depends on the relative expiry ++ * time. The farther the expiry time is away the higher the array level and ++ * therefor the granularity becomes. ++ * ++ * Contrary to the original timer wheel implementation, which aims for 'exact' ++ * expiry of the timers, this implementation removes the need for recascading ++ * the timers into the lower array levels. The previous 'classic' timer wheel ++ * implementation of the kernel already violated the 'exact' expiry by adding ++ * slack to the expiry time to provide batched expiration. The granularity ++ * levels provide implicit batching. ++ * ++ * This is an optimization of the original timer wheel implementation for the ++ * majority of the timer wheel use cases: timeouts. The vast majority of ++ * timeout timers (networking, disk I/O ...) are canceled before expiry. If ++ * the timeout expires it indicates that normal operation is disturbed, so it ++ * does not matter much whether the timeout comes with a slight delay. ++ * ++ * The only exception to this are networking timers with a small expiry ++ * time. They rely on the granularity. Those fit into the first wheel level, ++ * which has HZ granularity. ++ * ++ * We don't have cascading anymore. timers with a expiry time above the ++ * capacity of the last wheel level are force expired at the maximum timeout ++ * value of the last wheel level. From data sampling we know that the maximum ++ * value observed is 5 days (network connection tracking), so this should not ++ * be an issue. ++ * ++ * The currently chosen array constants values are a good compromise between ++ * array size and granularity. ++ * ++ * This results in the following granularity and range levels: ++ * ++ * HZ 1000 steps ++ * Level Offset Granularity Range ++ * 0 0 1 ms 0 ms - 63 ms ++ * 1 64 8 ms 64 ms - 511 ms ++ * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s) ++ * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s) ++ * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m) ++ * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m) ++ * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h) ++ * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d) ++ * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d) ++ * ++ * HZ 300 ++ * Level Offset Granularity Range ++ * 0 0 3 ms 0 ms - 210 ms ++ * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s) ++ * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s) ++ * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m) ++ * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m) ++ * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h) ++ * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h) ++ * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d) ++ * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d) ++ * ++ * HZ 250 ++ * Level Offset Granularity Range ++ * 0 0 4 ms 0 ms - 255 ms ++ * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s) ++ * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s) ++ * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m) ++ * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m) ++ * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h) ++ * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h) ++ * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d) ++ * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d) ++ * ++ * HZ 100 ++ * Level Offset Granularity Range ++ * 0 0 10 ms 0 ms - 630 ms ++ * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s) ++ * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s) ++ * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m) ++ * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m) ++ * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h) ++ * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d) ++ * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d) ++ */ ++ ++/* Clock divisor for the next level */ ++#define LVL_CLK_SHIFT 3 ++#define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT) ++#define LVL_CLK_MASK (LVL_CLK_DIV - 1) ++#define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT) ++#define LVL_GRAN(n) (1UL << LVL_SHIFT(n)) ++ ++/* ++ * The time start value for each level to select the bucket at enqueue ++ * time. + */ +-#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) +-#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) +-#define TVN_SIZE (1 << TVN_BITS) +-#define TVR_SIZE (1 << TVR_BITS) +-#define TVN_MASK (TVN_SIZE - 1) +-#define TVR_MASK (TVR_SIZE - 1) +-#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) ++#define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) + +-struct tvec { +- struct hlist_head vec[TVN_SIZE]; +-}; ++/* Size of each clock level */ ++#define LVL_BITS 6 ++#define LVL_SIZE (1UL << LVL_BITS) ++#define LVL_MASK (LVL_SIZE - 1) ++#define LVL_OFFS(n) ((n) * LVL_SIZE) ++ ++/* Level depth */ ++#if HZ > 100 ++# define LVL_DEPTH 9 ++# else ++# define LVL_DEPTH 8 ++#endif + +-struct tvec_root { +- struct hlist_head vec[TVR_SIZE]; +-}; ++/* The cutoff (max. capacity of the wheel) */ ++#define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH)) ++#define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1)) ++ ++/* ++ * The resulting wheel size. If NOHZ is configured we allocate two ++ * wheels so we have a separate storage for the deferrable timers. ++ */ ++#define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH) ++ ++#ifdef CONFIG_NO_HZ_COMMON ++# define NR_BASES 2 ++# define BASE_STD 0 ++# define BASE_DEF 1 ++#else ++# define NR_BASES 1 ++# define BASE_STD 0 ++# define BASE_DEF 0 ++#endif + + struct timer_base { +- spinlock_t lock; +- struct timer_list *running_timer; +- unsigned long clk; +- unsigned long next_timer; +- unsigned long active_timers; +- unsigned long all_timers; +- int cpu; +- bool migration_enabled; +- bool nohz_active; +- struct tvec_root tv1; +- struct tvec tv2; +- struct tvec tv3; +- struct tvec tv4; +- struct tvec tv5; ++ spinlock_t lock; ++ struct timer_list *running_timer; ++ unsigned long clk; ++ unsigned int cpu; ++ bool migration_enabled; ++ bool nohz_active; ++ DECLARE_BITMAP(pending_map, WHEEL_SIZE); ++ struct hlist_head vectors[WHEEL_SIZE]; + } ____cacheline_aligned; + +- +-static DEFINE_PER_CPU(struct timer_base, timer_bases); ++static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); + + #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + unsigned int sysctl_timer_migration = 1; +@@ -106,15 +214,17 @@ void timers_update_migration(bool update + unsigned int cpu; + + /* Avoid the loop, if nothing to update */ +- if (this_cpu_read(timer_bases.migration_enabled) == on) ++ if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on) + return; + + for_each_possible_cpu(cpu) { +- per_cpu(timer_bases.migration_enabled, cpu) = on; ++ per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on; ++ per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on; + per_cpu(hrtimer_bases.migration_enabled, cpu) = on; + if (!update_nohz) + continue; +- per_cpu(timer_bases.nohz_active, cpu) = true; ++ per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true; ++ per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true; + per_cpu(hrtimer_bases.nohz_active, cpu) = true; + } + } +@@ -133,20 +243,6 @@ int timer_migration_handler(struct ctl_t + mutex_unlock(&mutex); + return ret; + } +- +-static inline struct timer_base *get_target_base(struct timer_base *base, +- int pinned) +-{ +- if (pinned || !base->migration_enabled) +- return this_cpu_ptr(&timer_bases); +- return per_cpu_ptr(&timer_bases, get_nohz_timer_target()); +-} +-#else +-static inline struct timer_base *get_target_base(struct timer_base *base, +- int pinned) +-{ +- return this_cpu_ptr(&timer_bases); +-} + #endif + + static unsigned long round_jiffies_common(unsigned long j, int cpu, +@@ -370,78 +466,91 @@ void set_timer_slack(struct timer_list * + } + EXPORT_SYMBOL_GPL(set_timer_slack); + ++static inline unsigned int timer_get_idx(struct timer_list *timer) ++{ ++ return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT; ++} ++ ++static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) ++{ ++ timer->flags = (timer->flags & ~TIMER_ARRAYMASK) | ++ idx << TIMER_ARRAYSHIFT; ++} ++ ++/* ++ * Helper function to calculate the array index for a given expiry ++ * time. ++ */ ++static inline unsigned calc_index(unsigned expires, unsigned lvl) ++{ ++ expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); ++ return LVL_OFFS(lvl) + (expires & LVL_MASK); ++} ++ + static void + __internal_add_timer(struct timer_base *base, struct timer_list *timer) + { + unsigned long expires = timer->expires; +- unsigned long idx = expires - base->clk; ++ unsigned long delta = expires - base->clk; + struct hlist_head *vec; ++ unsigned int idx; + +- if (idx < TVR_SIZE) { +- int i = expires & TVR_MASK; +- vec = base->tv1.vec + i; +- } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { +- int i = (expires >> TVR_BITS) & TVN_MASK; +- vec = base->tv2.vec + i; +- } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { +- int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; +- vec = base->tv3.vec + i; +- } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { +- int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; +- vec = base->tv4.vec + i; +- } else if ((signed long) idx < 0) { +- /* +- * Can happen if you add a timer with expires == jiffies, +- * or you set a timer to go off in the past +- */ +- vec = base->tv1.vec + (base->clk & TVR_MASK); ++ if (delta < LVL_START(1)) { ++ idx = calc_index(expires, 0); ++ } else if (delta < LVL_START(2)) { ++ idx = calc_index(expires, 1); ++ } else if (delta < LVL_START(3)) { ++ idx = calc_index(expires, 2); ++ } else if (delta < LVL_START(4)) { ++ idx = calc_index(expires, 3); ++ } else if (delta < LVL_START(5)) { ++ idx = calc_index(expires, 4); ++ } else if (delta < LVL_START(6)) { ++ idx = calc_index(expires, 5); ++ } else if (delta < LVL_START(7)) { ++ idx = calc_index(expires, 6); ++ } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { ++ idx = calc_index(expires, 7); ++ } else if ((long) delta < 0) { ++ idx = base->clk & LVL_MASK; + } else { +- int i; +- /* If the timeout is larger than MAX_TVAL (on 64-bit +- * architectures or with CONFIG_BASE_SMALL=1) then we +- * use the maximum timeout. ++ /* ++ * Force expire obscene large timeouts to expire at the ++ * capacity limit of the wheel. + */ +- if (idx > MAX_TVAL) { +- idx = MAX_TVAL; +- expires = idx + base->clk; +- } +- i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; +- vec = base->tv5.vec + i; +- } ++ if (expires >= WHEEL_TIMEOUT_CUTOFF) ++ expires = WHEEL_TIMEOUT_MAX; + ++ idx = calc_index(expires, LVL_DEPTH - 1); ++ } ++ /* ++ * Enqueue the timer into the array bucket, mark it pending in ++ * the bitmap and store the index in the timer flags. ++ */ ++ vec = base->vectors + idx; + hlist_add_head(&timer->entry, vec); ++ __set_bit(idx, base->pending_map); ++ timer_set_idx(timer, idx); + } + + static void internal_add_timer(struct timer_base *base, struct timer_list *timer) + { +- /* Advance base->jiffies, if the base is empty */ +- if (!base->all_timers++) +- base->clk = jiffies; +- + __internal_add_timer(base, timer); +- /* +- * Update base->active_timers and base->next_timer +- */ +- if (!(timer->flags & TIMER_DEFERRABLE)) { +- if (!base->active_timers++ || +- time_before(timer->expires, base->next_timer)) +- base->next_timer = timer->expires; +- } + + /* + * Check whether the other CPU is in dynticks mode and needs +- * to be triggered to reevaluate the timer wheel. +- * We are protected against the other CPU fiddling +- * with the timer by holding the timer base lock. This also +- * makes sure that a CPU on the way to stop its tick can not +- * evaluate the timer wheel. ++ * to be triggered to reevaluate the timer wheel. We are ++ * protected against the other CPU fiddling with the timer by ++ * holding the timer base lock. This also makes sure that a ++ * CPU on the way to stop its tick can not evaluate the timer ++ * wheel. + * + * Spare the IPI for deferrable timers on idle targets though. + * The next busy ticks will take care of it. Except full dynticks + * require special care against races with idle_cpu(), lets deal + * with that later. + */ +- if (base->nohz_active) { ++ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) { + if (!(timer->flags & TIMER_DEFERRABLE) || + tick_nohz_full_cpu(base->cpu)) + wake_up_nohz_cpu(base->cpu); +@@ -721,54 +830,87 @@ static inline void detach_timer(struct t + entry->next = LIST_POISON2; + } + +-static inline void +-detach_expired_timer(struct timer_list *timer, struct timer_base *base) +-{ +- detach_timer(timer, true); +- if (!(timer->flags & TIMER_DEFERRABLE)) +- base->active_timers--; +- base->all_timers--; +-} +- + static int detach_if_pending(struct timer_list *timer, struct timer_base *base, + bool clear_pending) + { ++ unsigned idx = timer_get_idx(timer); ++ + if (!timer_pending(timer)) + return 0; + ++ if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) ++ __clear_bit(idx, base->pending_map); ++ + detach_timer(timer, clear_pending); +- if (!(timer->flags & TIMER_DEFERRABLE)) { +- base->active_timers--; +- if (timer->expires == base->next_timer) +- base->next_timer = base->clk; +- } +- /* If this was the last timer, advance base->jiffies */ +- if (!--base->all_timers) +- base->clk = jiffies; + return 1; + } + ++static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) ++{ ++ struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); ++ ++ /* ++ * If the timer is deferrable and nohz is active then we need to use ++ * the deferrable base. ++ */ ++ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && ++ (tflags & TIMER_DEFERRABLE)) ++ base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); ++ return base; ++} ++ ++static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) ++{ ++ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); ++ ++ /* ++ * If the timer is deferrable and nohz is active then we need to use ++ * the deferrable base. ++ */ ++ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && ++ (tflags & TIMER_DEFERRABLE)) ++ base = this_cpu_ptr(&timer_bases[BASE_DEF]); ++ return base; ++} ++ ++static inline struct timer_base *get_timer_base(u32 tflags) ++{ ++ return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); ++} ++ ++static inline struct timer_base *get_target_base(struct timer_base *base, ++ unsigned tflags) ++{ ++#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) ++ if ((tflags & TIMER_PINNED) || !base->migration_enabled) ++ return get_timer_this_cpu_base(tflags); ++ return get_timer_cpu_base(tflags, get_nohz_timer_target()); ++#else ++ return get_timer_this_cpu_base(tflags); ++#endif ++} ++ + /* +- * We are using hashed locking: holding per_cpu(timer_bases).lock +- * means that all timers which are tied to this base via timer->base are +- * locked, and the base itself is locked too. ++ * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means ++ * that all timers which are tied to this base are locked, and the base itself ++ * is locked too. + * + * So __run_timers/migrate_timers can safely modify all timers which could +- * be found on ->tvX lists. ++ * be found in the base->vectors array. + * +- * When the timer's base is locked and removed from the list, the +- * TIMER_MIGRATING flag is set, FIXME ++ * When a timer is migrating then the TIMER_MIGRATING flag is set and we need ++ * to wait until the migration is done. + */ + static struct timer_base *lock_timer_base(struct timer_list *timer, +- unsigned long *flags) ++ unsigned long *flags) + __acquires(timer->base->lock) + { + for (;;) { +- u32 tf = timer->flags; + struct timer_base *base; ++ u32 tf = timer->flags; + + if (!(tf & TIMER_MIGRATING)) { +- base = per_cpu_ptr(&timer_bases, tf & TIMER_CPUMASK); ++ base = get_timer_base(tf); + spin_lock_irqsave(&base->lock, *flags); + if (timer->flags == tf) + return base; +@@ -785,6 +927,27 @@ static inline int + unsigned long flags; + int ret = 0; + ++ /* ++ * TODO: Calculate the array bucket of the timer right here w/o ++ * holding the base lock. This allows to check not only ++ * timer->expires == expires below, but also whether the timer ++ * ends up in the same bucket. If we really need to requeue ++ * the timer then we check whether base->clk have ++ * advanced between here and locking the timer base. If ++ * jiffies advanced we have to recalc the array bucket with the ++ * lock held. ++ */ ++ ++ /* ++ * This is a common optimization triggered by the ++ * networking code - if the timer is re-modified ++ * to be the same thing then just return: ++ */ ++ if (timer_pending(timer)) { ++ if (timer->expires == expires) ++ return 1; ++ } ++ + timer_stats_timer_set_start_info(timer); + BUG_ON(!timer->function); + +@@ -796,15 +959,15 @@ static inline int + + debug_activate(timer, expires); + +- new_base = get_target_base(base, timer->flags & TIMER_PINNED); ++ new_base = get_target_base(base, timer->flags); + + if (base != new_base) { + /* +- * We are trying to schedule the timer on the local CPU. ++ * We are trying to schedule the timer on the new base. + * However we can't change timer's base while it is running, + * otherwise del_timer_sync() can't detect that the timer's +- * handler yet has not finished. This also guarantees that +- * the timer is serialized wrt itself. ++ * handler yet has not finished. This also guarantees that the ++ * timer is serialized wrt itself. + */ + if (likely(base->running_timer != timer)) { + /* See the comment in lock_timer_base() */ +@@ -843,45 +1006,6 @@ int mod_timer_pending(struct timer_list + } + EXPORT_SYMBOL(mod_timer_pending); + +-/* +- * Decide where to put the timer while taking the slack into account +- * +- * Algorithm: +- * 1) calculate the maximum (absolute) time +- * 2) calculate the highest bit where the expires and new max are different +- * 3) use this bit to make a mask +- * 4) use the bitmask to round down the maximum time, so that all last +- * bits are zeros +- */ +-static inline +-unsigned long apply_slack(struct timer_list *timer, unsigned long expires) +-{ +- unsigned long expires_limit, mask; +- int bit; +- +- if (timer->slack >= 0) { +- expires_limit = expires + timer->slack; +- } else { +- long delta = expires - jiffies; +- +- if (delta < 256) +- return expires; +- +- expires_limit = expires + delta / 256; +- } +- mask = expires ^ expires_limit; +- if (mask == 0) +- return expires; +- +- bit = __fls(mask); +- +- mask = (1UL << bit) - 1; +- +- expires_limit = expires_limit & ~(mask); +- +- return expires_limit; +-} +- + /** + * mod_timer - modify a timer's timeout + * @timer: the timer to be modified +@@ -904,16 +1028,6 @@ unsigned long apply_slack(struct timer_l + */ + int mod_timer(struct timer_list *timer, unsigned long expires) + { +- expires = apply_slack(timer, expires); +- +- /* +- * This is a common optimization triggered by the +- * networking code - if the timer is re-modified +- * to be the same thing then just return: +- */ +- if (timer_pending(timer) && timer->expires == expires) +- return 1; +- + return __mod_timer(timer, expires, false); + } + EXPORT_SYMBOL(mod_timer); +@@ -948,13 +1062,14 @@ EXPORT_SYMBOL(add_timer); + */ + void add_timer_on(struct timer_list *timer, int cpu) + { +- struct timer_base *new_base = per_cpu_ptr(&timer_bases, cpu); +- struct timer_base *base; ++ struct timer_base *new_base, *base; + unsigned long flags; + + timer_stats_timer_set_start_info(timer); + BUG_ON(timer_pending(timer) || !timer->function); + ++ new_base = get_timer_cpu_base(timer->flags, cpu); ++ + /* + * If @timer was on a different CPU, it should be migrated with the + * old base locked to prevent other operations proceeding with the +@@ -1100,27 +1215,6 @@ int del_timer_sync(struct timer_list *ti + EXPORT_SYMBOL(del_timer_sync); + #endif + +-static int cascade(struct timer_base *base, struct tvec *tv, int index) +-{ +- /* cascade all the timers from tv up one level */ +- struct timer_list *timer; +- struct hlist_node *tmp; +- struct hlist_head tv_list; +- +- hlist_move_list(tv->vec + index, &tv_list); +- +- /* +- * We are removing _all_ timers from the list, so we +- * don't have to detach them individually. +- */ +- hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) { +- /* No accounting, while moving them */ +- __internal_add_timer(base, timer); +- } +- +- return index; +-} +- + static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), + unsigned long data) + { +@@ -1164,68 +1258,80 @@ static void call_timer_fn(struct timer_l + } + } + +-#define INDEX(N) ((base->clk >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) ++static void expire_timers(struct timer_base *base, struct hlist_head *head) ++{ ++ while (!hlist_empty(head)) { ++ struct timer_list *timer; ++ void (*fn)(unsigned long); ++ unsigned long data; ++ ++ timer = hlist_entry(head->first, struct timer_list, entry); ++ timer_stats_account_timer(timer); ++ ++ base->running_timer = timer; ++ detach_timer(timer, true); ++ ++ fn = timer->function; ++ data = timer->data; ++ ++ if (timer->flags & TIMER_IRQSAFE) { ++ spin_unlock(&base->lock); ++ call_timer_fn(timer, fn, data); ++ spin_lock(&base->lock); ++ } else { ++ spin_unlock_irq(&base->lock); ++ call_timer_fn(timer, fn, data); ++ spin_lock_irq(&base->lock); ++ } ++ } ++} ++ ++static int collect_expired_timers(struct timer_base *base, ++ struct hlist_head *heads) ++{ ++ unsigned long clk = base->clk; ++ struct hlist_head *vec; ++ int i, levels = 0; ++ unsigned int idx; ++ ++ for (i = 0; i < LVL_DEPTH; i++) { ++ idx = (clk & LVL_MASK) + i * LVL_SIZE; ++ ++ if (__test_and_clear_bit(idx, base->pending_map)) { ++ vec = base->vectors + idx; ++ hlist_move_list(vec, heads++); ++ levels++; ++ } ++ /* Is it time to look at the next level? */ ++ if (clk & LVL_CLK_MASK) ++ break; ++ /* Shift clock for the next level granularity */ ++ clk >>= LVL_CLK_SHIFT; ++ } ++ return levels; ++} + + /** + * __run_timers - run all expired timers (if any) on this CPU. + * @base: the timer vector to be processed. +- * +- * This function cascades all vectors and executes all expired timer +- * vectors. + */ + static inline void __run_timers(struct timer_base *base) + { +- struct timer_list *timer; ++ struct hlist_head heads[LVL_DEPTH]; ++ int levels; ++ ++ if (!time_after_eq(jiffies, base->clk)) ++ return; + + spin_lock_irq(&base->lock); + + while (time_after_eq(jiffies, base->clk)) { +- struct hlist_head work_list; +- struct hlist_head *head = &work_list; +- int index; + +- if (!base->all_timers) { +- base->clk = jiffies; +- break; +- } +- +- index = base->clk & TVR_MASK; ++ levels = collect_expired_timers(base, heads); ++ base->clk++; + +- /* +- * Cascade timers: +- */ +- if (!index && +- (!cascade(base, &base->tv2, INDEX(0))) && +- (!cascade(base, &base->tv3, INDEX(1))) && +- !cascade(base, &base->tv4, INDEX(2))) +- cascade(base, &base->tv5, INDEX(3)); +- ++base->clk; +- hlist_move_list(base->tv1.vec + index, head); +- while (!hlist_empty(head)) { +- void (*fn)(unsigned long); +- unsigned long data; +- bool irqsafe; +- +- timer = hlist_entry(head->first, struct timer_list, entry); +- fn = timer->function; +- data = timer->data; +- irqsafe = timer->flags & TIMER_IRQSAFE; +- +- timer_stats_account_timer(timer); +- +- base->running_timer = timer; +- detach_expired_timer(timer, base); +- +- if (irqsafe) { +- spin_unlock(&base->lock); +- call_timer_fn(timer, fn, data); +- spin_lock(&base->lock); +- } else { +- spin_unlock_irq(&base->lock); +- call_timer_fn(timer, fn, data); +- spin_lock_irq(&base->lock); +- } +- } ++ while (levels--) ++ expire_timers(base, heads + levels); + } + base->running_timer = NULL; + spin_unlock_irq(&base->lock); +@@ -1233,78 +1339,87 @@ static inline void __run_timers(struct t + + #ifdef CONFIG_NO_HZ_COMMON + /* +- * Find out when the next timer event is due to happen. This +- * is used on S/390 to stop all activity when a CPU is idle. +- * This function needs to be called with interrupts disabled. ++ * Find the next pending bucket of a level. Search from @offset + @clk upwards ++ * and if nothing there, search from start of the level (@offset) up to ++ * @offset + clk. ++ */ ++static int next_pending_bucket(struct timer_base *base, unsigned offset, ++ unsigned clk) ++{ ++ unsigned pos, start = offset + clk; ++ unsigned end = offset + LVL_SIZE; ++ ++ pos = find_next_bit(base->pending_map, end, start); ++ if (pos < end) ++ return pos - start; ++ ++ pos = find_next_bit(base->pending_map, start, offset); ++ return pos < start ? pos + LVL_SIZE - start : -1; ++} ++ ++/* ++ * Search the first expiring timer in the various clock levels. + */ + static unsigned long __next_timer_interrupt(struct timer_base *base) + { +- unsigned long clk = base->clk; +- unsigned long expires = clk + NEXT_TIMER_MAX_DELTA; +- int index, slot, array, found = 0; +- struct timer_list *nte; +- struct tvec *varray[4]; +- +- /* Look for timer events in tv1. */ +- index = slot = clk & TVR_MASK; +- do { +- hlist_for_each_entry(nte, base->tv1.vec + slot, entry) { +- if (nte->flags & TIMER_DEFERRABLE) +- continue; +- +- found = 1; +- expires = nte->expires; +- /* Look at the cascade bucket(s)? */ +- if (!index || slot < index) +- goto cascade; +- return expires; +- } +- slot = (slot + 1) & TVR_MASK; +- } while (slot != index); ++ unsigned long clk, next, adj; ++ unsigned lvl, offset = 0; + +-cascade: +- /* Calculate the next cascade event */ +- if (index) +- clk += TVR_SIZE - index; +- clk >>= TVR_BITS; +- +- /* Check tv2-tv5. */ +- varray[0] = &base->tv2; +- varray[1] = &base->tv3; +- varray[2] = &base->tv4; +- varray[3] = &base->tv5; +- +- for (array = 0; array < 4; array++) { +- struct tvec *varp = varray[array]; +- +- index = slot = clk & TVN_MASK; +- do { +- hlist_for_each_entry(nte, varp->vec + slot, entry) { +- if (nte->flags & TIMER_DEFERRABLE) +- continue; +- +- found = 1; +- if (time_before(nte->expires, expires)) +- expires = nte->expires; +- } +- /* +- * Do we still search for the first timer or are +- * we looking up the cascade buckets ? +- */ +- if (found) { +- /* Look at the cascade bucket(s)? */ +- if (!index || slot < index) +- break; +- return expires; +- } +- slot = (slot + 1) & TVN_MASK; +- } while (slot != index); +- +- if (index) +- clk += TVN_SIZE - index; +- clk >>= TVN_BITS; ++ spin_lock(&base->lock); ++ next = base->clk + NEXT_TIMER_MAX_DELTA; ++ clk = base->clk; ++ for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { ++ int pos = next_pending_bucket(base, offset, clk & LVL_MASK); ++ ++ if (pos >= 0) { ++ unsigned long tmp = clk + (unsigned long) pos; ++ ++ tmp <<= LVL_SHIFT(lvl); ++ if (time_before(tmp, next)) ++ next = tmp; ++ } ++ /* ++ * Clock for the next level. If the current level clock lower ++ * bits are zero, we look at the next level as is. If not we ++ * need to advance it by one because that's going to be the ++ * next expiring bucket in that level. base->clk is the next ++ * expiring jiffie. So in case of: ++ * ++ * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 ++ * 0 0 0 0 0 0 ++ * ++ * we have to look at all levels @index 0. With ++ * ++ * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 ++ * 0 0 0 0 0 2 ++ * ++ * LVL0 has the next expiring bucket @index 2. The upper ++ * levels have the next expiring bucket @index 1. ++ * ++ * In case that the propagation wraps the next level the same ++ * rules apply: ++ * ++ * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 ++ * 0 0 0 0 F 2 ++ * ++ * So after looking at LVL0 we get: ++ * ++ * LVL5 LVL4 LVL3 LVL2 LVL1 ++ * 0 0 0 1 0 ++ * ++ * So no propagation from LVL1 to LVL2 because that happened ++ * with the add already, but then we need to propagate further ++ * from LVL2 to LVL3. ++ * ++ * So the simple check whether the lower bits of the current ++ * level are 0 or not is sufficient for all cases. ++ */ ++ adj = clk & LVL_CLK_MASK ? 1 : 0; ++ clk >>= LVL_CLK_SHIFT; ++ clk += adj; + } +- return expires; ++ spin_unlock(&base->lock); ++ return next; + } + + /* +@@ -1350,7 +1465,7 @@ static u64 cmp_next_hrtimer_event(u64 ba + */ + u64 get_next_timer_interrupt(unsigned long basej, u64 basem) + { +- struct timer_base *base = this_cpu_ptr(&timer_bases); ++ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + u64 expires = KTIME_MAX; + unsigned long nextevt; + +@@ -1361,17 +1476,11 @@ u64 get_next_timer_interrupt(unsigned lo + if (cpu_is_offline(smp_processor_id())) + return expires; + +- spin_lock(&base->lock); +- if (base->active_timers) { +- if (time_before_eq(base->next_timer, base->clk)) +- base->next_timer = __next_timer_interrupt(base); +- nextevt = base->next_timer; +- if (time_before_eq(nextevt, basej)) +- expires = basem; +- else +- expires = basem + (nextevt - basej) * TICK_NSEC; +- } +- spin_unlock(&base->lock); ++ nextevt = __next_timer_interrupt(base); ++ if (time_before_eq(nextevt, basej)) ++ expires = basem; ++ else ++ expires = basem + (nextevt - basej) * TICK_NSEC; + + return cmp_next_hrtimer_event(basem, expires); + } +@@ -1402,10 +1511,11 @@ void update_process_times(int user_tick) + */ + static void run_timer_softirq(struct softirq_action *h) + { +- struct timer_base *base = this_cpu_ptr(&timer_bases); ++ struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + +- if (time_after_eq(jiffies, base->clk)) +- __run_timers(base); ++ __run_timers(base); ++ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) ++ __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); + } + + /* +@@ -1556,7 +1666,6 @@ static void migrate_timer_list(struct ti + + while (!hlist_empty(head)) { + timer = hlist_entry(head->first, struct timer_list, entry); +- /* We ignore the accounting on the dying cpu */ + detach_timer(timer, false); + timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; + internal_add_timer(new_base, timer); +@@ -1567,35 +1676,29 @@ static void migrate_timers(int cpu) + { + struct timer_base *old_base; + struct timer_base *new_base; +- int i; ++ int b, i; + + BUG_ON(cpu_online(cpu)); +- old_base = per_cpu_ptr(&timer_bases, cpu); +- new_base = get_cpu_ptr(&timer_bases); +- /* +- * The caller is globally serialized and nobody else +- * takes two locks at once, deadlock is not possible. +- */ +- spin_lock_irq(&new_base->lock); +- spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + +- BUG_ON(old_base->running_timer); ++ for (b = 0; b < NR_BASES; b++) { ++ old_base = per_cpu_ptr(&timer_bases[b], cpu); ++ new_base = get_cpu_ptr(&timer_bases[b]); ++ /* ++ * The caller is globally serialized and nobody else ++ * takes two locks at once, deadlock is not possible. ++ */ ++ spin_lock_irq(&new_base->lock); ++ spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); ++ ++ BUG_ON(old_base->running_timer); ++ ++ for (i = 0; i < WHEEL_SIZE; i++) ++ migrate_timer_list(new_base, old_base->vectors + i); + +- for (i = 0; i < TVR_SIZE; i++) +- migrate_timer_list(new_base, old_base->tv1.vec + i); +- for (i = 0; i < TVN_SIZE; i++) { +- migrate_timer_list(new_base, old_base->tv2.vec + i); +- migrate_timer_list(new_base, old_base->tv3.vec + i); +- migrate_timer_list(new_base, old_base->tv4.vec + i); +- migrate_timer_list(new_base, old_base->tv5.vec + i); +- } +- +- old_base->active_timers = 0; +- old_base->all_timers = 0; +- +- spin_unlock(&old_base->lock); +- spin_unlock_irq(&new_base->lock); +- put_cpu_ptr(&timer_bases); ++ spin_unlock(&old_base->lock); ++ spin_unlock_irq(&new_base->lock); ++ put_cpu_ptr(&timer_bases); ++ } + } + + static int timer_cpu_notify(struct notifier_block *self, +@@ -1623,13 +1726,15 @@ static inline void timer_register_cpu_no + + static void __init init_timer_cpu(int cpu) + { +- struct timer_base *base = per_cpu_ptr(&timer_bases, cpu); +- +- base->cpu = cpu; +- spin_lock_init(&base->lock); ++ struct timer_base *base; ++ int i; + +- base->clk = jiffies; +- base->next_timer = base->clk; ++ for (i = 0; i < NR_BASES; i++) { ++ base = per_cpu_ptr(&timer_bases[i], cpu); ++ base->cpu = cpu; ++ spin_lock_init(&base->lock); ++ base->clk = jiffies; ++ } + } + + static void __init init_timer_cpus(void) diff --git a/patches/timer-add-setup_deferrable_timer-macro.patch b/patches/timer-add-setup_deferrable_timer-macro.patch new file mode 100644 index 00000000000000..3c7546c54ef447 --- /dev/null +++ b/patches/timer-add-setup_deferrable_timer-macro.patch @@ -0,0 +1,26 @@ +From: Lucas Stach <l.stach@pengutronix.de> +Date: Tue, 12 Jan 2016 18:17:19 +0100 +Subject: [PATCH] timer: add setup_deferrable_timer macro + +Upstream commit 6f3ffc19157a14b182d9d0c449cd613cef421fe1 + +Add the trivial missing macro to setup a deferrable timer. + +Signed-off-by: Lucas Stach <l.stach@pengutronix.de> +Acked-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/timer.h | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/include/linux/timer.h ++++ b/include/linux/timer.h +@@ -145,6 +145,8 @@ static inline void init_timer_on_stack_k + + #define setup_timer(timer, fn, data) \ + __setup_timer((timer), (fn), (data), 0) ++#define setup_deferrable_timer(timer, fn, data) \ ++ __setup_timer((timer), (fn), (data), TIMER_DEFERRABLE) + #define setup_timer_on_stack(timer, fn, data) \ + __setup_timer_on_stack((timer), (fn), (data), 0) + #define setup_deferrable_timer_on_stack(timer, fn, data) \ diff --git a/patches/timer-delay-waking-softirqs-from-the-jiffy-tick.patch b/patches/timer-delay-waking-softirqs-from-the-jiffy-tick.patch index 204dda85151b67..beb893e15ac260 100644 --- a/patches/timer-delay-waking-softirqs-from-the-jiffy-tick.patch +++ b/patches/timer-delay-waking-softirqs-from-the-jiffy-tick.patch @@ -58,7 +58,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- a/kernel/time/timer.c +++ b/kernel/time/timer.c -@@ -1456,13 +1456,13 @@ void update_process_times(int user_tick) +@@ -1639,13 +1639,13 @@ void update_process_times(int user_tick) /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); diff --git a/patches/timer-make-the-base-lock-raw.patch b/patches/timer-make-the-base-lock-raw.patch new file mode 100644 index 00000000000000..937534e3789c4b --- /dev/null +++ b/patches/timer-make-the-base-lock-raw.patch @@ -0,0 +1,180 @@ +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 13 Jul 2016 18:22:23 +0200 +Subject: [PATCH] timer: make the base lock raw + +The part where the base lock is held got more predictable / shorter after the +timer rework. One reason is the lack of re-cascading. +That means the lock can be made raw and held in IRQ context. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/time/timer.c | 48 ++++++++++++++++++++++++------------------------ + 1 file changed, 24 insertions(+), 24 deletions(-) + +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -193,7 +193,7 @@ EXPORT_SYMBOL(jiffies_64); + #endif + + struct timer_base { +- spinlock_t lock; ++ raw_spinlock_t lock; + struct timer_list *running_timer; + unsigned long clk; + unsigned long next_expiry; +@@ -962,10 +962,10 @@ static struct timer_base *lock_timer_bas + + if (!(tf & TIMER_MIGRATING)) { + base = get_timer_base(tf); +- spin_lock_irqsave(&base->lock, *flags); ++ raw_spin_lock_irqsave(&base->lock, *flags); + if (timer->flags == tf) + return base; +- spin_unlock_irqrestore(&base->lock, *flags); ++ raw_spin_unlock_irqrestore(&base->lock, *flags); + } + cpu_relax(); + } +@@ -1032,9 +1032,9 @@ static inline int + /* See the comment in lock_timer_base() */ + timer->flags |= TIMER_MIGRATING; + +- spin_unlock(&base->lock); ++ raw_spin_unlock(&base->lock); + base = new_base; +- spin_lock(&base->lock); ++ raw_spin_lock(&base->lock); + WRITE_ONCE(timer->flags, + (timer->flags & ~TIMER_BASEMASK) | base->cpu); + } +@@ -1055,7 +1055,7 @@ static inline int + } + + out_unlock: +- spin_unlock_irqrestore(&base->lock, flags); ++ raw_spin_unlock_irqrestore(&base->lock, flags); + + return ret; + } +@@ -1149,16 +1149,16 @@ void add_timer_on(struct timer_list *tim + if (base != new_base) { + timer->flags |= TIMER_MIGRATING; + +- spin_unlock(&base->lock); ++ raw_spin_unlock(&base->lock); + base = new_base; +- spin_lock(&base->lock); ++ raw_spin_lock(&base->lock); + WRITE_ONCE(timer->flags, + (timer->flags & ~TIMER_BASEMASK) | cpu); + } + + debug_activate(timer, timer->expires); + internal_add_timer(base, timer); +- spin_unlock_irqrestore(&base->lock, flags); ++ raw_spin_unlock_irqrestore(&base->lock, flags); + } + EXPORT_SYMBOL_GPL(add_timer_on); + +@@ -1185,7 +1185,7 @@ int del_timer(struct timer_list *timer) + if (timer_pending(timer)) { + base = lock_timer_base(timer, &flags); + ret = detach_if_pending(timer, base, true); +- spin_unlock_irqrestore(&base->lock, flags); ++ raw_spin_unlock_irqrestore(&base->lock, flags); + } + + return ret; +@@ -1213,7 +1213,7 @@ int try_to_del_timer_sync(struct timer_l + timer_stats_timer_clear_start_info(timer); + ret = detach_if_pending(timer, base, true); + } +- spin_unlock_irqrestore(&base->lock, flags); ++ raw_spin_unlock_irqrestore(&base->lock, flags); + + return ret; + } +@@ -1345,13 +1345,13 @@ static void expire_timers(struct timer_b + data = timer->data; + + if (timer->flags & TIMER_IRQSAFE) { +- spin_unlock(&base->lock); ++ raw_spin_unlock(&base->lock); + call_timer_fn(timer, fn, data); +- spin_lock(&base->lock); ++ raw_spin_lock(&base->lock); + } else { +- spin_unlock_irq(&base->lock); ++ raw_spin_unlock_irq(&base->lock); + call_timer_fn(timer, fn, data); +- spin_lock_irq(&base->lock); ++ raw_spin_lock_irq(&base->lock); + } + } + } +@@ -1519,7 +1519,7 @@ u64 get_next_timer_interrupt(unsigned lo + if (cpu_is_offline(smp_processor_id())) + return expires; + +- spin_lock(&base->lock); ++ raw_spin_lock(&base->lock); + nextevt = __next_timer_interrupt(base); + base->next_expiry = nextevt; + /* +@@ -1541,7 +1541,7 @@ u64 get_next_timer_interrupt(unsigned lo + if ((expires - basem) > TICK_NSEC) + base->is_idle = true; + } +- spin_unlock(&base->lock); ++ raw_spin_unlock(&base->lock); + + return cmp_next_hrtimer_event(basem, expires); + } +@@ -1628,7 +1628,7 @@ static inline void __run_timers(struct t + if (!time_after_eq(jiffies, base->clk)) + return; + +- spin_lock_irq(&base->lock); ++ raw_spin_lock_irq(&base->lock); + + while (time_after_eq(jiffies, base->clk)) { + +@@ -1639,7 +1639,7 @@ static inline void __run_timers(struct t + expire_timers(base, heads + levels); + } + base->running_timer = NULL; +- spin_unlock_irq(&base->lock); ++ raw_spin_unlock_irq(&base->lock); + } + + /* +@@ -1834,16 +1834,16 @@ static void migrate_timers(int cpu) + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ +- spin_lock_irq(&new_base->lock); +- spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); ++ raw_spin_lock_irq(&new_base->lock); ++ raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + + BUG_ON(old_base->running_timer); + + for (i = 0; i < WHEEL_SIZE; i++) + migrate_timer_list(new_base, old_base->vectors + i); + +- spin_unlock(&old_base->lock); +- spin_unlock_irq(&new_base->lock); ++ raw_spin_unlock(&old_base->lock); ++ raw_spin_unlock_irq(&new_base->lock); + put_cpu_ptr(&timer_bases); + } + } +@@ -1879,7 +1879,7 @@ static void __init init_timer_cpu(int cp + for (i = 0; i < NR_BASES; i++) { + base = per_cpu_ptr(&timer_bases[i], cpu); + base->cpu = cpu; +- spin_lock_init(&base->lock); ++ raw_spin_lock_init(&base->lock); + base->clk = jiffies; + } + } diff --git a/patches/timers-avoid-the-base-null-otptimization-on-rt.patch b/patches/timers-avoid-the-base-null-otptimization-on-rt.patch deleted file mode 100644 index 4dbef989266724..00000000000000 --- a/patches/timers-avoid-the-base-null-otptimization-on-rt.patch +++ /dev/null @@ -1,73 +0,0 @@ -Subject: timers: Avoid the switch timers base set to NULL trick on RT -From: Thomas Gleixner <tglx@linutronix.de> -Date: Thu, 21 Jul 2011 15:23:39 +0200 - -On RT that code is preemptible, so we cannot assign NULL to timers -base as a preempter would spin forever in lock_timer_base(). - -Signed-off-by: Thomas Gleixner <tglx@linutronix.de> ---- - kernel/time/timer.c | 45 +++++++++++++++++++++++++++++++++++---------- - 1 file changed, 35 insertions(+), 10 deletions(-) - ---- a/kernel/time/timer.c -+++ b/kernel/time/timer.c -@@ -780,6 +780,39 @@ static struct tvec_base *lock_timer_base - cpu_relax(); - } - } -+#ifdef CONFIG_PREEMPT_RT_FULL -+static inline struct tvec_base *switch_timer_base(struct timer_list *timer, -+ struct tvec_base *old, -+ struct tvec_base *new) -+{ -+ /* -+ * We cannot do the below because we might be preempted and -+ * then the preempter would see NULL and loop forever. -+ */ -+ if (spin_trylock(&new->lock)) { -+ WRITE_ONCE(timer->flags, -+ (timer->flags & ~TIMER_BASEMASK) | new->cpu); -+ spin_unlock(&old->lock); -+ return new; -+ } -+ return old; -+} -+ -+#else -+static inline struct tvec_base *switch_timer_base(struct timer_list *timer, -+ struct tvec_base *old, -+ struct tvec_base *new) -+{ -+ /* See the comment in lock_timer_base() */ -+ timer->flags |= TIMER_MIGRATING; -+ -+ spin_unlock(&old->lock); -+ spin_lock(&new->lock); -+ WRITE_ONCE(timer->flags, -+ (timer->flags & ~TIMER_BASEMASK) | new->cpu); -+ return new; -+} -+#endif - - static inline int - __mod_timer(struct timer_list *timer, unsigned long expires, -@@ -810,16 +843,8 @@ static inline int - * handler yet has not finished. This also guarantees that - * the timer is serialized wrt itself. - */ -- if (likely(base->running_timer != timer)) { -- /* See the comment in lock_timer_base() */ -- timer->flags |= TIMER_MIGRATING; -- -- spin_unlock(&base->lock); -- base = new_base; -- spin_lock(&base->lock); -- WRITE_ONCE(timer->flags, -- (timer->flags & ~TIMER_BASEMASK) | base->cpu); -- } -+ if (likely(base->running_timer != timer)) -+ base = switch_timer_base(timer, base, new_base); - } - - timer->expires = expires; diff --git a/patches/timers-preempt-rt-support.patch b/patches/timers-preempt-rt-support.patch deleted file mode 100644 index d87595a0106f41..00000000000000 --- a/patches/timers-preempt-rt-support.patch +++ /dev/null @@ -1,54 +0,0 @@ -From: Ingo Molnar <mingo@elte.hu> -Date: Fri, 3 Jul 2009 08:30:20 -0500 -Subject: timers: Preempt-rt support - -The base->lock is a sleeping lock. Try not to workaround with a -spin_trylock(). The rt-mutex lock is not irq save even the try-lock -due to way how the inner lock accessed. Even with this fixed have the -problem that the owner is not the current process on the CPU but his -pid is used while taking the lock. Therefore we go with ext jiffy for -the wakeup. Also drop preempt_disable() usage since we need just to -ensure not to switch CPUs (the data structures have own locks). - -[bigeasy: dropped that spin try lock] -Signed-off-by: Ingo Molnar <mingo@elte.hu> -Signed-off-by: Thomas Gleixner <tglx@linutronix.de> ---- - kernel/time/timer.c | 12 ++++++++++-- - 1 file changed, 10 insertions(+), 2 deletions(-) - ---- a/kernel/time/timer.c -+++ b/kernel/time/timer.c -@@ -1422,6 +1422,14 @@ u64 get_next_timer_interrupt(unsigned lo - if (cpu_is_offline(smp_processor_id())) - return expires; - -+#ifdef CONFIG_PREEMPT_RT_FULL -+ /* -+ * On PREEMPT_RT we cannot sleep here. As a result we can't take -+ * the base lock to check when the next timer is pending and so -+ * we assume the next jiffy. -+ */ -+ return basem + TICK_NSEC; -+#endif - spin_lock(&base->lock); - if (base->active_timers) { - if (time_before_eq(base->next_timer, base->timer_jiffies)) -@@ -1632,7 +1640,7 @@ static void migrate_timers(int cpu) - - BUG_ON(cpu_online(cpu)); - old_base = per_cpu_ptr(&tvec_bases, cpu); -- new_base = get_cpu_ptr(&tvec_bases); -+ new_base = get_local_ptr(&tvec_bases); - /* - * The caller is globally serialized and nobody else - * takes two locks at once, deadlock is not possible. -@@ -1656,7 +1664,7 @@ static void migrate_timers(int cpu) - - spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); -- put_cpu_ptr(&tvec_bases); -+ put_local_ptr(&tvec_bases); - } - - static int timer_cpu_notify(struct notifier_block *self, diff --git a/patches/timers-prepare-for-full-preemption.patch b/patches/timers-prepare-for-full-preemption.patch index 2ffc37e1e32400..328409275a6a1d 100644 --- a/patches/timers-prepare-for-full-preemption.patch +++ b/patches/timers-prepare-for-full-preemption.patch @@ -12,12 +12,12 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/linux/timer.h | 2 +- kernel/sched/core.c | 9 +++++++-- - kernel/time/timer.c | 41 ++++++++++++++++++++++++++++++++++++++--- - 3 files changed, 46 insertions(+), 6 deletions(-) + kernel/time/timer.c | 44 ++++++++++++++++++++++++++++++++++++++++---- + 3 files changed, 48 insertions(+), 7 deletions(-) --- a/include/linux/timer.h +++ b/include/linux/timer.h -@@ -225,7 +225,7 @@ extern void add_timer(struct timer_list +@@ -241,7 +241,7 @@ extern void add_timer(struct timer_list extern int try_to_del_timer_sync(struct timer_list *timer); @@ -56,17 +56,17 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> /* --- a/kernel/time/timer.c +++ b/kernel/time/timer.c -@@ -80,6 +80,9 @@ struct tvec_root { - struct tvec_base { - spinlock_t lock; - struct timer_list *running_timer; +@@ -195,6 +195,9 @@ EXPORT_SYMBOL(jiffies_64); + struct timer_base { + raw_spinlock_t lock; + struct timer_list *running_timer; +#ifdef CONFIG_PREEMPT_RT_FULL -+ wait_queue_head_t wait_for_running_timer; ++ struct swait_queue_head wait_for_running_timer; +#endif - unsigned long timer_jiffies; - unsigned long next_timer; - unsigned long active_timers; -@@ -1006,6 +1009,33 @@ void add_timer_on(struct timer_list *tim + unsigned long clk; + unsigned long next_expiry; + unsigned int cpu; +@@ -1162,6 +1165,33 @@ void add_timer_on(struct timer_list *tim } EXPORT_SYMBOL_GPL(add_timer_on); @@ -76,18 +76,18 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> + */ +static void wait_for_running_timer(struct timer_list *timer) +{ -+ struct tvec_base *base; ++ struct timer_base *base; + u32 tf = timer->flags; + + if (tf & TIMER_MIGRATING) + return; + -+ base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK); -+ wait_event(base->wait_for_running_timer, ++ base = get_timer_base(tf); ++ swait_event(base->wait_for_running_timer, + base->running_timer != timer); +} + -+# define wakeup_timer_waiters(b) wake_up(&(b)->wait_for_running_timer) ++# define wakeup_timer_waiters(b) swake_up_all(&(b)->wait_for_running_timer) +#else +static inline void wait_for_running_timer(struct timer_list *timer) +{ @@ -100,7 +100,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> /** * del_timer - deactive a timer. * @timer: the timer to be deactivated -@@ -1063,7 +1093,7 @@ int try_to_del_timer_sync(struct timer_l +@@ -1219,7 +1249,7 @@ int try_to_del_timer_sync(struct timer_l } EXPORT_SYMBOL(try_to_del_timer_sync); @@ -109,7 +109,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> /** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated -@@ -1123,7 +1153,7 @@ int del_timer_sync(struct timer_list *ti +@@ -1279,7 +1309,7 @@ int del_timer_sync(struct timer_list *ti int ret = try_to_del_timer_sync(timer); if (ret >= 0) return ret; @@ -118,32 +118,41 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> } } EXPORT_SYMBOL(del_timer_sync); -@@ -1248,15 +1278,17 @@ static inline void __run_timers(struct t - if (irqsafe) { - spin_unlock(&base->lock); - call_timer_fn(timer, fn, data); -+ base->running_timer = NULL; - spin_lock(&base->lock); - } else { - spin_unlock_irq(&base->lock); - call_timer_fn(timer, fn, data); -+ base->running_timer = NULL; - spin_lock_irq(&base->lock); - } +@@ -1344,13 +1374,16 @@ static void expire_timers(struct timer_b + fn = timer->function; + data = timer->data; + +- if (timer->flags & TIMER_IRQSAFE) { ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && ++ timer->flags & TIMER_IRQSAFE) { + raw_spin_unlock(&base->lock); + call_timer_fn(timer, fn, data); ++ base->running_timer = NULL; + raw_spin_lock(&base->lock); + } else { + raw_spin_unlock_irq(&base->lock); + call_timer_fn(timer, fn, data); ++ base->running_timer = NULL; + raw_spin_lock_irq(&base->lock); } } +@@ -1638,8 +1671,8 @@ static inline void __run_timers(struct t + while (levels--) + expire_timers(base, heads + levels); + } - base->running_timer = NULL; + raw_spin_unlock_irq(&base->lock); + wakeup_timer_waiters(base); - spin_unlock_irq(&base->lock); } -@@ -1656,6 +1688,9 @@ static void __init init_timer_cpu(int cp - - base->cpu = cpu; - spin_lock_init(&base->lock); + /* +@@ -1881,6 +1914,9 @@ static void __init init_timer_cpu(int cp + base->cpu = cpu; + raw_spin_lock_init(&base->lock); + base->clk = jiffies; +#ifdef CONFIG_PREEMPT_RT_FULL -+ init_waitqueue_head(&base->wait_for_running_timer); ++ init_swait_queue_head(&base->wait_for_running_timer); +#endif + } + } - base->timer_jiffies = jiffies; - base->next_timer = base->timer_jiffies; diff --git a/patches/timers-wakeup-all-timer-waiters-without-holding-the-.patch b/patches/timers-wakeup-all-timer-waiters-without-holding-the-.patch deleted file mode 100644 index 2b51d4de5d2111..00000000000000 --- a/patches/timers-wakeup-all-timer-waiters-without-holding-the-.patch +++ /dev/null @@ -1,26 +0,0 @@ -From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> -Date: Wed, 13 Jul 2016 17:13:23 +0200 -Subject: [PATCH] timers: wakeup all timer waiters without holding the base lock - -There should be no need to hold the base lock during the wakeup. There -should be no boosting involved, the wakeup list has its own lock so it -should be safe to do this without the lock. - -Cc: stable-rt@vger.kernel.org -Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> ---- - kernel/time/timer.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - ---- a/kernel/time/timer.c -+++ b/kernel/time/timer.c -@@ -1288,8 +1288,8 @@ static inline void __run_timers(struct t - } - } - } -- wakeup_timer_waiters(base); - spin_unlock_irq(&base->lock); -+ wakeup_timer_waiters(base); - } - - #ifdef CONFIG_NO_HZ_COMMON diff --git a/patches/timers-wakeup-all-timer-waiters.patch b/patches/timers-wakeup-all-timer-waiters.patch deleted file mode 100644 index bc0956d1d01c32..00000000000000 --- a/patches/timers-wakeup-all-timer-waiters.patch +++ /dev/null @@ -1,30 +0,0 @@ -From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> -Date: Wed, 13 Jul 2016 17:13:23 +0200 -Subject: [PATCH] timers: wakeup all timer waiters - -The base lock is dropped during the invocation if the timer. That means -it is possible that we have one waiter while timer1 is running and once -this one finished, we get another waiter while timer2 is running. Since -we wake up only one waiter it is possible that we miss the other one. -This will probably heal itself over time because most of the time we -complete timers without an active wake up. -To avoid the scenario where we don't wake up all waiters at once, -wake_up_all() is used. - -Cc: stable-rt@vger.kernel.org -Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> ---- - kernel/time/timer.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - ---- a/kernel/time/timer.c -+++ b/kernel/time/timer.c -@@ -1026,7 +1026,7 @@ static void wait_for_running_timer(struc - base->running_timer != timer); - } - --# define wakeup_timer_waiters(b) wake_up(&(b)->wait_for_running_timer) -+# define wakeup_timer_waiters(b) wake_up_all(&(b)->wait_for_running_timer) - #else - static inline void wait_for_running_timer(struct timer_list *timer) - { diff --git a/patches/trace-correct-off-by-one-while-recording-the-trace-e.patch b/patches/trace-correct-off-by-one-while-recording-the-trace-e.patch deleted file mode 100644 index 5461b472b3d7bc..00000000000000 --- a/patches/trace-correct-off-by-one-while-recording-the-trace-e.patch +++ /dev/null @@ -1,49 +0,0 @@ -From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> -Date: Wed, 25 May 2016 14:03:50 +0200 -Subject: [PATCH] trace: correct off by one while recording the trace-event - -Trace events like raw_syscalls show always a preempt code of one. The -reason is that on PREEMPT kernels rcu_read_lock_sched_notrace() -increases the preemption counter and the function recording the counter -is caller within the RCU section. - -Cc: stable-rt@vger.kernel.org -Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> ---- - include/linux/tracepoint.h | 13 +++++++++++++ - kernel/trace/trace_events.c | 2 +- - 2 files changed, 14 insertions(+), 1 deletion(-) - ---- a/include/linux/tracepoint.h -+++ b/include/linux/tracepoint.h -@@ -33,6 +33,19 @@ struct trace_enum_map { - - #define TRACEPOINT_DEFAULT_PRIO 10 - -+/* -+ * The preempt count recorded in trace_event_raw_event_# are off by one due to -+ * rcu_read_lock_sched_notrace() in __DO_TRACE. This is corrected here. -+ */ -+static inline int event_preempt_count(void) -+{ -+#ifdef CONFIG_PREEMPT -+ return preempt_count() - 1; -+#else -+ return 0; -+#endif -+} -+ - extern int - tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data); - extern int ---- a/kernel/trace/trace_events.c -+++ b/kernel/trace/trace_events.c -@@ -243,7 +243,7 @@ void *trace_event_buffer_reserve(struct - return NULL; - - local_save_flags(fbuffer->flags); -- fbuffer->pc = preempt_count(); -+ fbuffer->pc = event_preempt_count(); - fbuffer->trace_file = trace_file; - - fbuffer->event = diff --git a/patches/tracing-Show-the-preempt-count-of-when-the-event-was.patch b/patches/tracing-Show-the-preempt-count-of-when-the-event-was.patch new file mode 100644 index 00000000000000..552c50520ece10 --- /dev/null +++ b/patches/tracing-Show-the-preempt-count-of-when-the-event-was.patch @@ -0,0 +1,41 @@ +From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> +Date: Fri, 17 Jun 2016 17:40:58 -0400 +Subject: [PATCH] tracing: Show the preempt count of when the event was called + +Upstream commit e947841c0dce9db675a957182214ef8091ac3d61 + +Because tracepoint callbacks are done with preemption enabled, the trace +events are always called with preempt disable due to the +rcu_read_lock_sched_notrace() in __DO_TRACE(). This causes the preempt count +shown in the recorded trace event to be inaccurate. It is always one more +that what the preempt_count was when the tracepoint was called. + +If CONFIG_PREEMPT is enabled, subtract 1 from the preempt_count before +recording it in the trace buffer. + +Link: http://lkml.kernel.org/r/20160525132537.GA10808@linutronix.de + +Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Steven Rostedt <rostedt@goodmis.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/trace/trace_events.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/kernel/trace/trace_events.c ++++ b/kernel/trace/trace_events.c +@@ -244,6 +244,14 @@ void *trace_event_buffer_reserve(struct + + local_save_flags(fbuffer->flags); + fbuffer->pc = preempt_count(); ++ /* ++ * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables ++ * preemption (adding one to the preempt_count). Since we are ++ * interested in the preempt_count at the time the tracepoint was ++ * hit, we need to subtract one to offset the increment. ++ */ ++ if (IS_ENABLED(CONFIG_PREEMPT)) ++ fbuffer->pc--; + fbuffer->trace_file = trace_file; + + fbuffer->event = diff --git a/patches/tty-serial-8250-don-t-take-the-trylock-during-oops.patch b/patches/tty-serial-8250-don-t-take-the-trylock-during-oops.patch index 42b3efebfb4b78..231ef6120cde00 100644 --- a/patches/tty-serial-8250-don-t-take-the-trylock-during-oops.patch +++ b/patches/tty-serial-8250-don-t-take-the-trylock-during-oops.patch @@ -1,4 +1,3 @@ -From 08552bb6e497a6f37a31884083cdd2c046d0f674 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Date: Mon, 11 Apr 2016 16:55:02 +0200 Subject: [PATCH] tty: serial: 8250: don't take the trylock during oops diff --git a/patches/x86-apic-uv-Initialize-timer-as-pinned.patch b/patches/x86-apic-uv-Initialize-timer-as-pinned.patch new file mode 100644 index 00000000000000..4f2deba3c165ab --- /dev/null +++ b/patches/x86-apic-uv-Initialize-timer-as-pinned.patch @@ -0,0 +1,40 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:16 +0000 +Subject: [PATCH 02/22] x86/apic/uv: Initialize timer as pinned + +Pinned timers must carry that attribute in the timer itself. No functional +change. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Chris Mason <clm@fb.com> +Cc: Eric Dumazet <edumazet@google.com> +Cc: rt@linutronix.de +Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/x86/kernel/apic/x2apic_uv_x.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/x86/kernel/apic/x2apic_uv_x.c ++++ b/arch/x86/kernel/apic/x2apic_uv_x.c +@@ -755,7 +755,7 @@ static void uv_heartbeat(unsigned long i + uv_set_scir_bits(bits); + + /* enable next timer period */ +- mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL); ++ mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); + } + + static void uv_heartbeat_enable(int cpu) +@@ -764,7 +764,7 @@ static void uv_heartbeat_enable(int cpu) + struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; + + uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); +- setup_timer(timer, uv_heartbeat, cpu); ++ setup_pinned_timer(timer, uv_heartbeat, cpu); + timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; + add_timer_on(timer, cpu); + uv_cpu_hub_info(cpu)->scir.enabled = 1; diff --git a/patches/x86-mce-Initialize-timer-as-pinned.patch b/patches/x86-mce-Initialize-timer-as-pinned.patch new file mode 100644 index 00000000000000..3cae7d29e8f836 --- /dev/null +++ b/patches/x86-mce-Initialize-timer-as-pinned.patch @@ -0,0 +1,40 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 4 Jul 2016 09:50:17 +0000 +Subject: [PATCH 03/22] x86/mce: Initialize timer as pinned + +Pinned timers must carry that attribute in the timer itself. No functional +change. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Chris Mason <clm@fb.com> +Cc: Eric Dumazet <edumazet@google.com> +Cc: rt@linutronix.de +Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> +Cc: Arjan van de Ven <arjan@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/x86/kernel/cpu/mcheck/mce.c ++++ b/arch/x86/kernel/cpu/mcheck/mce.c +@@ -1258,7 +1258,7 @@ static void __restart_timer(struct timer + + if (timer_pending(t)) { + if (time_before(when, t->expires)) +- mod_timer_pinned(t, when); ++ mod_timer(t, when); + } else { + t->expires = round_jiffies(when); + add_timer_on(t, smp_processor_id()); +@@ -1672,7 +1672,7 @@ static void __mcheck_cpu_init_timer(void + struct timer_list *t = this_cpu_ptr(&mce_timer); + unsigned int cpu = smp_processor_id(); + +- setup_timer(t, mce_timer_fn, cpu); ++ setup_pinned_timer(t, mce_timer_fn, cpu); + mce_start_timer(cpu, t); + } + diff --git a/patches/x86-mce-timer-hrtimer.patch b/patches/x86-mce-timer-hrtimer.patch index 2215344ce34f3e..97996cda925d8e 100644 --- a/patches/x86-mce-timer-hrtimer.patch +++ b/patches/x86-mce-timer-hrtimer.patch @@ -57,7 +57,7 @@ fold in: - - if (timer_pending(t)) { - if (time_before(when, t->expires)) -- mod_timer_pinned(t, when); +- mod_timer(t, when); - } else { - t->expires = round_jiffies(when); - add_timer_on(t, smp_processor_id()); @@ -134,7 +134,7 @@ fold in: + struct hrtimer *t = this_cpu_ptr(&mce_timer); unsigned int cpu = smp_processor_id(); -- setup_timer(t, mce_timer_fn, cpu); +- setup_pinned_timer(t, mce_timer_fn, cpu); + hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + t->function = mce_timer_fn; mce_start_timer(cpu, t); |