diff options
Diffstat (limited to 'kernel/sched/deadline.c')
-rw-r--r-- | kernel/sched/deadline.c | 882 |
1 files changed, 622 insertions, 260 deletions
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 23973e5714d972..eb743649f24a58 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,11 +17,18 @@ */ #include "sched.h" #include "pelt.h" +#include <linux/cpuset.h> struct dl_bandwidth def_dl_bandwidth; +static bool dl_server(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_server; +} + static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) { + BUG_ON(dl_server(dl_se)); return container_of(dl_se, struct task_struct, dl); } @@ -30,12 +37,19 @@ static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq) return container_of(dl_rq, struct rq, dl); } -static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) +static inline struct rq *rq_of_dl_se(struct sched_dl_entity *dl_se) { - struct task_struct *p = dl_task_of(dl_se); - struct rq *rq = task_rq(p); + struct rq *rq = dl_se->rq; - return &rq->dl; + if (!dl_server(dl_se)) + rq = task_rq(dl_task_of(dl_se)); + + return rq; +} + +static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) +{ + return &rq_of_dl_se(dl_se)->dl; } static inline int on_dl_rq(struct sched_dl_entity *dl_se) @@ -230,19 +244,12 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) __sub_running_bw(dl_se->dl_bw, dl_rq); } -static void dl_change_utilization(struct task_struct *p, u64 new_bw) +static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_se, u64 new_bw) { - struct rq *rq; - - BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV); - - if (task_on_rq_queued(p)) - return; + if (dl_se->dl_non_contending) { + sub_running_bw(dl_se, &rq->dl); + dl_se->dl_non_contending = 0; - rq = task_rq(p); - if (p->dl.dl_non_contending) { - sub_running_bw(&p->dl, &rq->dl); - p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the * timer cannot be canceled, inactive_task_timer() @@ -250,13 +257,27 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw) * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) { + if (!dl_server(dl_se)) + put_task_struct(dl_task_of(dl_se)); + } } - __sub_rq_bw(p->dl.dl_bw, &rq->dl); + __sub_rq_bw(dl_se->dl_bw, &rq->dl); __add_rq_bw(new_bw, &rq->dl); } +static void dl_change_utilization(struct task_struct *p, u64 new_bw) +{ + WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); + + if (task_on_rq_queued(p)) + return; + + dl_rq_change_utilization(task_rq(p), &p->dl, new_bw); +} + +static void __dl_clear_params(struct sched_dl_entity *dl_se); + /* * The utilization of a task cannot be immediately removed from * the rq active utilization (running_bw) when the task blocks. @@ -311,12 +332,11 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw) * up, and checks if the task is still in the "ACTIVE non contending" * state or not (in the second case, it updates running_bw). */ -static void task_non_contending(struct task_struct *p) +static void task_non_contending(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se = &p->dl; struct hrtimer *timer = &dl_se->inactive_timer; - struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq); + struct rq *rq = rq_of_dl_se(dl_se); + struct dl_rq *dl_rq = &rq->dl; s64 zerolag_time; /* @@ -346,24 +366,33 @@ static void task_non_contending(struct task_struct *p) * utilization now, instead of starting a timer */ if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) { - if (dl_task(p)) + if (dl_server(dl_se)) { sub_running_bw(dl_se, dl_rq); - if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - - if (READ_ONCE(p->__state) == TASK_DEAD) - sub_rq_bw(&p->dl, &rq->dl); - raw_spin_lock(&dl_b->lock); - __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); - __dl_clear_params(p); - raw_spin_unlock(&dl_b->lock); + } else { + struct task_struct *p = dl_task_of(dl_se); + + if (dl_task(p)) + sub_running_bw(dl_se, dl_rq); + + if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + + if (READ_ONCE(p->__state) == TASK_DEAD) + sub_rq_bw(dl_se, &rq->dl); + raw_spin_lock(&dl_b->lock); + __dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p))); + raw_spin_unlock(&dl_b->lock); + __dl_clear_params(dl_se); + } } return; } dl_se->dl_non_contending = 1; - get_task_struct(p); + if (!dl_server(dl_se)) + get_task_struct(dl_task_of(dl_se)); + hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD); } @@ -390,8 +419,10 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) - put_task_struct(dl_task_of(dl_se)); + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) { + if (!dl_server(dl_se)) + put_task_struct(dl_task_of(dl_se)); + } } else { /* * Since "dl_non_contending" is not set, the @@ -404,9 +435,8 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) } } -static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) +static inline int is_leftmost(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { - struct sched_dl_entity *dl_se = &p->dl; return dl_rq->root.rb_leftmost == &dl_se->rb_node; } @@ -440,7 +470,6 @@ void init_dl_rq(struct dl_rq *dl_rq) /* zero means no -deadline tasks */ dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; - dl_rq->dl_nr_migratory = 0; dl_rq->overloaded = 0; dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED; #else @@ -484,39 +513,6 @@ static inline void dl_clear_overload(struct rq *rq) cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask); } -static void update_dl_migration(struct dl_rq *dl_rq) -{ - if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) { - if (!dl_rq->overloaded) { - dl_set_overload(rq_of_dl_rq(dl_rq)); - dl_rq->overloaded = 1; - } - } else if (dl_rq->overloaded) { - dl_clear_overload(rq_of_dl_rq(dl_rq)); - dl_rq->overloaded = 0; - } -} - -static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ - struct task_struct *p = dl_task_of(dl_se); - - if (p->nr_cpus_allowed > 1) - dl_rq->dl_nr_migratory++; - - update_dl_migration(dl_rq); -} - -static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ - struct task_struct *p = dl_task_of(dl_se); - - if (p->nr_cpus_allowed > 1) - dl_rq->dl_nr_migratory--; - - update_dl_migration(dl_rq); -} - #define __node_2_pdl(node) \ rb_entry((node), struct task_struct, pushable_dl_tasks) @@ -525,6 +521,11 @@ static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b) return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl); } +static inline int has_pushable_dl_tasks(struct rq *rq) +{ + return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root); +} + /* * The list of pushable -deadline task is not a plist, like in * sched_rt.c, it is an rb-tree with tasks ordered by deadline. @@ -540,6 +541,11 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) __pushable_less); if (leftmost) rq->dl.earliest_dl.next = p->dl.deadline; + + if (!rq->dl.overloaded) { + dl_set_overload(rq); + rq->dl.overloaded = 1; + } } static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) @@ -556,11 +562,11 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline; RB_CLEAR_NODE(&p->pushable_dl_tasks); -} -static inline int has_pushable_dl_tasks(struct rq *rq) -{ - return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root); + if (!has_pushable_dl_tasks(rq) && rq->dl.overloaded) { + dl_clear_overload(rq); + rq->dl.overloaded = 0; + } } static int push_dl_task(struct rq *rq); @@ -701,10 +707,28 @@ static inline void deadline_queue_pull_task(struct rq *rq) } #endif /* CONFIG_SMP */ +static void +enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags); static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); -static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); +static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags); static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); +static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, + struct rq *rq) +{ + /* for non-boosted task, pi_of(dl_se) == dl_se */ + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; + + /* + * If it is a deferred reservation, throttle it. + */ + if (dl_se->dl_defer) { + dl_se->dl_throttled = 1; + dl_se->dl_defer_armed = 1; + } +} + /* * We are being explicitly informed that a new instance is starting, * and this means that: @@ -738,10 +762,12 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) * future; in fact, we must consider execution overheads (time * spent on hardirq context, etc.). */ - dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; - dl_se->runtime = dl_se->dl_runtime; + replenish_dl_new_period(dl_se, rq); } +static int start_dl_timer(struct sched_dl_entity *dl_se); +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t); + /* * Pure Earliest Deadline First (EDF) scheduling does not deal with the * possibility of a entity lasting more than what it declared, and thus @@ -770,8 +796,15 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) /* * This could be the case for a !-dl task that is boosted. * Just go with full inherited parameters. + * + * Or, it could be the case of a deferred reservation that + * was not able to consume its runtime in background and + * reached this point with current u > U. + * + * In both cases, set a new period. */ - if (dl_se->dl_deadline == 0) { + if (dl_se->dl_deadline == 0 || + (dl_se->dl_defer_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) { dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; dl_se->runtime = pi_of(dl_se)->dl_runtime; } @@ -801,14 +834,44 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { printk_deferred_once("sched: DL replenish lagged too much\n"); - dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; - dl_se->runtime = pi_of(dl_se)->dl_runtime; + replenish_dl_new_period(dl_se, rq); } if (dl_se->dl_yielded) dl_se->dl_yielded = 0; if (dl_se->dl_throttled) dl_se->dl_throttled = 0; + + /* + * If this is the replenishment of a deferred reservation, + * clear the flag and return. + */ + if (dl_se->dl_defer_armed) { + dl_se->dl_defer_armed = 0; + return; + } + + /* + * A this point, if the deferred server is not armed, and the deadline + * is in the future, throttle the server and arm the zero laxity timer. + */ + if (dl_se->dl_defer && + dl_time_before(rq_clock(rq), dl_se->deadline - dl_se->runtime)) { + if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) { + dl_se->dl_defer_armed = 1; + dl_se->dl_throttled = 1; + if (!start_dl_timer(dl_se)) { + /* + * If for whatever reason (delays), if a previous timer was + * queued but not serviced, cancel it. + */ + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; + return; + } + } + } } /* @@ -946,8 +1009,7 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) */ static void update_dl_entity(struct sched_dl_entity *dl_se) { - struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq); + struct rq *rq = rq_of_dl_se(dl_se); if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, rq_clock(rq))) { @@ -959,8 +1021,14 @@ static void update_dl_entity(struct sched_dl_entity *dl_se) return; } - dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; - dl_se->runtime = pi_of(dl_se)->dl_runtime; + replenish_dl_new_period(dl_se, rq); + } else if (dl_server(dl_se) && dl_se->dl_defer) { + /* + * The server can still use its previous deadline, so throttle + * and arm the zero-laxity timer. + */ + dl_se->dl_defer_armed = 1; + dl_se->dl_throttled = 1; } } @@ -979,11 +1047,11 @@ static inline u64 dl_next_period(struct sched_dl_entity *dl_se) * actually started or not (i.e., the replenishment instant is in * the future or in the past). */ -static int start_dl_timer(struct task_struct *p) +static int start_dl_timer(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se = &p->dl; struct hrtimer *timer = &dl_se->dl_timer; - struct rq *rq = task_rq(p); + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); ktime_t now, act; s64 delta; @@ -993,8 +1061,20 @@ static int start_dl_timer(struct task_struct *p) * We want the timer to fire at the deadline, but considering * that it is actually coming from rq->clock and not from * hrtimer's time base reading. + * + * The deferred reservation will have its timer set to the zero laxity + * time (deadline - runtime). At that point, the CBS rule will decide + * if the current deadline can be used, or if a replenishment is + * required to avoid add too much pressure on the system (current u > + * U). */ - act = ns_to_ktime(dl_next_period(dl_se)); + if (dl_se->dl_defer_armed) { + WARN_ON_ONCE(!dl_se->dl_throttled); + act = ns_to_ktime(dl_se->deadline - dl_se->runtime); + } else { + act = ns_to_ktime(dl_next_period(dl_se)); + } + now = hrtimer_cb_get_time(timer); delta = ktime_to_ns(now) - rq_clock(rq); act = ktime_add_ns(act, delta); @@ -1017,13 +1097,33 @@ static int start_dl_timer(struct task_struct *p) * and observe our state. */ if (!hrtimer_is_queued(timer)) { - get_task_struct(p); + if (!dl_server(dl_se)) + get_task_struct(dl_task_of(dl_se)); hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD); } return 1; } +static void __push_dl_task(struct rq *rq, struct rq_flags *rf) +{ +#ifdef CONFIG_SMP + /* + * Queueing this task back might have overloaded rq, check if we need + * to kick someone away. + */ + if (has_pushable_dl_tasks(rq)) { + /* + * Nothing relies on rq->lock after this, so its safe to drop + * rq->lock. + */ + rq_unpin_lock(rq, rf); + push_dl_task(rq); + rq_repin_lock(rq, rf); + } +#endif +} + /* * This is the bandwidth enforcement timer callback. If here, we know * a task is not on its dl_rq, since the fact that the timer was running @@ -1042,10 +1142,34 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct sched_dl_entity *dl_se = container_of(timer, struct sched_dl_entity, dl_timer); - struct task_struct *p = dl_task_of(dl_se); + struct task_struct *p; struct rq_flags rf; struct rq *rq; + if (dl_server(dl_se)) { + struct rq *rq = rq_of_dl_se(dl_se); + struct rq_flags rf; + + rq_lock(rq, &rf); + if (dl_se->dl_throttled) { + sched_clock_tick(); + update_rq_clock(rq); + + if (dl_se->server_has_tasks(dl_se)) { + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); + resched_curr(rq); + __push_dl_task(rq, &rf); + } else { + replenish_dl_entity(dl_se); + } + + } + rq_unlock(rq, &rf); + + return HRTIMER_NORESTART; + } + + p = dl_task_of(dl_se); rq = task_rq_lock(p, &rf); /* @@ -1116,21 +1240,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) else resched_curr(rq); -#ifdef CONFIG_SMP - /* - * Queueing this task back might have overloaded rq, check if we need - * to kick someone away. - */ - if (has_pushable_dl_tasks(rq)) { - /* - * Nothing relies on rq->lock after this, so its safe to drop - * rq->lock. - */ - rq_unpin_lock(rq, &rf); - push_dl_task(rq); - rq_repin_lock(rq, &rf); - } -#endif + __push_dl_task(rq, &rf); unlock: task_rq_unlock(rq, p, &rf); @@ -1144,7 +1254,7 @@ unlock: return HRTIMER_NORESTART; } -void init_dl_task_timer(struct sched_dl_entity *dl_se) +static void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; @@ -1172,12 +1282,11 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) */ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) { - struct task_struct *p = dl_task_of(dl_se); - struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se)); + struct rq *rq = rq_of_dl_se(dl_se); if (dl_time_before(dl_se->deadline, rq_clock(rq)) && dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { - if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) return; dl_se->dl_throttled = 1; if (dl_se->runtime > 0) @@ -1234,48 +1343,9 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) return (delta * u_act) >> BW_SHIFT; } -/* - * Update the current task's runtime statistics (provided it is still - * a -deadline task and has not been removed from the dl_rq). - */ -static void update_curr_dl(struct rq *rq) +s64 dl_scalled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) { - struct task_struct *curr = rq->curr; - struct sched_dl_entity *dl_se = &curr->dl; - u64 delta_exec, scaled_delta_exec; - int cpu = cpu_of(rq); - u64 now; - - if (!dl_task(curr) || !on_dl_rq(dl_se)) - return; - - /* - * Consumed budget is computed considering the time as - * observed by schedulable tasks (excluding time spent - * in hardirq context, etc.). Deadlines are instead - * computed using hard walltime. This seems to be the more - * natural solution, but the full ramifications of this - * approach need further study. - */ - now = rq_clock_task(rq); - delta_exec = now - curr->se.exec_start; - if (unlikely((s64)delta_exec <= 0)) { - if (unlikely(dl_se->dl_yielded)) - goto throttle; - return; - } - - schedstat_set(curr->se.statistics.exec_max, - max(curr->se.statistics.exec_max, delta_exec)); - - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); - - curr->se.exec_start = now; - cgroup_account_cputime(curr, delta_exec); - - if (dl_entity_is_special(dl_se)) - return; + s64 scaled_delta_exec; /* * For tasks that participate in GRUB, we implement GRUB-PA: the @@ -1285,10 +1355,9 @@ static void update_curr_dl(struct rq *rq) * according to current frequency and CPU maximum capacity. */ if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) { - scaled_delta_exec = grub_reclaim(delta_exec, - rq, - &curr->dl); + scaled_delta_exec = grub_reclaim(delta_exec, rq, dl_se); } else { + int cpu = cpu_of(rq); unsigned long scale_freq = arch_scale_freq_capacity(cpu); unsigned long scale_cpu = arch_scale_cpu_capacity(cpu); @@ -1296,8 +1365,58 @@ static void update_curr_dl(struct rq *rq) scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); } + return scaled_delta_exec; +} + +static inline void +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags); +static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) +{ + s64 scaled_delta_exec; + + if (unlikely(delta_exec <= 0)) { + if (unlikely(dl_se->dl_yielded)) + goto throttle; + return; + } + + if (dl_server(dl_se) && dl_se->dl_throttled && !dl_se->dl_defer) + return; + + if (dl_entity_is_special(dl_se)) + return; + + scaled_delta_exec = dl_scalled_delta_exec(rq, dl_se, delta_exec); + dl_se->runtime -= scaled_delta_exec; + if (dl_server(dl_se) && !rq->curr->pid) + return; + + /* + * The fair server can consume its runtime while throttled (not queued). + * + * If the server consumes its entire runtime in this state. The server + * is not required for the current period. Thus, reset the server by + * starting a new period, pushing the activation to the zero-lax time. + */ + if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) { + hrtimer_try_to_cancel(&dl_se->dl_timer); + + replenish_dl_new_period(dl_se, dl_se->rq); + + /* + * Not being able to start the timer seems problematic. If it could not + * be started for whatever reason, we need to "unthrottle" the DL server + * and queue right away. Otherwise nothing might queue it. That's similar + * to what enqueue_dl_entity() does on start_dl_timer==0. For now, just warn. + */ + WARN_ON_ONCE(!start_dl_timer(dl_se)); + + return; + } + throttle: if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { dl_se->dl_throttled = 1; @@ -1307,15 +1426,30 @@ throttle: (dl_se->flags & SCHED_FLAG_DL_OVERRUN)) dl_se->dl_overrun = 1; - __dequeue_task_dl(rq, curr, 0); - if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr))) - enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); + dequeue_dl_entity(dl_se, 0); + if (!dl_server(dl_se)) + dequeue_pushable_dl_task(rq, dl_task_of(dl_se)); - if (!is_leftmost(curr, &rq->dl)) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) { + if (dl_server(dl_se)) + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); + else + enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH); + } + + if (!is_leftmost(dl_se, &rq->dl)) resched_curr(rq); } /* + * The fair server (sole dl_server) does not account for real-time + * workload because it is running fair work. + */ + if (dl_se == &rq->fair_server) + return; + +#ifdef CONFIG_RT_GROUP_SCHED + /* * Because -- for now -- we share the rt bandwidth, we need to * account our runtime there too, otherwise actual rt tasks * would be able to exceed the shared quota. @@ -1339,6 +1473,121 @@ throttle: rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } +#endif +} + +void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) +{ + update_curr_dl_se(dl_se->rq, dl_se, delta_exec); +} + +void dl_server_start(struct sched_dl_entity *dl_se) +{ + /* + * XXX: the apply do not work fine at the init phase for the + * fair server because things are not yet set. We need to improve + * this before getting generic. + */ + if (!dl_server(dl_se)) { + u64 runtime = 50 * NSEC_PER_MSEC; + u64 period = 1000 * NSEC_PER_MSEC; + + dl_server_apply_params(dl_se, runtime, period, 1); + + dl_se->dl_defer = 1; + dl_se->dl_server = 1; + setup_new_dl_entity(dl_se); + } + + enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); +} + +void dl_server_stop(struct sched_dl_entity *dl_se) +{ + dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; +} + +void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, + dl_server_has_tasks_f has_tasks, + dl_server_pick_f pick_next, + dl_server_pick_f pick_task) +{ + dl_se->rq = rq; + dl_se->server_has_tasks = has_tasks; + dl_se->server_pick_next = pick_next; + dl_se->server_pick_task = pick_task; +} + +int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init) +{ + u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime); + u64 new_bw = to_ratio(period, runtime); + struct rq *rq = dl_se->rq; + int cpu = cpu_of(rq); + struct dl_bw *dl_b; + unsigned long cap; + int retval = 0; + int cpus; + + dl_b = dl_bw_of(cpu); + raw_spin_lock(&dl_b->lock); + cpus = dl_bw_cpus(cpu); + cap = dl_bw_capacity(cpu); + + if (__dl_overflow(dl_b, cap, old_bw, new_bw)) { + retval = -EBUSY; + goto out; + } + + if (init) { + __add_rq_bw(new_bw, &rq->dl); + __dl_add(dl_b, new_bw, cpus); + } else { + __dl_sub(dl_b, dl_se->dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); + + dl_rq_change_utilization(rq, dl_se, new_bw); + } + + rq->fair_server.dl_runtime = runtime; + rq->fair_server.dl_deadline = period; + rq->fair_server.dl_period = period; + + dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); + dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); + +out: + raw_spin_unlock(&dl_b->lock); + + return retval; +} + +/* + * Update the current task's runtime statistics (provided it is still + * a -deadline task and has not been removed from the dl_rq). + */ +static void update_curr_dl(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct sched_dl_entity *dl_se = &curr->dl; + s64 delta_exec; + + if (!dl_task(curr) || !on_dl_rq(dl_se)) + return; + + /* + * Consumed budget is computed considering the time as + * observed by schedulable tasks (excluding time spent + * in hardirq context, etc.). Deadlines are instead + * computed using hard walltime. This seems to be the more + * natural solution, but the full ramifications of this + * approach need further study. + */ + delta_exec = update_curr_common(rq); + update_curr_dl_se(rq, dl_se, delta_exec); } static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) @@ -1346,15 +1595,24 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) struct sched_dl_entity *dl_se = container_of(timer, struct sched_dl_entity, inactive_timer); - struct task_struct *p = dl_task_of(dl_se); + struct task_struct *p = NULL; struct rq_flags rf; struct rq *rq; - rq = task_rq_lock(p, &rf); + if (!dl_server(dl_se)) { + p = dl_task_of(dl_se); + rq = task_rq_lock(p, &rf); + } else { + rq = dl_se->rq; + rq_lock(rq, &rf); + } sched_clock_tick(); update_rq_clock(rq); + if (dl_server(dl_se)) + goto no_task; + if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); @@ -1367,23 +1625,30 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&dl_b->lock); - __dl_clear_params(p); + __dl_clear_params(dl_se); goto unlock; } + +no_task: if (dl_se->dl_non_contending == 0) goto unlock; sub_running_bw(dl_se, &rq->dl); dl_se->dl_non_contending = 0; unlock: - task_rq_unlock(rq, p, &rf); - put_task_struct(p); + + if (!dl_server(dl_se)) { + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + } else { + rq_unlock(rq, &rf); + } return HRTIMER_NORESTART; } -void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) +static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->inactive_timer; @@ -1439,29 +1704,22 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} static inline void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { - int prio = dl_task_of(dl_se)->prio; u64 deadline = dl_se->deadline; - WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; add_nr_running(rq_of_dl_rq(dl_rq), 1); inc_dl_deadline(dl_rq, deadline); - inc_dl_migration(dl_se, dl_rq); } static inline void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { - int prio = dl_task_of(dl_se)->prio; - - WARN_ON(!dl_prio(prio)); WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; sub_nr_running(rq_of_dl_rq(dl_rq), 1); dec_dl_deadline(dl_rq, dl_se->deadline); - dec_dl_migration(dl_se, dl_rq); } #define __node_2_dle(node) \ @@ -1503,6 +1761,41 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) BUG_ON(on_dl_rq(dl_se)); /* + * Check if a constrained deadline task was activated + * after the deadline but before the next period. + * If that is the case, the task will be throttled and + * the replenishment timer will be set to the next period. + */ + if (!dl_se->dl_throttled && !dl_is_implicit(dl_se)) + dl_check_constrained_dl(dl_se); + + if (flags & (ENQUEUE_RESTORE|ENQUEUE_MIGRATING)) { + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + add_rq_bw(dl_se, dl_rq); + add_running_bw(dl_se, dl_rq); + } + + /* + * If p is throttled, we do not enqueue it. In fact, if it exhausted + * its budget it needs a replenishment and, since it now is on + * its rq, the bandwidth timer callback (which clearly has not + * run yet) will take care of this. + * However, the active utilization does not depend on the fact + * that the task is on the runqueue or not (but depends on the + * task's state - in GRUB parlance, "inactive" vs "active contending"). + * In other words, even if a task is throttled its utilization must + * be counted in the active utilization; hence, we need to call + * add_running_bw(). + */ + if (!dl_se->dl_defer && dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { + if (flags & ENQUEUE_WAKEUP) + task_contending(dl_se, flags); + + return; + } + + /* * If this is a wakeup or a new instance, the scheduling * parameters of the task might need updating. Otherwise, * we want a replenishment of its runtime. @@ -1513,17 +1806,54 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se); } else if ((flags & ENQUEUE_RESTORE) && - dl_time_before(dl_se->deadline, - rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { + dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) { setup_new_dl_entity(dl_se); } + /* + * If we are still throttled, eg. we got replenished but are a + * zero-laxity task and still got to wait, don't enqueue. + */ + if (dl_se->dl_throttled && start_dl_timer(dl_se)) + return; + + /* + * We're about to enqueue, make sure we're not ->dl_throttled! + * In case the timer was not started, say because the 0-lax time + * has passed, mark as not throttled and mark unarmed. + * Also cancel earlier timers, since letting those run is pointless. + */ + if (dl_se->dl_throttled) { + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; + } + __enqueue_dl_entity(dl_se); } -static void dequeue_dl_entity(struct sched_dl_entity *dl_se) +static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags) { __dequeue_dl_entity(dl_se); + + if (flags & (DEQUEUE_SAVE|DEQUEUE_MIGRATING)) { + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + sub_running_bw(dl_se, dl_rq); + sub_rq_bw(dl_se, dl_rq); + } + + /* + * This check allows to start the inactive timer (or to immediately + * decrease the active utilization, if needed) in two cases: + * when the task blocks and when it is terminating + * (p->state == TASK_DEAD). We can handle the two cases in the same + * way, because from GRUB's point of view the same thing is happening + * (the task moves from "active contending" to "active non contending" + * or "inactive") + */ + if (flags & DEQUEUE_SLEEP) + task_non_contending(dl_se); } static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) @@ -1568,72 +1898,28 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) return; } - /* - * Check if a constrained deadline task was activated - * after the deadline but before the next period. - * If that is the case, the task will be throttled and - * the replenishment timer will be set to the next period. - */ - if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl)) - dl_check_constrained_dl(&p->dl); - - if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { - add_rq_bw(&p->dl, &rq->dl); - add_running_bw(&p->dl, &rq->dl); - } + if (p->on_rq == TASK_ON_RQ_MIGRATING) + flags |= ENQUEUE_MIGRATING; - /* - * If p is throttled, we do not enqueue it. In fact, if it exhausted - * its budget it needs a replenishment and, since it now is on - * its rq, the bandwidth timer callback (which clearly has not - * run yet) will take care of this. - * However, the active utilization does not depend on the fact - * that the task is on the runqueue or not (but depends on the - * task's state - in GRUB parlance, "inactive" vs "active contending"). - * In other words, even if a task is throttled its utilization must - * be counted in the active utilization; hence, we need to call - * add_running_bw(). - */ - if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) { - if (flags & ENQUEUE_WAKEUP) - task_contending(&p->dl, flags); + enqueue_dl_entity(&p->dl, flags); + if (dl_server(&p->dl)) return; - } - - enqueue_dl_entity(&p->dl, flags); - if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } -static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) -{ - dequeue_dl_entity(&p->dl); - dequeue_pushable_dl_task(rq, p); -} - static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); - __dequeue_task_dl(rq, p, flags); - if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { - sub_running_bw(&p->dl, &rq->dl); - sub_rq_bw(&p->dl, &rq->dl); - } + if (p->on_rq == TASK_ON_RQ_MIGRATING) + flags |= DEQUEUE_MIGRATING; - /* - * This check allows to start the inactive timer (or to immediately - * decrease the active utilization, if needed) in two cases: - * when the task blocks and when it is terminating - * (p->state == TASK_DEAD). We can handle the two cases in the same - * way, because from GRUB's point of view the same thing is happening - * (the task moves from "active contending" to "active non contending" - * or "inactive") - */ - if (flags & DEQUEUE_SLEEP) - task_non_contending(p); + dequeue_dl_entity(&p->dl, flags); + if (!p->dl.dl_throttled && !dl_server(&p->dl)) + dequeue_pushable_dl_task(rq, p); } /* @@ -1817,12 +2103,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, } #ifdef CONFIG_SCHED_HRTICK -static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { - hrtick_start(rq, p->dl.runtime); + hrtick_start(rq, dl_se->runtime); } #else /* !CONFIG_SCHED_HRTICK */ -static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { } #endif @@ -1837,9 +2123,6 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) if (!first) return; - if (hrtick_enabled_dl(rq)) - start_hrtick_dl(rq, p); - if (rq->curr->sched_class != &dl_sched_class) update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); @@ -1856,30 +2139,64 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) return rb_entry(left, struct sched_dl_entity, rb_node); } -static struct task_struct *pick_task_dl(struct rq *rq) +/* + * __pick_next_task_dl - Helper to pick the next -deadline task to run. + * @rq: The runqueue to pick the next task from. + * @peek: If true, just peek at the next task. Only relevant for dlserver. + */ +static struct task_struct *__pick_next_task_dl(struct rq *rq, bool peek) { struct sched_dl_entity *dl_se; struct dl_rq *dl_rq = &rq->dl; struct task_struct *p; +again: if (!sched_dl_runnable(rq)) return NULL; dl_se = pick_next_dl_entity(dl_rq); BUG_ON(!dl_se); - p = dl_task_of(dl_se); + + if (dl_server(dl_se)) { + if (IS_ENABLED(CONFIG_SMP) && peek) + p = dl_se->server_pick_task(dl_se); + else + p = dl_se->server_pick_next(dl_se); + if (!p) { + WARN_ON_ONCE(1); + dl_se->dl_yielded = 1; + update_curr_dl_se(rq, dl_se, 0); + goto again; + } + p->dl_server = dl_se; + } else { + p = dl_task_of(dl_se); + } return p; } +#ifdef CONFIG_SMP +static struct task_struct *pick_task_dl(struct rq *rq) +{ + return __pick_next_task_dl(rq, true); +} +#endif + static struct task_struct *pick_next_task_dl(struct rq *rq) { struct task_struct *p; - p = pick_task_dl(rq); - if (p) + p = __pick_next_task_dl(rq, false); + if (!p) + return p; + + if (!p->dl_server) set_next_task_dl(rq, p, true); + if (hrtick_enabled(rq)) + start_hrtick_dl(rq, &p->dl); + return p; } @@ -1911,8 +2228,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) * be set and schedule() will start a new hrtick for the next task. */ if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 && - is_leftmost(p, &rq->dl)) - start_hrtick_dl(rq, p); + is_leftmost(&p->dl, &rq->dl)) + start_hrtick_dl(rq, &p->dl); } static void task_fork_dl(struct task_struct *p) @@ -2141,9 +2458,6 @@ static int push_dl_task(struct rq *rq) struct rq *later_rq; int ret = 0; - if (!rq->dl.overloaded) - return 0; - next_task = pick_next_pushable_dl_task(rq); if (!next_task) return 0; @@ -2307,9 +2621,11 @@ skip: double_unlock_balance(this_rq, src_rq); if (push_task) { + preempt_disable(); raw_spin_rq_unlock(this_rq); stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, push_task, &src_rq->push_work); + preempt_enable(); raw_spin_rq_lock(this_rq); } } @@ -2444,7 +2760,13 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) * will reset the task parameters. */ if (task_on_rq_queued(p) && p->dl.dl_runtime) - task_non_contending(p); + task_non_contending(&p->dl); + + /* + * In case a task is setscheduled out from SCHED_DEADLINE we need to + * keep track of that on its cpuset (for correct bandwidth tracking). + */ + dec_dl_tasks_cs(p); if (!task_on_rq_queued(p)) { /* @@ -2486,6 +2808,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) put_task_struct(p); + /* + * In case a task is setscheduled to SCHED_DEADLINE we need to keep + * track of that on its cpuset (for correct bandwidth tracking). + */ + inc_dl_tasks_cs(p); + /* If p is not queued we will update its parameters at next wakeup. */ if (!task_on_rq_queued(p)) { add_rq_bw(&p->dl, &rq->dl); @@ -2829,10 +3157,8 @@ bool __checkparam_dl(const struct sched_attr *attr) /* * This function clears the sched_dl_entity static params. */ -void __dl_clear_params(struct task_struct *p) +static void __dl_clear_params(struct sched_dl_entity *dl_se) { - struct sched_dl_entity *dl_se = &p->dl; - dl_se->dl_runtime = 0; dl_se->dl_deadline = 0; dl_se->dl_period = 0; @@ -2844,12 +3170,21 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; dl_se->dl_overrun = 0; + dl_se->dl_server = 0; #ifdef CONFIG_RT_MUTEXES dl_se->pi_se = dl_se; #endif } +void init_dl_entity(struct sched_dl_entity *dl_se) +{ + RB_CLEAR_NODE(&dl_se->rb_node); + init_dl_task_timer(dl_se); + init_dl_inactive_task_timer(dl_se); + __dl_clear_params(dl_se); +} + bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) { struct sched_dl_entity *dl_se = &p->dl; @@ -2885,26 +3220,38 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, return ret; } -int dl_cpu_busy(int cpu, struct task_struct *p) +enum dl_bw_request { + dl_bw_req_check_overflow = 0, + dl_bw_req_alloc, + dl_bw_req_free +}; + +static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) { - unsigned long flags, cap; + unsigned long flags; struct dl_bw *dl_b; - bool overflow; + bool overflow = 0; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - cap = dl_bw_capacity(cpu); - overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0); - if (!overflow && p) { - /* - * We reserve space for this task in the destination - * root_domain, as we can't fail after this point. - * We will free resources in the source root_domain - * later on (see set_cpus_allowed_dl()). - */ - __dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu)); + if (req == dl_bw_req_free) { + __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu)); + } else { + unsigned long cap = dl_bw_capacity(cpu); + + overflow = __dl_overflow(dl_b, cap, 0, dl_bw); + + if (req == dl_bw_req_alloc && !overflow) { + /* + * We reserve space in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu)); + } } raw_spin_unlock_irqrestore(&dl_b->lock, flags); @@ -2912,6 +3259,21 @@ int dl_cpu_busy(int cpu, struct task_struct *p) return overflow ? -EBUSY : 0; } + +int dl_bw_check_overflow(int cpu) +{ + return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0); +} + +int dl_bw_alloc(int cpu, u64 dl_bw) +{ + return dl_bw_manage(dl_bw_req_alloc, cpu, dl_bw); +} + +void dl_bw_free(int cpu, u64 dl_bw) +{ + dl_bw_manage(dl_bw_req_free, cpu, dl_bw); +} #endif #ifdef CONFIG_SCHED_DEBUG |