Merge tag 'sched-urgent-2026-01-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar:
 "Misc deadline scheduler fixes, mainly for a new category of bugs that
  were discovered and fixed recently:

   - Fix a race condition in the DL server

   - Fix a DL server bug which can result in incorrectly going idle when
     there's work available

   - Fix DL server bug which triggers a WARN() due to broken
     get_prio_dl() logic and subsequent misbehavior

   - Fix double update_rq_clock() calls

   - Fix setscheduler() assumption about static priorities

   - Make sure balancing callbacks are always called

   - Plus a handful of preparatory commits for the fixes"

* tag 'sched-urgent-2026-01-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/deadline: Use ENQUEUE_MOVE to allow priority change
  sched: Deadline has dynamic priority
  sched: Audit MOVE vs balance_callbacks
  sched: Fold rq-pin swizzle into __balance_callbacks()
  sched/deadline: Avoid double update_rq_clock()
  sched/deadline: Ensure get_prio_dl() is up-to-date
  sched/deadline: Fix server stopping with runnable tasks
  sched: Provide idle_rq() helper
  sched/deadline: Fix potential race in dl_add_task_root_domain()
  sched/deadline: Remove unnecessary comment in dl_add_task_root_domain()
This commit is contained in:
Linus Torvalds
2026-01-18 10:17:40 -08:00
6 changed files with 59 additions and 56 deletions

View File

@@ -1874,7 +1874,6 @@ static inline int task_nice(const struct task_struct *p)
extern int can_nice(const struct task_struct *p, const int nice);
extern int task_curr(const struct task_struct *p);
extern int idle_cpu(int cpu);
extern int available_idle_cpu(int cpu);
extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
extern void sched_set_fifo(struct task_struct *p);

View File

@@ -4950,9 +4950,13 @@ struct balance_callback *splice_balance_callbacks(struct rq *rq)
return __splice_balance_callbacks(rq, true);
}
static void __balance_callbacks(struct rq *rq)
void __balance_callbacks(struct rq *rq, struct rq_flags *rf)
{
if (rf)
rq_unpin_lock(rq, rf);
do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
if (rf)
rq_repin_lock(rq, rf);
}
void balance_callbacks(struct rq *rq, struct balance_callback *head)
@@ -4991,7 +4995,7 @@ static inline void finish_lock_switch(struct rq *rq)
* prev into current:
*/
spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
__balance_callbacks(rq);
__balance_callbacks(rq, NULL);
raw_spin_rq_unlock_irq(rq);
}
@@ -6867,7 +6871,7 @@ keep_resched:
proxy_tag_curr(rq, next);
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
__balance_callbacks(rq, NULL);
raw_spin_rq_unlock_irq(rq);
}
trace_sched_exit_tp(is_switch);
@@ -7316,7 +7320,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
trace_sched_pi_setprio(p, pi_task);
oldprio = p->prio;
if (oldprio == prio)
if (oldprio == prio && !dl_prio(prio))
queue_flag &= ~DEQUEUE_MOVE;
prev_class = p->sched_class;
@@ -7362,9 +7366,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
out_unlock:
/* Caller holds task_struct::pi_lock, IRQs are still disabled */
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
rq_repin_lock(rq, &rf);
__balance_callbacks(rq, &rf);
__task_rq_unlock(rq, p, &rf);
}
#endif /* CONFIG_RT_MUTEXES */
@@ -9124,6 +9126,8 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)
if (resched)
resched_curr(rq);
__balance_callbacks(rq, &rq_guard.rf);
}
static struct cgroup_subsys_state *

View File

@@ -752,8 +752,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
update_rq_clock(rq);
WARN_ON(is_dl_boosted(dl_se));
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
@@ -1420,7 +1418,7 @@ update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, int
static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
{
bool idle = rq->curr == rq->idle;
bool idle = idle_rq(rq);
s64 scaled_delta_exec;
if (unlikely(delta_exec <= 0)) {
@@ -1603,8 +1601,8 @@ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
* | 8 | B:zero_laxity-wait | | |
* | | | <---+ |
* | +--------------------------------+ |
* | | ^ ^ 2 |
* | | 7 | 2 +--------------------+
* | | ^ ^ 2 |
* | | 7 | 2, 1 +----------------+
* | v |
* | +-------------+ |
* +-- | C:idle-wait | -+
@@ -1649,8 +1647,11 @@ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
* dl_defer_idle = 0
*
*
* [1] A->B, A->D
* [1] A->B, A->D, C->B
* dl_server_start()
* dl_defer_idle = 0;
* if (dl_server_active)
* return; // [B]
* dl_server_active = 1;
* enqueue_dl_entity()
* update_dl_entity(WAKEUP)
@@ -1759,6 +1760,7 @@ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
* "B:zero_laxity-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"]
* "B:zero_laxity-wait" -> "D:running" [label="3:dl_server_timer"]
* "C:idle-wait" -> "A:init" [label="8:dl_server_timer"]
* "C:idle-wait" -> "B:zero_laxity-wait" [label="1:dl_server_start"]
* "C:idle-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"]
* "C:idle-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"]
* "D:running" -> "A:init" [label="4:pick_task_dl"]
@@ -1784,6 +1786,7 @@ void dl_server_start(struct sched_dl_entity *dl_se)
{
struct rq *rq = dl_se->rq;
dl_se->dl_defer_idle = 0;
if (!dl_server(dl_se) || dl_se->dl_server_active)
return;
@@ -1834,6 +1837,7 @@ void sched_init_dl_servers(void)
rq = cpu_rq(cpu);
guard(rq_lock_irq)(rq);
update_rq_clock(rq);
dl_se = &rq->fair_server;
@@ -2210,7 +2214,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
update_dl_entity(dl_se);
} else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se);
} else if ((flags & ENQUEUE_RESTORE) &&
} else if ((flags & ENQUEUE_MOVE) &&
!is_dl_boosted(dl_se) &&
dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
setup_new_dl_entity(dl_se);
@@ -3154,7 +3158,7 @@ void dl_add_task_root_domain(struct task_struct *p)
struct rq *rq;
struct dl_bw *dl_b;
unsigned int cpu;
struct cpumask *msk = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
struct cpumask *msk;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
if (!dl_task(p) || dl_entity_is_special(&p->dl)) {
@@ -3162,20 +3166,12 @@ void dl_add_task_root_domain(struct task_struct *p)
return;
}
/*
* Get an active rq, whose rq->rd traces the correct root
* domain.
* Ideally this would be under cpuset reader lock until rq->rd is
* fetched. However, sleepable locks cannot nest inside pi_lock, so we
* rely on the caller of dl_add_task_root_domain() holds 'cpuset_mutex'
* to guarantee the CPU stays in the cpuset.
*/
msk = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
dl_get_task_effective_cpus(p, msk);
cpu = cpumask_first_and(cpu_active_mask, msk);
BUG_ON(cpu >= nr_cpu_ids);
rq = cpu_rq(cpu);
dl_b = &rq->rd->dl_bw;
/* End of fetching rd */
raw_spin_lock(&dl_b->lock);
__dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
@@ -3299,6 +3295,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
static u64 get_prio_dl(struct rq *rq, struct task_struct *p)
{
/*
* Make sure to update current so we don't return a stale value.
*/
if (task_current_donor(rq, p))
update_curr_dl(rq);
return p->dl.deadline;
}

View File

@@ -545,6 +545,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter)
static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
{
if (iter->locked_task) {
__balance_callbacks(iter->rq, &iter->rf);
task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
iter->locked_task = NULL;
}

View File

@@ -1364,6 +1364,28 @@ static inline u32 sched_rng(void)
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)
static inline bool idle_rq(struct rq *rq)
{
return rq->curr == rq->idle && !rq->nr_running && !rq->ttwu_pending;
}
/**
* available_idle_cpu - is a given CPU idle for enqueuing work.
* @cpu: the CPU in question.
*
* Return: 1 if the CPU is currently idle. 0 otherwise.
*/
static inline bool available_idle_cpu(int cpu)
{
if (!idle_rq(cpu_rq(cpu)))
return 0;
if (vcpu_is_preempted(cpu))
return 0;
return 1;
}
#ifdef CONFIG_SCHED_PROXY_EXEC
static inline void rq_set_donor(struct rq *rq, struct task_struct *t)
{
@@ -2366,7 +2388,8 @@ extern const u32 sched_prio_to_wmult[40];
* should preserve as much state as possible.
*
* MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
* in the runqueue.
* in the runqueue. IOW the priority is allowed to change. Callers
* must expect to deal with balance callbacks.
*
* NOCLOCK - skip the update_rq_clock() (avoids double updates)
*
@@ -3947,6 +3970,8 @@ extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
extern void __balance_callbacks(struct rq *rq, struct rq_flags *rf);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
/*

View File

@@ -180,35 +180,7 @@ int task_prio(const struct task_struct *p)
*/
int idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
if (rq->curr != rq->idle)
return 0;
if (rq->nr_running)
return 0;
if (rq->ttwu_pending)
return 0;
return 1;
}
/**
* available_idle_cpu - is a given CPU idle for enqueuing work.
* @cpu: the CPU in question.
*
* Return: 1 if the CPU is currently idle. 0 otherwise.
*/
int available_idle_cpu(int cpu)
{
if (!idle_cpu(cpu))
return 0;
if (vcpu_is_preempted(cpu))
return 0;
return 1;
return idle_rq(cpu_rq(cpu));
}
/**
@@ -667,7 +639,7 @@ change:
* itself.
*/
newprio = rt_effective_prio(p, newprio);
if (newprio == oldprio)
if (newprio == oldprio && !dl_prio(newprio))
queue_flags &= ~DEQUEUE_MOVE;
}