Merge tag 'sched-urgent-2026-01-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar:
"Misc deadline scheduler fixes, mainly for a new category of bugs that
were discovered and fixed recently:
- Fix a race condition in the DL server
- Fix a DL server bug which can result in incorrectly going idle when
there's work available
- Fix DL server bug which triggers a WARN() due to broken
get_prio_dl() logic and subsequent misbehavior
- Fix double update_rq_clock() calls
- Fix setscheduler() assumption about static priorities
- Make sure balancing callbacks are always called
- Plus a handful of preparatory commits for the fixes"
* tag 'sched-urgent-2026-01-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/deadline: Use ENQUEUE_MOVE to allow priority change
sched: Deadline has dynamic priority
sched: Audit MOVE vs balance_callbacks
sched: Fold rq-pin swizzle into __balance_callbacks()
sched/deadline: Avoid double update_rq_clock()
sched/deadline: Ensure get_prio_dl() is up-to-date
sched/deadline: Fix server stopping with runnable tasks
sched: Provide idle_rq() helper
sched/deadline: Fix potential race in dl_add_task_root_domain()
sched/deadline: Remove unnecessary comment in dl_add_task_root_domain()
This commit is contained in:
@@ -1874,7 +1874,6 @@ static inline int task_nice(const struct task_struct *p)
|
||||
extern int can_nice(const struct task_struct *p, const int nice);
|
||||
extern int task_curr(const struct task_struct *p);
|
||||
extern int idle_cpu(int cpu);
|
||||
extern int available_idle_cpu(int cpu);
|
||||
extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
|
||||
extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
|
||||
extern void sched_set_fifo(struct task_struct *p);
|
||||
|
||||
@@ -4950,9 +4950,13 @@ struct balance_callback *splice_balance_callbacks(struct rq *rq)
|
||||
return __splice_balance_callbacks(rq, true);
|
||||
}
|
||||
|
||||
static void __balance_callbacks(struct rq *rq)
|
||||
void __balance_callbacks(struct rq *rq, struct rq_flags *rf)
|
||||
{
|
||||
if (rf)
|
||||
rq_unpin_lock(rq, rf);
|
||||
do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
|
||||
if (rf)
|
||||
rq_repin_lock(rq, rf);
|
||||
}
|
||||
|
||||
void balance_callbacks(struct rq *rq, struct balance_callback *head)
|
||||
@@ -4991,7 +4995,7 @@ static inline void finish_lock_switch(struct rq *rq)
|
||||
* prev into current:
|
||||
*/
|
||||
spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
|
||||
__balance_callbacks(rq);
|
||||
__balance_callbacks(rq, NULL);
|
||||
raw_spin_rq_unlock_irq(rq);
|
||||
}
|
||||
|
||||
@@ -6867,7 +6871,7 @@ keep_resched:
|
||||
proxy_tag_curr(rq, next);
|
||||
|
||||
rq_unpin_lock(rq, &rf);
|
||||
__balance_callbacks(rq);
|
||||
__balance_callbacks(rq, NULL);
|
||||
raw_spin_rq_unlock_irq(rq);
|
||||
}
|
||||
trace_sched_exit_tp(is_switch);
|
||||
@@ -7316,7 +7320,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
|
||||
trace_sched_pi_setprio(p, pi_task);
|
||||
oldprio = p->prio;
|
||||
|
||||
if (oldprio == prio)
|
||||
if (oldprio == prio && !dl_prio(prio))
|
||||
queue_flag &= ~DEQUEUE_MOVE;
|
||||
|
||||
prev_class = p->sched_class;
|
||||
@@ -7362,9 +7366,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
|
||||
out_unlock:
|
||||
/* Caller holds task_struct::pi_lock, IRQs are still disabled */
|
||||
|
||||
rq_unpin_lock(rq, &rf);
|
||||
__balance_callbacks(rq);
|
||||
rq_repin_lock(rq, &rf);
|
||||
__balance_callbacks(rq, &rf);
|
||||
__task_rq_unlock(rq, p, &rf);
|
||||
}
|
||||
#endif /* CONFIG_RT_MUTEXES */
|
||||
@@ -9124,6 +9126,8 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)
|
||||
|
||||
if (resched)
|
||||
resched_curr(rq);
|
||||
|
||||
__balance_callbacks(rq, &rq_guard.rf);
|
||||
}
|
||||
|
||||
static struct cgroup_subsys_state *
|
||||
|
||||
@@ -752,8 +752,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
|
||||
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
|
||||
struct rq *rq = rq_of_dl_rq(dl_rq);
|
||||
|
||||
update_rq_clock(rq);
|
||||
|
||||
WARN_ON(is_dl_boosted(dl_se));
|
||||
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
|
||||
|
||||
@@ -1420,7 +1418,7 @@ update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, int
|
||||
|
||||
static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
|
||||
{
|
||||
bool idle = rq->curr == rq->idle;
|
||||
bool idle = idle_rq(rq);
|
||||
s64 scaled_delta_exec;
|
||||
|
||||
if (unlikely(delta_exec <= 0)) {
|
||||
@@ -1603,8 +1601,8 @@ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
|
||||
* | 8 | B:zero_laxity-wait | | |
|
||||
* | | | <---+ |
|
||||
* | +--------------------------------+ |
|
||||
* | | ^ ^ 2 |
|
||||
* | | 7 | 2 +--------------------+
|
||||
* | | ^ ^ 2 |
|
||||
* | | 7 | 2, 1 +----------------+
|
||||
* | v |
|
||||
* | +-------------+ |
|
||||
* +-- | C:idle-wait | -+
|
||||
@@ -1649,8 +1647,11 @@ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
|
||||
* dl_defer_idle = 0
|
||||
*
|
||||
*
|
||||
* [1] A->B, A->D
|
||||
* [1] A->B, A->D, C->B
|
||||
* dl_server_start()
|
||||
* dl_defer_idle = 0;
|
||||
* if (dl_server_active)
|
||||
* return; // [B]
|
||||
* dl_server_active = 1;
|
||||
* enqueue_dl_entity()
|
||||
* update_dl_entity(WAKEUP)
|
||||
@@ -1759,6 +1760,7 @@ void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
|
||||
* "B:zero_laxity-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"]
|
||||
* "B:zero_laxity-wait" -> "D:running" [label="3:dl_server_timer"]
|
||||
* "C:idle-wait" -> "A:init" [label="8:dl_server_timer"]
|
||||
* "C:idle-wait" -> "B:zero_laxity-wait" [label="1:dl_server_start"]
|
||||
* "C:idle-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"]
|
||||
* "C:idle-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"]
|
||||
* "D:running" -> "A:init" [label="4:pick_task_dl"]
|
||||
@@ -1784,6 +1786,7 @@ void dl_server_start(struct sched_dl_entity *dl_se)
|
||||
{
|
||||
struct rq *rq = dl_se->rq;
|
||||
|
||||
dl_se->dl_defer_idle = 0;
|
||||
if (!dl_server(dl_se) || dl_se->dl_server_active)
|
||||
return;
|
||||
|
||||
@@ -1834,6 +1837,7 @@ void sched_init_dl_servers(void)
|
||||
rq = cpu_rq(cpu);
|
||||
|
||||
guard(rq_lock_irq)(rq);
|
||||
update_rq_clock(rq);
|
||||
|
||||
dl_se = &rq->fair_server;
|
||||
|
||||
@@ -2210,7 +2214,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
|
||||
update_dl_entity(dl_se);
|
||||
} else if (flags & ENQUEUE_REPLENISH) {
|
||||
replenish_dl_entity(dl_se);
|
||||
} else if ((flags & ENQUEUE_RESTORE) &&
|
||||
} else if ((flags & ENQUEUE_MOVE) &&
|
||||
!is_dl_boosted(dl_se) &&
|
||||
dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
|
||||
setup_new_dl_entity(dl_se);
|
||||
@@ -3154,7 +3158,7 @@ void dl_add_task_root_domain(struct task_struct *p)
|
||||
struct rq *rq;
|
||||
struct dl_bw *dl_b;
|
||||
unsigned int cpu;
|
||||
struct cpumask *msk = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
|
||||
struct cpumask *msk;
|
||||
|
||||
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
|
||||
if (!dl_task(p) || dl_entity_is_special(&p->dl)) {
|
||||
@@ -3162,20 +3166,12 @@ void dl_add_task_root_domain(struct task_struct *p)
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get an active rq, whose rq->rd traces the correct root
|
||||
* domain.
|
||||
* Ideally this would be under cpuset reader lock until rq->rd is
|
||||
* fetched. However, sleepable locks cannot nest inside pi_lock, so we
|
||||
* rely on the caller of dl_add_task_root_domain() holds 'cpuset_mutex'
|
||||
* to guarantee the CPU stays in the cpuset.
|
||||
*/
|
||||
msk = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
|
||||
dl_get_task_effective_cpus(p, msk);
|
||||
cpu = cpumask_first_and(cpu_active_mask, msk);
|
||||
BUG_ON(cpu >= nr_cpu_ids);
|
||||
rq = cpu_rq(cpu);
|
||||
dl_b = &rq->rd->dl_bw;
|
||||
/* End of fetching rd */
|
||||
|
||||
raw_spin_lock(&dl_b->lock);
|
||||
__dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
|
||||
@@ -3299,6 +3295,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
|
||||
|
||||
static u64 get_prio_dl(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
/*
|
||||
* Make sure to update current so we don't return a stale value.
|
||||
*/
|
||||
if (task_current_donor(rq, p))
|
||||
update_curr_dl(rq);
|
||||
|
||||
return p->dl.deadline;
|
||||
}
|
||||
|
||||
|
||||
@@ -545,6 +545,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter)
|
||||
static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
|
||||
{
|
||||
if (iter->locked_task) {
|
||||
__balance_callbacks(iter->rq, &iter->rf);
|
||||
task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
|
||||
iter->locked_task = NULL;
|
||||
}
|
||||
|
||||
@@ -1364,6 +1364,28 @@ static inline u32 sched_rng(void)
|
||||
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
|
||||
#define raw_rq() raw_cpu_ptr(&runqueues)
|
||||
|
||||
static inline bool idle_rq(struct rq *rq)
|
||||
{
|
||||
return rq->curr == rq->idle && !rq->nr_running && !rq->ttwu_pending;
|
||||
}
|
||||
|
||||
/**
|
||||
* available_idle_cpu - is a given CPU idle for enqueuing work.
|
||||
* @cpu: the CPU in question.
|
||||
*
|
||||
* Return: 1 if the CPU is currently idle. 0 otherwise.
|
||||
*/
|
||||
static inline bool available_idle_cpu(int cpu)
|
||||
{
|
||||
if (!idle_rq(cpu_rq(cpu)))
|
||||
return 0;
|
||||
|
||||
if (vcpu_is_preempted(cpu))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SCHED_PROXY_EXEC
|
||||
static inline void rq_set_donor(struct rq *rq, struct task_struct *t)
|
||||
{
|
||||
@@ -2366,7 +2388,8 @@ extern const u32 sched_prio_to_wmult[40];
|
||||
* should preserve as much state as possible.
|
||||
*
|
||||
* MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
|
||||
* in the runqueue.
|
||||
* in the runqueue. IOW the priority is allowed to change. Callers
|
||||
* must expect to deal with balance callbacks.
|
||||
*
|
||||
* NOCLOCK - skip the update_rq_clock() (avoids double updates)
|
||||
*
|
||||
@@ -3947,6 +3970,8 @@ extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
|
||||
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
|
||||
|
||||
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
|
||||
|
||||
extern void __balance_callbacks(struct rq *rq, struct rq_flags *rf);
|
||||
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
|
||||
|
||||
/*
|
||||
|
||||
@@ -180,35 +180,7 @@ int task_prio(const struct task_struct *p)
|
||||
*/
|
||||
int idle_cpu(int cpu)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
|
||||
if (rq->curr != rq->idle)
|
||||
return 0;
|
||||
|
||||
if (rq->nr_running)
|
||||
return 0;
|
||||
|
||||
if (rq->ttwu_pending)
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* available_idle_cpu - is a given CPU idle for enqueuing work.
|
||||
* @cpu: the CPU in question.
|
||||
*
|
||||
* Return: 1 if the CPU is currently idle. 0 otherwise.
|
||||
*/
|
||||
int available_idle_cpu(int cpu)
|
||||
{
|
||||
if (!idle_cpu(cpu))
|
||||
return 0;
|
||||
|
||||
if (vcpu_is_preempted(cpu))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
return idle_rq(cpu_rq(cpu));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -667,7 +639,7 @@ change:
|
||||
* itself.
|
||||
*/
|
||||
newprio = rt_effective_prio(p, newprio);
|
||||
if (newprio == oldprio)
|
||||
if (newprio == oldprio && !dl_prio(newprio))
|
||||
queue_flags &= ~DEQUEUE_MOVE;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user