Skip to content

Commit

Permalink
Merge tag 'sched_urgent_for_v5.17_rc2' of git://git.kernel.org/pub/sc…
Browse files Browse the repository at this point in the history
…m/linux/kernel/git/tip/tip

Pull scheduler fixes from Borislav Petkov:
 "A bunch of fixes: forced idle time accounting, utilization values
  propagation in the sched hierarchies and other minor cleanups and
  improvements"

* tag 'sched_urgent_for_v5.17_rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  kernel/sched: Remove dl_boosted flag comment
  sched: Avoid double preemption in __cond_resched_*lock*()
  sched/fair: Fix all kernel-doc warnings
  sched/core: Accounting forceidle time for all tasks except idle task
  sched/pelt: Relax the sync of load_sum with load_avg
  sched/pelt: Relax the sync of runnable_sum with runnable_avg
  sched/pelt: Continue to relax the sync of util_sum with util_avg
  sched/pelt: Relax the sync of util_sum with util_avg
  psi: Fix uaf issue when psi trigger is destroyed while being polled
  • Loading branch information
torvalds committed Jan 23, 2022
2 parents 0f9e042 + 0e38724 commit 10c64a0
Show file tree
Hide file tree
Showing 10 changed files with 125 additions and 103 deletions.
3 changes: 2 additions & 1 deletion Documentation/accounting/psi.rst
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ Triggers can be set on more than one psi metric and more than one trigger
for the same psi metric can be specified. However for each trigger a separate
file descriptor is required to be able to poll it separately from others,
therefore for each trigger a separate open() syscall should be made even
when opening the same psi interface file.
when opening the same psi interface file. Write operations to a file descriptor
with an already existing psi trigger will fail with EBUSY.

Monitors activate only when system enters stall state for the monitored
psi metric and deactivates upon exit from the stall state. While system is
Expand Down
2 changes: 1 addition & 1 deletion include/linux/psi.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ void cgroup_move_task(struct task_struct *p, struct css_set *to);

struct psi_trigger *psi_trigger_create(struct psi_group *group,
char *buf, size_t nbytes, enum psi_res res);
void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t);
void psi_trigger_destroy(struct psi_trigger *t);

__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
poll_table *wait);
Expand Down
3 changes: 0 additions & 3 deletions include/linux/psi_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,6 @@ struct psi_trigger {
* events to one per window
*/
u64 last_event_time;

/* Refcounting to prevent premature destruction */
struct kref refcount;
};

struct psi_group {
Expand Down
4 changes: 0 additions & 4 deletions include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -619,10 +619,6 @@ struct sched_dl_entity {
* task has to wait for a replenishment to be performed at the
* next firing of dl_timer.
*
* @dl_boosted tells if we are boosted due to DI. If so we are
* outside bandwidth enforcement mechanism (but only until we
* exit the critical section);
*
* @dl_yielded tells if task gave up the CPU before consuming
* all its available runtime during the last job.
*
Expand Down
11 changes: 8 additions & 3 deletions kernel/cgroup/cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -3643,15 +3643,20 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
cgroup_get(cgrp);
cgroup_kn_unlock(of->kn);

/* Allow only one trigger per file descriptor */
if (ctx->psi.trigger) {
cgroup_put(cgrp);
return -EBUSY;
}

psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
new = psi_trigger_create(psi, buf, nbytes, res);
if (IS_ERR(new)) {
cgroup_put(cgrp);
return PTR_ERR(new);
}

psi_trigger_replace(&ctx->psi.trigger, new);

smp_store_release(&ctx->psi.trigger, new);
cgroup_put(cgrp);

return nbytes;
Expand Down Expand Up @@ -3690,7 +3695,7 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
{
struct cgroup_file_ctx *ctx = of->priv;

psi_trigger_replace(&ctx->psi.trigger, NULL);
psi_trigger_destroy(ctx->psi.trigger);
}

bool cgroup_psi_enabled(void)
Expand Down
15 changes: 4 additions & 11 deletions kernel/sched/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -5822,8 +5822,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}

if (schedstat_enabled() && rq->core->core_forceidle_count) {
if (cookie)
rq->core->core_forceidle_start = rq_clock(rq->core);
rq->core->core_forceidle_start = rq_clock(rq->core);
rq->core->core_forceidle_occupation = occ;
}

Expand Down Expand Up @@ -8219,9 +8218,7 @@ int __cond_resched_lock(spinlock_t *lock)

if (spin_needbreak(lock) || resched) {
spin_unlock(lock);
if (resched)
preempt_schedule_common();
else
if (!_cond_resched())
cpu_relax();
ret = 1;
spin_lock(lock);
Expand All @@ -8239,9 +8236,7 @@ int __cond_resched_rwlock_read(rwlock_t *lock)

if (rwlock_needbreak(lock) || resched) {
read_unlock(lock);
if (resched)
preempt_schedule_common();
else
if (!_cond_resched())
cpu_relax();
ret = 1;
read_lock(lock);
Expand All @@ -8259,9 +8254,7 @@ int __cond_resched_rwlock_write(rwlock_t *lock)

if (rwlock_needbreak(lock) || resched) {
write_unlock(lock);
if (resched)
preempt_schedule_common();
else
if (!_cond_resched())
cpu_relax();
ret = 1;
write_lock(lock);
Expand Down
2 changes: 1 addition & 1 deletion kernel/sched/core_sched.c
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ void __sched_core_account_forceidle(struct rq *rq)
rq_i = cpu_rq(i);
p = rq_i->core_pick ?: rq_i->curr;

if (!p->core_cookie)
if (p == rq_i->idle)
continue;

__schedstat_add(p->stats.core_forceidle_sum, delta);
Expand Down
118 changes: 77 additions & 41 deletions kernel/sched/fair.c
Original file line number Diff line number Diff line change
Expand Up @@ -3028,9 +3028,11 @@ enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u32 divider = get_pelt_divider(&se->avg);
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
/* See update_cfs_rq_load_avg() */
cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
#else
static inline void
Expand Down Expand Up @@ -3381,7 +3383,6 @@ void set_task_rq_fair(struct sched_entity *se,
se->avg.last_update_time = n_last_update_time;
}


/*
* When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
* propagate its contribution. The key to this propagation is the invariant
Expand Down Expand Up @@ -3449,15 +3450,14 @@ void set_task_rq_fair(struct sched_entity *se,
* XXX: only do this for the part of runnable > running ?
*
*/

static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
u32 divider;
long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
u32 new_sum, divider;

/* Nothing to update */
if (!delta)
if (!delta_avg)
return;

/*
Expand All @@ -3466,23 +3466,30 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
*/
divider = get_pelt_divider(&cfs_rq->avg);


/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
se->avg.util_sum = se->avg.util_avg * divider;
new_sum = se->avg.util_avg * divider;
delta_sum = (long)new_sum - (long)se->avg.util_sum;
se->avg.util_sum = new_sum;

/* Update parent cfs_rq utilization */
add_positive(&cfs_rq->avg.util_avg, delta);
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
add_positive(&cfs_rq->avg.util_avg, delta_avg);
add_positive(&cfs_rq->avg.util_sum, delta_sum);

/* See update_cfs_rq_load_avg() */
cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
}

static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
u32 divider;
long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
u32 new_sum, divider;

/* Nothing to update */
if (!delta)
if (!delta_avg)
return;

/*
Expand All @@ -3493,19 +3500,25 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf

/* Set new sched_entity's runnable */
se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
se->avg.runnable_sum = se->avg.runnable_avg * divider;
new_sum = se->avg.runnable_avg * divider;
delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
se->avg.runnable_sum = new_sum;

/* Update parent cfs_rq runnable */
add_positive(&cfs_rq->avg.runnable_avg, delta);
cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
/* See update_cfs_rq_load_avg() */
cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
}

static inline void
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
unsigned long load_avg;
u64 load_sum = 0;
s64 delta_sum;
u32 divider;

if (!runnable_sum)
Expand All @@ -3532,7 +3545,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
* assuming all tasks are equally runnable.
*/
if (scale_load_down(gcfs_rq->load.weight)) {
load_sum = div_s64(gcfs_rq->avg.load_sum,
load_sum = div_u64(gcfs_rq->avg.load_sum,
scale_load_down(gcfs_rq->load.weight));
}

Expand All @@ -3549,19 +3562,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
runnable_sum = max(runnable_sum, running_sum);

load_sum = (s64)se_weight(se) * runnable_sum;
load_avg = div_s64(load_sum, divider);

se->avg.load_sum = runnable_sum;
load_sum = se_weight(se) * runnable_sum;
load_avg = div_u64(load_sum, divider);

delta = load_avg - se->avg.load_avg;
if (!delta)
delta_avg = load_avg - se->avg.load_avg;
if (!delta_avg)
return;

se->avg.load_avg = load_avg;
delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;

add_positive(&cfs_rq->avg.load_avg, delta);
cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
se->avg.load_sum = runnable_sum;
se->avg.load_avg = load_avg;
add_positive(&cfs_rq->avg.load_avg, delta_avg);
add_positive(&cfs_rq->avg.load_sum, delta_sum);
/* See update_cfs_rq_load_avg() */
cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}

static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
Expand Down Expand Up @@ -3652,7 +3668,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
*
* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
*
* Returns true if the load decayed or we removed load.
* Return: true if the load decayed or we removed load.
*
* Since both these conditions indicate a changed cfs_rq->avg.load we should
* call update_tg_load_avg() when this function returns true.
Expand All @@ -3677,15 +3693,32 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)

r = removed_load;
sub_positive(&sa->load_avg, r);
sa->load_sum = sa->load_avg * divider;
sub_positive(&sa->load_sum, r * divider);
/* See sa->util_sum below */
sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);

r = removed_util;
sub_positive(&sa->util_avg, r);
sa->util_sum = sa->util_avg * divider;
sub_positive(&sa->util_sum, r * divider);
/*
* Because of rounding, se->util_sum might ends up being +1 more than
* cfs->util_sum. Although this is not a problem by itself, detaching
* a lot of tasks with the rounding problem between 2 updates of
* util_avg (~1ms) can make cfs->util_sum becoming null whereas
* cfs_util_avg is not.
* Check that util_sum is still above its lower bound for the new
* util_avg. Given that period_contrib might have moved since the last
* sync, we are only sure that util_sum must be above or equal to
* util_avg * minimum possible divider
*/
sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);

r = removed_runnable;
sub_positive(&sa->runnable_avg, r);
sa->runnable_sum = sa->runnable_avg * divider;
sub_positive(&sa->runnable_sum, r * divider);
/* See sa->util_sum above */
sa->runnable_sum = max_t(u32, sa->runnable_sum,
sa->runnable_avg * PELT_MIN_DIVIDER);

/*
* removed_runnable is the unweighted version of removed_load so we
Expand Down Expand Up @@ -3772,17 +3805,18 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
u32 divider = get_pelt_divider(&cfs_rq->avg);

dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
/* See update_cfs_rq_load_avg() */
cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);

sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
/* See update_cfs_rq_load_avg() */
cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);

add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);

Expand Down Expand Up @@ -8539,6 +8573,8 @@ group_type group_classify(unsigned int imbalance_pct,
*
* If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
* of @dst_cpu are idle and @sg has lower priority.
*
* Return: true if @dst_cpu can pull tasks, false otherwise.
*/
static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
struct sg_lb_stats *sgs,
Expand Down Expand Up @@ -8614,6 +8650,7 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
* @sds: Load-balancing data with statistics of the local group.
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
* @sg_status: Holds flag indicating the status of the sched_group
Expand Down Expand Up @@ -9421,12 +9458,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/**
* find_busiest_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
* @env: The load balancing environment.
*
* Also calculates the amount of runnable load which should be moved
* to restore balance.
*
* @env: The load balancing environment.
*
* Return: - The busiest group if imbalance exists.
*/
static struct sched_group *find_busiest_group(struct lb_env *env)
Expand Down
4 changes: 3 additions & 1 deletion kernel/sched/pelt.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running)
}
#endif

#define PELT_MIN_DIVIDER (LOAD_AVG_MAX - 1024)

static inline u32 get_pelt_divider(struct sched_avg *avg)
{
return LOAD_AVG_MAX - 1024 + avg->period_contrib;
return PELT_MIN_DIVIDER + avg->period_contrib;
}

static inline void cfs_se_util_change(struct sched_avg *avg)
Expand Down
Loading

0 comments on commit 10c64a0

Please sign in to comment.