Skip to content

Commit

Permalink
sched: Change nohz idle load balancing logic to push model
Browse files Browse the repository at this point in the history
In the new push model, all idle CPUs indeed go into nohz mode. There is
still the concept of idle load balancer (performing the load balancing
on behalf of all the idle cpu's in the system). Busy CPU kicks the nohz
balancer when any of the nohz CPUs need idle load balancing.
The kickee CPU does the idle load balancing on behalf of all idle CPUs
instead of the normal idle balance.

This addresses the below two problems with the current nohz ilb logic:
* the idle load balancer continued to have periodic ticks during idle and
  wokeup frequently, even though it did not have any rebalancing to do on
  behalf of any of the idle CPUs.
* On x86 and CPUs that have APIC timer stoppage on idle CPUs, this
  periodic wakeup can result in a periodic additional interrupt on a CPU
  doing the timer broadcast.

Also currently we are migrating the unpinned timers from an idle to the cpu
doing idle load balancing (when all the cpus in the system are idle,
there is no idle load balancing cpu and timers get added to the same idle cpu
where the request was made. So the existing optimization works only on semi idle
system).

And In semi idle system, we no longer have periodic ticks on the idle load
balancer CPU. Using that cpu will add more delays to the timers than intended
(as that cpu's timer base may not be uptodate wrt jiffies etc). This was
causing mysterious slowdowns during boot etc.

For now, in the semi idle case, use the nearest busy cpu for migrating timers
from an idle cpu.  This is good for power-savings anyway.

Signed-off-by: Venkatesh Pallipadi <[email protected]>
Signed-off-by: Suresh Siddha <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
  • Loading branch information
Venkatesh Pallipadi authored and Ingo Molnar committed Jun 9, 2010
1 parent fdf3e95 commit 83cd4fe
Show file tree
Hide file tree
Showing 6 changed files with 237 additions and 159 deletions.
9 changes: 3 additions & 6 deletions include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -271,14 +271,11 @@ extern int runqueue_is_locked(int cpu);

extern cpumask_var_t nohz_cpu_mask;
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
extern int select_nohz_load_balancer(int cpu);
extern int get_nohz_load_balancer(void);
extern void select_nohz_load_balancer(int stop_tick);
extern int get_nohz_timer_target(void);
extern int nohz_ratelimit(int cpu);
#else
static inline int select_nohz_load_balancer(int cpu)
{
return 0;
}
static inline void select_nohz_load_balancer(int stop_tick) { }

static inline int nohz_ratelimit(int cpu)
{
Expand Down
8 changes: 2 additions & 6 deletions kernel/hrtimer.c
Original file line number Diff line number Diff line change
Expand Up @@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
static int hrtimer_get_target(int this_cpu, int pinned)
{
#ifdef CONFIG_NO_HZ
if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
int preferred_cpu = get_nohz_load_balancer();

if (preferred_cpu >= 0)
return preferred_cpu;
}
if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
return get_nohz_timer_target();
#endif
return this_cpu;
}
Expand Down
34 changes: 31 additions & 3 deletions kernel/sched.c
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@ struct rq {
unsigned long last_load_update_tick;
#ifdef CONFIG_NO_HZ
u64 nohz_stamp;
unsigned char in_nohz_recently;
unsigned char nohz_balance_kick;
#endif
unsigned int skip_clock_update;

Expand Down Expand Up @@ -1194,6 +1194,27 @@ static void resched_cpu(int cpu)
}

#ifdef CONFIG_NO_HZ
/*
* In the semi idle case, use the nearest busy cpu for migrating timers
* from an idle cpu. This is good for power-savings.
*
* We don't do similar optimization for completely idle system, as
* selecting an idle cpu will add more delays to the timers than intended
* (as that cpu's timer base may not be uptodate wrt jiffies etc).
*/
int get_nohz_timer_target(void)
{
int cpu = smp_processor_id();
int i;
struct sched_domain *sd;

for_each_domain(cpu, sd) {
for_each_cpu(i, sched_domain_span(sd))
if (!idle_cpu(i))
return i;
}
return cpu;
}
/*
* When add_timer_on() enqueues a timer into the timer wheel of an
* idle CPU then this timer might expire before the next timer event
Expand Down Expand Up @@ -7791,6 +7812,10 @@ void __init sched_init(void)
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ
rq->nohz_balance_kick = 0;
init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
#endif
#endif
init_rq_hrtick(rq);
atomic_set(&rq->nr_iowait, 0);
Expand Down Expand Up @@ -7835,8 +7860,11 @@ void __init sched_init(void)
zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ
zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
atomic_set(&nohz.load_balancer, nr_cpu_ids);
atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
#endif
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
Expand Down
Loading

0 comments on commit 83cd4fe

Please sign in to comment.