Kernel调度器负载均衡(一)
来源:互联网 发布:fm2016经典球星数据库 编辑:程序博客网 时间:2024/06/08 02:04
Kernel中的CPU负载均衡是对调度器的增强,在多处理器上(SMP/NUMA),必须要考虑CPU的负载均衡,包括:1.CPU负荷尽可能公平地在所有处理器上共享。2.内核必须能够将进程从一个CPU迁移到另一个CPU上。
在Kernel中调度器进行负载均衡的时机有4个:
1.在时钟中断时,周期性调度器scheduler_tick会被调用,它在最后会调用trigger_load_balance函数,该函数可以触发负载均衡软中断。
2.在主调度器schedule中会判断当前CPU的rq上的进程个数是否为0,如果为0的话,就调用idle_balance函数,进行负载均衡。
3.在创建进程时,在函数sched_exec会进行负载均衡。
4.在使用try_to_wake_up函数唤醒进程时,会进行负载均衡。
下面对着四种情况一一介绍。
1.周期性负载均衡。
scheduler_tick会调用trigger_load_balance,trigger_load_balance的代码如下:
static inline void trigger_load_balance(struct rq *rq, int cpu)
{
#ifdef CONFIG_NO_HZ
/*
* If we were in the nohz mode recently and busy at the current
* scheduler tick, then check if we need to nominate new idle
* load balancer.
*/
{
#ifdef CONFIG_NO_HZ
/*
* If we were in the nohz mode recently and busy at the current
* scheduler tick, then check if we need to nominate new idle
* load balancer.
*/
//nohz模式指的是,当CPU进入空闲状态时,系统为了省电,就把CPU的周期性中断关掉
if (rq->in_nohz_recently && !rq->idle_at_tick) {//如果当前cpu最近进入了nohz模式,但是目前不是处于空闲状态。
rq->in_nohz_recently = 0;
if (atomic_read(&nohz.load_balancer) == cpu) {//如果当前cpu之前是idle load balancer,则取消其资格(因为它不在空闲)
cpumask_clear_cpu(cpu, nohz.cpu_mask);
atomic_set(&nohz.load_balancer, -1);
}
if (atomic_read(&nohz.load_balancer) == -1) {//选择一个新的idle load balancer,并且将其唤醒
int ilb = find_new_ilb(cpu);
if (ilb < nr_cpu_ids)
resched_cpu(ilb);//让该cpu重新调度,这样它就会执行idle load balance
}
}
/*
* If this cpu is idle and doing idle load balancing for all the
* cpus with ticks stopped, is it time for that to stop?
*/
if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {//这表示当前的idle load balancer是当前cpu,而且所有cpu都进入了nohz模式,整个系统处于空闲状态,没有必要做负载均衡。
resched_cpu(cpu);//在该cpu上重新调度(该cpu目前执行的应该是负载均衡的代码),停止执行负载均衡。
return;
}
/*
* If this cpu is idle and the idle load balancing is done by
* someone else, then no need raise the SCHED_SOFTIRQ
*/
if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
cpumask_test_cpu(cpu, nohz.cpu_mask))//当前cpu处于空闲状态,当前的idle load balancer不是该cpu(意味着有别的cpu会帮其进行负载均衡),则直接退出,无需触发负载均衡软中断。
return;
#endif
/* Don't need to rebalance while attached to NULL domain */
if (time_after_eq(jiffies, rq->next_balance) &&
likely(!on_null_domain(cpu)))
raise_softirq(SCHED_SOFTIRQ);//触发负载均衡软中断,该中断处理函数是run_rebalance_domains
}
if (rq->in_nohz_recently && !rq->idle_at_tick) {//如果当前cpu最近进入了nohz模式,但是目前不是处于空闲状态。
rq->in_nohz_recently = 0;
if (atomic_read(&nohz.load_balancer) == cpu) {//如果当前cpu之前是idle load balancer,则取消其资格(因为它不在空闲)
cpumask_clear_cpu(cpu, nohz.cpu_mask);
atomic_set(&nohz.load_balancer, -1);
}
if (atomic_read(&nohz.load_balancer) == -1) {//选择一个新的idle load balancer,并且将其唤醒
int ilb = find_new_ilb(cpu);
if (ilb < nr_cpu_ids)
resched_cpu(ilb);//让该cpu重新调度,这样它就会执行idle load balance
}
}
/*
* If this cpu is idle and doing idle load balancing for all the
* cpus with ticks stopped, is it time for that to stop?
*/
if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {//这表示当前的idle load balancer是当前cpu,而且所有cpu都进入了nohz模式,整个系统处于空闲状态,没有必要做负载均衡。
resched_cpu(cpu);//在该cpu上重新调度(该cpu目前执行的应该是负载均衡的代码),停止执行负载均衡。
return;
}
/*
* If this cpu is idle and the idle load balancing is done by
* someone else, then no need raise the SCHED_SOFTIRQ
*/
if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
cpumask_test_cpu(cpu, nohz.cpu_mask))//当前cpu处于空闲状态,当前的idle load balancer不是该cpu(意味着有别的cpu会帮其进行负载均衡),则直接退出,无需触发负载均衡软中断。
return;
#endif
/* Don't need to rebalance while attached to NULL domain */
if (time_after_eq(jiffies, rq->next_balance) &&
likely(!on_null_domain(cpu)))
raise_softirq(SCHED_SOFTIRQ);//触发负载均衡软中断,该中断处理函数是run_rebalance_domains
}
下面看下run_rebalance_domains的代码
/*
* run_rebalance_domains is triggered when needed from the scheduler tick.
* In CONFIG_NO_HZ case, the idle load balance owner will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
*/
static void run_rebalance_domains(struct softirq_action *h)
{
int this_cpu = smp_processor_id();
struct rq *this_rq = cpu_rq(this_cpu);
enum cpu_idle_type idle = this_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE;//后面负载均衡时会根据该cpu的状态(闲/忙),选择不同饿参数。
rebalance_domains(this_cpu, idle);//检查该cpu所处的调度域,看是否存在负载不平衡现象,如果存在则进行负载平衡。详情见下文。
#ifdef CONFIG_NO_HZ
/*
* If this cpu is the owner for idle load balancing, then do the
* balancing on behalf of the other idle cpus whose ticks are
* stopped.
*/
if (this_rq->idle_at_tick &&
atomic_read(&nohz.load_balancer) == this_cpu) {
struct rq *rq;
int balance_cpu;
for_each_cpu(balance_cpu, nohz.cpu_mask) {
if (balance_cpu == this_cpu)
continue;
/*
* If this cpu gets work to do, stop the load balancing
* work being done for other cpus. Next load
* balancing owner will pick it up.
*/
if (need_resched())//如果当前cpu有工作要做,则停止进行负载平衡。
break;
rebalance_domains(balance_cpu, CPU_IDLE);//当前cpu替balance_cpu进行负载均衡,即将忙的cpu上的任务迁移到balance_cpu上(如果存在不平衡的话)
rq = cpu_rq(balance_cpu);
if (time_after(this_rq->next_balance, rq->next_balance))//更新本rq下一次load balance的时间为所有被它代理的rq执行load balance中最早的,不明白原因。
this_rq->next_balance = rq->next_balance;
}
}
#endif
}
* run_rebalance_domains is triggered when needed from the scheduler tick.
* In CONFIG_NO_HZ case, the idle load balance owner will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
*/
static void run_rebalance_domains(struct softirq_action *h)
{
int this_cpu = smp_processor_id();
struct rq *this_rq = cpu_rq(this_cpu);
enum cpu_idle_type idle = this_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE;//后面负载均衡时会根据该cpu的状态(闲/忙),选择不同饿参数。
rebalance_domains(this_cpu, idle);//检查该cpu所处的调度域,看是否存在负载不平衡现象,如果存在则进行负载平衡。详情见下文。
#ifdef CONFIG_NO_HZ
/*
* If this cpu is the owner for idle load balancing, then do the
* balancing on behalf of the other idle cpus whose ticks are
* stopped.
*/
if (this_rq->idle_at_tick &&
atomic_read(&nohz.load_balancer) == this_cpu) {
struct rq *rq;
int balance_cpu;
for_each_cpu(balance_cpu, nohz.cpu_mask) {
if (balance_cpu == this_cpu)
continue;
/*
* If this cpu gets work to do, stop the load balancing
* work being done for other cpus. Next load
* balancing owner will pick it up.
*/
if (need_resched())//如果当前cpu有工作要做,则停止进行负载平衡。
break;
rebalance_domains(balance_cpu, CPU_IDLE);//当前cpu替balance_cpu进行负载均衡,即将忙的cpu上的任务迁移到balance_cpu上(如果存在不平衡的话)
rq = cpu_rq(balance_cpu);
if (time_after(this_rq->next_balance, rq->next_balance))//更新本rq下一次load balance的时间为所有被它代理的rq执行load balance中最早的,不明白原因。
this_rq->next_balance = rq->next_balance;
}
}
#endif
}
函数rebalance_domains的代码如下:
static void rebalance_domains(int cpu, enum cpu_idle_type idle)
{
int balance = 1;
struct rq *rq = cpu_rq(cpu);
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
int need_serialize;
for_each_domain(cpu, sd) {//遍历该cpu的所有调度域,从最低一级到最高一级。
if (!(sd->flags & SD_LOAD_BALANCE))//该调度域被指定不进行负载均衡。
continue;
interval = sd->balance_interval;//执行load balance的时间间隔
if (idle != CPU_IDLE)//如果该cpu的状态不为空闲,则延长load balance的时间间隔,将原来的值乘以sd->busy_factor
interval *= sd->busy_factor;
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
if (unlikely(!interval))
interval = 1;
if (interval > HZ*NR_CPUS/10)//load balance时间间隔是有上限的
interval = HZ*NR_CPUS/10;
need_serialize = sd->flags & SD_SERIALIZE;
if (need_serialize) {//需要串行执行,获取锁。
if (!spin_trylock(&balancing))
goto out;
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {//到达执行load balance的时间
if (load_balance(cpu, rq, sd, idle, &balance)) {//检查该cpu在这一层的调度域中是否存在负载不平衡的情况,如果存在该cpu会分担负载最重的那个cpu的一些任务。详细情况见下文。
/*
* We've pulled tasks over so either we're no
* longer idle, or one of our SMT siblings is
* not idle.
*/
idle = CPU_NOT_IDLE;
}
sd->last_balance = jiffies;
}
if (need_serialize)
spin_unlock(&balancing);//释放串行锁
out:
if (time_after(next_balance, sd->last_balance + interval)) {
next_balance = sd->last_balance + interval;
update_next_balance = 1;
}
/*
* Stop the load balance at this level. There is another
* CPU in our sched group which is doing load balancing more
* actively.
*/
if (!balance)//不存在负载均衡了,跳出循环
break;
}
/*
* next_balance will be updated only when there is a need.
* When the cpu is attached to null domain for ex, it will not be
* updated.
*/
if (likely(update_next_balance))
rq->next_balance = next_balance;
}
{
int balance = 1;
struct rq *rq = cpu_rq(cpu);
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
int need_serialize;
for_each_domain(cpu, sd) {//遍历该cpu的所有调度域,从最低一级到最高一级。
if (!(sd->flags & SD_LOAD_BALANCE))//该调度域被指定不进行负载均衡。
continue;
interval = sd->balance_interval;//执行load balance的时间间隔
if (idle != CPU_IDLE)//如果该cpu的状态不为空闲,则延长load balance的时间间隔,将原来的值乘以sd->busy_factor
interval *= sd->busy_factor;
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
if (unlikely(!interval))
interval = 1;
if (interval > HZ*NR_CPUS/10)//load balance时间间隔是有上限的
interval = HZ*NR_CPUS/10;
need_serialize = sd->flags & SD_SERIALIZE;
if (need_serialize) {//需要串行执行,获取锁。
if (!spin_trylock(&balancing))
goto out;
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {//到达执行load balance的时间
if (load_balance(cpu, rq, sd, idle, &balance)) {//检查该cpu在这一层的调度域中是否存在负载不平衡的情况,如果存在该cpu会分担负载最重的那个cpu的一些任务。详细情况见下文。
/*
* We've pulled tasks over so either we're no
* longer idle, or one of our SMT siblings is
* not idle.
*/
idle = CPU_NOT_IDLE;
}
sd->last_balance = jiffies;
}
if (need_serialize)
spin_unlock(&balancing);//释放串行锁
out:
if (time_after(next_balance, sd->last_balance + interval)) {
next_balance = sd->last_balance + interval;
update_next_balance = 1;
}
/*
* Stop the load balance at this level. There is another
* CPU in our sched group which is doing load balancing more
* actively.
*/
if (!balance)//不存在负载均衡了,跳出循环
break;
}
/*
* next_balance will be updated only when there is a need.
* When the cpu is attached to null domain for ex, it will not be
* updated.
*/
if (likely(update_next_balance))
rq->next_balance = next_balance;
}
函数load_balance的代码如下:
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *balance)
{
int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
struct sched_group *group;
unsigned long imbalance;
struct rq *busiest;
unsigned long flags;
struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
cpumask_copy(cpus, cpu_active_mask);
/*
* When power savings policy is enabled for the parent domain, idle
* sibling can pick up load irrespective of busy siblings. In this case,
* let the state of idle sibling percolate up as CPU_IDLE, instead of
* portraying it as CPU_NOT_IDLE.
*/
if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
sd_idle = 1;
schedstat_inc(sd, lb_count[idle]);
redo:
update_shares(sd);
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *balance)
{
int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
struct sched_group *group;
unsigned long imbalance;
struct rq *busiest;
unsigned long flags;
struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
cpumask_copy(cpus, cpu_active_mask);
/*
* When power savings policy is enabled for the parent domain, idle
* sibling can pick up load irrespective of busy siblings. In this case,
* let the state of idle sibling percolate up as CPU_IDLE, instead of
* portraying it as CPU_NOT_IDLE.
*/
if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
sd_idle = 1;
schedstat_inc(sd, lb_count[idle]);
redo:
update_shares(sd);
//find_busiest_group的目的是:如果在该调度域存在负载不均衡的情况,则找到负载最重的那个调度组,如果该调度域负载没有失衡,则找到负载最轻的那个,后面会将其负载全部转移到其他cpu上,以此来达到省电的目的。
group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, cpus, balance);
if (*balance == 0)//没有负载失衡
goto out_balanced;
if (!group) {//没有负载失衡,不存在负载最大的调度组
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
}
busiest = find_busiest_queue(group, idle, imbalance, cpus);//在负载最重的调度组中寻找负载最重的调度队列
if (!busiest) {//不存在负载最重的调度队列
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
}
BUG_ON(busiest == this_rq);
schedstat_add(sd, lb_imbalance[idle], imbalance);
ld_moved = 0;
if (busiest->nr_running > 1) {//如果最忙的调度队列中的任务个数不止1个。
/*
* Attempt to move tasks. If find_busiest_group has found
* an imbalance but busiest->nr_running <= 1, the group is
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
local_irq_save(flags);
double_rq_lock(this_rq, busiest);
ld_moved = move_tasks(this_rq, this_cpu, busiest,
imbalance, sd, idle, &all_pinned);//从负载最重的调度队列中移动一些任务到该cpu的调度队列中,需要移动的负载最大值为imbalance
double_rq_unlock(this_rq, busiest);
local_irq_restore(flags);
/*
* some other cpu did the load balance for us.
*/
if (ld_moved && this_cpu != smp_processor_id())//如果为该cpu进行负载均衡的cpu不止其本身,则该cpu需要被唤醒,因为它有活干了。
resched_cpu(this_cpu);
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(all_pinned)) {//如果最忙的调度队列上的所有任务被绑定到它们所运行的cpu上,即不能移动。那么不在考虑该cpu上的负载了,如果该调度域中还有其他cpu的话,则继续寻找最忙的cpu。
cpumask_clear_cpu(cpu_of(busiest),cpus);
if (!cpumask_empty(cpus))
goto redo;
goto out_balanced;
}
}
if (!ld_moved) {//如果移动任务失败
schedstat_inc(sd, lb_failed[idle]);
/*
* Increment the failure counter only on periodic balance.
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
*/
if (idle != CPU_NEWLY_IDLE)
sd->nr_balance_failed++;
if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the migration_thread, if the curr
* task on busiest cpu can't be moved to this_cpu
*/
if (!cpumask_test_cpu(this_cpu,
&busiest->curr->cpus_allowed)) {
spin_unlock_irqrestore(&busiest->lock, flags);
all_pinned = 1;
goto out_one_pinned;
}
if (!busiest->active_balance) {//准备激活migration_thread,采用“推任务”的方式
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;//将任务推到该cpu上
active_balance = 1;
}
spin_unlock_irqrestore(&busiest->lock, flags);
if (active_balance)
wake_up_process(busiest->migration_thread);//唤醒migration_thread
/*
* We've kicked active balancing, reset the failure
* counter.
*/
sd->nr_balance_failed = sd->cache_nice_tries+1;
}
} else
sd->nr_balance_failed = 0;
if (likely(!active_balance)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
} else {
/*
* If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call
* move_tasks).
*/
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
}
if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
ld_moved = -1;
goto out;
out_balanced:
schedstat_inc(sd, lb_balanced[idle]);
sd->nr_balance_failed = 0;
out_one_pinned:
/* tune up the balancing interval */
if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
ld_moved = -1;
else
ld_moved = 0;
out:
if (ld_moved)
update_shares(sd);
return ld_moved;
}
group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, cpus, balance);
if (*balance == 0)//没有负载失衡
goto out_balanced;
if (!group) {//没有负载失衡,不存在负载最大的调度组
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
}
busiest = find_busiest_queue(group, idle, imbalance, cpus);//在负载最重的调度组中寻找负载最重的调度队列
if (!busiest) {//不存在负载最重的调度队列
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
}
BUG_ON(busiest == this_rq);
schedstat_add(sd, lb_imbalance[idle], imbalance);
ld_moved = 0;
if (busiest->nr_running > 1) {//如果最忙的调度队列中的任务个数不止1个。
/*
* Attempt to move tasks. If find_busiest_group has found
* an imbalance but busiest->nr_running <= 1, the group is
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
local_irq_save(flags);
double_rq_lock(this_rq, busiest);
ld_moved = move_tasks(this_rq, this_cpu, busiest,
imbalance, sd, idle, &all_pinned);//从负载最重的调度队列中移动一些任务到该cpu的调度队列中,需要移动的负载最大值为imbalance
double_rq_unlock(this_rq, busiest);
local_irq_restore(flags);
/*
* some other cpu did the load balance for us.
*/
if (ld_moved && this_cpu != smp_processor_id())//如果为该cpu进行负载均衡的cpu不止其本身,则该cpu需要被唤醒,因为它有活干了。
resched_cpu(this_cpu);
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(all_pinned)) {//如果最忙的调度队列上的所有任务被绑定到它们所运行的cpu上,即不能移动。那么不在考虑该cpu上的负载了,如果该调度域中还有其他cpu的话,则继续寻找最忙的cpu。
cpumask_clear_cpu(cpu_of(busiest),cpus);
if (!cpumask_empty(cpus))
goto redo;
goto out_balanced;
}
}
if (!ld_moved) {//如果移动任务失败
schedstat_inc(sd, lb_failed[idle]);
/*
* Increment the failure counter only on periodic balance.
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
*/
if (idle != CPU_NEWLY_IDLE)
sd->nr_balance_failed++;
if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the migration_thread, if the curr
* task on busiest cpu can't be moved to this_cpu
*/
if (!cpumask_test_cpu(this_cpu,
&busiest->curr->cpus_allowed)) {
spin_unlock_irqrestore(&busiest->lock, flags);
all_pinned = 1;
goto out_one_pinned;
}
if (!busiest->active_balance) {//准备激活migration_thread,采用“推任务”的方式
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;//将任务推到该cpu上
active_balance = 1;
}
spin_unlock_irqrestore(&busiest->lock, flags);
if (active_balance)
wake_up_process(busiest->migration_thread);//唤醒migration_thread
/*
* We've kicked active balancing, reset the failure
* counter.
*/
sd->nr_balance_failed = sd->cache_nice_tries+1;
}
} else
sd->nr_balance_failed = 0;
if (likely(!active_balance)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
} else {
/*
* If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call
* move_tasks).
*/
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
}
if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
ld_moved = -1;
goto out;
out_balanced:
schedstat_inc(sd, lb_balanced[idle]);
sd->nr_balance_failed = 0;
out_one_pinned:
/* tune up the balancing interval */
if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
ld_moved = -1;
else
ld_moved = 0;
out:
if (ld_moved)
update_shares(sd);
return ld_moved;
}
static struct sched_group *find_busiest_group(struct sched_domain *sd, int this_cpu,
unsigned long *imbalance, enum cpu_idle_type idle,
int *sd_idle, const struct cpumask *cpus, int *balance)
{
struct sd_lb_stats sds;
memset(&sds, 0, sizeof(sds));
/*
* Compute the various statistics relavent for load balancing at
* this level.
*/
update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, balance, &sds);//获取该调度域的负载平衡统计数据,详情见下文。
/* Cases where imbalance does not exist from POV of this_cpu */
/* 1) this_cpu is not the appropriate cpu to perform load balancing
* at this level.
* 2) There is no busy sibling group to pull from.
* 3) This group is the busiest group.
* 4) This group is more busy than the avg busieness at this
* sched_domain.
* 5) The imbalance is within the specified limit.
*
* Note: when doing newidle balance, if the local group has excess
* capacity (i.e. nr_running < group_capacity) and the busiest group
* does not have any capacity, we force a load balance to pull tasks
* to the local group. In this case, we skip past checks 3, 4 and 5.
*/
if (balance && !(*balance))//没有失衡
goto ret;
if (!sds.busiest || sds.busiest_nr_running == 0)//该调度域中不存在负载最重的调度组,或者负载最重的调度组上没有可运行的进程。
goto out_balanced;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
!sds.busiest_has_capacity)//从主调度器中发起的负载平衡才会有可能进这个分支(idle = CPU_NEWLY_IDLE)
goto force_balance;
if (sds.this_load >= sds.max_load)//this_cpu所在的调度组的负载大于该调度域中的其他调度组,则不需要执行负载平衡(因为它已经是最忙的了,如何去分担其他调度组上的任务呢)
goto out_balanced;
sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;//计算该调度域的平均负载值
if (sds.this_load >= sds.avg_load)//this_cpu所在的调度组的负载大于该调度域的平均负载,则它也不适合执行负载平衡
goto out_balanced;
/*
* In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
* And to check for busy balance use !idle_cpu instead of
* CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
* even when they are idle.
*/
if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
goto out_balanced;
} else {
/*
* This cpu is idle. If the busiest group load doesn't
* have more tasks than the number of available cpu's and
* there is no imbalance between this and busiest group
* wrt to idle cpu's, it is balanced.
*/
if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
sds.busiest_nr_running <= sds.busiest_group_weight)
goto out_balanced;
}
force_balance:
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(&sds, this_cpu, imbalance);
return sds.busiest;
out_balanced:
/*
* There is no obvious imbalance. But check if we can do some balancing
* to save power.
*/
if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
return sds.busiest;
ret:
*imbalance = 0;
return NULL;
}
unsigned long *imbalance, enum cpu_idle_type idle,
int *sd_idle, const struct cpumask *cpus, int *balance)
{
struct sd_lb_stats sds;
memset(&sds, 0, sizeof(sds));
/*
* Compute the various statistics relavent for load balancing at
* this level.
*/
update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, balance, &sds);//获取该调度域的负载平衡统计数据,详情见下文。
/* Cases where imbalance does not exist from POV of this_cpu */
/* 1) this_cpu is not the appropriate cpu to perform load balancing
* at this level.
* 2) There is no busy sibling group to pull from.
* 3) This group is the busiest group.
* 4) This group is more busy than the avg busieness at this
* sched_domain.
* 5) The imbalance is within the specified limit.
*
* Note: when doing newidle balance, if the local group has excess
* capacity (i.e. nr_running < group_capacity) and the busiest group
* does not have any capacity, we force a load balance to pull tasks
* to the local group. In this case, we skip past checks 3, 4 and 5.
*/
if (balance && !(*balance))//没有失衡
goto ret;
if (!sds.busiest || sds.busiest_nr_running == 0)//该调度域中不存在负载最重的调度组,或者负载最重的调度组上没有可运行的进程。
goto out_balanced;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
!sds.busiest_has_capacity)//从主调度器中发起的负载平衡才会有可能进这个分支(idle = CPU_NEWLY_IDLE)
goto force_balance;
if (sds.this_load >= sds.max_load)//this_cpu所在的调度组的负载大于该调度域中的其他调度组,则不需要执行负载平衡(因为它已经是最忙的了,如何去分担其他调度组上的任务呢)
goto out_balanced;
sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;//计算该调度域的平均负载值
if (sds.this_load >= sds.avg_load)//this_cpu所在的调度组的负载大于该调度域的平均负载,则它也不适合执行负载平衡
goto out_balanced;
/*
* In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
* And to check for busy balance use !idle_cpu instead of
* CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
* even when they are idle.
*/
if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
goto out_balanced;
} else {
/*
* This cpu is idle. If the busiest group load doesn't
* have more tasks than the number of available cpu's and
* there is no imbalance between this and busiest group
* wrt to idle cpu's, it is balanced.
*/
if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
sds.busiest_nr_running <= sds.busiest_group_weight)
goto out_balanced;
}
force_balance:
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(&sds, this_cpu, imbalance);
return sds.busiest;
out_balanced:
/*
* There is no obvious imbalance. But check if we can do some balancing
* to save power.
*/
if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
return sds.busiest;
ret:
*imbalance = 0;
return NULL;
}
static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
enum cpu_idle_type idle, int *sd_idle,
const struct cpumask *cpus, int *balance,
struct sd_lb_stats *sds)
{
struct sched_domain *child = sd->child;
struct sched_group *group = sd->groups;
struct sg_lb_stats sgs;
int load_idx, prefer_sibling = 0;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
init_sd_power_savings_stats(sd, sds, idle);
load_idx = get_sd_load_idx(sd, idle);
do {//遍历该调度域上的每个调度组
int local_group;
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));//判断this_cpu所在的调度组是否是当前计算的这个调度组(即group)
memset(&sgs, 0, sizeof(sgs));
update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
local_group, cpus, balance, &sgs);//计算该调度组的负载平衡统计量
if (local_group && balance && !(*balance))
return;
sds->total_load += sgs.group_load;//将该调度组的负载累加到调度域的负载上
sds->total_pwr += group->cpu_power;//累加cpu处理能力
/*
* In case the child domain prefers tasks go to siblings
* first, lower the group capacity to one so that we'll try
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
* these excess tasks, i.e. nr_running < group_capacity. The
* extra check prevents the case where you always pull from the
* heaviest group when it is already under-utilized (possible
* with a large weight task outweighs the tasks on the system).
*/
if (prefer_sibling && !local_group && sds->this_has_capacity)
sgs.group_capacity = min(sgs.group_capacity, 1UL);
if (local_group) {//this_cpu所在的调度组和该调度域中其他的调度组分开处理,下面的赋值代码不解释了,详情见update_sg_lb_stats函数
sds->this_load = sgs.avg_load;
sds->this = group;
sds->this_nr_running = sgs.sum_nr_running;
sds->this_load_per_task = sgs.sum_weighted_load;
sds->this_has_capacity = sgs.group_has_capacity;
sds->this_idle_cpus = sgs.idle_cpus;
} else if (sgs.avg_load > sds->max_load &&
(sgs.sum_nr_running > sgs.group_capacity ||
sgs.group_imb)) {
sds->max_load = sgs.avg_load;
sds->busiest = group;
sds->busiest_nr_running = sgs.sum_nr_running;
sds->busiest_idle_cpus = sgs.idle_cpus;
sds->busiest_group_capacity = sgs.group_capacity;
sds->busiest_group_weight = sgs.group_weight;
sds->busiest_load_per_task = sgs.sum_weighted_load;
sds->busiest_has_capacity = sgs.group_has_capacity;
sds->group_imb = sgs.group_imb;
}
update_sd_power_savings_stats(group, sds, local_group, &sgs);
group = group->next;
} while (group != sd->groups);
}
enum cpu_idle_type idle, int *sd_idle,
const struct cpumask *cpus, int *balance,
struct sd_lb_stats *sds)
{
struct sched_domain *child = sd->child;
struct sched_group *group = sd->groups;
struct sg_lb_stats sgs;
int load_idx, prefer_sibling = 0;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
init_sd_power_savings_stats(sd, sds, idle);
load_idx = get_sd_load_idx(sd, idle);
do {//遍历该调度域上的每个调度组
int local_group;
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));//判断this_cpu所在的调度组是否是当前计算的这个调度组(即group)
memset(&sgs, 0, sizeof(sgs));
update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
local_group, cpus, balance, &sgs);//计算该调度组的负载平衡统计量
if (local_group && balance && !(*balance))
return;
sds->total_load += sgs.group_load;//将该调度组的负载累加到调度域的负载上
sds->total_pwr += group->cpu_power;//累加cpu处理能力
/*
* In case the child domain prefers tasks go to siblings
* first, lower the group capacity to one so that we'll try
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
* these excess tasks, i.e. nr_running < group_capacity. The
* extra check prevents the case where you always pull from the
* heaviest group when it is already under-utilized (possible
* with a large weight task outweighs the tasks on the system).
*/
if (prefer_sibling && !local_group && sds->this_has_capacity)
sgs.group_capacity = min(sgs.group_capacity, 1UL);
if (local_group) {//this_cpu所在的调度组和该调度域中其他的调度组分开处理,下面的赋值代码不解释了,详情见update_sg_lb_stats函数
sds->this_load = sgs.avg_load;
sds->this = group;
sds->this_nr_running = sgs.sum_nr_running;
sds->this_load_per_task = sgs.sum_weighted_load;
sds->this_has_capacity = sgs.group_has_capacity;
sds->this_idle_cpus = sgs.idle_cpus;
} else if (sgs.avg_load > sds->max_load &&
(sgs.sum_nr_running > sgs.group_capacity ||
sgs.group_imb)) {
sds->max_load = sgs.avg_load;
sds->busiest = group;
sds->busiest_nr_running = sgs.sum_nr_running;
sds->busiest_idle_cpus = sgs.idle_cpus;
sds->busiest_group_capacity = sgs.group_capacity;
sds->busiest_group_weight = sgs.group_weight;
sds->busiest_load_per_task = sgs.sum_weighted_load;
sds->busiest_has_capacity = sgs.group_has_capacity;
sds->group_imb = sgs.group_imb;
}
update_sd_power_savings_stats(group, sds, local_group, &sgs);
group = group->next;
} while (group != sd->groups);
}
static inline void update_sg_lb_stats(struct sched_domain *sd,
struct sched_group *group, int this_cpu,
enum cpu_idle_type idle, int load_idx, int *sd_idle,
int local_group, const struct cpumask *cpus,
int *balance, struct sg_lb_stats *sgs)
{
unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
int i;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long avg_load_per_task = 0;
if (local_group) {
balance_cpu = group_first_cpu(group);
if (balance_cpu == this_cpu)
update_group_power(sd, this_cpu);
}
/* Tally up the load of all CPUs in the group */
max_cpu_load = 0;
min_cpu_load = ~0UL;
max_nr_running = 0;
for_each_cpu_and(i, sched_group_cpus(group), cpus) {//遍历这个调度组上的所有cpu,可能只有一个cpu
struct rq *rq = cpu_rq(i);
if (*sd_idle && rq->nr_running)
*sd_idle = 0;
/* Bias balancing toward cpus of our domain */
if (local_group) {
if (idle_cpu(i) && !first_idle_cpu) {//记录this_cpu所在调度组上第一个处于idle的cpu
first_idle_cpu = 1;
balance_cpu = i;
}
load = target_load(i, load_idx);//选择该cpu调度队列的当前负载和它的历史负载值中的最大值,load_idx是前面在update_sd_lb_stats函数中根据this_cpu的空闲状态所选择的。
} else {
load = source_load(i, load_idx);//选择该cpu调度队列的当前负载和它的历史负载值中的最小值。
if (load > max_cpu_load) {
max_cpu_load = load;
max_nr_running = rq->nr_running;
}
if (min_cpu_load > load)
min_cpu_load = load;
}
//下面基本上就是计算当前调度组负载均衡统计数据。
sgs->group_load += load;
sgs->sum_nr_running += rq->nr_running;
sgs->sum_weighted_load += weighted_cpuload(i);
if (idle_cpu(i))
sgs->idle_cpus++;
}
/*
* First idle cpu or the first cpu(busiest) in this sched group
* is eligible for doing load balancing at this and above
* domains. In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.
*/
if (idle != CPU_NEWLY_IDLE && local_group &&
balance_cpu != this_cpu && balance) {
*balance = 0;
return;
}
/* Adjust by relative CPU power of the group */
sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
/*
* Consider the group unbalanced when the imbalance is larger
* than the average weight of two tasks.
*
* APZ: with cgroup the avg task weight can vary wildly and
* might not be a suitable number - should we keep a
* normalized nr_running number somewhere that negates
* the hierarchy?
*/
if (sgs->sum_nr_running)
avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
sgs->group_imb = 1;
sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
sgs->group_weight = group->group_weight;
if (sgs->group_capacity > sgs->sum_nr_running)
sgs->group_has_capacity = 1;
}
struct sched_group *group, int this_cpu,
enum cpu_idle_type idle, int load_idx, int *sd_idle,
int local_group, const struct cpumask *cpus,
int *balance, struct sg_lb_stats *sgs)
{
unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
int i;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long avg_load_per_task = 0;
if (local_group) {
balance_cpu = group_first_cpu(group);
if (balance_cpu == this_cpu)
update_group_power(sd, this_cpu);
}
/* Tally up the load of all CPUs in the group */
max_cpu_load = 0;
min_cpu_load = ~0UL;
max_nr_running = 0;
for_each_cpu_and(i, sched_group_cpus(group), cpus) {//遍历这个调度组上的所有cpu,可能只有一个cpu
struct rq *rq = cpu_rq(i);
if (*sd_idle && rq->nr_running)
*sd_idle = 0;
/* Bias balancing toward cpus of our domain */
if (local_group) {
if (idle_cpu(i) && !first_idle_cpu) {//记录this_cpu所在调度组上第一个处于idle的cpu
first_idle_cpu = 1;
balance_cpu = i;
}
load = target_load(i, load_idx);//选择该cpu调度队列的当前负载和它的历史负载值中的最大值,load_idx是前面在update_sd_lb_stats函数中根据this_cpu的空闲状态所选择的。
} else {
load = source_load(i, load_idx);//选择该cpu调度队列的当前负载和它的历史负载值中的最小值。
if (load > max_cpu_load) {
max_cpu_load = load;
max_nr_running = rq->nr_running;
}
if (min_cpu_load > load)
min_cpu_load = load;
}
//下面基本上就是计算当前调度组负载均衡统计数据。
sgs->group_load += load;
sgs->sum_nr_running += rq->nr_running;
sgs->sum_weighted_load += weighted_cpuload(i);
if (idle_cpu(i))
sgs->idle_cpus++;
}
/*
* First idle cpu or the first cpu(busiest) in this sched group
* is eligible for doing load balancing at this and above
* domains. In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.
*/
if (idle != CPU_NEWLY_IDLE && local_group &&
balance_cpu != this_cpu && balance) {
*balance = 0;
return;
}
/* Adjust by relative CPU power of the group */
sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
/*
* Consider the group unbalanced when the imbalance is larger
* than the average weight of two tasks.
*
* APZ: with cgroup the avg task weight can vary wildly and
* might not be a suitable number - should we keep a
* normalized nr_running number somewhere that negates
* the hierarchy?
*/
if (sgs->sum_nr_running)
avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
sgs->group_imb = 1;
sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
sgs->group_weight = group->group_weight;
if (sgs->group_capacity > sgs->sum_nr_running)
sgs->group_has_capacity = 1;
}
负载均衡的第一种情况就简单的介绍到这里,很多细节也没说明白,请多多指教。
0 0
- Kernel调度器负载均衡(一)
- Kernel调度器负载均衡(二)
- 调度子系统4_负载均衡(一)
- linux 调度器负载均衡
- 负载均衡(一)
- 负载均衡(一)
- 负载均衡调度算法
- 负载均衡调度算法
- 负载均衡调度算法
- 负载均衡调度算法
- IIS负载均衡(一)
- linux kernel 负载均衡总结
- 负载均衡概述(一)DNS负载均衡技术
- 负载均衡概述(一)软件级负载均衡…
- 负载均衡(一)为什么需要负载均衡
- 负载均衡调度算法大全
- 负载均衡调度算法大全
- 负载均衡调度算法大全
- AI:**消灭程序员需要一百年吗?
- 字符编码GB2312、GBK、UTF-8的区别
- 运放的各个参数名词解释 (含英文)
- springMvc ajax 操作及配置
- 系统审计学习
- Kernel调度器负载均衡(一)
- C++中const与指针
- maven安装配置JAVA_HOME环境变量
- 关闭liunx防火墙
- CNN toolbox
- Instagram技术窥探,助你打造响应式App
- 数据库——视图
- java 单例模式
- HDU 1026 Ignatius and the Princess I - BFS + 优先队列