Kernel调度器负载均衡(一）

来源：互联网发布：fm2016经典球星数据库编辑：程序博客网时间：2024/06/08 02:04

Kernel中的CPU负载均衡是对调度器的增强，在多处理器上（SMP/NUMA），必须要考虑CPU的负载均衡，包括：1.CPU负荷尽可能公平地在所有处理器上共享。2.内核必须能够将进程从一个CPU迁移到另一个CPU上。

在Kernel中调度器进行负载均衡的时机有4个：

1.在时钟中断时，周期性调度器scheduler_tick会被调用，它在最后会调用trigger_load_balance函数，该函数可以触发负载均衡软中断。

2.在主调度器schedule中会判断当前CPU的rq上的进程个数是否为0，如果为0的话，就调用idle_balance函数，进行负载均衡。

3.在创建进程时，在函数sched_exec会进行负载均衡。

4.在使用try_to_wake_up函数唤醒进程时，会进行负载均衡。

下面对着四种情况一一介绍。

1.周期性负载均衡。

scheduler_tick会调用trigger_load_balance，trigger_load_balance的代码如下：

static inline void trigger_load_balance(struct rq *rq, int cpu)
{
#ifdef CONFIG_NO_HZ
   /*
   * If we were in the nohz mode recently and busy at the current
   * scheduler tick, then check if we need to nominate new idle
   * load balancer.
   */

    //nohz模式指的是，当CPU进入空闲状态时，系统为了省电，就把CPU的周期性中断关掉
   if (rq->in_nohz_recently && !rq->idle_at_tick) {//如果当前cpu最近进入了nohz模式，但是目前不是处于空闲状态。
       rq->in_nohz_recently = 0;

       if (atomic_read(&nohz.load_balancer) == cpu) {//如果当前cpu之前是idle load balancer，则取消其资格（因为它不在空闲）
           cpumask_clear_cpu(cpu, nohz.cpu_mask);
           atomic_set(&nohz.load_balancer, -1);
       }

       if (atomic_read(&nohz.load_balancer) == -1) {//选择一个新的idle load balancer，并且将其唤醒
           int ilb = find_new_ilb(cpu);

           if (ilb < nr_cpu_ids)
               resched_cpu(ilb);//让该cpu重新调度，这样它就会执行idle load balance
       }
   }

   /*
   * If this cpu is idle and doing idle load balancing for all the
   * cpus with ticks stopped, is it time for that to stop?
   */
   if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
        cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {//这表示当前的idle load balancer是当前cpu，而且所有cpu都进入了nohz模式,整个系统处于空闲状态，没有必要做负载均衡。
       resched_cpu(cpu);//在该cpu上重新调度（该cpu目前执行的应该是负载均衡的代码），停止执行负载均衡。
       return;
   }

   /*
   * If this cpu is idle and the idle load balancing is done by
   * someone else, then no need raise the SCHED_SOFTIRQ
   */
   if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
        cpumask_test_cpu(cpu, nohz.cpu_mask))//当前cpu处于空闲状态，当前的idle load balancer不是该cpu（意味着有别的cpu会帮其进行负载均衡），则直接退出，无需触发负载均衡软中断。
       return;
#endif
   /* Don't need to rebalance while attached to NULL domain */
   if (time_after_eq(jiffies, rq->next_balance) &&
        likely(!on_null_domain(cpu)))
       raise_softirq(SCHED_SOFTIRQ);//触发负载均衡软中断，该中断处理函数是run_rebalance_domains
}

下面看下run_rebalance_domains的代码

/*
* run_rebalance_domains is triggered when needed from the scheduler tick.
* In CONFIG_NO_HZ case, the idle load balance owner will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
*/
static void run_rebalance_domains(struct softirq_action *h)
{
   int this_cpu = smp_processor_id();
   struct rq *this_rq = cpu_rq(this_cpu);
   enum cpu_idle_type idle = this_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE;//后面负载均衡时会根据该cpu的状态（闲/忙），选择不同饿参数。

   rebalance_domains(this_cpu, idle);//检查该cpu所处的调度域，看是否存在负载不平衡现象，如果存在则进行负载平衡。详情见下文。

#ifdef CONFIG_NO_HZ
   /*
   * If this cpu is the owner for idle load balancing, then do the
   * balancing on behalf of the other idle cpus whose ticks are
   * stopped.
   */
   if (this_rq->idle_at_tick &&
        atomic_read(&nohz.load_balancer) == this_cpu) {
       struct rq *rq;
       int balance_cpu;

       for_each_cpu(balance_cpu, nohz.cpu_mask) {
           if (balance_cpu == this_cpu)
               continue;

           /*
           * If this cpu gets work to do, stop the load balancing
           * work being done for other cpus. Next load
           * balancing owner will pick it up.
           */
           if (need_resched())//如果当前cpu有工作要做，则停止进行负载平衡。
               break;

           rebalance_domains(balance_cpu, CPU_IDLE);//当前cpu替balance_cpu进行负载均衡，即将忙的cpu上的任务迁移到balance_cpu上（如果存在不平衡的话）

           rq = cpu_rq(balance_cpu);
           if (time_after(this_rq->next_balance, rq->next_balance))//更新本rq下一次load balance的时间为所有被它代理的rq执行load balance中最早的，不明白原因。
               this_rq->next_balance = rq->next_balance;
       }
   }
#endif
}

函数rebalance_domains的代码如下：

static void rebalance_domains(int cpu, enum cpu_idle_type idle)
{
   int balance = 1;
   struct rq *rq = cpu_rq(cpu);
   unsigned long interval;
   struct sched_domain *sd;
   /* Earliest time when we have to do rebalance again */
   unsigned long next_balance = jiffies + 60*HZ;
   int update_next_balance = 0;
   int need_serialize;

   for_each_domain(cpu, sd) {//遍历该cpu的所有调度域，从最低一级到最高一级。
       if (!(sd->flags & SD_LOAD_BALANCE))//该调度域被指定不进行负载均衡。
           continue;

       interval = sd->balance_interval;//执行load balance的时间间隔
       if (idle != CPU_IDLE)//如果该cpu的状态不为空闲，则延长load balance的时间间隔，将原来的值乘以sd->busy_factor
           interval *= sd->busy_factor;

       /* scale ms to jiffies */
       interval = msecs_to_jiffies(interval);
       if (unlikely(!interval))
           interval = 1;
       if (interval > HZ*NR_CPUS/10)//load balance时间间隔是有上限的
           interval = HZ*NR_CPUS/10;

       need_serialize = sd->flags & SD_SERIALIZE;

       if (need_serialize) {//需要串行执行，获取锁。
           if (!spin_trylock(&balancing))
               goto out;
       }

       if (time_after_eq(jiffies, sd->last_balance + interval)) {//到达执行load balance的时间
           if (load_balance(cpu, rq, sd, idle, &balance)) {//检查该cpu在这一层的调度域中是否存在负载不平衡的情况，如果存在该cpu会分担负载最重的那个cpu的一些任务。详细情况见下文。
               /*
               * We've pulled tasks over so either we're no
               * longer idle, or one of our SMT siblings is
               * not idle.
               */
               idle = CPU_NOT_IDLE;
           }
           sd->last_balance = jiffies;
       }
       if (need_serialize)
           spin_unlock(&balancing);//释放串行锁
out:
       if (time_after(next_balance, sd->last_balance + interval)) {
           next_balance = sd->last_balance + interval;
           update_next_balance = 1;
       }

       /*
       * Stop the load balance at this level. There is another
       * CPU in our sched group which is doing load balancing more
       * actively.
       */
       if (!balance)//不存在负载均衡了，跳出循环
           break;
   }

   /*
   * next_balance will be updated only when there is a need.
   * When the cpu is attached to null domain for ex, it will not be
   * updated.
   */
   if (likely(update_next_balance))
       rq->next_balance = next_balance;
}

函数load_balance的代码如下：

/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
static int load_balance(int this_cpu, struct rq *this_rq,
           struct sched_domain *sd, enum cpu_idle_type idle,
           int *balance)
{
   int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
   struct sched_group *group;
   unsigned long imbalance;
   struct rq *busiest;
   unsigned long flags;
   struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);

   cpumask_copy(cpus, cpu_active_mask);

   /*
   * When power savings policy is enabled for the parent domain, idle
   * sibling can pick up load irrespective of busy siblings. In this case,
   * let the state of idle sibling percolate up as CPU_IDLE, instead of
   * portraying it as CPU_NOT_IDLE.
   */
   if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
        !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
       sd_idle = 1;

   schedstat_inc(sd, lb_count[idle]);

redo:
   update_shares(sd);

    //find_busiest_group的目的是：如果在该调度域存在负载不均衡的情况，则找到负载最重的那个调度组，如果该调度域负载没有失衡，则找到负载最轻的那个，后面会将其负载全部转移到其他cpu上，以此来达到省电的目的。
   group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, cpus, balance);

   if (*balance == 0)//没有负载失衡
       goto out_balanced;

   if (!group) {//没有负载失衡，不存在负载最大的调度组
       schedstat_inc(sd, lb_nobusyg[idle]);
       goto out_balanced;
   }

   busiest = find_busiest_queue(group, idle, imbalance, cpus);//在负载最重的调度组中寻找负载最重的调度队列
   if (!busiest) {//不存在负载最重的调度队列
       schedstat_inc(sd, lb_nobusyq[idle]);
       goto out_balanced;
   }

   BUG_ON(busiest == this_rq);

   schedstat_add(sd, lb_imbalance[idle], imbalance);

   ld_moved = 0;
   if (busiest->nr_running > 1) {//如果最忙的调度队列中的任务个数不止1个。
       /*
       * Attempt to move tasks. If find_busiest_group has found
       * an imbalance but busiest->nr_running <= 1, the group is
       * still unbalanced. ld_moved simply stays zero, so it is
       * correctly treated as an imbalance.
       */
       local_irq_save(flags);
       double_rq_lock(this_rq, busiest);
       ld_moved = move_tasks(this_rq, this_cpu, busiest,
                      imbalance, sd, idle, &all_pinned);//从负载最重的调度队列中移动一些任务到该cpu的调度队列中，需要移动的负载最大值为imbalance
       double_rq_unlock(this_rq, busiest);
       local_irq_restore(flags);

       /*
       * some other cpu did the load balance for us.
       */
       if (ld_moved && this_cpu != smp_processor_id())//如果为该cpu进行负载均衡的cpu不止其本身，则该cpu需要被唤醒，因为它有活干了。
           resched_cpu(this_cpu);

       /* All tasks on this runqueue were pinned by CPU affinity */
       if (unlikely(all_pinned)) {//如果最忙的调度队列上的所有任务被绑定到它们所运行的cpu上，即不能移动。那么不在考虑该cpu上的负载了，如果该调度域中还有其他cpu的话，则继续寻找最忙的cpu。
           cpumask_clear_cpu(cpu_of(busiest),cpus);
           if (!cpumask_empty(cpus))
               goto redo;
           goto out_balanced;
       }
   }

   if (!ld_moved) {//如果移动任务失败
       schedstat_inc(sd, lb_failed[idle]);
       /*
       * Increment the failure counter only on periodic balance.
       * We do not want newidle balance, which can be very
       * frequent, pollute the failure counter causing
       * excessive cache_hot migrations and active balances.
       */
       if (idle != CPU_NEWLY_IDLE)
           sd->nr_balance_failed++;

       if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {

           spin_lock_irqsave(&busiest->lock, flags);

           /* don't kick the migration_thread, if the curr
           * task on busiest cpu can't be moved to this_cpu
           */
           if (!cpumask_test_cpu(this_cpu,
                          &busiest->curr->cpus_allowed)) {
               spin_unlock_irqrestore(&busiest->lock, flags);
               all_pinned = 1;
               goto out_one_pinned;
           }

           if (!busiest->active_balance) {//准备激活migration_thread，采用“推任务”的方式
               busiest->active_balance = 1;
               busiest->push_cpu = this_cpu;//将任务推到该cpu上
               active_balance = 1;
           }
           spin_unlock_irqrestore(&busiest->lock, flags);
           if (active_balance)
               wake_up_process(busiest->migration_thread);//唤醒migration_thread

           /*
           * We've kicked active balancing, reset the failure
           * counter.
           */
           sd->nr_balance_failed = sd->cache_nice_tries+1;
       }
   } else
       sd->nr_balance_failed = 0;

   if (likely(!active_balance)) {
       /* We were unbalanced, so reset the balancing interval */
       sd->balance_interval = sd->min_interval;
   } else {
       /*
       * If we've begun active balancing, start to back off. This
       * case may not be covered by the all_pinned logic if there
       * is only 1 task on the busy runqueue (because we don't call
       * move_tasks).
       */
       if (sd->balance_interval < sd->max_interval)
           sd->balance_interval *= 2;
   }

   if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
        !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
       ld_moved = -1;

   goto out;

out_balanced:
   schedstat_inc(sd, lb_balanced[idle]);

   sd->nr_balance_failed = 0;

out_one_pinned:
   /* tune up the balancing interval */
   if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
           (sd->balance_interval < sd->max_interval))
       sd->balance_interval *= 2;

   if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
        !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
       ld_moved = -1;
   else
       ld_moved = 0;
out:
   if (ld_moved)
       update_shares(sd);
   return ld_moved;
}

static struct sched_group *find_busiest_group(struct sched_domain *sd, int this_cpu,
           unsigned long *imbalance, enum cpu_idle_type idle,
           int *sd_idle, const struct cpumask *cpus, int *balance)
{
   struct sd_lb_stats sds;

   memset(&sds, 0, sizeof(sds));

   /*
   * Compute the various statistics relavent for load balancing at
   * this level.
   */
   update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, balance, &sds);//获取该调度域的负载平衡统计数据，详情见下文。

   /* Cases where imbalance does not exist from POV of this_cpu */
   /* 1) this_cpu is not the appropriate cpu to perform load balancing
   *    at this level.
   * 2) There is no busy sibling group to pull from.
   * 3) This group is the busiest group.
   * 4) This group is more busy than the avg busieness at this
   *    sched_domain.
   * 5) The imbalance is within the specified limit.
   *
   * Note: when doing newidle balance, if the local group has excess
   * capacity (i.e. nr_running < group_capacity) and the busiest group
   * does not have any capacity, we force a load balance to pull tasks
   * to the local group. In this case, we skip past checks 3, 4 and 5.
   */
   if (balance && !(*balance))//没有失衡
       goto ret;

   if (!sds.busiest || sds.busiest_nr_running == 0)//该调度域中不存在负载最重的调度组，或者负载最重的调度组上没有可运行的进程。
       goto out_balanced;

   /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
   if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
           !sds.busiest_has_capacity)//从主调度器中发起的负载平衡才会有可能进这个分支(idle = CPU_NEWLY_IDLE)
       goto force_balance;

   if (sds.this_load >= sds.max_load)//this_cpu所在的调度组的负载大于该调度域中的其他调度组，则不需要执行负载平衡（因为它已经是最忙的了，如何去分担其他调度组上的任务呢）
       goto out_balanced;

   sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;//计算该调度域的平均负载值

   if (sds.this_load >= sds.avg_load)//this_cpu所在的调度组的负载大于该调度域的平均负载，则它也不适合执行负载平衡
       goto out_balanced;

   /*
   * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
   * And to check for busy balance use !idle_cpu instead of
   * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
   * even when they are idle.
   */
   if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
       if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
           goto out_balanced;
   } else {
       /*
       * This cpu is idle. If the busiest group load doesn't
       * have more tasks than the number of available cpu's and
       * there is no imbalance between this and busiest group
       * wrt to idle cpu's, it is balanced.
       */
       if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
            sds.busiest_nr_running <= sds.busiest_group_weight)
           goto out_balanced;
   }

force_balance:
   /* Looks like there is an imbalance. Compute it */
   calculate_imbalance(&sds, this_cpu, imbalance);
   return sds.busiest;

out_balanced:
   /*
   * There is no obvious imbalance. But check if we can do some balancing
   * to save power.
   */
   if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
       return sds.busiest;
ret:
   *imbalance = 0;
   return NULL;
}

static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
           enum cpu_idle_type idle, int *sd_idle,
           const struct cpumask *cpus, int *balance,
           struct sd_lb_stats *sds)
{
   struct sched_domain *child = sd->child;
   struct sched_group *group = sd->groups;
   struct sg_lb_stats sgs;
   int load_idx, prefer_sibling = 0;

   if (child && child->flags & SD_PREFER_SIBLING)
       prefer_sibling = 1;

   init_sd_power_savings_stats(sd, sds, idle);
   load_idx = get_sd_load_idx(sd, idle);

   do {//遍历该调度域上的每个调度组
       int local_group;

       local_group = cpumask_test_cpu(this_cpu,
                           sched_group_cpus(group));//判断this_cpu所在的调度组是否是当前计算的这个调度组（即group）
       memset(&sgs, 0, sizeof(sgs));
       update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
               local_group, cpus, balance, &sgs);//计算该调度组的负载平衡统计量

       if (local_group && balance && !(*balance))
           return;

       sds->total_load += sgs.group_load;//将该调度组的负载累加到调度域的负载上
       sds->total_pwr += group->cpu_power;//累加cpu处理能力

       /*
       * In case the child domain prefers tasks go to siblings
       * first, lower the group capacity to one so that we'll try
       * and move all the excess tasks away. We lower the capacity
       * of a group only if the local group has the capacity to fit
       * these excess tasks, i.e. nr_running < group_capacity. The
       * extra check prevents the case where you always pull from the
       * heaviest group when it is already under-utilized (possible
       * with a large weight task outweighs the tasks on the system).
       */
       if (prefer_sibling && !local_group && sds->this_has_capacity)
           sgs.group_capacity = min(sgs.group_capacity, 1UL);

       if (local_group) {//this_cpu所在的调度组和该调度域中其他的调度组分开处理，下面的赋值代码不解释了，详情见update_sg_lb_stats函数
           sds->this_load = sgs.avg_load;
           sds->this = group;
           sds->this_nr_running = sgs.sum_nr_running;
           sds->this_load_per_task = sgs.sum_weighted_load;
           sds->this_has_capacity = sgs.group_has_capacity;
           sds->this_idle_cpus = sgs.idle_cpus;
       } else if (sgs.avg_load > sds->max_load &&
               (sgs.sum_nr_running > sgs.group_capacity ||
               sgs.group_imb)) {
           sds->max_load = sgs.avg_load;
           sds->busiest = group;
           sds->busiest_nr_running = sgs.sum_nr_running;
           sds->busiest_idle_cpus = sgs.idle_cpus;
           sds->busiest_group_capacity = sgs.group_capacity;
           sds->busiest_group_weight = sgs.group_weight;
           sds->busiest_load_per_task = sgs.sum_weighted_load;
           sds->busiest_has_capacity = sgs.group_has_capacity;
           sds->group_imb = sgs.group_imb;
       }

       update_sd_power_savings_stats(group, sds, local_group, &sgs);
       group = group->next;
   } while (group != sd->groups);
}

static inline void update_sg_lb_stats(struct sched_domain *sd,
           struct sched_group *group, int this_cpu,
           enum cpu_idle_type idle, int load_idx, int *sd_idle,
           int local_group, const struct cpumask *cpus,
           int *balance, struct sg_lb_stats *sgs)
{
   unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
   int i;
   unsigned int balance_cpu = -1, first_idle_cpu = 0;
   unsigned long avg_load_per_task = 0;

   if (local_group) {
       balance_cpu = group_first_cpu(group);
       if (balance_cpu == this_cpu)
           update_group_power(sd, this_cpu);
   }

   /* Tally up the load of all CPUs in the group */
   max_cpu_load = 0;
   min_cpu_load = ~0UL;
   max_nr_running = 0;

   for_each_cpu_and(i, sched_group_cpus(group), cpus) {//遍历这个调度组上的所有cpu，可能只有一个cpu
       struct rq *rq = cpu_rq(i);

       if (*sd_idle && rq->nr_running)
           *sd_idle = 0;

       /* Bias balancing toward cpus of our domain */
       if (local_group) {
           if (idle_cpu(i) && !first_idle_cpu) {//记录this_cpu所在调度组上第一个处于idle的cpu
               first_idle_cpu = 1;
               balance_cpu = i;
           }

           load = target_load(i, load_idx);//选择该cpu调度队列的当前负载和它的历史负载值中的最大值，load_idx是前面在update_sd_lb_stats函数中根据this_cpu的空闲状态所选择的。
       } else {
           load = source_load(i, load_idx);//选择该cpu调度队列的当前负载和它的历史负载值中的最小值。
           if (load > max_cpu_load) {
               max_cpu_load = load;
               max_nr_running = rq->nr_running;
           }
           if (min_cpu_load > load)
               min_cpu_load = load;
       }
        //下面基本上就是计算当前调度组负载均衡统计数据。
       sgs->group_load += load;
       sgs->sum_nr_running += rq->nr_running;
       sgs->sum_weighted_load += weighted_cpuload(i);
       if (idle_cpu(i))
           sgs->idle_cpus++;
   }

   /*
   * First idle cpu or the first cpu(busiest) in this sched group
   * is eligible for doing load balancing at this and above
   * domains. In the newly idle case, we will allow all the cpu's
   * to do the newly idle load balance.
   */
   if (idle != CPU_NEWLY_IDLE && local_group &&
        balance_cpu != this_cpu && balance) {
       *balance = 0;
       return;
   }

   /* Adjust by relative CPU power of the group */
   sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;

   /*
   * Consider the group unbalanced when the imbalance is larger
   * than the average weight of two tasks.
   *
   * APZ: with cgroup the avg task weight can vary wildly and
   *      might not be a suitable number - should we keep a
   *      normalized nr_running number somewhere that negates
   *      the hierarchy?
   */
   if (sgs->sum_nr_running)
       avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;

   if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
       sgs->group_imb = 1;

   sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
   sgs->group_weight = group->group_weight;

   if (sgs->group_capacity > sgs->sum_nr_running)
       sgs->group_has_capacity = 1;
}

负载均衡的第一种情况就简单的介绍到这里，很多细节也没说明白，请多多指教。

0 0