Kernel调度器负载均衡(一)

来源:互联网 发布:fm2016经典球星数据库 编辑:程序博客网 时间:2024/06/08 02:04
Kernel中的CPU负载均衡是对调度器的增强,在多处理器上(SMP/NUMA),必须要考虑CPU的负载均衡,包括:1.CPU负荷尽可能公平地在所有处理器上共享。2.内核必须能够将进程从一个CPU迁移到另一个CPU上。

在Kernel中调度器进行负载均衡的时机有4个:
1.在时钟中断时,周期性调度器scheduler_tick会被调用,它在最后会调用trigger_load_balance函数,该函数可以触发负载均衡软中断。
2.在主调度器schedule中会判断当前CPU的rq上的进程个数是否为0,如果为0的话,就调用idle_balance函数,进行负载均衡。
3.在创建进程时,在函数sched_exec会进行负载均衡。
4.在使用try_to_wake_up函数唤醒进程时,会进行负载均衡。

下面对着四种情况一一介绍。
1.周期性负载均衡。
scheduler_tick会调用trigger_load_balance,trigger_load_balance的代码如下:
static inline void trigger_load_balance(struct rq *rq, int cpu)
{
#ifdef CONFIG_NO_HZ
    /*
     * If we were in the nohz mode recently and busy at the current
     * scheduler tick, then check if we need to nominate new idle
     * load balancer.
     */
    //nohz模式指的是,当CPU进入空闲状态时,系统为了省电,就把CPU的周期性中断关掉
    if (rq->in_nohz_recently && !rq->idle_at_tick) {//如果当前cpu最近进入了nohz模式,但是目前不是处于空闲状态。
        rq->in_nohz_recently = 0;

        if (atomic_read(&nohz.load_balancer) == cpu) {//如果当前cpu之前是idle load balancer,则取消其资格(因为它不在空闲)
            cpumask_clear_cpu(cpu, nohz.cpu_mask);
            atomic_set(&nohz.load_balancer, -1);
        }

        if (atomic_read(&nohz.load_balancer) == -1) {//选择一个新的idle load balancer,并且将其唤醒
            int ilb = find_new_ilb(cpu);

            if (ilb < nr_cpu_ids)
                resched_cpu(ilb);//让该cpu重新调度,这样它就会执行idle load balance
        }
    }

    /*
     * If this cpu is idle and doing idle load balancing for all the
     * cpus with ticks stopped, is it time for that to stop?
     */
    if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
        cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {//这表示当前的idle load balancer是当前cpu,而且所有cpu都进入了nohz模式,整个系统处于空闲状态,没有必要做负载均衡。
        resched_cpu(cpu);//在该cpu上重新调度(该cpu目前执行的应该是负载均衡的代码),停止执行负载均衡。
        return;
    }

    /*
     * If this cpu is idle and the idle load balancing is done by
     * someone else, then no need raise the SCHED_SOFTIRQ
     */
    if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
        cpumask_test_cpu(cpu, nohz.cpu_mask))//当前cpu处于空闲状态,当前的idle load balancer不是该cpu(意味着有别的cpu会帮其进行负载均衡),则直接退出,无需触发负载均衡软中断。
        return;
#endif
    /* Don't need to rebalance while attached to NULL domain */
    if (time_after_eq(jiffies, rq->next_balance) &&
        likely(!on_null_domain(cpu)))
        raise_softirq(SCHED_SOFTIRQ);//触发负载均衡软中断,该中断处理函数是run_rebalance_domains
}
下面看下run_rebalance_domains的代码
/*
 * run_rebalance_domains is triggered when needed from the scheduler tick.
 * In CONFIG_NO_HZ case, the idle load balance owner will do the
 * rebalancing for all the cpus for whom scheduler ticks are stopped.
 */
static void run_rebalance_domains(struct softirq_action *h)
{
    int this_cpu = smp_processor_id();
    struct rq *this_rq = cpu_rq(this_cpu);
    enum cpu_idle_type idle = this_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE;//后面负载均衡时会根据该cpu的状态(闲/忙),选择不同饿参数。

    rebalance_domains(this_cpu, idle);//检查该cpu所处的调度域,看是否存在负载不平衡现象,如果存在则进行负载平衡。详情见下文。

#ifdef CONFIG_NO_HZ
    /*
     * If this cpu is the owner for idle load balancing, then do the
     * balancing on behalf of the other idle cpus whose ticks are
     * stopped.
     */
    if (this_rq->idle_at_tick &&
        atomic_read(&nohz.load_balancer) == this_cpu) {
        struct rq *rq;
        int balance_cpu;

        for_each_cpu(balance_cpu, nohz.cpu_mask) {
            if (balance_cpu == this_cpu)
                continue;

            /*
             * If this cpu gets work to do, stop the load balancing
             * work being done for other cpus. Next load
             * balancing owner will pick it up.
             */
            if (need_resched())//如果当前cpu有工作要做,则停止进行负载平衡。
                break;

            rebalance_domains(balance_cpu, CPU_IDLE);//当前cpu替balance_cpu进行负载均衡,即将忙的cpu上的任务迁移到balance_cpu上(如果存在不平衡的话)

            rq = cpu_rq(balance_cpu);
            if (time_after(this_rq->next_balance, rq->next_balance))//更新本rq下一次load balance的时间为所有被它代理的rq执行load balance中最早的,不明白原因。
                this_rq->next_balance = rq->next_balance;
        }
    }
#endif
}

函数rebalance_domains的代码如下:
static void rebalance_domains(int cpu, enum cpu_idle_type idle)
{
    int balance = 1;
    struct rq *rq = cpu_rq(cpu);
    unsigned long interval;
    struct sched_domain *sd;
    /* Earliest time when we have to do rebalance again */
    unsigned long next_balance = jiffies + 60*HZ;
    int update_next_balance = 0;
    int need_serialize;

    for_each_domain(cpu, sd) {//遍历该cpu的所有调度域,从最低一级到最高一级。
        if (!(sd->flags & SD_LOAD_BALANCE))//该调度域被指定不进行负载均衡。
            continue;

        interval = sd->balance_interval;//执行load balance的时间间隔
        if (idle != CPU_IDLE)//如果该cpu的状态不为空闲,则延长load balance的时间间隔,将原来的值乘以sd->busy_factor
            interval *= sd->busy_factor;

        /* scale ms to jiffies */
        interval = msecs_to_jiffies(interval);
        if (unlikely(!interval))
            interval = 1;
        if (interval > HZ*NR_CPUS/10)//load balance时间间隔是有上限的
            interval = HZ*NR_CPUS/10;

        need_serialize = sd->flags & SD_SERIALIZE;

        if (need_serialize) {//需要串行执行,获取锁。
            if (!spin_trylock(&balancing))
                goto out;
        }

        if (time_after_eq(jiffies, sd->last_balance + interval)) {//到达执行load balance的时间
            if (load_balance(cpu, rq, sd, idle, &balance)) {//检查该cpu在这一层的调度域中是否存在负载不平衡的情况,如果存在该cpu会分担负载最重的那个cpu的一些任务。详细情况见下文。
                /*
                 * We've pulled tasks over so either we're no
                 * longer idle, or one of our SMT siblings is
                 * not idle.
                 */
                idle = CPU_NOT_IDLE;
            }
            sd->last_balance = jiffies;
        }
        if (need_serialize)
            spin_unlock(&balancing);//释放串行锁
out:
        if (time_after(next_balance, sd->last_balance + interval)) {
            next_balance = sd->last_balance + interval;
            update_next_balance = 1;
        }

        /*
         * Stop the load balance at this level. There is another
         * CPU in our sched group which is doing load balancing more
         * actively.
         */
        if (!balance)//不存在负载均衡了,跳出循环
            break;
    }

    /*
     * next_balance will be updated only when there is a need.
     * When the cpu is attached to null domain for ex, it will not be
     * updated.
     */
    if (likely(update_next_balance))
        rq->next_balance = next_balance;
}
函数load_balance的代码如下:
/*
 * Check this_cpu to ensure it is balanced within domain. Attempt to move
 * tasks if there is an imbalance.
 */
static int load_balance(int this_cpu, struct rq *this_rq,
            struct sched_domain *sd, enum cpu_idle_type idle,
            int *balance)
{
    int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
    struct sched_group *group;
    unsigned long imbalance;
    struct rq *busiest;
    unsigned long flags;
    struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);

    cpumask_copy(cpus, cpu_active_mask);

    /*
     * When power savings policy is enabled for the parent domain, idle
     * sibling can pick up load irrespective of busy siblings. In this case,
     * let the state of idle sibling percolate up as CPU_IDLE, instead of
     * portraying it as CPU_NOT_IDLE.
     */
    if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
        !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
        sd_idle = 1;

    schedstat_inc(sd, lb_count[idle]);

redo:
    update_shares(sd);
    //find_busiest_group的目的是:如果在该调度域存在负载不均衡的情况,则找到负载最重的那个调度组,如果该调度域负载没有失衡,则找到负载最轻的那个,后面会将其负载全部转移到其他cpu上,以此来达到省电的目的。
    group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, cpus, balance);

    if (*balance == 0)//没有负载失衡
        goto out_balanced;

    if (!group) {//没有负载失衡,不存在负载最大的调度组
        schedstat_inc(sd, lb_nobusyg[idle]);
        goto out_balanced;
    }

    busiest = find_busiest_queue(group, idle, imbalance, cpus);//在负载最重的调度组中寻找负载最重的调度队列
    if (!busiest) {//不存在负载最重的调度队列
        schedstat_inc(sd, lb_nobusyq[idle]);
        goto out_balanced;
    }

    BUG_ON(busiest == this_rq);

    schedstat_add(sd, lb_imbalance[idle], imbalance);

    ld_moved = 0;
    if (busiest->nr_running > 1) {//如果最忙的调度队列中的任务个数不止1个。
        /*
         * Attempt to move tasks. If find_busiest_group has found
         * an imbalance but busiest->nr_running <= 1, the group is
         * still unbalanced. ld_moved simply stays zero, so it is
         * correctly treated as an imbalance.
         */
        local_irq_save(flags);
        double_rq_lock(this_rq, busiest);
        ld_moved = move_tasks(this_rq, this_cpu, busiest,
                      imbalance, sd, idle, &all_pinned);//从负载最重的调度队列中移动一些任务到该cpu的调度队列中,需要移动的负载最大值为imbalance
        double_rq_unlock(this_rq, busiest);
        local_irq_restore(flags);

        /*
         * some other cpu did the load balance for us.
         */
        if (ld_moved && this_cpu != smp_processor_id())//如果为该cpu进行负载均衡的cpu不止其本身,则该cpu需要被唤醒,因为它有活干了。
            resched_cpu(this_cpu);

        /* All tasks on this runqueue were pinned by CPU affinity */
        if (unlikely(all_pinned)) {//如果最忙的调度队列上的所有任务被绑定到它们所运行的cpu上,即不能移动。那么不在考虑该cpu上的负载了,如果该调度域中还有其他cpu的话,则继续寻找最忙的cpu。
            cpumask_clear_cpu(cpu_of(busiest),cpus);
            if (!cpumask_empty(cpus))
                goto redo;
            goto out_balanced;
        }
    }

    if (!ld_moved) {//如果移动任务失败
        schedstat_inc(sd, lb_failed[idle]);
        /*
         * Increment the failure counter only on periodic balance.
         * We do not want newidle balance, which can be very
         * frequent, pollute the failure counter causing
         * excessive cache_hot migrations and active balances.
         */
        if (idle != CPU_NEWLY_IDLE)
            sd->nr_balance_failed++;

        if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {

            spin_lock_irqsave(&busiest->lock, flags);

            /* don't kick the migration_thread, if the curr
             * task on busiest cpu can't be moved to this_cpu
             */
            if (!cpumask_test_cpu(this_cpu,
                          &busiest->curr->cpus_allowed)) {
                spin_unlock_irqrestore(&busiest->lock, flags);
                all_pinned = 1;
                goto out_one_pinned;
            }

            if (!busiest->active_balance) {//准备激活migration_thread,采用“推任务”的方式
                busiest->active_balance = 1;
                busiest->push_cpu = this_cpu;//将任务推到该cpu上
                active_balance = 1;
            }
            spin_unlock_irqrestore(&busiest->lock, flags);
            if (active_balance)
                wake_up_process(busiest->migration_thread);//唤醒migration_thread

            /*
             * We've kicked active balancing, reset the failure
             * counter.
             */
            sd->nr_balance_failed = sd->cache_nice_tries+1;
        }
    } else
        sd->nr_balance_failed = 0;

    if (likely(!active_balance)) {
        /* We were unbalanced, so reset the balancing interval */
        sd->balance_interval = sd->min_interval;
    } else {
        /*
         * If we've begun active balancing, start to back off. This
         * case may not be covered by the all_pinned logic if there
         * is only 1 task on the busy runqueue (because we don't call
         * move_tasks).
         */
        if (sd->balance_interval < sd->max_interval)
            sd->balance_interval *= 2;
    }

    if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
        !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
        ld_moved = -1;

    goto out;

out_balanced:
    schedstat_inc(sd, lb_balanced[idle]);

    sd->nr_balance_failed = 0;

out_one_pinned:
    /* tune up the balancing interval */
    if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
            (sd->balance_interval < sd->max_interval))
        sd->balance_interval *= 2;

    if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
        !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
        ld_moved = -1;
    else
        ld_moved = 0;
out:
    if (ld_moved)
        update_shares(sd);
    return ld_moved;
}

static struct sched_group *find_busiest_group(struct sched_domain *sd, int this_cpu,
           unsigned long *imbalance, enum cpu_idle_type idle,
           int *sd_idle, const struct cpumask *cpus, int *balance)
{
    struct sd_lb_stats sds;

    memset(&sds, 0, sizeof(sds));

    /*
     * Compute the various statistics relavent for load balancing at
     * this level.
     */
    update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, balance, &sds);//获取该调度域的负载平衡统计数据,详情见下文。

    /* Cases where imbalance does not exist from POV of this_cpu */
    /* 1) this_cpu is not the appropriate cpu to perform load balancing
     *    at this level.
     * 2) There is no busy sibling group to pull from.
     * 3) This group is the busiest group.
     * 4) This group is more busy than the avg busieness at this
     *    sched_domain.
     * 5) The imbalance is within the specified limit.
     *
     * Note: when doing newidle balance, if the local group has excess
     * capacity (i.e. nr_running < group_capacity) and the busiest group
     * does not have any capacity, we force a load balance to pull tasks
     * to the local group. In this case, we skip past checks 3, 4 and 5.
     */
    if (balance && !(*balance))//没有失衡
        goto ret;

    if (!sds.busiest || sds.busiest_nr_running == 0)//该调度域中不存在负载最重的调度组,或者负载最重的调度组上没有可运行的进程。
        goto out_balanced;

    /*  SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
    if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
            !sds.busiest_has_capacity)//从主调度器中发起的负载平衡才会有可能进这个分支(idle = CPU_NEWLY_IDLE)
        goto force_balance;

    if (sds.this_load >= sds.max_load)//this_cpu所在的调度组的负载大于该调度域中的其他调度组,则不需要执行负载平衡(因为它已经是最忙的了,如何去分担其他调度组上的任务呢)
        goto out_balanced;

    sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;//计算该调度域的平均负载值

    if (sds.this_load >= sds.avg_load)//this_cpu所在的调度组的负载大于该调度域的平均负载,则它也不适合执行负载平衡
        goto out_balanced;

    /*
     * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
     * And to check for busy balance use !idle_cpu instead of
     * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
     * even when they are idle.
     */
    if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
        if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
            goto out_balanced;
    } else {
        /*
         * This cpu is idle. If the busiest group load doesn't
         * have more tasks than the number of available cpu's and
         * there is no imbalance between this and busiest group
         * wrt to idle cpu's, it is balanced.
         */
        if ((sds.this_idle_cpus  <= sds.busiest_idle_cpus + 1) &&
            sds.busiest_nr_running <= sds.busiest_group_weight)
            goto out_balanced;
    }

force_balance:
    /* Looks like there is an imbalance. Compute it */
    calculate_imbalance(&sds, this_cpu, imbalance);
    return sds.busiest;

out_balanced:
    /*
     * There is no obvious imbalance. But check if we can do some balancing
     * to save power.
     */
    if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
        return sds.busiest;
ret:
    *imbalance = 0;
    return NULL;
}

static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
            enum cpu_idle_type idle, int *sd_idle,
            const struct cpumask *cpus, int *balance,
            struct sd_lb_stats *sds)
{
    struct sched_domain *child = sd->child;
    struct sched_group *group = sd->groups;
    struct sg_lb_stats sgs;
    int load_idx, prefer_sibling = 0;

    if (child && child->flags & SD_PREFER_SIBLING)
        prefer_sibling = 1;

    init_sd_power_savings_stats(sd, sds, idle);
    load_idx = get_sd_load_idx(sd, idle);

    do {//遍历该调度域上的每个调度组
        int local_group;

        local_group = cpumask_test_cpu(this_cpu,
                           sched_group_cpus(group));//判断this_cpu所在的调度组是否是当前计算的这个调度组(即group)
        memset(&sgs, 0, sizeof(sgs));
        update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
                local_group, cpus, balance, &sgs);//计算该调度组的负载平衡统计量

        if (local_group && balance && !(*balance))
            return;

        sds->total_load += sgs.group_load;//将该调度组的负载累加到调度域的负载上
        sds->total_pwr += group->cpu_power;//累加cpu处理能力

        /*
         * In case the child domain prefers tasks go to siblings
         * first, lower the group capacity to one so that we'll try
         * and move all the excess tasks away. We lower the capacity
         * of a group only if the local group has the capacity to fit
         * these excess tasks, i.e. nr_running < group_capacity. The
         * extra check prevents the case where you always pull from the
         * heaviest group when it is already under-utilized (possible
         * with a large weight task outweighs the tasks on the system).
         */
        if (prefer_sibling && !local_group && sds->this_has_capacity)
            sgs.group_capacity = min(sgs.group_capacity, 1UL);

        if (local_group) {//this_cpu所在的调度组和该调度域中其他的调度组分开处理,下面的赋值代码不解释了,详情见update_sg_lb_stats函数
            sds->this_load = sgs.avg_load;
            sds->this = group;
            sds->this_nr_running = sgs.sum_nr_running;
            sds->this_load_per_task = sgs.sum_weighted_load;
            sds->this_has_capacity = sgs.group_has_capacity;
            sds->this_idle_cpus = sgs.idle_cpus;
        } else if (sgs.avg_load > sds->max_load &&
               (sgs.sum_nr_running > sgs.group_capacity ||
                sgs.group_imb)) {
            sds->max_load = sgs.avg_load;
            sds->busiest = group;
            sds->busiest_nr_running = sgs.sum_nr_running;
            sds->busiest_idle_cpus = sgs.idle_cpus;
            sds->busiest_group_capacity = sgs.group_capacity;
            sds->busiest_group_weight = sgs.group_weight;
            sds->busiest_load_per_task = sgs.sum_weighted_load;
            sds->busiest_has_capacity = sgs.group_has_capacity;
            sds->group_imb = sgs.group_imb;
        }

        update_sd_power_savings_stats(group, sds, local_group, &sgs);
        group = group->next;
    } while (group != sd->groups);
}

static inline void update_sg_lb_stats(struct sched_domain *sd,
            struct sched_group *group, int this_cpu,
            enum cpu_idle_type idle, int load_idx, int *sd_idle,
            int local_group, const struct cpumask *cpus,
            int *balance, struct sg_lb_stats *sgs)
{
    unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
    int i;
    unsigned int balance_cpu = -1, first_idle_cpu = 0;
    unsigned long avg_load_per_task = 0;

    if (local_group) {
        balance_cpu = group_first_cpu(group);
        if (balance_cpu == this_cpu)
            update_group_power(sd, this_cpu);
    }

    /* Tally up the load of all CPUs in the group */
    max_cpu_load = 0;
    min_cpu_load = ~0UL;
    max_nr_running = 0;

    for_each_cpu_and(i, sched_group_cpus(group), cpus) {//遍历这个调度组上的所有cpu,可能只有一个cpu
        struct rq *rq = cpu_rq(i);

        if (*sd_idle && rq->nr_running)
            *sd_idle = 0;

        /* Bias balancing toward cpus of our domain */
        if (local_group) {
            if (idle_cpu(i) && !first_idle_cpu) {//记录this_cpu所在调度组上第一个处于idle的cpu
                first_idle_cpu = 1;
                balance_cpu = i;
            }

            load = target_load(i, load_idx);//选择该cpu调度队列的当前负载和它的历史负载值中的最大值,load_idx是前面在update_sd_lb_stats函数中根据this_cpu的空闲状态所选择的。
        } else {
            load = source_load(i, load_idx);//选择该cpu调度队列的当前负载和它的历史负载值中的最小值。
            if (load > max_cpu_load) {
                max_cpu_load = load;
                max_nr_running = rq->nr_running;
            }
            if (min_cpu_load > load)
                min_cpu_load = load;
        }
        //下面基本上就是计算当前调度组负载均衡统计数据。
        sgs->group_load += load;
        sgs->sum_nr_running += rq->nr_running;
        sgs->sum_weighted_load += weighted_cpuload(i);
        if (idle_cpu(i))
            sgs->idle_cpus++;
    }

    /*
     * First idle cpu or the first cpu(busiest) in this sched group
     * is eligible for doing load balancing at this and above
     * domains. In the newly idle case, we will allow all the cpu's
     * to do the newly idle load balance.
     */
    if (idle != CPU_NEWLY_IDLE && local_group &&
        balance_cpu != this_cpu && balance) {
        *balance = 0;
        return;
    }

    /* Adjust by relative CPU power of the group */
    sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;

    /*
     * Consider the group unbalanced when the imbalance is larger
     * than the average weight of two tasks.
     *
     * APZ: with cgroup the avg task weight can vary wildly and
     *      might not be a suitable number - should we keep a
     *      normalized nr_running number somewhere that negates
     *      the hierarchy?
     */
    if (sgs->sum_nr_running)
        avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;

    if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
        sgs->group_imb = 1;

    sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
    sgs->group_weight = group->group_weight;

    if (sgs->group_capacity > sgs->sum_nr_running)
        sgs->group_has_capacity = 1;
}
负载均衡的第一种情况就简单的介绍到这里,很多细节也没说明白,请多多指教。
0 0