Linux schedule 5、EAS(Energy-Aware Scheduling)

来源:互联网 发布:被淘宝网店诈骗 编辑:程序博客网 时间:2024/06/05 14:34

5、EAS(Energy-Aware Scheduling)

5.1、smp rebalance

通过搜索关键字“energy_aware()”,来查看EAS对smp负载均衡的影响。

可以看到EAS对负载均衡的策略是这样的:在overutilized的情况下,使用传统的smp/hmp负载均衡方法;在非overutilized的情况下,使用eas的均衡方法。

EAS的负载均衡和原有方法的区别有几部分:

  • 1、在EAS使能且没有overutilized的情况下,hmp负载均衡不工作;
  • 2、在EAS使能且没有overutilized的情况下,smp负载均衡不工作;
  • 3、在EAS使能且没有overutilized的情况下,EAS的主要工作集中在进程唤醒/新建时选择运行cpu上select_task_rq_fair();

5.1.1、rebalance_domains()

  • 1、在EAS使能且没有overutilized的情况下,hmp负载均衡不使能;
static void run_rebalance_domains(struct softirq_action *h){    struct rq *this_rq = this_rq();    enum cpu_idle_type idle = this_rq->idle_balance ?                        CPU_IDLE : CPU_NOT_IDLE;    int this_cpu = smp_processor_id();    /* bypass load balance of HMP if EAS consideration */    if ((!energy_aware() && sched_feat(SCHED_HMP)) ||            (hybrid_support() && cpu_rq(this_cpu)->rd->overutilized))        hmp_force_up_migration(this_cpu);    /*     * If this cpu has a pending nohz_balance_kick, then do the     * balancing on behalf of the other idle cpus whose ticks are     * stopped. Do nohz_idle_balance *before* rebalance_domains to     * give the idle cpus a chance to load balance. Else we may     * load balance only within the local sched_domain hierarchy     * and abort nohz_idle_balance altogether if we pull some load.     */    nohz_idle_balance(this_rq, idle);    rebalance_domains(this_rq, idle);}
  • 2、在load_balance() -> find_busiest_group()中,如果在EAS使能且没有overutilized的情况下,不进行常规的smp负载均衡;
static struct sched_group *find_busiest_group(struct lb_env *env){    if (energy_aware() && !env->dst_rq->rd->overutilized && !same_clus)        goto out_balanced;out_balanced:    env->imbalance = 0;    return NULL;    }

5.1.2、select_task_rq_fair()

参考4.1.2.3、select_task_rq_fair()这一节的详细描述。

5.2、cpufreq_sched/schedutil governor

sched governor比较传统interactive governor有以下优点:

  • 1、负载变化的时间更快。interactive是20ms统计一次负载,而sched governor是在schedule_tick()中更新负载,tick的时间更短;
  • 2、interactive的负载计算有问题:历史负载没有老化;历史频率除以现在频率不合理;

interactive governor的主要思想就是综合rt、cfs的负载,判断当前freq的capacity是否满足需求,是否需要调频。

这里写图片描述

5.2.1、rt request

针对CONFIG_CPU_FREQ_GOV_SCHED,rt有3条关键计算路径:

  • 1、rt负载的(rq->rt_avg)的累加:scheduler_tick() -> task_tick_rt() -> update_curr_rt() -> sched_rt_avg_update()

rq->rt_avg = 累加时间分量 * 当前frq分量(最大1024)

static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta){    rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));}
  • 2、rt负载的老化:scheduler_tick() -> __update_cpu_load() -> __update_cpu_load() -> sched_avg_update()
    或者scheduler_tick() -> task_tick_rt() -> sched_rt_update_capacity_req() -> sched_avg_update()

rq->rt_avg的老化比较简单,每个period老化1/2。

void sched_avg_update(struct rq *rq){    /* (1) 默认老化周期为1s/2 = 500ms */    s64 period = sched_avg_period();    while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {        /*         * Inline assembly required to prevent the compiler         * optimising this loop into a divmod call.         * See __iter_div_u64_rem() for another example of this.         */        asm("" : "+rm" (rq->age_stamp));        rq->age_stamp += period;        /* (2) 每个老化周期,负载老化为原来的1/2 */        rq->rt_avg /= 2;        rq->dl_avg /= 2;    }}|→static inline u64 sched_avg_period(void){    /* (1.1) 老化周期 = sysctl_sched_time_avg/2 = 500ms */    return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;}
  • 3、rt request的更新:scheduler_tick() -> task_tick_rt() -> sched_rt_update_capacity_req() -> set_rt_cpu_capacity()

rt request的计算有点粗糙: request = rt_avg/(sched_avg_period() + delta),rt_avg中没有加上delta时间的负载。

static void sched_rt_update_capacity_req(struct rq *rq){    u64 total, used, age_stamp, avg;    s64 delta;    if (!sched_freq())        return;    /* (1) 最新的负载进行老化 */    sched_avg_update(rq);    /*     * Since we're reading these variables without serialization make sure     * we read them once before doing sanity checks on them.     */    age_stamp = READ_ONCE(rq->age_stamp);    /* (2) avg=老化后的负载 */    avg = READ_ONCE(rq->rt_avg);    delta = rq_clock(rq) - age_stamp;    if (unlikely(delta < 0))        delta = 0;    /* (3) total时间=一个老化周期+上次老化剩余时间 */    total = sched_avg_period() + delta;    /* (4) avg/total=request,(最大频率=1024) */    used = div_u64(avg, total);    if (unlikely(used > SCHED_CAPACITY_SCALE))        used = SCHED_CAPACITY_SCALE;    /* (5) update request */    set_rt_cpu_capacity(rq->cpu, true, (unsigned long)(used), SCHE_ONESHOT);}|→static inline void set_rt_cpu_capacity(int cpu, bool request,                       unsigned long capacity,                    int type){#ifdef CONFIG_CPU_FREQ_SCHED_ASSIST    if (true) {#else    if (per_cpu(cpu_sched_capacity_reqs, cpu).rt != capacity) {#endif        /* (5.1) 把RT负载更新到per_cpu(cpu_sched_capacity_reqs, cpu).rt */        per_cpu(cpu_sched_capacity_reqs, cpu).rt = capacity;        update_cpu_capacity_request(cpu, request, type);    }}

5.2.2、cfs request

同样,cfs也有3条关键计算路径:

  • 1、cfs负载的(rq->rt_avg)的累加:scheduler_tick() -> task_tick_fair() -> entity_tick() -> update_load_avg()
  • 2、cfs负载的老化:scheduler_tick() -> task_tick_fair() -> entity_tick() -> update_load_avg()
  • 3、cfs request的更新:scheduler_tick() -> sched_freq_tick() -> set_cfs_cpu_capacity()
static void sched_freq_tick(int cpu){    struct sched_capacity_reqs *scr;    unsigned long capacity_orig, capacity_curr;    unsigned long capacity_req;    struct sched_domain *sd = rcu_dereference(per_cpu(sd_ea, cpu));    if (!sched_freq())        return;    capacity_orig = capacity_orig_of(cpu);    capacity_curr = capacity_curr_of(cpu);    /* (1) 如果当前频率已经是最高频率,直接返回         目前只支持频率从低往高调整?     */    if (capacity_curr == capacity_orig)        return;    /*     * To make free room for a task that is building up its "real"     * utilization and to harm its performance the least, request     * a jump to bigger OPP as soon as the margin of free capacity is     * impacted (specified by capacity_margin).     */    scr = &per_cpu(cpu_sched_capacity_reqs, cpu);    /* (2) 计算最新的(cfs capacity+ rt capacity) * (1126/1024)         放大一些,等于对capacity的需求request        ooooo这里的计算有问题:cpu_util(cpu)是带capacity分量的,而scr->rt是不带capacity分量的,不能直接相加?     */    /* capacity_req which includes RT loading & capacity_margin */    capacity_req = sum_capacity_reqs(cpu_util(cpu), scr);    /* (3) 如果capacity request大于当前频率的capacity */    if (capacity_curr <= capacity_req) {        if (sd) {### 5.3.1、WALT的负载计算            const struct sched_group_energy *const sge = sd->groups->sge;            int nr_cap_states = sge->nr_cap_states;            int idx, tmp_idx;            int opp_jump_step;            for (idx = 0; idx < nr_cap_states; idx++) {                if (sge->cap_states[idx].cap > capacity_curr+1)                    break;            }            /* (4) 尝试计算一个合理的频率等级来满足capacity request */            if (idx < nr_cap_states/3)                opp_jump_step = 2; /* far step */            else                opp_jump_step = 1; /* near step */            tmp_idx = idx + (opp_jump_step - 1);            idx = tmp_idx > (nr_cap_states - 1) ?                (nr_cap_states - 1) : tmp_idx;            if (idx)                capacity_req = (sge->cap_states[idx].cap +                        sge->cap_states[idx-1].cap)/2;            else                /* should not arrive here!*/                capacity_req = sge->cap_states[idx].cap + 2;        }        /* (5) 去掉request中的capacity分量,转化成scale_freq */        /* convert scale-invariant capacity */        capacity_req = capacity_req * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);        /* (6) update request,             ooooo这里有问题啊:capacity_req计算的时候是按照rt+cfs加起来计算的,怎么有把结果配置给了scr->cfs?         */        /*         * If free room ~5% impact, jump to 1 more index hihger OPP.         * Whatever it should be better than capacity_max.         */        set_cfs_cpu_capacity(cpu, true, capacity_req, SCHE_ONESHOT);    }}|→static inline void set_cfs_cpu_capacity(int cpu, bool request,                    unsigned long capacity, int type){#ifdef CONFIG_CPU_FREQ_SCHED_ASSIST    if (true) {#else    if (per_cpu(cpu_sched_capacity_reqs, cpu).cfs != capacity) {#endif        /* (6.1) 把RT负载更新到per_cpu(cpu_sched_capacity_reqs, cpu).cfs */        per_cpu(cpu_sched_capacity_reqs, cpu).cfs = capacity;        update_cpu_capacity_request(cpu, request, type);    }}

5.2.3、freq target

void update_cpu_capacity_request(int cpu, bool request, int type){    unsigned long new_capacity;    struct sched_capacity_reqs *scr;    /* The rq lock serializes access to the CPU's sched_capacity_reqs. */    lockdep_assert_held(&cpu_rq(cpu)->lock);    scr = &per_cpu(cpu_sched_capacity_reqs, cpu);    /* (1) 综合rt、cfs的request */    new_capacity = scr->cfs + scr->rt;    new_capacity = new_capacity * capacity_margin_dvfs        / SCHED_CAPACITY_SCALE;    new_capacity += scr->dl;#ifndef CONFIG_CPU_FREQ_SCHED_ASSIST    if (new_capacity == scr->total)        return;#endif    scr->total = new_capacity;    if (request)        update_fdomain_capacity_request(cpu, type);}|→static void update_fdomain_capacity_request(int cpu, int type){    unsigned int freq_new, cpu_tmp;    struct gov_data *gd;    unsigned long capacity = 0;#ifdef CONFIG_CPU_FREQ_SCHED_ASSIST    int cid = arch_get_cluster_id(cpu);    struct cpumask cls_cpus;#endif    struct cpufreq_policy *policy = NULL;    /*     * Avoid grabbing the policy if possible. A test is still     * required after locking the CPU's policy to avoid racing     * with the governor changing.     */    if (!per_cpu(enabled, cpu))        return;#ifdef CONFIG_CPU_FREQ_SCHED_ASSIST    gd = g_gd[cid];    /* bail early if we are throttled */    if (ktime_before(ktime_get(), gd->throttle))        goto out;    arch_get_cluster_cpus(&cls_cpus, cid);    /* find max capacity requested by cpus in this policy */    for_each_cpu(cpu_tmp, &cls_cpus) {        struct sched_capacity_reqs *scr;        if (!cpu_online(cpu_tmp))            continue;        scr = &per_cpu(cpu_sched_capacity_reqs, cpu_tmp);        capacity = max(capacity, scr->total);    }    freq_new = capacity * arch_scale_get_max_freq(cpu) >> SCHED_CAPACITY_SHIFT;#else    if (likely(cpu_online(cpu)))        policy = cpufreq_cpu_get(cpu);    if (IS_ERR_OR_NULL(policy))        return;    if (policy->governor != &cpufreq_gov_sched ||        !policy->governor_data)        goto out;    gd = policy->governor_data;    /* bail early if we are throttled */    if (ktime_before(ktime_get(), gd->throttle))        goto out;    /* (1) 选择policy cpus中最大的capacity */    /* find max capacity requested by cpus in this policy */    for_each_cpu(cpu_tmp, policy->cpus) {        struct sched_capacity_reqs *scr;        scr = &per_cpu(cpu_sched_capacity_reqs, cpu_tmp);        capacity = max(capacity, scr->total);    }    /* (2) 把相对capacity转换成绝对freq */    /* Convert the new maximum capacity request into a cpu frequency */    freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;    if (freq_new == gd->requested_freq)        goto out;#endif /* !CONFIG_CPU_FREQ_SCHED_ASSIST */    gd->requested_freq = freq_new;    gd->target_cpu = cpu;    /* (3) 使用irq_work或者直接配置的方式来配置新的频率         直接在schedule_tick()中配置频率的方式估计不会使用,因为这样会阻塞中断     */    /*     * Throttling is not yet supported on platforms with fast cpufreq     * drivers.     */    if (cpufreq_driver_slow)        irq_work_queue_on(&gd->irq_work, cpu);    else        cpufreq_sched_try_driver_target(cpu, policy, freq_new, type);out:    if (policy)        cpufreq_cpu_put(policy);}

5.3、WALT(Windows Assisted Load Tracking)

在qualcomm 8898中,使用了WALT作为负载计算方法,也使用了自己的负载均衡算法来使用WALT负载。代码中使用CONFIG_SCHED_HMP来标示qualcomm自己负载均衡方法。

5.3.1、WALT的负载计算

Walt的本质也是时间窗分量,结合freq分量、capacity分量等一起表达的一个负载相对值。我们首先来看看几个分量的计算方法。

  • 1、cluster->efficiency计算:从dts中读取,我们可以看到,四个小核的efficiency是1024,四个大核的efficiency是1638;
static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus){    cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus));    if (cluster->efficiency > max_possible_efficiency)        max_possible_efficiency = cluster->efficiency;    if (cluster->efficiency < min_possible_efficiency)        min_possible_efficiency = cluster->efficiency;}unsigned long arch_get_cpu_efficiency(int cpu){    return per_cpu(cpu_efficiency, cpu);}static void __init parse_dt_cpu_power(void){        /*         * The CPU efficiency value passed from the device tree         * overrides the value defined in the table_efficiency[]         */        if (of_property_read_u32(cn, "efficiency", &efficiency) < 0) {        }        per_cpu(cpu_efficiency, cpu) = efficiency;}从 arch/arm64/boot/dts/qcom/sdm660.dtsi读到"efficiency"配置:    cpus {        #address-cells = <2>;        #size-cells = <0>;        CPU0: cpu@0 {            efficiency = <1024>;        };        CPU1: cpu@1 {            efficiency = <1024>;        };        CPU2: cpu@2 {            efficiency = <1024>;        };        CPU3: cpu@3 {            efficiency = <1024>;        };        CPU4: cpu@100 {            efficiency = <1638>;        };        CPU5: cpu@101 {            efficiency = <1638>;        };        CPU6: cpu@102 {            efficiency = <1638>;        };        CPU7: cpu@103 {            efficiency = <1638>;        };        cpu-map {            cluster0 {                core0 {                    cpu = <&CPU0>;                };                core1 {                    cpu = <&CPU1>;                };                core2 {                    cpu = <&CPU2>;                };                core3 {                    cpu = <&CPU3>;                };            };            cluster1 {                core0 {                    cpu = <&CPU4>;                };                core1 {                    cpu = <&CPU5>;                };                core2 {                    cpu = <&CPU6>;                };                core3 {                    cpu = <&CPU7>;                };            };        };    }
  • 2、cluster->capacity:计算和最小值的正比:capacity = 1024 * (cluster->efficiency*cluster_max_freq(cluster)) / (min_possible_efficiency*min_max_freq)
  • 3、cluster->max_possible_capacity:计算和最小值的正比:capacity = 1024 * (cluster->efficiency*cluster->max_possible_freq) / (min_possible_efficiency*min_max_freq)
  • 4、cluster->load_scale_factor:计算和最大值的反比:lsf = 1024 * (max_possible_efficiency*max_possible_freq) / (cluster->efficiency*cluster_max_freq(cluster))
  • 5、cluster->exec_scale_factor:计算和最大值的正比:exec_scale_factor = 1024 * cluster->efficiency / max_possible_efficiency
static void update_all_clusters_stats(void){    struct sched_cluster *cluster;    u64 highest_mpc = 0, lowest_mpc = U64_MAX;    pre_big_task_count_change(cpu_possible_mask);    for_each_sched_cluster(cluster) {        u64 mpc;        /* (1) 计算cluster->capacity:capacity = efficiency * cluster_max_freq            最小值:min_possible_efficiency*min_max_freq = 1024,            计算和最小值的正比:capacity = 1024 * (cluster->efficiency*cluster_max_freq(cluster)) / (min_possible_efficiency*min_max_freq)         */        cluster->capacity = compute_capacity(cluster);        /* (2) 计算cluster->max_possible_capacity:capacity = efficiency * cluster_max_freq            最小值:min_possible_efficiency*min_max_freq = 1024,            计算和最小值的正比:capacity = 1024 * (cluster->efficiency*cluster->max_possible_freq) / (min_possible_efficiency*min_max_freq)         */        mpc = cluster->max_possible_capacity =            compute_max_possible_capacity(cluster);        /* (3) 计算cluster->load_scale_factor: lsf = efficiency * cluster_max_freq            最大值:max_possible_efficiency*max_possible_freq = 1024            计算和最大值的反比:lsf = 1024 * (max_possible_efficiency*max_possible_freq) / (cluster->efficiency*cluster_max_freq(cluster))         */        cluster->load_scale_factor = compute_load_scale_factor(cluster);        /* (4) 计算cluster->exec_scale_factor:            最大值:max_possible_efficiency = 1024            计算和最大值的正比:exec_scale_factor = 1024 * cluster->efficiency / max_possible_efficiency         */        cluster->exec_scale_factor =            DIV_ROUND_UP(cluster->efficiency * 1024,                     max_possible_efficiency);        if (mpc > highest_mpc)            highest_mpc = mpc;        if (mpc < lowest_mpc)            lowest_mpc = mpc;    }    max_possible_capacity = highest_mpc;    min_max_possible_capacity = lowest_mpc;    __update_min_max_capacity();    sched_update_freq_max_load(cpu_possible_mask);    post_big_task_count_change(cpu_possible_mask);}|→static int compute_capacity(struct sched_cluster *cluster){    int capacity = 1024;    capacity *= capacity_scale_cpu_efficiency(cluster);    capacity >>= 10;    capacity *= capacity_scale_cpu_freq(cluster);    capacity >>= 10;    return capacity;}||→/* * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that * least efficient cpu gets capacity of 1024 */static unsigned longcapacity_scale_cpu_efficiency(struct sched_cluster *cluster){    return (1024 * cluster->efficiency) / min_possible_efficiency;}||→/* * Return 'capacity' of a cpu in reference to cpu with lowest max_freq * (min_max_freq), such that one with lowest max_freq gets capacity of 1024. */static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster){    return (1024 * cluster_max_freq(cluster)) / min_max_freq;}|→static int compute_load_scale_factor(struct sched_cluster *cluster){    int load_scale = 1024;    /*     * load_scale_factor accounts for the fact that task load     * is in reference to "best" performing cpu. Task's load will need to be     * scaled (up) by a factor to determine suitability to be placed on a     * (little) cpu.     */    load_scale *= load_scale_cpu_efficiency(cluster);    load_scale >>= 10;    load_scale *= load_scale_cpu_freq(cluster);    load_scale >>= 10;    return load_scale;}||→/* * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so * that "most" efficient cpu gets a load_scale_factor of 1 */static inline unsigned longload_scale_cpu_efficiency(struct sched_cluster *cluster){    return DIV_ROUND_UP(1024 * max_possible_efficiency,                cluster->efficiency);}||→/* * Return load_scale_factor of a cpu in reference to cpu with best max_freq * (max_possible_freq), so that one with best max_freq gets a load_scale_factor * of 1. */static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster){    return DIV_ROUND_UP(1024 * max_possible_freq,               cluster_max_freq(cluster));}
  • 6、cluster->max_power_cost:cluster的最大功耗 = voltage^2 * frequence
  • 7、cluster->min_power_cost:cluster的最小功耗 = voltage^2 * frequence
static void sort_clusters(void){    for_each_sched_cluster(cluster) {        cluster->max_power_cost = power_cost(cluster_first_cpu(cluster),                                   max_task_load());        cluster->min_power_cost = power_cost(cluster_first_cpu(cluster),                                   0);        if (cluster->max_power_cost > tmp_max)            tmp_max = cluster->max_power_cost;    }    max_power_cost = tmp_max;}|→unsigned int power_cost(int cpu, u64 demand){    int first, mid, last;    struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();    struct cpu_pstate_pwr *costs;    struct freq_max_load *max_load;    int total_static_pwr_cost = 0;    struct rq *rq = cpu_rq(cpu);    unsigned int pc;    if (!per_cpu_info || !per_cpu_info[cpu].ptable)        /*         * When power aware scheduling is not in use, or CPU         * power data is not available, just use the CPU         * capacity as a rough stand-in for real CPU power         * numbers, assuming bigger CPUs are more power         * hungry.         */        return cpu_max_possible_capacity(cpu);    rcu_read_lock();    max_load = rcu_dereference(per_cpu(freq_max_load, cpu));    if (!max_load) {        pc = cpu_max_possible_capacity(cpu);        goto unlock;    }    costs = per_cpu_info[cpu].ptable;    if (demand <= max_load->freqs[0].hdemand) {        pc = costs[0].power;        goto unlock;    } else if (demand > max_load->freqs[max_load->length - 1].hdemand) {        pc = costs[max_load->length - 1].power;        goto unlock;    }    first = 0;    last = max_load->length - 1;    mid = (last - first) >> 1;    while (1) {        if (demand <= max_load->freqs[mid].hdemand)            last = mid;        else            first = mid;        if (last - first == 1)            break;        mid = first + ((last - first) >> 1);    }    pc = costs[last].power;unlock:    rcu_read_unlock();    if (idle_cpu(cpu) && rq->cstate) {        total_static_pwr_cost += rq->static_cpu_pwr_cost;        if (rq->cluster->dstate)            total_static_pwr_cost +=                rq->cluster->static_cluster_pwr_cost;    }    return pc + total_static_pwr_cost;}/* qualcom的power的计算公式 = voltage^2 * frequence */static int msm_get_power_values(int cpu, struct cpu_static_info *sp){    int i = 0, j;    int ret = 0;    uint64_t power;    /* Calculate dynamic power spent for every frequency using formula:     * Power = V * V * f     * where V = voltage for frequency     *       f = frequency     * */    sp->power = allocate_2d_array_uint32_t(sp->num_of_freqs);    if (IS_ERR_OR_NULL(sp->power))        return PTR_ERR(sp->power);    for (i = 0; i < TEMP_DATA_POINTS; i++) {        for (j = 0; j < sp->num_of_freqs; j++) {            power = sp->voltage[j] *                        sp->table[j].frequency;            do_div(power, 1000);            do_div(power, 1000);            power *= sp->voltage[j];            do_div(power, 1000);            sp->power[i][j] = power;        }    }    return ret;}

5.3.1.1、update_task_ravg()

walt关于进程的负载计算流程如下:

  • 1、把时间分成一个个window窗口,累加时间时,需要综合efficiency和freq分量(也就是capacity):delta = delta_time * (curr_freq/max_possible_freq) * (cluster->efficiency/max_possible_efficiency);
static inline u64 scale_exec_time(u64 delta, struct rq *rq){    u32 freq;    /* curr_freq / max_possible_freq */    freq = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);    delta = DIV64_U64_ROUNDUP(delta * freq, max_possible_freq);    /* exec_scale_factor = cluster->efficiency / max_possible_efficiency */    delta *= rq->cluster->exec_scale_factor;    delta >>= 10;    return delta;}
  • 2、统计runnable状态的时间:account_busy_for_task_demand()屏蔽掉runnable以外的其他状态的时间统计;
static int account_busy_for_task_demand(struct task_struct *p, int event){    /*     * No need to bother updating task demand for exiting tasks     * or the idle task.     */    /* (3.1.1) exit、idle任务不计入统计 */    if (exiting_task(p) || is_idle_task(p))        return 0;    /*     * When a task is waking up it is completing a segment of non-busy     * time. Likewise, if wait time is not treated as busy time, then     * when a task begins to run or is migrated, it is not running and     * is completing a segment of non-busy time.     */    /* (3.1.2) 任务被wakeup,之前的等待时间不计入统计         SCHED_ACCOUNT_WAIT_TIME用来控制ruannable的等待时间是否计入统计,默认是计入的     */    if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME &&             (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))        return 0;    return 1;}
  • 3、在统计时间时,可能碰到的3种组合情况:

这里写图片描述

  • 4、如果一个window还没有完成,会逐步累加时间到p->ravg.sum;如果一个window完成,存储最新window负载到p->ravg.sum_history[RAVG_HIST_SIZE_MAX]中,sum_history[]一共有5个槽位;系统根据sched_window_stats_policy选择策略(RECENT、MAX、AVG、MAX_RECENT_AVG),根据sum_history[]计算选择一个合适的值作为进程负载p->ravg.demand;同时根据sum_history[]的计算进程的负载预测p->ravg.pred_demand;
    这里写图片描述

  • 5、walt的task级别的负载是p->ravg.demand,cpu级别负载是rq->hmp_stats.cumulative_runnable_avg;

  • 6、

具体的update_task_ravg()代码解析如下:

scheduler_tick() -> update_task_ravg()↓/* Reflect task activity on its demand and cpu's busy time statistics */void update_task_ravg(struct task_struct *p, struct rq *rq, int event,                        u64 wallclock, u64 irqtime){    u64 runtime;    if (!rq->window_start || sched_disable_window_stats ||        p->ravg.mark_start == wallclock)        return;    lockdep_assert_held(&rq->lock);    /* (1) 根据wallclock更新rq->window_start */    update_window_start(rq, wallclock);    if (!p->ravg.mark_start) {        update_task_cpu_cycles(p, cpu_of(rq));        goto done;    }    /* (2) 更新cycle、walltime的差值,用来计算cpu的当前freq */    update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime);    /* (3) 更新task的负载demand */    runtime = update_task_demand(p, rq, event, wallclock);    if (runtime)        update_task_burst(p, rq, event, runtime);    /* (4) 更新cpu的busy时间 */    update_cpu_busy_time(p, rq, event, wallclock, irqtime);    /* (5) 更新task的负载预测pred_demand */    update_task_pred_demand(rq, p, event);done:    trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,                     rq->cc.cycles, rq->cc.time,                     p->grp ? &rq->grp_time : NULL);    /* (6) 更新task的时间更新点:p->ravg.mark_start */    p->ravg.mark_start = wallclock;}|→static u64 update_task_demand(struct task_struct *p, struct rq *rq,                   int event, u64 wallclock){    u64 mark_start = p->ravg.mark_start;    u64 delta, window_start = rq->window_start;    int new_window, nr_full_windows;    u32 window_size = sched_ravg_window;    u64 runtime;    new_window = mark_start < window_start;    /* (3.1) 这是一个关键点,非runnable状态的统计需要在这里异常返回 */    if (!account_busy_for_task_demand(p, event)) {        if (new_window)            /*             * If the time accounted isn't being accounted as             * busy time, and a new window started, only the             * previous window need be closed out with the             * pre-existing demand. Multiple windows may have             * elapsed, but since empty windows are dropped,             * it is not necessary to account those.             */            update_history(rq, p, p->ravg.sum, 1, event);        return 0;    }    /* (3.2) 第一种情况:还在原窗口内,简单继续累加p->ravg.sum */    if (!new_window) {        /*         * The simple case - busy time contained within the existing         * window.         */        return add_to_task_demand(rq, p, wallclock - mark_start);    }    /* (3.3) 第二、三种情况:原窗口已经填满 */    /*     * Busy time spans at least two windows. Temporarily rewind     * window_start to first window boundary after mark_start.     */    delta = window_start - mark_start;    nr_full_windows = div64_u64(delta, window_size);    window_start -= (u64)nr_full_windows * (u64)window_size;    /* (3.4.1) 补全第一个窗口 */    /* Process (window_start - mark_start) first */    runtime = add_to_task_demand(rq, p, window_start - mark_start);    /* (3.4.2) 把第一个窗口更新到进程task负载history中,         更新p->ravg.demand、p->ravg.pred_demand     */    /* Push new sample(s) into task's demand history */    update_history(rq, p, p->ravg.sum, 1, event);    /* (3.5) 如果中间有几个完整窗口,更新负载,更新history */    if (nr_full_windows) {        u64 scaled_window = scale_exec_time(window_size, rq);        update_history(rq, p, scaled_window, nr_full_windows, event);        runtime += nr_full_windows * scaled_window;    }    /* (3.6) 最后一个没有完成的窗口,只是简单累加时间,不更新history */    /*     * Roll window_start back to current to process any remainder     * in current window.     */    window_start += (u64)nr_full_windows * (u64)window_size;    /* Process (wallclock - window_start) next */    mark_start = window_start;    runtime += add_to_task_demand(rq, p, wallclock - mark_start);    return runtime;}||→static int account_busy_for_task_demand(struct task_struct *p, int event){    /*     * No need to bother updating task demand for exiting tasks     * or the idle task.     */    /* (3.1.1) exit、idle任务不计入统计 */    if (exiting_task(p) || is_idle_task(p))        return 0;    /*     * When a task is waking up it is completing a segment of non-busy     * time. Likewise, if wait time is not treated as busy time, then     * when a task begins to run or is migrated, it is not running and     * is completing a segment of non-busy time.     */    /* (3.1.2) 任务被wakeup,之前的等待时间不计入统计         SCHED_ACCOUNT_WAIT_TIME用来控制ruannable的等待时间是否计入统计,默认是计入的     */    if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME &&             (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))        return 0;    return 1;}||→static void add_to_task_demand(struct rq *rq, struct task_struct *p,                u64 delta){    /* (3.4.1) 累加窗口的时间值 */    delta = scale_exec_time(delta, rq);    p->ravg.sum += delta;    if (unlikely(p->ravg.sum > walt_ravg_window))        p->ravg.sum = walt_ravg_window;}static inline u64 scale_exec_time(u64 delta, struct rq *rq){    u32 freq;    /* curr_freq / max_possible_freq */    freq = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);    delta = DIV64_U64_ROUNDUP(delta * freq, max_possible_freq);    /* exec_scale_factor = cluster->efficiency / max_possible_efficiency */    delta *= rq->cluster->exec_scale_factor;    delta >>= 10;    return delta;}||→static void update_history(struct rq *rq, struct task_struct *p,             u32 runtime, int samples, int event){    u32 *hist = &p->ravg.sum_history[0];    int ridx, widx;    u32 max = 0, avg, demand, pred_demand;    u64 sum = 0;    /* (3.4.2.1) 不活跃的进程不进行更新 */    /* Ignore windows where task had no activity */    if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)        goto done;    /* (3.4.2.2) 把新窗口的runtime推送到history stack中 */    /* Push new 'runtime' value onto stack */    widx = sched_ravg_hist_size - 1;    ridx = widx - samples;    for (; ridx >= 0; --widx, --ridx) {        hist[widx] = hist[ridx];        sum += hist[widx];        if (hist[widx] > max)            max = hist[widx];    }    for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) {        hist[widx] = runtime;        sum += hist[widx];        if (hist[widx] > max)            max = hist[widx];    }    p->ravg.sum = 0;    /* (3.4.2.3) 根据sched_window_stats_policy策略(RECENT、MAX、AVG、MAX_RECENT_AVG),        从sum_history[]中选择合适的值作为进程负载p->ravg.demand     */    if (sched_window_stats_policy == WINDOW_STATS_RECENT) {        demand = runtime;    } else if (sched_window_stats_policy == WINDOW_STATS_MAX) {        demand = max;    } else {        avg = div64_u64(sum, sched_ravg_hist_size);        if (sched_window_stats_policy == WINDOW_STATS_AVG)            demand = avg;        else            demand = max(avg, runtime);    }    /* (3.4.2.4) 计算进程的预测负载 */    pred_demand = predict_and_update_buckets(rq, p, runtime);    /*     * A throttled deadline sched class task gets dequeued without     * changing p->on_rq. Since the dequeue decrements hmp stats     * avoid decrementing it here again.     */    /* (3.4.2.5) 更新进程负载(p->ravg.demand)到cpu负载(rq->hmp_stats.cumulative_runnable_avg)中         cfs中p->sched_class->fixup_hmp_sched_stats对应函数fixup_hmp_sched_stats_fair()     */    if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||                        !p->dl.dl_throttled))        p->sched_class->fixup_hmp_sched_stats(rq, p, demand,                              pred_demand);    p->ravg.demand = demand;    p->ravg.pred_demand = pred_demand;done:    trace_sched_update_history(rq, p, runtime, samples, event);}|||→static inline u32 predict_and_update_buckets(struct rq *rq,            struct task_struct *p, u32 runtime) {    int bidx;    u32 pred_demand;    /* (3.4.2.4.1) 把window负载转换成bucket index(最大10) */    bidx = busy_to_bucket(runtime);    /* (3.4.2.4.2) 根据index,找到历史曾经达到过的更大值,取历史的值作为预测值 */    pred_demand = get_pred_busy(rq, p, bidx, runtime);    /* (3.4.2.4.3) 对bucket[]中本次index权重进行增加,其他权重减少 */    bucket_increase(p->ravg.busy_buckets, bidx);    return pred_demand;}|||→static voidfixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,               u32 new_task_load, u32 new_pred_demand){    /* (3.4.2.5.1) 计算task负载和预测的变化值delta */    s64 task_load_delta = (s64)new_task_load - task_load(p);    s64 pred_demand_delta = PRED_DEMAND_DELTA;    /* (3.4.2.5.2) 将进程级别的delta计入cpu级别的负载统计(rq->hmp_stats)中 */    fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,                      pred_demand_delta);    /* (3.4.2.5.3) 更新cpu级别big_task的数量 */    fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);}static inline voidfixup_cumulative_runnable_avg(struct hmp_sched_stats *stats,                  struct task_struct *p, s64 task_load_delta,                  s64 pred_demand_delta){    if (sched_disable_window_stats)        return;    stats->cumulative_runnable_avg += task_load_delta;    BUG_ON((s64)stats->cumulative_runnable_avg < 0);    stats->pred_demands_sum += pred_demand_delta;    BUG_ON((s64)stats->pred_demands_sum < 0);}void fixup_nr_big_tasks(struct hmp_sched_stats *stats,                struct task_struct *p, s64 delta){    u64 new_task_load;    u64 old_task_load;    if (sched_disable_window_stats)        return;    /* task_load按照capacity反比放大,让所有cpu处在同一级别 */    old_task_load = scale_load_to_cpu(task_load(p), task_cpu(p));    new_task_load = scale_load_to_cpu(delta + task_load(p), task_cpu(p));    /* 如果进程负载 > 最大负载 * 80% (sysctl_sched_upmigrate_pct)        该任务为big_task     */    if (__is_big_task(p, old_task_load) && !__is_big_task(p, new_task_load))        stats->nr_big_tasks--;    else if (!__is_big_task(p, old_task_load) &&         __is_big_task(p, new_task_load))        stats->nr_big_tasks++;    BUG_ON(stats->nr_big_tasks < 0);}

我们再来详细看看cpu级别的busy time计算:

static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,                 int event, u64 wallclock, u64 irqtime){    int new_window, full_window = 0;    int p_is_curr_task = (p == rq->curr);    u64 mark_start = p->ravg.mark_start;    u64 window_start = rq->window_start;    u32 window_size = sched_ravg_window;    u64 delta;    u64 *curr_runnable_sum = &rq->curr_runnable_sum;    u64 *prev_runnable_sum = &rq->prev_runnable_sum;    u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;    u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;    bool new_task;    struct related_thread_group *grp;    int cpu = rq->cpu;    u32 old_curr_window = p->ravg.curr_window;    new_window = mark_start < window_start;    if (new_window) {        full_window = (window_start - mark_start) >= window_size;        if (p->ravg.active_windows < USHRT_MAX)            p->ravg.active_windows++;    }    new_task = is_new_task(p);    /*     * Handle per-task window rollover. We don't care about the idle     * task or exiting tasks.     */    /* (1) 如果有新window,滚动进程窗口:p->ravg.prev_window、p->ravg.curr_window */    if (!is_idle_task(p) && !exiting_task(p)) {        if (new_window)            rollover_task_window(p, full_window);    }    /* (2) 如果有新window且进程是rq的当前进程,        cpu级别的窗口滚动:rq->prev_runnable_sum、rq->curr_runnable_sum        cpu级别的进程统计窗口滚动:rq->top_tasks[prev_table]、rq->top_tasks[curr_table]     */    if (p_is_curr_task && new_window) {        rollover_cpu_window(rq, full_window);        rollover_top_tasks(rq, full_window);    }    /* (3) 判断哪些情况可以统计进cpu time */    if (!account_busy_for_cpu_time(rq, p, irqtime, event))        goto done;    grp = p->grp;    if (grp && sched_freq_aggregate) {        struct group_cpu_time *cpu_time = &rq->grp_time;        curr_runnable_sum = &cpu_time->curr_runnable_sum;        prev_runnable_sum = &cpu_time->prev_runnable_sum;        nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;        nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;    }    /* (4) 如果时间没有达到新window,        在cpu级别的当前负载上累加:rq->curr_runnable_sum        在进程级别的基础上累加:p->ravg.curr_window     */    if (!new_window) {        /*         * account_busy_for_cpu_time() = 1 so busy time needs         * to be accounted to the current window. No rollover         * since we didn't start a new window. An example of this is         * when a task starts execution and then sleeps within the         * same window.         */        if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))            delta = wallclock - mark_start;        else            delta = irqtime;        delta = scale_exec_time(delta, rq);        *curr_runnable_sum += delta;        if (new_task)            *nt_curr_runnable_sum += delta;        if (!is_idle_task(p) && !exiting_task(p)) {            p->ravg.curr_window += delta;            p->ravg.curr_window_cpu[cpu] += delta;        }        goto done;    }    /* (5) 如果时间达到新window,但是进程不是rq的当前进程        在进程级别的基础上累加:p->ravg.prev_window、p->ravg.curr_window        在cpu级别的当前负载上累加:rq->prev_runnable_sum、rq->curr_runnable_sum     */    if (!p_is_curr_task) {        /*         * account_busy_for_cpu_time() = 1 so busy time needs         * to be accounted to the current window. A new window         * has also started, but p is not the current task, so the         * window is not rolled over - just split up and account         * as necessary into curr and prev. The window is only         * rolled over when a new window is processed for the current         * task.         *         * Irqtime can't be accounted by a task that isn't the         * currently running task.         */        if (!full_window) {            /*             * A full window hasn't elapsed, account partial             * contribution to previous completed window.             */            delta = scale_exec_time(window_start - mark_start, rq);            if (!exiting_task(p)) {                p->ravg.prev_window += delta;                p->ravg.prev_window_cpu[cpu] += delta;            }        } else {            /*             * Since at least one full window has elapsed,             * the contribution to the previous window is the             * full window (window_size).             */            delta = scale_exec_time(window_size, rq);            if (!exiting_task(p)) {                p->ravg.prev_window = delta;                p->ravg.prev_window_cpu[cpu] = delta;            }        }        *prev_runnable_sum += delta;        if (new_task)            *nt_prev_runnable_sum += delta;        /* Account piece of busy time in the current window. */        delta = scale_exec_time(wallclock - window_start, rq);        *curr_runnable_sum += delta;        if (new_task)            *nt_curr_runnable_sum += delta;        if (!exiting_task(p)) {            p->ravg.curr_window = delta;            p->ravg.curr_window_cpu[cpu] = delta;        }        goto done;    }    /* (6) 如果时间达到新window,且进程是rq的当前进程        在进程级别的基础上累加:p->ravg.prev_window、p->ravg.curr_window        在cpu级别的当前负载上累加:rq->prev_runnable_sum、rq->curr_runnable_sum     */    if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {        /*         * account_busy_for_cpu_time() = 1 so busy time needs         * to be accounted to the current window. A new window         * has started and p is the current task so rollover is         * needed. If any of these three above conditions are true         * then this busy time can't be accounted as irqtime.         *         * Busy time for the idle task or exiting tasks need not         * be accounted.         *         * An example of this would be a task that starts execution         * and then sleeps once a new window has begun.         */        if (!full_window) {            /*             * A full window hasn't elapsed, account partial             * contribution to previous completed window.             */            delta = scale_exec_time(window_start - mark_start, rq);            if (!is_idle_task(p) && !exiting_task(p)) {                p->ravg.prev_window += delta;                p->ravg.prev_window_cpu[cpu] += delta;            }        } else {            /*             * Since at least one full window has elapsed,             * the contribution to the previous window is the             * full window (window_size).             */            delta = scale_exec_time(window_size, rq);            if (!is_idle_task(p) && !exiting_task(p)) {                p->ravg.prev_window = delta;                p->ravg.prev_window_cpu[cpu] = delta;            }        }        /*         * Rollover is done here by overwriting the values in         * prev_runnable_sum and curr_runnable_sum.         */        *prev_runnable_sum += delta;        if (new_task)            *nt_prev_runnable_sum += delta;        /* Account piece of busy time in the current window. */        delta = scale_exec_time(wallclock - window_start, rq);        *curr_runnable_sum += delta;        if (new_task)            *nt_curr_runnable_sum += delta;        if (!is_idle_task(p) && !exiting_task(p)) {            p->ravg.curr_window = delta;            p->ravg.curr_window_cpu[cpu] = delta;        }        goto done;    }    if (irqtime) {        /*         * account_busy_for_cpu_time() = 1 so busy time needs         * to be accounted to the current window. A new window         * has started and p is the current task so rollover is         * needed. The current task must be the idle task because         * irqtime is not accounted for any other task.         *         * Irqtime will be accounted each time we process IRQ activity         * after a period of idleness, so we know the IRQ busy time         * started at wallclock - irqtime.         */        BUG_ON(!is_idle_task(p));        mark_start = wallclock - irqtime;        /*         * Roll window over. If IRQ busy time was just in the current         * window then that is all that need be accounted.         */        if (mark_start > window_start) {            *curr_runnable_sum = scale_exec_time(irqtime, rq);            return;        }        /*         * The IRQ busy time spanned multiple windows. Process the         * busy time preceding the current window start first.         */        delta = window_start - mark_start;        if (delta > window_size)            delta = window_size;        delta = scale_exec_time(delta, rq);        *prev_runnable_sum += delta;        /* Process the remaining IRQ busy time in the current window. */        delta = wallclock - window_start;        rq->curr_runnable_sum = scale_exec_time(delta, rq);        return;    }done:    /* (7) 更新cpu上的top task */    if (!is_idle_task(p) && !exiting_task(p))        update_top_tasks(p, rq, old_curr_window,                    new_window, full_window);}|→static void update_top_tasks(struct task_struct *p, struct rq *rq,        u32 old_curr_window, int new_window, bool full_window){    u8 curr = rq->curr_table;    u8 prev = 1 - curr;    u8 *curr_table = rq->top_tasks[curr];    u8 *prev_table = rq->top_tasks[prev];    int old_index, new_index, update_index;    u32 curr_window = p->ravg.curr_window;    u32 prev_window = p->ravg.prev_window;    bool zero_index_update;    if (old_curr_window == curr_window && !new_window)        return;    /* (1) 把就进程p的"当前window负载""旧的当前window负载"转换成index(NUM_LOAD_INDICES=1000) */    old_index = load_to_index(old_curr_window);    new_index = load_to_index(curr_window);    /* (2) 如果没有新window         更新当前top表rq->top_tasks[curr][]中新旧index的计数        根据index的计数是否为0,更新rq->top_tasks_bitmap[curr] bitmap中对应index的值     */    if (!new_window) {        zero_index_update = !old_curr_window && curr_window;        if (old_index != new_index || zero_index_update) {            if (old_curr_window)                curr_table[old_index] -= 1;            if (curr_window)                curr_table[new_index] += 1;            if (new_index > rq->curr_top)                rq->curr_top = new_index;        }        if (!curr_table[old_index])            __clear_bit(NUM_LOAD_INDICES - old_index - 1,                rq->top_tasks_bitmap[curr]);        if (curr_table[new_index] == 1)            __set_bit(NUM_LOAD_INDICES - new_index - 1,                rq->top_tasks_bitmap[curr]);        return;    }    /*     * The window has rolled over for this task. By the time we get     * here, curr/prev swaps would has already occurred. So we need     * to use prev_window for the new index.     */    update_index = load_to_index(prev_window);    if (full_window) {        /*         * Two cases here. Either 'p' ran for the entire window or         * it didn't run at all. In either case there is no entry         * in the prev table. If 'p' ran the entire window, we just         * need to create a new entry in the prev table. In this case         * update_index will be correspond to sched_ravg_window         * so we can unconditionally update the top index.         */        if (prev_window) {            prev_table[update_index] += 1;            rq->prev_top = update_index;        }        if (prev_table[update_index] == 1)            __set_bit(NUM_LOAD_INDICES - update_index - 1,                rq->top_tasks_bitmap[prev]);    } else {        zero_index_update = !old_curr_window && prev_window;        if (old_index != update_index || zero_index_update) {            if (old_curr_window)                prev_table[old_index] -= 1;            prev_table[update_index] += 1;            if (update_index > rq->prev_top)                rq->prev_top = update_index;            if (!prev_table[old_index])                __clear_bit(NUM_LOAD_INDICES - old_index - 1,                        rq->top_tasks_bitmap[prev]);            if (prev_table[update_index] == 1)                __set_bit(NUM_LOAD_INDICES - update_index - 1,                        rq->top_tasks_bitmap[prev]);        }    }    if (curr_window) {        curr_table[new_index] += 1;        if (new_index > rq->curr_top)            rq->curr_top = new_index;        if (curr_table[new_index] == 1)            __set_bit(NUM_LOAD_INDICES - new_index - 1,                rq->top_tasks_bitmap[curr]);    }}

5.3.2、基于WALT的负载均衡

5.3.2.1、load_balance()

其他部分和主干内核算法一致,这里只标识出qualcom的HMP算法特有的部分。在负载均衡部分,walt用来找出cpu;但是在负载迁移时,计算负载还是使用pelt?

  • 在find_busiest_queue()中:原本是找出cfs_rq->runnable_load_avg * capacity负载最大的cpu,qualcom HMP改为找出walt runnable负载(rq->hmp_stats.cumulative_runnable_avg)最重的cpu。
run_rebalance_domains() -> rebalance_domains() -> load_balance() -> find_busiest_queue() -> find_busiest_queue_hmp()↓static struct rq *find_busiest_queue_hmp(struct lb_env *env,                     struct sched_group *group){    struct rq *busiest = NULL, *busiest_big = NULL;    u64 max_runnable_avg = 0, max_runnable_avg_big = 0;    int max_nr_big = 0, nr_big;    bool find_big = !!(env->flags & LBF_BIG_TASK_ACTIVE_BALANCE);    int i;    cpumask_t cpus;    cpumask_andnot(&cpus, sched_group_cpus(group), cpu_isolated_mask);    /* (1) 遍历sg中的cpu */    for_each_cpu(i, &cpus) {        struct rq *rq = cpu_rq(i);        u64 cumulative_runnable_avg =                rq->hmp_stats.cumulative_runnable_avg;        if (!cpumask_test_cpu(i, env->cpus))            continue;        /* (2) 考虑big_task,找出big_task最重的cpu */        if (find_big) {            nr_big = nr_big_tasks(rq);            if (nr_big > max_nr_big ||                (nr_big > 0 && nr_big == max_nr_big &&                 cumulative_runnable_avg > max_runnable_avg_big)) {                max_runnable_avg_big = cumulative_runnable_avg;                busiest_big = rq;                max_nr_big = nr_big;                continue;            }        }        /* (3) 找出walt runnable负载(rq->hmp_stats.cumulative_runnable_avg)最重的cpu */        if (cumulative_runnable_avg > max_runnable_avg) {            max_runnable_avg = cumulative_runnable_avg;            busiest = rq;        }    }    if (busiest_big)        return busiest_big;    env->flags &= ~LBF_BIG_TASK_ACTIVE_BALANCE;    return busiest;}

5.3.2.2、nohz_idle_balance()

  • _nohz_kick_needed():
scheduler_tick() -> trigger_load_balance() -> nohz_kick_needed() -> _nohz_kick_needed() -> nohz_kick_needed_hmp()↓static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type){    struct sched_domain *sd;    int i;    if (rq->nr_running < 2)        return 0;    /* (1) 如果是SCHED_BOOST_ON_ALL,返回true */    if (!sysctl_sched_restrict_cluster_spill ||            sched_boost_policy() == SCHED_BOOST_ON_ALL)        return 1;    /* (2) 如果当前cpu是max cpu,返回true */    if (cpu_max_power_cost(cpu) == max_power_cost)        return 1;    rcu_read_lock();    sd = rcu_dereference_check_sched_domain(rq->sd);    if (!sd) {        rcu_read_unlock();        return 0;    }    for_each_cpu(i, sched_domain_span(sd)) {        if (cpu_load(i) < sched_spill_load &&                cpu_rq(i)->nr_running <                sysctl_sched_spill_nr_run) {            /* Change the kick type to limit to CPUs that             * are of equal or lower capacity.             */            *type = NOHZ_KICK_RESTRICT;            break;        }    }    rcu_read_unlock();    return 1;}
  • find_new_hmp_ilb():原本是找出nohz.idle_cpus_mask中的第一个cpu作为ilb cpu,qualcom HMP改为尝试在nohz.idle_cpus_mask中找到一个max power小于当前cpu的作为ilb cpu。
scheduler_tick() -> trigger_load_balance() -> nohz_balancer_kick() -> find_new_ilb()↓static inline int find_new_hmp_ilb(int type){    int call_cpu = raw_smp_processor_id();    struct sched_domain *sd;    int ilb;    rcu_read_lock();    /* Pick an idle cpu "closest" to call_cpu */    for_each_domain(call_cpu, sd) {        for_each_cpu_and(ilb, nohz.idle_cpus_mask,                        sched_domain_span(sd)) {            /* (1) 尝试找到一个max power小于当前power的cpu作为ilb cpu */            if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT ||                    cpu_max_power_cost(ilb) <=                    cpu_max_power_cost(call_cpu))) {                rcu_read_unlock();                reset_balance_interval(ilb);                return ilb;            }        }    }    rcu_read_unlock();    return nr_cpu_ids;}

5.3.2.3、select_task_rq_fair()

  • select_task_rq_fair():使用qualcom自己的算法,综合capacity、power、idle给出一个best cpu。
select_task_rq_fair() -> select_best_cpu()↓/* return cheapest cpu that can fit this task */static int select_best_cpu(struct task_struct *p, int target, int reason,               int sync){    struct sched_cluster *cluster, *pref_cluster = NULL;    struct cluster_cpu_stats stats;    struct related_thread_group *grp;    unsigned int sbc_flag = 0;    int cpu = raw_smp_processor_id();    bool special;    struct cpu_select_env env = {        .p          = p,        .reason         = reason,        .need_idle      = wake_to_idle(p),        .need_waker_cluster = 0,        .sync           = sync,        .prev_cpu       = target,        .rtg            = NULL,        .sbc_best_flag      = 0,        .sbc_best_cluster_flag  = 0,        .pack_task              = false,    };    rcu_read_lock();    env.boost_policy = task_sched_boost(p) ?            sched_boost_policy() : SCHED_BOOST_NONE;    bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS);    bitmap_zero(env.backup_list, NR_CPUS);    cpumask_and(&env.search_cpus, tsk_cpus_allowed(p), cpu_active_mask);    cpumask_andnot(&env.search_cpus, &env.search_cpus, cpu_isolated_mask);    init_cluster_cpu_stats(&stats);    special = env_has_special_flags(&env);    grp = task_related_thread_group(p);    if (grp && grp->preferred_cluster) {        pref_cluster = grp->preferred_cluster;        if (!cluster_allowed(&env, pref_cluster))            clear_bit(pref_cluster->id, env.candidate_list);        else            env.rtg = grp;    } else if (!special) {        cluster = cpu_rq(cpu)->cluster;        if (wake_to_waker_cluster(&env)) {            if (bias_to_waker_cpu(&env, cpu)) {                target = cpu;                sbc_flag = SBC_FLAG_WAKER_CLUSTER |                       SBC_FLAG_WAKER_CPU;                goto out;            } else if (cluster_allowed(&env, cluster)) {                env.need_waker_cluster = 1;                bitmap_zero(env.candidate_list, NR_CPUS);                __set_bit(cluster->id, env.candidate_list);                env.sbc_best_cluster_flag =                            SBC_FLAG_WAKER_CLUSTER;            }        } else if (bias_to_prev_cpu(&env, &stats)) {            sbc_flag = SBC_FLAG_PREV_CPU;            goto out;        }    }    if (!special && is_short_burst_task(p)) {        env.pack_task = true;        sbc_flag = SBC_FLAG_PACK_TASK;    }retry:    /* (1) 从低到高找到一个power最低,且capacity能满足task_load的cluster */    cluster = select_least_power_cluster(&env);    if (!cluster)        goto out;    /*     * 'cluster' now points to the minimum power cluster which can satisfy     * task's perf goals. Walk down the cluster list starting with that     * cluster. For non-small tasks, skip clusters that don't have     * mostly_idle/idle cpus     */    do {        /* (2) 全方位统计:capacity spare、cost、idle */        find_best_cpu_in_cluster(cluster, &env, &stats);    } while ((cluster = next_best_cluster(cluster, &env, &stats)));    /* (3) 从idle角度给出best cpu */    if (env.need_idle) {        if (stats.best_idle_cpu >= 0) {            target = stats.best_idle_cpu;            sbc_flag |= SBC_FLAG_IDLE_CSTATE;        } else if (stats.least_loaded_cpu >= 0) {            target = stats.least_loaded_cpu;            sbc_flag |= SBC_FLAG_IDLE_LEAST_LOADED;        }    /* (4) 从综合角度给出best cpu */    } else if (stats.best_cpu >= 0) {        if (stats.best_sibling_cpu >= 0 &&                stats.best_cpu != task_cpu(p) &&                stats.min_cost == stats.best_sibling_cpu_cost) {            stats.best_cpu = stats.best_sibling_cpu;            sbc_flag |= SBC_FLAG_BEST_SIBLING;        }        sbc_flag |= env.sbc_best_flag;        target = stats.best_cpu;    } else {        if (env.rtg && env.boost_policy == SCHED_BOOST_NONE) {            env.rtg = NULL;            goto retry;        }        /*         * With boost_policy == SCHED_BOOST_ON_BIG, we reach here with         * backup_list = little cluster, candidate_list = none and         * stats->best_capacity_cpu points the best spare capacity         * CPU among the CPUs in the big cluster.         */        if (env.boost_policy == SCHED_BOOST_ON_BIG &&            stats.best_capacity_cpu >= 0)            sbc_flag |= SBC_FLAG_BOOST_CLUSTER;        else            find_backup_cluster(&env, &stats);        if (stats.best_capacity_cpu >= 0) {            target = stats.best_capacity_cpu;            sbc_flag |= SBC_FLAG_BEST_CAP_CPU;        }    }    p->last_cpu_selected_ts = sched_ktime_clock();out:    sbc_flag |= env.sbc_best_cluster_flag;    rcu_read_unlock();    trace_sched_task_load(p, sched_boost_policy() && task_sched_boost(p),        env.reason, env.sync, env.need_idle, sbc_flag, target);    return target;}

5.3.2.4、Interaction Governor & sched_load

qualcom对interactive governor进行了改造,打造成了可以使用sched_load的interactive governor。

这里写图片描述

  • 1、interactive governor注册回调函数,接收sched_load变化事件;
static ssize_t store_use_sched_load(            struct cpufreq_interactive_tunables *tunables,            const char *buf, size_t count){    int ret;    unsigned long val;    ret = kstrtoul(buf, 0, &val);    if (ret < 0)        return ret;    if (tunables->use_sched_load == (bool) val)        return count;    tunables->use_sched_load = val;    if (val)        ret = cpufreq_interactive_enable_sched_input(tunables);    else        ret = cpufreq_interactive_disable_sched_input(tunables);    if (ret) {        tunables->use_sched_load = !val;        return ret;    }    return count;}|→static int cpufreq_interactive_enable_sched_input(            struct cpufreq_interactive_tunables *tunables){    int rc = 0, j;    struct cpufreq_interactive_tunables *t;    mutex_lock(&sched_lock);    set_window_count++;    if (set_window_count > 1) {        for_each_possible_cpu(j) {            if (!per_cpu(polinfo, j))                continue;            t = per_cpu(polinfo, j)->cached_tunables;            if (t && t->use_sched_load) {                tunables->timer_rate = t->timer_rate;                tunables->io_is_busy = t->io_is_busy;                break;            }        }    } else {        /* (1) 设置walt窗口大小 */        rc = set_window_helper(tunables);        if (rc) {            pr_err("%s: Failed to set sched window\n", __func__);            set_window_count--;            goto out;        }        sched_set_io_is_busy(tunables->io_is_busy);    }    if (!tunables->use_migration_notif)        goto out;    migration_register_count++;    if (migration_register_count > 1)        goto out;    else        /* (2) 注册sched_load变化的回调函数 */        atomic_notifier_chain_register(&load_alert_notifier_head,                        &load_notifier_block);out:    mutex_unlock(&sched_lock);    return rc;}||→static inline int set_window_helper(            struct cpufreq_interactive_tunables *tunables){    /* 设置默认窗口size为DEFAULT_TIMER_RATE(20ms) */    return sched_set_window(round_to_nw_start(get_jiffies_64(), tunables),             usecs_to_jiffies(tunables->timer_rate));}static struct notifier_block load_notifier_block = {    .notifier_call = load_change_callback,};
  • 2、sched_load的变化通过回调函数通知给Interaction Governor;
check_for_freq_change() -> load_alert_notifier_head -> load_change_callback()↓static int load_change_callback(struct notifier_block *nb, unsigned long val,                void *data){    unsigned long cpu = (unsigned long) data;    struct cpufreq_interactive_policyinfo *ppol = per_cpu(polinfo, cpu);    struct cpufreq_interactive_tunables *tunables;    unsigned long flags;    if (!ppol || ppol->reject_notification)        return 0;    if (!down_read_trylock(&ppol->enable_sem))        return 0;    if (!ppol->governor_enabled)        goto exit;    tunables = ppol->policy->governor_data;    if (!tunables->use_sched_load || !tunables->use_migration_notif)        goto exit;    spin_lock_irqsave(&ppol->target_freq_lock, flags);    ppol->notif_pending = true;    ppol->notif_cpu = cpu;    spin_unlock_irqrestore(&ppol->target_freq_lock, flags);    if (!hrtimer_is_queued(&ppol->notif_timer))        hrtimer_start(&ppol->notif_timer, ms_to_ktime(1),                  HRTIMER_MODE_REL);exit:    up_read(&ppol->enable_sem);    return 0;}
  • 3、除了事件通知,interactive governor还会在20ms timer中轮询sched_load的变化来决定是否需要调频。
static void cpufreq_interactive_timer(unsigned long data){    s64 now;    unsigned int delta_time;    u64 cputime_speedadj;    int cpu_load;    int pol_load = 0;    struct cpufreq_interactive_policyinfo *ppol = per_cpu(polinfo, data);    struct cpufreq_interactive_tunables *tunables =        ppol->policy->governor_data;    struct sched_load *sl = ppol->sl;    struct cpufreq_interactive_cpuinfo *pcpu;    unsigned int new_freq;    unsigned int prev_laf = 0, t_prevlaf;    unsigned int pred_laf = 0, t_predlaf = 0;    unsigned int prev_chfreq, pred_chfreq, chosen_freq;    unsigned int index;    unsigned long flags;    unsigned long max_cpu;    int cpu, i;    int new_load_pct = 0;    int prev_l, pred_l = 0;    struct cpufreq_govinfo govinfo;    bool skip_hispeed_logic, skip_min_sample_time;    bool jump_to_max_no_ts = false;    bool jump_to_max = false;    bool start_hyst = true;    if (!down_read_trylock(&ppol->enable_sem))        return;    if (!ppol->governor_enabled)        goto exit;    now = ktime_to_us(ktime_get());    spin_lock_irqsave(&ppol->target_freq_lock, flags);    spin_lock(&ppol->load_lock);    skip_hispeed_logic =        tunables->ignore_hispeed_on_notif && ppol->notif_pending;    skip_min_sample_time = tunables->fast_ramp_down && ppol->notif_pending;    ppol->notif_pending = false;    now = ktime_to_us(ktime_get());    ppol->last_evaluated_jiffy = get_jiffies_64();    /* (1) sched_load模式,查询最新的sched_load  */    if (tunables->use_sched_load)        sched_get_cpus_busy(sl, ppol->policy->cpus);    max_cpu = cpumask_first(ppol->policy->cpus);    i = 0;    for_each_cpu(cpu, ppol->policy->cpus) {        pcpu = &per_cpu(cpuinfo, cpu);        /* (2) sched_load模式,使用sched_load来计算负载变化  */        if (tunables->use_sched_load) {            /* (2.1) 根据上个窗口负载,获得当前目标值 */            t_prevlaf = sl_busy_to_laf(ppol, sl[i].prev_load);            prev_l = t_prevlaf / ppol->target_freq;            /* (2.2) 根据上个窗口负载预测,获得当前的预测值 */            if (tunables->enable_prediction) {                t_predlaf = sl_busy_to_laf(ppol,                        sl[i].predicted_load);                pred_l = t_predlaf / ppol->target_freq;            }            if (sl[i].prev_load)                new_load_pct = sl[i].new_task_load * 100 /                            sl[i].prev_load;            else                new_load_pct = 0;        /* (3) 传统模式,使用time*freq的模式来计算负载变化  */        } else {            now = update_load(cpu);            delta_time = (unsigned int)                (now - pcpu->cputime_speedadj_timestamp);            if (WARN_ON_ONCE(!delta_time))                continue;            cputime_speedadj = pcpu->cputime_speedadj;            do_div(cputime_speedadj, delta_time);            t_prevlaf = (unsigned int)cputime_speedadj * 100;            prev_l = t_prevlaf / ppol->target_freq;        }        /* find max of loadadjfreq inside policy */        if (t_prevlaf > prev_laf) {            prev_laf = t_prevlaf;            max_cpu = cpu;        }        pred_laf = max(t_predlaf, pred_laf);        cpu_load = max(prev_l, pred_l);        pol_load = max(pol_load, cpu_load);        trace_cpufreq_interactive_cpuload(cpu, cpu_load, new_load_pct,                          prev_l, pred_l);        /* save loadadjfreq for notification */        pcpu->loadadjfreq = max(t_prevlaf, t_predlaf);        /* detect heavy new task and jump to policy->max */        if (prev_l >= tunables->go_hispeed_load &&            new_load_pct >= NEW_TASK_RATIO) {            skip_hispeed_logic = true;            jump_to_max = true;        }        i++;    }    spin_unlock(&ppol->load_lock);    tunables->boosted = tunables->boost_val || now < tunables->boostpulse_endtime;    /* (4) 取目标值和预测值中的较大值,作为调频目标 */    prev_chfreq = choose_freq(ppol, prev_laf);    pred_chfreq = choose_freq(ppol, pred_laf);    chosen_freq = max(prev_chfreq, pred_chfreq);    if (prev_chfreq < ppol->policy->max && pred_chfreq >= ppol->policy->max)        if (!jump_to_max)            jump_to_max_no_ts = true;    if (now - ppol->max_freq_hyst_start_time <        tunables->max_freq_hysteresis &&        pol_load >= tunables->go_hispeed_load &&        ppol->target_freq < ppol->policy->max) {        skip_hispeed_logic = true;        skip_min_sample_time = true;        if (!jump_to_max)            jump_to_max_no_ts = true;    }    new_freq = chosen_freq;    if (jump_to_max_no_ts || jump_to_max) {        new_freq = ppol->policy->cpuinfo.max_freq;    } else if (!skip_hispeed_logic) {        if (pol_load >= tunables->go_hispeed_load ||            tunables->boosted) {            if (ppol->target_freq < tunables->hispeed_freq)                new_freq = tunables->hispeed_freq;            else                new_freq = max(new_freq,                           tunables->hispeed_freq);        }    }    if (now - ppol->max_freq_hyst_start_time <        tunables->max_freq_hysteresis) {        if (new_freq < ppol->policy->max &&                ppol->policy->max <= tunables->hispeed_freq)            start_hyst = false;        new_freq = max(tunables->hispeed_freq, new_freq);    }    if (!skip_hispeed_logic &&        ppol->target_freq >= tunables->hispeed_freq &&        new_freq > ppol->target_freq &&        now - ppol->hispeed_validate_time <        freq_to_above_hispeed_delay(tunables, ppol->target_freq)) {        trace_cpufreq_interactive_notyet(            max_cpu, pol_load, ppol->target_freq,            ppol->policy->cur, new_freq);        spin_unlock_irqrestore(&ppol->target_freq_lock, flags);        goto rearm;    }    ppol->hispeed_validate_time = now;    if (cpufreq_frequency_table_target(&ppol->p_nolim, ppol->freq_table,                       new_freq, CPUFREQ_RELATION_L,                       &index)) {        spin_unlock_irqrestore(&ppol->target_freq_lock, flags);        goto rearm;    }    new_freq = ppol->freq_table[index].frequency;    /*     * Do not scale below floor_freq unless we have been at or above the     * floor frequency for the minimum sample time since last validated.     */    if (!skip_min_sample_time && new_freq < ppol->floor_freq) {        if (now - ppol->floor_validate_time <                tunables->min_sample_time) {            trace_cpufreq_interactive_notyet(                max_cpu, pol_load, ppol->target_freq,                ppol->policy->cur, new_freq);            spin_unlock_irqrestore(&ppol->target_freq_lock, flags);            goto rearm;        }    }    /*     * Update the timestamp for checking whether speed has been held at     * or above the selected frequency for a minimum of min_sample_time,     * if not boosted to hispeed_freq.  If boosted to hispeed_freq then we     * allow the speed to drop as soon as the boostpulse duration expires     * (or the indefinite boost is turned off). If policy->max is restored     * for max_freq_hysteresis, don't extend the timestamp. Otherwise, it     * could incorrectly extended the duration of max_freq_hysteresis by     * min_sample_time.     */    if ((!tunables->boosted || new_freq > tunables->hispeed_freq)        && !jump_to_max_no_ts) {        ppol->floor_freq = new_freq;        ppol->floor_validate_time = now;    }    if (start_hyst && new_freq >= ppol->policy->max && !jump_to_max_no_ts)        ppol->max_freq_hyst_start_time = now;    if (ppol->target_freq == new_freq &&            ppol->target_freq <= ppol->policy->cur) {        trace_cpufreq_interactive_already(            max_cpu, pol_load, ppol->target_freq,            ppol->policy->cur, new_freq);        spin_unlock_irqrestore(&ppol->target_freq_lock, flags);        goto rearm;    }    trace_cpufreq_interactive_target(max_cpu, pol_load, ppol->target_freq,                     ppol->policy->cur, new_freq);    ppol->target_freq = new_freq;    spin_unlock_irqrestore(&ppol->target_freq_lock, flags);    spin_lock_irqsave(&speedchange_cpumask_lock, flags);    cpumask_set_cpu(max_cpu, &speedchange_cpumask);    spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);    wake_up_process_no_notif(speedchange_task);rearm:    if (!timer_pending(&ppol->policy_timer))        cpufreq_interactive_timer_resched(data, false);    /*     * Send govinfo notification.     * Govinfo notification could potentially wake up another thread     * managed by its clients. Thread wakeups might trigger a load     * change callback that executes this function again. Therefore     * no spinlock could be held when sending the notification.     */    for_each_cpu(i, ppol->policy->cpus) {        pcpu = &per_cpu(cpuinfo, i);        govinfo.cpu = i;        govinfo.load = pcpu->loadadjfreq / ppol->policy->max;        govinfo.sampling_rate_us = tunables->timer_rate;        atomic_notifier_call_chain(&cpufreq_govinfo_notifier_list,                       CPUFREQ_LOAD_CHANGE, &govinfo);    }exit:    up_read(&ppol->enable_sem);    return;}|→void sched_get_cpus_busy(struct sched_load *busy,             const struct cpumask *query_cpus){    unsigned long flags;    struct rq *rq;    const int cpus = cpumask_weight(query_cpus);    u64 load[cpus], group_load[cpus];    u64 nload[cpus], ngload[cpus];    u64 pload[cpus];    unsigned int max_freq[cpus];    int notifier_sent = 0;    int early_detection[cpus];    int cpu, i = 0;    unsigned int window_size;    u64 max_prev_sum = 0;    int max_busy_cpu = cpumask_first(query_cpus);    u64 total_group_load = 0, total_ngload = 0;    bool aggregate_load = false;    struct sched_cluster *cluster = cpu_cluster(cpumask_first(query_cpus));    if (unlikely(cpus == 0))        return;    local_irq_save(flags);    /*     * This function could be called in timer context, and the     * current task may have been executing for a long time. Ensure     * that the window stats are current by doing an update.     */    for_each_cpu(cpu, query_cpus)        raw_spin_lock_nested(&cpu_rq(cpu)->lock, cpu);    window_size = sched_ravg_window;    /*     * We don't really need the cluster lock for this entire for loop     * block. However, there is no advantage in optimizing this as rq     * locks are held regardless and would prevent migration anyways     */    raw_spin_lock(&cluster->load_lock);    for_each_cpu(cpu, query_cpus) {        rq = cpu_rq(cpu);        update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(),                 0);        account_load_subtractions(rq);        /* (1) 获取:             cpu上一个窗口的负载:rq->prev_runnable_sum            cpu上一个窗口的的新任务负载:rq->nt_prev_runnable_sum            cpu上一个窗口的负载预测:rq->hmp_stats.pred_demands_sum         */        load[i] = rq->prev_runnable_sum;        nload[i] = rq->nt_prev_runnable_sum;        pload[i] = rq->hmp_stats.pred_demands_sum;        rq->old_estimated_time = pload[i];        if (load[i] > max_prev_sum) {            max_prev_sum = load[i];            max_busy_cpu = cpu;        }        /*         * sched_get_cpus_busy() is called for all CPUs in a         * frequency domain. So the notifier_sent flag per         * cluster works even when a frequency domain spans         * more than 1 cluster.         */        if (rq->cluster->notifier_sent) {            notifier_sent = 1;            rq->cluster->notifier_sent = 0;        }        early_detection[i] = (rq->ed_task != NULL);        max_freq[i] = cpu_max_freq(cpu);        i++;    }    raw_spin_unlock(&cluster->load_lock);    group_load_in_freq_domain(            &cpu_rq(max_busy_cpu)->freq_domain_cpumask,            &total_group_load, &total_ngload);    aggregate_load = !!(total_group_load > sched_freq_aggregate_threshold);    i = 0;    for_each_cpu(cpu, query_cpus) {        group_load[i] = 0;        ngload[i] = 0;        if (early_detection[i])            goto skip_early;        rq = cpu_rq(cpu);        if (aggregate_load) {            if (cpu == max_busy_cpu) {                group_load[i] = total_group_load;                ngload[i] = total_ngload;            }        } else {            group_load[i] = rq->grp_time.prev_runnable_sum;            ngload[i] = rq->grp_time.nt_prev_runnable_sum;        }        load[i] += group_load[i];        nload[i] += ngload[i];        load[i] = freq_policy_load(rq, load[i]);        rq->old_busy_time = load[i];        /*         * Scale load in reference to cluster max_possible_freq.         *         * Note that scale_load_to_cpu() scales load in reference to         * the cluster max_freq.         */        load[i] = scale_load_to_cpu(load[i], cpu);        nload[i] = scale_load_to_cpu(nload[i], cpu);        pload[i] = scale_load_to_cpu(pload[i], cpu);skip_early:        i++;    }    for_each_cpu(cpu, query_cpus)        raw_spin_unlock(&(cpu_rq(cpu))->lock);    local_irq_restore(flags);    i = 0;    for_each_cpu(cpu, query_cpus) {        rq = cpu_rq(cpu);        if (early_detection[i]) {            busy[i].prev_load = div64_u64(sched_ravg_window,                            NSEC_PER_USEC);            busy[i].new_task_load = 0;            busy[i].predicted_load = 0;            goto exit_early;        }        load[i] = scale_load_to_freq(load[i], max_freq[i],                cpu_max_possible_freq(cpu));        nload[i] = scale_load_to_freq(nload[i], max_freq[i],                cpu_max_possible_freq(cpu));        pload[i] = scale_load_to_freq(pload[i], max_freq[i],                         rq->cluster->max_possible_freq);        /* (2) 负载经过转换后赋值给busy:             cpu上一个窗口的负载:busy[i].prev_load            cpu上一个窗口的的新任务负载:busy[i].new_task_load            cpu上一个窗口的负载预测:busy[i].predicted_load         */        busy[i].prev_load = div64_u64(load[i], NSEC_PER_USEC);        busy[i].new_task_load = div64_u64(nload[i], NSEC_PER_USEC);        busy[i].predicted_load = div64_u64(pload[i], NSEC_PER_USEC);exit_early:        trace_sched_get_busy(cpu, busy[i].prev_load,                     busy[i].new_task_load,                     busy[i].predicted_load,                     early_detection[i],                     aggregate_load &&                      cpu == max_busy_cpu);        i++;    }}
原创粉丝点击