Linux schedule 4、负载均衡

来源：互联网发布：被淘宝网店诈骗编辑：程序博客网时间：2024/06/06 01:35

4、负载均衡

4.1、SMP负载均衡

4.1.1、Scheduling Domains

4.1.1.1、Scheduling Domains概念

借用Linux Scheduling Domains的描述，阐述Scheduling Domains的概念。

一个复杂的高端系统由上到下可以这样构成：

1、它是一个 NUMA 架构的系统，系统中的每个 Node 访问系统中不同区域的内存有不同的速度。
2、同时它又是一个 SMP 系统。由多个物理 CPU(Physical Package) 构成。这些物理 CPU 共享系统中所有的内存。但都有自己独立的 Cache 。
3、每个物理 CPU 又由多个核 (Core) 构成，即 Multi-core 技术或者叫 Chip-level Multi processor(CMP) 。这些核都被集成在一块 die 里面。一般有自己独立的 L1 Cache，但可能共享 L2 Cache 。
4、每个核中又通过 SMT 之类的技术实现多个硬件线程，或者叫 Virtual CPU( 比如 Intel 的 Hyper-threading 技术 ) 。这些硬件线程，逻辑上看是就是一个 CPU 。它们之间几乎所有的东西都共享。包括 L1 Cache，甚至是逻辑运算单元 (ALU) 以及 Power 。

可以看到cpu是有多个层级的，cpu和越近的层级之间共享的资源越多。所以进程在cpu之间迁移是有代价的，从性能的角度看，迁移跨越的层级越大性能损失越大。另外还需要从功耗的角度来考虑进程迁移的代价，这就是EAS考虑的。

4.1.1.2、arm64 cpu_topology

arm64架构的cpu拓扑结构存储在cpu_topology[]变量当中：

/* * cpu topology table */struct cpu_topology cpu_topology[NR_CPUS];struct cpu_topology {    int thread_id;    int core_id;    int cluster_id;                 // 本cpu所在的cluster    unsigned int partno;    cpumask_t thread_sibling;    cpumask_t core_sibling;         // 在MutiCore层次(即同一个cluster中)，有哪些兄弟cpu};

cpu_topology[]是parse_dt_cpu_capacity()函数解析dts中的信息建立的:

kernel_init() -> kernel_init_freeable() -> smp_prepare_cpus() -> init_cpu_topology() -> parse_dt_topology()↓static int __init parse_dt_topology(void){    struct device_node *cn, *map;    int ret = 0;    int cpu;    /* (1) 找到dts中cpu topology的根节点"/cpus"" */    cn = of_find_node_by_path("/cpus");    if (!cn) {        pr_err("No CPU information found in DT\n");        return 0;    }    /*     * When topology is provided cpu-map is essentially a root     * cluster with restricted subnodes.     */    /* (2) 找到"cpu-map"节点 */    map = of_get_child_by_name(cn, "cpu-map");    if (!map)        goto out;    /* (3) 解析"cpu-map"中的cluster */    ret = parse_cluster(map, 0);    if (ret != 0)        goto out_map;    /*     * Check that all cores are in the topology; the SMP code will     * only mark cores described in the DT as possible.     */    for_each_possible_cpu(cpu)        if (cpu_topology[cpu].cluster_id == -1)            ret = -EINVAL;out_map:    of_node_put(map);out:    of_node_put(cn);    return ret;}|→static int __init parse_cluster(struct device_node *cluster, int depth){    char name[10];    bool leaf = true;    bool has_cores = false;    struct device_node *c;    static int cluster_id __initdata;    int core_id = 0;    int i, ret;    /*     * First check for child clusters; we currently ignore any     * information about the nesting of clusters and present the     * scheduler with a flat list of them.     */    i = 0;    /* (3.1) 如果有多级cluster，继续递归搜索 */    do {        snprintf(name, sizeof(name), "cluster%d", i);        c = of_get_child_by_name(cluster, name);        if (c) {            leaf = false;            ret = parse_cluster(c, depth + 1);            of_node_put(c);            if (ret != 0)                return ret;        }        i++;    } while (c);    /* Now check for cores */    i = 0;    do {        /* (3.2) 或者core层次的节点 */        snprintf(name, sizeof(name), "core%d", i);        c = of_get_child_by_name(cluster, name);        if (c) {            has_cores = true;            if (depth == 0) {                pr_err("%s: cpu-map children should be clusters\n",                       c->full_name);                of_node_put(c);                return -EINVAL;            }            if (leaf) {                /* (3.3) 如果是叶子cluster节点，继续遍历core中的cpu节点 */                ret = parse_core(c, cluster_id, core_id++);            } else {                pr_err("%s: Non-leaf cluster with core %s\n",                       cluster->full_name, name);                ret = -EINVAL;            }            of_node_put(c);            if (ret != 0)                return ret;        }        i++;    } while (c);    if (leaf && !has_cores)        pr_warn("%s: empty cluster\n", cluster->full_name);    if (leaf)        cluster_id++;    return 0;}||→static int __init parse_core(struct device_node *core, int cluster_id,                 int core_id){    char name[10];    bool leaf = true;    int i = 0;    int cpu;    struct device_node *t;    do {        /* (3.3.1) 如果存在thread层级，解析thread和cpu层级 */        snprintf(name, sizeof(name), "thread%d", i);        t = of_get_child_by_name(core, name);        if (t) {            leaf = false;            cpu = get_cpu_for_node(t);            if (cpu >= 0) {                cpu_topology[cpu].cluster_id = cluster_id;                cpu_topology[cpu].core_id = core_id;                cpu_topology[cpu].thread_id = i;            } else {                pr_err("%s: Can't get CPU for thread\n",                       t->full_name);                of_node_put(t);                return -EINVAL;            }            of_node_put(t);        }        i++;    } while (t);    /* (3.3.2) 否则直接解析cpu层级 */    cpu = get_cpu_for_node(core);    if (cpu >= 0) {        if (!leaf) {            pr_err("%s: Core has both threads and CPU\n",                   core->full_name);            return -EINVAL;        }        /* (3.3.3) 得到了cpu的cluster_id/core_id */        cpu_topology[cpu].cluster_id = cluster_id;        cpu_topology[cpu].core_id = core_id;    } else if (leaf) {        pr_err("%s: Can't get CPU for leaf core\n", core->full_name);        return -EINVAL;    }    return 0;}|||→static int __init get_cpu_for_node(struct device_node *node){    struct device_node *cpu_node;    int cpu;    cpu_node = of_parse_phandle(node, "cpu", 0);    if (!cpu_node)        return -1;    for_each_possible_cpu(cpu) {        if (of_get_cpu_node(cpu, NULL) == cpu_node) {            of_node_put(cpu_node);            return cpu;        }    }    pr_crit("Unable to find CPU node for %s\n", cpu_node->full_name);    of_node_put(cpu_node);    return -1;}

cpu同一层次的关系cpu_topology[cpu].core_sibling/thread_sibling会在update_siblings_masks()中更新：

kernel_init() -> kernel_init_freeable() -> smp_prepare_cpus() -> store_cpu_topology() -> update_siblings_masks()↓static void update_siblings_masks(unsigned int cpuid){    struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];    int cpu;    /* update core and thread sibling masks */    for_each_possible_cpu(cpu) {        cpu_topo = &cpu_topology[cpu];        if (cpuid_topo->cluster_id != cpu_topo->cluster_id)            continue;        cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);        if (cpu != cpuid)            cpumask_set_cpu(cpu, &cpuid_topo->core_sibling);        if (cpuid_topo->core_id != cpu_topo->core_id)            continue;        cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling);        if (cpu != cpuid)            cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling);    }}

以mt6799为例，topology为”4*A35 + 4*A53 + 2*A73”，dts中定义如下：

mt6799.dtsi:cpus {        #address-cells = <1>;        #size-cells = <0>;        cpu0: cpu@0 {            device_type = "cpu";            compatible = "arm,cortex-a35";            reg = <0x000>;            enable-method = "psci";            cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,                      <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;            cpu-release-addr = <0x0 0x40000200>;            clock-frequency = <1248000000>;        };        cpu1: cpu@001 {            device_type = "cpu";            compatible = "arm,cortex-a35";            reg = <0x001>;            enable-method = "psci";            cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,                      <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;            cpu-release-addr = <0x0 0x40000200>;            clock-frequency = <1248000000>;        };        cpu2: cpu@002 {            device_type = "cpu";            compatible = "arm,cortex-a35";            reg = <0x002>;            enable-method = "psci";            cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,                      <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;            cpu-release-addr = <0x0 0x40000200>;            clock-frequency = <1248000000>;        };        cpu3: cpu@003 {            device_type = "cpu";            compatible = "arm,cortex-a35";            reg = <0x003>;            enable-method = "psci";            cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,                      <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;            cpu-release-addr = <0x0 0x40000200>;            clock-frequency = <1248000000>;        };        cpu4: cpu@100 {            device_type = "cpu";            compatible = "arm,cortex-a53";            reg = <0x100>;            enable-method = "psci";            cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,                      <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;            cpu-release-addr = <0x0 0x40000200>;            clock-frequency = <1378000000>;        };        cpu5: cpu@101 {            device_type = "cpu";            compatible = "arm,cortex-a53";            reg = <0x101>;            enable-method = "psci";            cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,                      <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;            cpu-release-addr = <0x0 0x40000200>;            clock-frequency = <1378000000>;        };        cpu6: cpu@102 {            device_type = "cpu";            compatible = "arm,cortex-a53";            reg = <0x102>;            enable-method = "psci";            cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,                      <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;            cpu-release-addr = <0x0 0x40000200>;            clock-frequency = <1378000000>;        };        cpu7: cpu@103 {            device_type = "cpu";            compatible = "arm,cortex-a53";            reg = <0x103>;            enable-method = "psci";            cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,                      <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;            cpu-release-addr = <0x0 0x40000200>;            clock-frequency = <1378000000>;        };        cpu8: cpu@200 {            device_type = "cpu";            compatible = "arm,cortex-a73";            reg = <0x200>;            enable-method = "psci";            cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,                      <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;            cpu-release-addr = <0x0 0x40000200>;            clock-frequency = <1638000000>;        };        cpu9: cpu@201 {            device_type = "cpu";            compatible = "arm,cortex-a73";            reg = <0x201>;            enable-method = "psci";            cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,                      <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;            cpu-release-addr = <0x0 0x40000200>;            clock-frequency = <1638000000>;        };        cpu-map {            cluster0 {                core0 {                    cpu = <&cpu0>;                };                core1 {                    cpu = <&cpu1>;                };                core2 {                    cpu = <&cpu2>;                };                core3 {                    cpu = <&cpu3>;                };            };            cluster1 {                core0 {                    cpu = <&cpu4>;                };                core1 {                    cpu = <&cpu5>;                };                core2 {                    cpu = <&cpu6>;                };                core3 {                    cpu = <&cpu7>;                };            };            cluster2 {                core0 {                    cpu = <&cpu8>;                };                core1 {                    cpu = <&cpu9>;                };            };        };

经过parse_dt_topology()、update_siblings_masks()解析后得到cpu_topology[}的值为：

cpu 0 cluster_id = 0, core_id = 0, core_sibling = 0xfcpu 1 cluster_id = 0, core_id = 1, core_sibling = 0xfcpu 2 cluster_id = 0, core_id = 2, core_sibling = 0xfcpu 3 cluster_id = 0, core_id = 3, core_sibling = 0xfcpu 4 cluster_id = 1, core_id = 0, core_sibling = 0xf0cpu 5 cluster_id = 1, core_id = 1, core_sibling = 0xf0cpu 6 cluster_id = 1, core_id = 2, core_sibling = 0xf0cpu 7 cluster_id = 1, core_id = 3, core_sibling = 0xf0cpu 8 cluster_id = 2, core_id = 0, core_sibling = 0x300cpu 9 cluster_id = 2, core_id = 1, core_sibling = 0x300

4.1.1.3、Scheduling Domains的初始化

在kernel_init_freeable()中，调用smp_prepare_cpus()初始化完cpu的拓扑关系，再调用smp_init()唤醒cpu，紧接会调用sched_init_smp()初始化系统的Scheduling Domains。

关于拓扑的层次默认可选的有3层：SMT/MC/DIE。arm目前不支持多线程技术，所以现在只支持2层：MC/DIE。

/* * Topology list, bottom-up. */static struct sched_domain_topology_level default_topology[] = {#ifdef CONFIG_SCHED_SMT    { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },#endif#ifdef CONFIG_SCHED_MC    { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },#endif    { cpu_cpu_mask, SD_INIT_NAME(DIE) },    { NULL, },};

arm64使用的SDTL如下：

static struct sched_domain_topology_level arm64_topology[] = {#ifdef CONFIG_SCHED_MC    { cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) },#endif    { cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) },    { NULL, },};

具体的Scheduling Domains的初始化代码分析如下：

kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> init_sched_domains(cpu_active_mask):↓static int init_sched_domains(const struct cpumask *cpu_map){    int err;    arch_update_cpu_topology();    /* (1) 当前只有一个schedule domain需要初始化 */    ndoms_cur = 1;    doms_cur = alloc_sched_domains(ndoms_cur);    if (!doms_cur)        doms_cur = &fallback_doms;    /* (2) 按照传入的cpu_active_mask，构造sched_domains */    cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);    err = build_sched_domains(doms_cur[0], NULL);    /* (3) 注册“/proc/sys/kernel/sched_domain/” */    register_sched_domain_sysctl();    return err;}|→static int build_sched_domains(const struct cpumask *cpu_map,                   struct sched_domain_attr *attr){    enum s_alloc alloc_state;    struct sched_domain *sd;    struct s_data d;    struct rq *rq = NULL;    int i, ret = -ENOMEM;    /* (2.1) 在每个tl层次，给每个cpu分配sd、sg、sgc空间 */    alloc_state = __visit_domain_allocation_hell(&d, cpu_map);    if (alloc_state != sa_rootdomain)        goto error;    /* Set up domains for cpus specified by the cpu_map. */    for_each_cpu(i, cpu_map) {        struct sched_domain_topology_level *tl;        sd = NULL;        for_each_sd_topology(tl) {            /* (2.2) 初始化sd                构造其不同tl之间的sd的parent、cild关系                按照SDTL传入的tl->mask()函数，给sd->span[]赋值             */            sd = build_sched_domain(tl, cpu_map, attr, sd, i);            /* (2.2.1) 将最底层tl的sd赋值给d.sd */            if (tl == sched_domain_topology)                *per_cpu_ptr(d.sd, i) = sd;            if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))                sd->flags |= SD_OVERLAP;            if (cpumask_equal(cpu_map, sched_domain_span(sd)))                break;        }    }    /* Build the groups for the domains */    for_each_cpu(i, cpu_map) {        for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {            /* (2.3) 给sd->span_weight赋值 */            sd->span_weight = cpumask_weight(sched_domain_span(sd));            if (sd->flags & SD_OVERLAP) {                if (build_overlap_sched_groups(sd, i))                    goto error;            } else {                /* (2.4) 按照span，构造每个tl层次中，sd、sg之间的关系 */                if (build_sched_groups(sd, i))                    goto error;            }        }    }    /* Calculate CPU capacity for physical packages and nodes */    for (i = nr_cpumask_bits-1; i >= 0; i--) {        struct sched_domain_topology_level *tl = sched_domain_topology;        if (!cpumask_test_cpu(i, cpu_map))            continue;        for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) {            /* (2.5) 初始化sg->sge对应的energy表 */            init_sched_energy(i, sd, tl->energy);            /* (2.6) 对有人引用的sd、sg、sgc进行标识，                无人引用的sd、sg、sgc在__free_domain_allocs()中会被释放             */            claim_allocations(i, sd);            /* (2.7) 初始化每个tl层级的sgc->capacity             */            init_sched_groups_capacity(i, sd);        }    }    /* Attach the domains */    rcu_read_lock();    /* (2.8) 将d.rd赋值给rq->sd        将d.rd赋值给rq->rd     */    for_each_cpu(i, cpu_map) {        rq = cpu_rq(i);        sd = *per_cpu_ptr(d.sd, i);        cpu_attach_domain(sd, d.rd, i);    }    rcu_read_unlock();    ret = 0;error:    /* (2.9) free掉分配失败/分配成功多余的内存 */    __free_domain_allocs(&d, alloc_state, cpu_map);    return ret;}||→static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,                           const struct cpumask *cpu_map){    memset(d, 0, sizeof(*d));    /* (2.1.1) 每个tl层次，给每个cpu都分配sd、sg、sgc，        tl->data->sd、l->data->sg、l->data->sgc     */    if (__sdt_alloc(cpu_map))        return sa_sd_storage;    /* (2.1.2) 分配d->sd指针空间        实际d->sd会指向最底层tl的tl->data->sd     */    d->sd = alloc_percpu(struct sched_domain *);    if (!d->sd)        return sa_sd_storage;    /* (2.1.3) 分配d->rd的指针空间和实际空间         rd = root_domain     */    d->rd = alloc_rootdomain();    if (!d->rd)        return sa_sd;    return sa_rootdomain;}||→struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,        const struct cpumask *cpu_map, struct sched_domain_attr *attr,        struct sched_domain *child, int cpu){    struct sched_domain *sd = sd_init(tl, cpu);    if (!sd)        return child;    /* (2.2.1) 根据tl->mask()初始化sd->sapn[] */    cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));    if (child) {        sd->level = child->level + 1;        sched_domain_level_max = max(sched_domain_level_max, sd->level);        /* (2.2.2) 如果有多层tl，建立起sd之间的parent/child关系，            对arm来说：MC层tl->data->sd是child，DIE层tl->data->sd是parent         */        child->parent = sd;        sd->child = child;        if (!cpumask_subset(sched_domain_span(child),                    sched_domain_span(sd))) {            pr_err("BUG: arch topology borken\n");#ifdef CONFIG_SCHED_DEBUG            pr_err("     the %s domain not a subset of the %s domain\n",                    child->name, sd->name);#endif            /* Fixup, ensure @sd has at least @child cpus. */            cpumask_or(sched_domain_span(sd),                   sched_domain_span(sd),                   sched_domain_span(child));        }    }    set_domain_attribute(sd, attr);    return sd;}||→static intbuild_sched_groups(struct sched_domain *sd, int cpu){    struct sched_group *first = NULL, *last = NULL;    struct sd_data *sdd = sd->private;    const struct cpumask *span = sched_domain_span(sd);    struct cpumask *covered;    int i;    /* (2.4.1) 根据sd->span[]建立起sd、sg之间的关系 ，        如果sd没有child，每个cpu的sd、sg之间建立链接        如果sd有child，每个cpu的sd和span中第一个cpu的sg建立链接     */    get_group(cpu, sdd, &sd->groups);    atomic_inc(&sd->groups->ref);    if (cpu != cpumask_first(span))        return 0;    lockdep_assert_held(&sched_domains_mutex);    covered = sched_domains_tmpmask;    cpumask_clear(covered);    /* (2.4.2) 挑选有sd链接的sg，给其中的sg->cpumask[]成员赋值 */    for_each_cpu(i, span) {        struct sched_group *sg;        int group, j;        if (cpumask_test_cpu(i, covered))            continue;        group = get_group(i, sdd, &sg);        cpumask_setall(sched_group_mask(sg));        for_each_cpu(j, span) {            if (get_group(j, sdd, NULL) != group)                continue;            cpumask_set_cpu(j, covered);            cpumask_set_cpu(j, sched_group_cpus(sg));        }        /* (2.4.3) 挑选有sd链接的sg，将同一层级sg链接成链表， */        if (!first)            first = sg;        if (last)            last->next = sg;        last = sg;    }    last->next = first;    return 0;}||→static void init_sched_energy(int cpu, struct sched_domain *sd,                  sched_domain_energy_f fn){    if (!(fn && fn(cpu)))        return;    if (cpu != group_balance_cpu(sd->groups))        return;    if (sd->child && !sd->child->groups->sge) {        pr_err("BUG: EAS setup broken for CPU%d\n", cpu);#ifdef CONFIG_SCHED_DEBUG        pr_err("     energy data on %s but not on %s domain\n",            sd->name, sd->child->name);#endif        return;    }    check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups));    /* (2.5.1) 不同层级tl，按照tl->energy()给sg->sge赋值 */    sd->groups->sge = fn(cpu);}||→static void claim_allocations(int cpu, struct sched_domain *sd){    struct sd_data *sdd = sd->private;    /* (2.6.1) 对有人使用的tl->data->sd、tl->data->sg、tl->data->sgc置空,        无人使用的空间，将会在__free_domain_allocs()中被释放     */    WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);    *per_cpu_ptr(sdd->sd, cpu) = NULL;    if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))        *per_cpu_ptr(sdd->sg, cpu) = NULL;    if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))        *per_cpu_ptr(sdd->sgc, cpu) = NULL;}||→static void init_sched_groups_capacity(int cpu, struct sched_domain *sd){    struct sched_group *sg = sd->groups;    WARN_ON(!sg);    do {        /* (2.7.1) 更新sg->group_weight的值 */        sg->group_weight = cpumask_weight(sched_group_cpus(sg));        sg = sg->next;    } while (sg != sd->groups);    if (cpu != group_balance_cpu(sg))        return;    /* (2.7.2) 更新sgc->capacity的值 */    update_group_capacity(sd, cpu);    /* (2.7.3) 更新sgc->nr_busy_cpus的值 */    atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);}|||→void update_group_capacity(struct sched_domain *sd, int cpu){    struct sched_domain *child = sd->child;    struct sched_group *group, *sdg = sd->groups;    unsigned long capacity;    unsigned long interval;    interval = msecs_to_jiffies(sd->balance_interval);    interval = clamp(interval, 1UL, max_load_balance_interval);    sdg->sgc->next_update = jiffies + interval;    if (!child) {        /* (2.7.2.1) 如果sd没有child是最底层tl,            则调用arch_scale_cpu_capacity()获取最大运算能力，并减去rt进程的消耗rq->rt_avg，            得到本sd的sg->sgc->capacity         */        update_cpu_capacity(sd, cpu);        return;    }    capacity = 0;    if (child->flags & SD_OVERLAP) {        /*         * SD_OVERLAP domains cannot assume that child groups         * span the current group.         */        for_each_cpu(cpu, sched_group_cpus(sdg)) {            struct sched_group_capacity *sgc;            struct rq *rq = cpu_rq(cpu);            /*             * build_sched_domains() -> init_sched_groups_capacity()             * gets here before we've attached the domains to the             * runqueues.             *             * Use capacity_of(), which is set irrespective of domains             * in update_cpu_capacity().             *             * This avoids capacity from being 0 and             * causing divide-by-zero issues on boot.             */            if (unlikely(!rq->sd)) {                capacity += capacity_of(cpu);                continue;            }            sgc = rq->sd->groups->sgc;            capacity += sgc->capacity;        }    } else  {        /*         * !SD_OVERLAP domains can assume that child groups         * span the current group.         */         /*  (2.7.2.2) 如果sd有child不是最底层tl,            则sgc->capacity等于所有child sg的group->sgc->capacity的和         */        group = child->groups;        do {            capacity += group->sgc->capacity;            group = group->next;        } while (group != child->groups);    }    sdg->sgc->capacity = capacity;}||||→static void update_cpu_capacity(struct sched_domain *sd, int cpu){    unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);    struct sched_group *sdg = sd->groups;    struct max_cpu_capacity *mcc;    unsigned long max_capacity;    int max_cap_cpu;    unsigned long flags;    /* (2.7.2.1.1) 根据arch_scale_cpu_capacity获取到本cpu最大/orig capacity     */    cpu_rq(cpu)->cpu_capacity_orig = capacity;    mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;    raw_spin_lock_irqsave(&mcc->lock, flags);    max_capacity = mcc->val;    max_cap_cpu = mcc->cpu;    if ((max_capacity > capacity && max_cap_cpu == cpu) ||        (max_capacity < capacity)) {        mcc->val = capacity;        mcc->cpu = cpu;#ifdef CONFIG_SCHED_DEBUG        raw_spin_unlock_irqrestore(&mcc->lock, flags);        /* pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity); */        goto skip_unlock;#endif    }    raw_spin_unlock_irqrestore(&mcc->lock, flags);skip_unlock: __attribute__ ((unused));    /* (2.7.2.1.2) 减去rt消耗的capacity，        rq->rt_avg/(sched_avg_period() + delta)是rt进程占用cpu的比例，        剩下就为cfs可用的capacity     */    capacity *= scale_rt_capacity(cpu);    capacity >>= SCHED_CAPACITY_SHIFT;    if (!capacity)        capacity = 1;    cpu_rq(cpu)->cpu_capacity = capacity;    sdg->sgc->capacity = capacity;}

init_sched_domains()是在系统启动时创建sched_domain，如果发生cpu hotplug系统中online的cpu发生变化时，会调用partition_sched_domains重新构造系统的sched_domain。

cpu_up() -> _cpu_up() -> __raw_notifier_call_chain() -> cpuset_cpu_active() -> cpuset_update_active_cpus() -> partition_sched_domains() -> build_sched_domains()；void __init sched_init_smp(void){    hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);    hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);}static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,                 void *hcpu){    switch (action) {    case CPU_ONLINE_FROZEN:    case CPU_DOWN_FAILED_FROZEN:        /*         * num_cpus_frozen tracks how many CPUs are involved in suspend         * resume sequence. As long as this is not the last online         * operation in the resume sequence, just build a single sched         * domain, ignoring cpusets.         */        num_cpus_frozen--;        if (likely(num_cpus_frozen)) {            partition_sched_domains(1, NULL, NULL);            break;        }        /*         * This is the last CPU online operation. So fall through and         * restore the original sched domains by considering the         * cpuset configurations.         */    case CPU_ONLINE:        cpuset_update_active_cpus(true);        break;    default:        return NOTIFY_DONE;    }    return NOTIFY_OK;}static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,                   void *hcpu){    unsigned long flags;    long cpu = (long)hcpu;    struct dl_bw *dl_b;    bool overflow;    int cpus;    switch (action) {    case CPU_DOWN_PREPARE:        rcu_read_lock_sched();        dl_b = dl_bw_of(cpu);        raw_spin_lock_irqsave(&dl_b->lock, flags);        cpus = dl_bw_cpus(cpu);        overflow = __dl_overflow(dl_b, cpus, 0, 0);        raw_spin_unlock_irqrestore(&dl_b->lock, flags);        rcu_read_unlock_sched();        if (overflow)            return notifier_from_errno(-EBUSY);        cpuset_update_active_cpus(false);        break;    case CPU_DOWN_PREPARE_FROZEN:        num_cpus_frozen++;        partition_sched_domains(1, NULL, NULL);        break;    default:        return NOTIFY_DONE;    }    return NOTIFY_OK;}

4.1.1.4、mt6799的Scheduling Domains

在系统初始化时，因为cmdline中传入了“maxcpus=8”所以setup_max_cpus=8，smp只是启动了8个核，mt6799的另外2个大核是在后面才启动的。我们看看在系统启动8个核的时候，Scheduling Domains是什么样的。

在启动的时候每个层次的tl对每个cpu都会分配sd、sg、sgc的内存空间，但是建立起有效链接后有些sg、sgc空间是没有用上的。没有使用的内存后面会在claim_allocations()中标识出来，build_sched_domains()函数返回之前调用__free_domain_allocs()释放掉。

kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> init_sched_domains() -> build_sched_domains() ->  __visit_domain_allocation_hell() -> __sdt_alloc():[__sdt_alloc][tl MC] cpu0, &sd = 0xffffffc15663c600, &sg = 0xffffffc156062600, &sgc = 0xffffffc156062780 [__sdt_alloc][tl MC] cpu1, &sd = 0xffffffc15608f000, &sg = 0xffffffc156056780, &sgc = 0xffffffc156090000 [__sdt_alloc][tl MC] cpu2, &sd = 0xffffffc15608fc00, &sg = 0xffffffc156090d80, &sgc = 0xffffffc156090180 [__sdt_alloc][tl MC] cpu3, &sd = 0xffffffc15608f300, &sg = 0xffffffc156090c00, &sgc = 0xffffffc156090300 [__sdt_alloc][tl MC] cpu4, &sd = 0xffffffc15608f900, &sg = 0xffffffc156090a80, &sgc = 0xffffffc156090480 [__sdt_alloc][tl MC] cpu5, &sd = 0xffffffc15608f600, &sg = 0xffffffc156090900, &sgc = 0xffffffc156090600 [__sdt_alloc][tl MC] cpu6, &sd = 0xffffffc156091000, &sg = 0xffffffc156090780, &sgc = 0xffffffc156092000 [__sdt_alloc][tl MC] cpu7, &sd = 0xffffffc156091c00, &sg = 0xffffffc156092d80, &sgc = 0xffffffc156092180 [__sdt_alloc][tl DIE] cpu0, &sd = 0xffffffc156091300, &sg = 0xffffffc156092c00, &sgc = 0xffffffc156092300 [__sdt_alloc][tl DIE] cpu1, &sd = 0xffffffc156091900, &sg = 0xffffffc156092a80, &sgc = 0xffffffc156092480 [__sdt_alloc][tl DIE] cpu2, &sd = 0xffffffc156091600, &sg = 0xffffffc156092900, &sgc = 0xffffffc156092600 [__sdt_alloc][tl DIE] cpu3, &sd = 0xffffffc156093000, &sg = 0xffffffc156092780, &sgc = 0xffffffc156094000 [__sdt_alloc][tl DIE] cpu4, &sd = 0xffffffc156093c00, &sg = 0xffffffc156094d80, &sgc = 0xffffffc156094180 [__sdt_alloc][tl DIE] cpu5, &sd = 0xffffffc156093300, &sg = 0xffffffc156094c00, &sgc = 0xffffffc156094300 [__sdt_alloc][tl DIE] cpu6, &sd = 0xffffffc156093900, &sg = 0xffffffc156094a80, &sgc = 0xffffffc156094480 [__sdt_alloc][tl DIE] cpu7, &sd = 0xffffffc156093600, &sg = 0xffffffc156094900, &sgc = 0xffffffc156094600

建立链接以后每个层次tl的sd、sg之间的关系：

kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> init_sched_domains() -> build_sched_domains() -> build_sched_groups():[build_sched_domains][tl MC] cpu0, sd->groups=0xffffffc156062600, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf[build_sched_domains][tl MC] cpu0, sg->sgc=0xffffffc156062780, sg->next=0xffffffc156056780, sg->group_weight=0, sg->cpumask[]=0x1[build_sched_domains][tl MC] cpu0, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff[build_sched_domains][tl MC] cpu0, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 [build_sched_domains][tl MC] cpu0, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|[build_sched_domains][tl MC] cpu1, sd->groups=0xffffffc156056780, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf[build_sched_domains][tl MC] cpu1, sg->sgc=0xffffffc156090000, sg->next=0xffffffc156090d80, sg->group_weight=0, sg->cpumask[]=0x2[build_sched_domains][tl MC] cpu1, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff[build_sched_domains][tl MC] cpu1, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 [build_sched_domains][tl MC] cpu1, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|[build_sched_domains][tl MC] cpu2, sd->groups=0xffffffc156090d80, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf[build_sched_domains][tl MC] cpu2, sg->sgc=0xffffffc156090180, sg->next=0xffffffc156090c00, sg->group_weight=0, sg->cpumask[]=0x4[build_sched_domains][tl MC] cpu2, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff[build_sched_domains][tl MC] cpu2, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 [build_sched_domains][tl MC] cpu2, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|[build_sched_domains][tl MC] cpu3, sd->groups=0xffffffc156090c00, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf[build_sched_domains][tl MC] cpu3, sg->sgc=0xffffffc156090300, sg->next=0xffffffc156062600, sg->group_weight=0, sg->cpumask[]=0x8[build_sched_domains][tl MC] cpu3, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff[build_sched_domains][tl MC] cpu3, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 [build_sched_domains][tl MC] cpu3, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|[build_sched_domains][tl MC] cpu4, sd->groups=0xffffffc156090a80, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf0[build_sched_domains][tl MC] cpu4, sg->sgc=0xffffffc156090480, sg->next=0xffffffc156090900, sg->group_weight=0, sg->cpumask[]=0x10[build_sched_domains][tl MC] cpu4, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff[build_sched_domains][tl MC] cpu4, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 [build_sched_domains][tl MC] cpu4, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|[build_sched_domains][tl MC] cpu5, sd->groups=0xffffffc156090900, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf0[build_sched_domains][tl MC] cpu5, sg->sgc=0xffffffc156090600, sg->next=0xffffffc156090780, sg->group_weight=0, sg->cpumask[]=0x20[build_sched_domains][tl MC] cpu5, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff[build_sched_domains][tl MC] cpu5, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 [build_sched_domains][tl MC] cpu5, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|[build_sched_domains][tl MC] cpu6, sd->groups=0xffffffc156090780, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf0[build_sched_domains][tl MC] cpu6, sg->sgc=0xffffffc156092000, sg->next=0xffffffc156092d80, sg->group_weight=0, sg->cpumask[]=0x40[build_sched_domains][tl MC] cpu6, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff[build_sched_domains][tl MC] cpu6, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 [build_sched_domains][tl MC] cpu6, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|[build_sched_domains][tl MC] cpu7, sd->groups=0xffffffc156092d80, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf0[build_sched_domains][tl MC] cpu7, sg->sgc=0xffffffc156092180, sg->next=0xffffffc156090a80, sg->group_weight=0, sg->cpumask[]=0x80[build_sched_domains][tl MC] cpu7, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff[build_sched_domains][tl MC] cpu7, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 [build_sched_domains][tl MC] cpu7, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|[build_sched_domains][tl DIE] cpu0, sd->groups=0xffffffc156092c00, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff[build_sched_domains][tl DIE] cpu0, sg->sgc=0xffffffc156092300, sg->next=0xffffffc156094d80, sg->group_weight=0, sg->cpumask[]=0xf[build_sched_domains][tl DIE] cpu0, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff[build_sched_domains][tl DIE] cpu0, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 [build_sched_domains][tl DIE] cpu0, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|[build_sched_domains][tl DIE] cpu1, sd->groups=0xffffffc156092c00, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff[build_sched_domains][tl DIE] cpu1, sg->sgc=0x0, sg->next=0xffffffc156092a80, sg->group_weight=0, sg->cpumask[]=0x0[build_sched_domains][tl DIE] cpu1, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0[build_sched_domains][tl DIE] cpu1, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 [build_sched_domains][tl DIE] cpu1, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|[build_sched_domains][tl DIE] cpu2, sd->groups=0xffffffc156092c00, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff[build_sched_domains][tl DIE] cpu2, sg->sgc=0x0, sg->next=0xffffffc156092900, sg->group_weight=0, sg->cpumask[]=0x0[build_sched_domains][tl DIE] cpu2, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0[build_sched_domains][tl DIE] cpu2, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 [build_sched_domains][tl DIE] cpu2, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|[build_sched_domains][tl DIE] cpu3, sd->groups=0xffffffc156092c00, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff[build_sched_domains][tl DIE] cpu3, sg->sgc=0x0, sg->next=0xffffffc156092780, sg->group_weight=0, sg->cpumask[]=0x0[build_sched_domains][tl DIE] cpu3, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0[build_sched_domains][tl DIE] cpu3, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 [build_sched_domains][tl DIE] cpu3, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|[build_sched_domains][tl DIE] cpu4, sd->groups=0xffffffc156094d80, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff[build_sched_domains][tl DIE] cpu4, sg->sgc=0xffffffc156094180, sg->next=0xffffffc156092c00, sg->group_weight=0, sg->cpumask[]=0xf0[build_sched_domains][tl DIE] cpu4, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff[build_sched_domains][tl DIE] cpu4, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 [build_sched_domains][tl DIE] cpu4, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|[build_sched_domains][tl DIE] cpu5, sd->groups=0xffffffc156094d80, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff[build_sched_domains][tl DIE] cpu5, sg->sgc=0x0, sg->next=0xffffffc156094c00, sg->group_weight=0, sg->cpumask[]=0x0[build_sched_domains][tl DIE] cpu5, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0[build_sched_domains][tl DIE] cpu5, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 [build_sched_domains][tl DIE] cpu5, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|[build_sched_domains][tl DIE] cpu6, sd->groups=0xffffffc156094d80, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff[build_sched_domains][tl DIE] cpu6, sg->sgc=0x0, sg->next=0xffffffc156094a80, sg->group_weight=0, sg->cpumask[]=0x0[build_sched_domains][tl DIE] cpu6, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0[build_sched_domains][tl DIE] cpu6, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 [build_sched_domains][tl DIE] cpu6, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|[build_sched_domains][tl DIE] cpu7, sd->groups=0xffffffc156094d80, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff[build_sched_domains][tl DIE] cpu7, sg->sgc=0x0, sg->next=0xffffffc156094900, sg->group_weight=0, sg->cpumask[]=0x0[build_sched_domains][tl DIE] cpu7, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0[build_sched_domains][tl DIE] cpu7, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 [build_sched_domains][tl DIE] cpu7, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|

用图形表达的关系如下：

这里写图片描述

每个sched_domain中的参数也非常重要，在函数sd_init()中初始化，在smp负载均衡时会频繁的使用这些参数和标志：

sd 参数 tl MC 层级 tl DIE 层级 sd->min_interval 4 8 sd->max_interval 8 16 sd->busy_factor 32 32 sd->imbalance_pct 117 125 sd->cache_nice_tries 1 1 sd->busy_idx 2 2 sd->idle_idx 0 1 sd->newidle_idx 0 0 sd->wake_idx 0 0 sd->forkexec_idx 0 0 sd->span_weight 4 8 sd->balance_interval 4 8 sd->level 0 1 sd->flags 0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES 0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING

update_top_cache_domain()函数中还把常用的一些sd进行了cache，我们通过打印得出每个cache实际对应的层次sd：

cache sd 说明赋值 sd_busy per_cpu(sd_busy, cpu), 本cpu的tl DIE层级sd sd_llc per_cpu(sd_llc, cpu), 本cpu的tl MC层级sd sd_llc_size per_cpu(sd_llc_size, cpu), 4 sd_llc_id per_cpu(sd_llc_id, cpu), 0/4 sd_numa per_cpu(sd_numa, cpu), 0 sd_asym per_cpu(sd_asym, cpu), 0 sd_ea per_cpu(sd_ea, cpu), 本cpu的tl DIE层级sd sd_scs per_cpu(sd_scs, cpu), 本cpu的tl MC层级sd

static void update_top_cache_domain(int cpu){    struct sched_domain *sd;    struct sched_domain *busy_sd = NULL, *ea_sd = NULL;    int id = cpu;    int size = 1;    sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);    if (sd) {        id = cpumask_first(sched_domain_span(sd));        size = cpumask_weight(sched_domain_span(sd));        busy_sd = sd->parent; /* sd_busy */    }    rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);    rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);    per_cpu(sd_llc_size, cpu) = size;    per_cpu(sd_llc_id, cpu) = id;    sd = lowest_flag_domain(cpu, SD_NUMA);    rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);    sd = highest_flag_domain(cpu, SD_ASYM_PACKING);    rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);    for_each_domain(cpu, sd) {        if (sd->groups->sge)            ea_sd = sd;        else            break;    }    rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd);    sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES);    rcu_assign_pointer(per_cpu(sd_scs, cpu), sd);}

[update_top_cache_domain] cpu0, sd_busy=0xffffffc156091300, sd_llc=0xffffffc15663c600, sd_llc_size=4, sd_llc_id=0, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156091300, sd_scs=0xffffffc15663c600[update_top_cache_domain] cpu1, sd_busy=0xffffffc156091900, sd_llc=0xffffffc15608f000, sd_llc_size=4, sd_llc_id=0, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156091900, sd_scs=0xffffffc15608f000[update_top_cache_domain] cpu2, sd_busy=0xffffffc156091600, sd_llc=0xffffffc15608fc00, sd_llc_size=4, sd_llc_id=0, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156091600, sd_scs=0xffffffc15608fc00[update_top_cache_domain] cpu3, sd_busy=0xffffffc156093000, sd_llc=0xffffffc15608f300, sd_llc_size=4, sd_llc_id=0, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093000, sd_scs=0xffffffc15608f300[update_top_cache_domain] cpu4, sd_busy=0xffffffc156093c00, sd_llc=0xffffffc15608f900, sd_llc_size=4, sd_llc_id=4, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093c00, sd_scs=0xffffffc15608f900[update_top_cache_domain] cpu5, sd_busy=0xffffffc156093300, sd_llc=0xffffffc15608f600, sd_llc_size=4, sd_llc_id=4, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093300, sd_scs=0xffffffc15608f600[update_top_cache_domain] cpu6, sd_busy=0xffffffc156093900, sd_llc=0xffffffc156091000, sd_llc_size=4, sd_llc_id=4, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093900, sd_scs=0xffffffc156091000[update_top_cache_domain] cpu7, sd_busy=0xffffffc156093600, sd_llc=0xffffffc156091c00, sd_llc_size=4, sd_llc_id=4, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093600, sd_scs=0xffffffc156091c00[__sdt_alloc][tl MC] cpu0, &sd = 0xffffffc15663c600, &sg = 0xffffffc156062600, &sgc = 0xffffffc156062780 [__sdt_alloc][tl MC] cpu1, &sd = 0xffffffc15608f000, &sg = 0xffffffc156056780, &sgc = 0xffffffc156090000 [__sdt_alloc][tl MC] cpu2, &sd = 0xffffffc15608fc00, &sg = 0xffffffc156090d80, &sgc = 0xffffffc156090180 [__sdt_alloc][tl MC] cpu3, &sd = 0xffffffc15608f300, &sg = 0xffffffc156090c00, &sgc = 0xffffffc156090300 [__sdt_alloc][tl MC] cpu4, &sd = 0xffffffc15608f900, &sg = 0xffffffc156090a80, &sgc = 0xffffffc156090480 [__sdt_alloc][tl MC] cpu5, &sd = 0xffffffc15608f600, &sg = 0xffffffc156090900, &sgc = 0xffffffc156090600 [__sdt_alloc][tl MC] cpu6, &sd = 0xffffffc156091000, &sg = 0xffffffc156090780, &sgc = 0xffffffc156092000 [__sdt_alloc][tl MC] cpu7, &sd = 0xffffffc156091c00, &sg = 0xffffffc156092d80, &sgc = 0xffffffc156092180 [__sdt_alloc][tl DIE] cpu0, &sd = 0xffffffc156091300, &sg = 0xffffffc156092c00, &sgc = 0xffffffc156092300 [__sdt_alloc][tl DIE] cpu1, &sd = 0xffffffc156091900, &sg = 0xffffffc156092a80, &sgc = 0xffffffc156092480 [__sdt_alloc][tl DIE] cpu2, &sd = 0xffffffc156091600, &sg = 0xffffffc156092900, &sgc = 0xffffffc156092600 [__sdt_alloc][tl DIE] cpu3, &sd = 0xffffffc156093000, &sg = 0xffffffc156092780, &sgc = 0xffffffc156094000 [__sdt_alloc][tl DIE] cpu4, &sd = 0xffffffc156093c00, &sg = 0xffffffc156094d80, &sgc = 0xffffffc156094180 [__sdt_alloc][tl DIE] cpu5, &sd = 0xffffffc156093300, &sg = 0xffffffc156094c00, &sgc = 0xffffffc156094300 [__sdt_alloc][tl DIE] cpu6, &sd = 0xffffffc156093900, &sg = 0xffffffc156094a80, &sgc = 0xffffffc156094480 [__sdt_alloc][tl DIE] cpu7, &sd = 0xffffffc156093600, &sg = 0xffffffc156094900, &sgc = 0xffffffc156094600

mt6799在计算功耗(energy)和运算能力(capacity)时使用的表项如下：

kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> init_sched_domains() -> build_sched_domains() -> init_sched_energy()/init_sched_groups_capacity()；/* v1 FY */struct upower_tbl_info upower_tbl_infos_FY[NR_UPOWER_BANK] = {    INIT_UPOWER_TBL_INFOS(UPOWER_BANK_LL, upower_tbl_ll_1_FY),    INIT_UPOWER_TBL_INFOS(UPOWER_BANK_L, upower_tbl_l_1_FY),    INIT_UPOWER_TBL_INFOS(UPOWER_BANK_B, upower_tbl_b_1_FY),    INIT_UPOWER_TBL_INFOS(UPOWER_BANK_CLS_LL, upower_tbl_cluster_ll_1_FY),    INIT_UPOWER_TBL_INFOS(UPOWER_BANK_CLS_L, upower_tbl_cluster_l_1_FY),    INIT_UPOWER_TBL_INFOS(UPOWER_BANK_CLS_B, upower_tbl_cluster_b_1_FY),    INIT_UPOWER_TBL_INFOS(UPOWER_BANK_CCI, upower_tbl_cci_1_FY),};/* ver1 *//* FY table */struct upower_tbl upower_tbl_ll_1_FY = {    .row = {        {.cap = 100, .volt = 75000, .dyn_pwr = 9994, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },        {.cap = 126, .volt = 75000, .dyn_pwr = 12585, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },        {.cap = 148, .volt = 75000, .dyn_pwr = 14806, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },        {.cap = 167, .volt = 75000, .dyn_pwr = 16656, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },        {.cap = 189, .volt = 75000, .dyn_pwr = 18877, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },        {.cap = 212, .volt = 75000, .dyn_pwr = 21098, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },        {.cap = 230, .volt = 75700, .dyn_pwr = 23379, .lkg_pwr = {13936, 13936, 13936, 13936, 13936, 13936} },        {.cap = 245, .volt = 78100, .dyn_pwr = 26490, .lkg_pwr = {14811, 14811, 14811, 14811, 14811, 14811} },        {.cap = 263, .volt = 81100, .dyn_pwr = 30729, .lkg_pwr = {15958, 15958, 15958, 15958, 15958, 15958} },        {.cap = 278, .volt = 83500, .dyn_pwr = 34409, .lkg_pwr = {16949, 16949, 16949, 16949, 16949, 16949} },        {.cap = 293, .volt = 86000, .dyn_pwr = 38447, .lkg_pwr = {18036, 18036, 18036, 18036, 18036, 18036} },        {.cap = 304, .volt = 88400, .dyn_pwr = 42166, .lkg_pwr = {19159, 19159, 19159, 19159, 19159, 19159} },        {.cap = 319, .volt = 90800, .dyn_pwr = 46657, .lkg_pwr = {20333, 20333, 20333, 20333, 20333, 20333} },        {.cap = 334, .volt = 93200, .dyn_pwr = 51442, .lkg_pwr = {21605, 21605, 21605, 21605, 21605, 21605} },        {.cap = 345, .volt = 95000, .dyn_pwr = 55230, .lkg_pwr = {22560, 22560, 22560, 22560, 22560, 22560} },        {.cap = 356, .volt = 97400, .dyn_pwr = 59928, .lkg_pwr = {24002, 24002, 24002, 24002, 24002, 24002} },    },    .lkg_idx = DEFAULT_LKG_IDX,    .row_num = UPOWER_OPP_NUM,    .nr_idle_states = NR_UPOWER_CSTATES,    .idle_states = {        {{0}, {7321} },        {{0}, {7321} },        {{0}, {7321} },        {{0}, {7321} },        {{0}, {7321} },        {{0}, {7321} },    },};struct upower_tbl upower_tbl_cluster_ll_1_FY = {    .row = {        {.cap = 100, .volt = 75000, .dyn_pwr = 3656, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },        {.cap = 126, .volt = 75000, .dyn_pwr = 4604, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },        {.cap = 148, .volt = 75000, .dyn_pwr = 5417, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },        {.cap = 167, .volt = 75000, .dyn_pwr = 6094, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },        {.cap = 189, .volt = 75000, .dyn_pwr = 6906, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },        {.cap = 212, .volt = 75000, .dyn_pwr = 7719, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },        {.cap = 230, .volt = 75700, .dyn_pwr = 8553, .lkg_pwr = {22134, 22134, 22134, 22134, 22134, 22134} },        {.cap = 245, .volt = 78100, .dyn_pwr = 9692, .lkg_pwr = {23523, 23523, 23523, 23523, 23523, 23523} },        {.cap = 263, .volt = 81100, .dyn_pwr = 11242, .lkg_pwr = {25344, 25344, 25344, 25344, 25344, 25344} },        {.cap = 278, .volt = 83500, .dyn_pwr = 12589, .lkg_pwr = {26919, 26919, 26919, 26919, 26919, 26919} },        {.cap = 293, .volt = 86000, .dyn_pwr = 14066, .lkg_pwr = {28646, 28646, 28646, 28646, 28646, 28646} },        {.cap = 304, .volt = 88400, .dyn_pwr = 15427, .lkg_pwr = {30430, 30430, 30430, 30430, 30430, 30430} },        {.cap = 319, .volt = 90800, .dyn_pwr = 17069, .lkg_pwr = {32293, 32293, 32293, 32293, 32293, 32293} },        {.cap = 334, .volt = 93200, .dyn_pwr = 18820, .lkg_pwr = {34314, 34314, 34314, 34314, 34314, 34314} },        {.cap = 345, .volt = 95000, .dyn_pwr = 20206, .lkg_pwr = {35830, 35830, 35830, 35830, 35830, 35830} },        {.cap = 356, .volt = 97400, .dyn_pwr = 21925, .lkg_pwr = {38121, 38121, 38121, 38121, 38121, 38121} },    },    .lkg_idx = DEFAULT_LKG_IDX,    .row_num = UPOWER_OPP_NUM,    .nr_idle_states = NR_UPOWER_CSTATES,    .idle_states = {        {{0}, {11628} },        {{0}, {11628} },        {{0}, {11628} },        {{0}, {11628} },        {{0}, {11628} },        {{0}, {11628} },    },};struct upower_tbl upower_tbl_l_1_FY = {    .row = {        {.cap = 116, .volt = 75000, .dyn_pwr = 16431, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },        {.cap = 152, .volt = 75000, .dyn_pwr = 21486, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },        {.cap = 179, .volt = 75000, .dyn_pwr = 25278, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },        {.cap = 201, .volt = 75000, .dyn_pwr = 28437, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },        {.cap = 228, .volt = 75000, .dyn_pwr = 32229, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },        {.cap = 255, .volt = 75000, .dyn_pwr = 36021, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },        {.cap = 282, .volt = 75700, .dyn_pwr = 40559, .lkg_pwr = {23423, 23423, 23423, 23423, 23423, 23423} },        {.cap = 304, .volt = 78100, .dyn_pwr = 46598, .lkg_pwr = {24968, 24968, 24968, 24968, 24968, 24968} },        {.cap = 331, .volt = 81100, .dyn_pwr = 54680, .lkg_pwr = {26999, 26999, 26999, 26999, 26999, 26999} },        {.cap = 349, .volt = 83500, .dyn_pwr = 61098, .lkg_pwr = {28760, 28760, 28760, 28760, 28760, 28760} },        {.cap = 371, .volt = 86000, .dyn_pwr = 68965, .lkg_pwr = {30698, 30698, 30698, 30698, 30698, 30698} },        {.cap = 393, .volt = 88400, .dyn_pwr = 77258, .lkg_pwr = {32706, 32706, 32706, 32706, 32706, 32706} },        {.cap = 416, .volt = 90800, .dyn_pwr = 86141, .lkg_pwr = {34808, 34808, 34808, 34808, 34808, 34808} },        {.cap = 438, .volt = 93200, .dyn_pwr = 95634, .lkg_pwr = {37097, 37097, 37097, 37097, 37097, 37097} },        {.cap = 452, .volt = 95000, .dyn_pwr = 102406, .lkg_pwr = {38814, 38814, 38814, 38814, 38814, 38814} },        {.cap = 474, .volt = 97400, .dyn_pwr = 112974, .lkg_pwr = {41424, 41424, 41424, 41424, 41424, 41424} },    },    .lkg_idx = DEFAULT_LKG_IDX,    .row_num = UPOWER_OPP_NUM,    .nr_idle_states = NR_UPOWER_CSTATES,    .idle_states = {        {{0}, {11926} },        {{0}, {11926} },        {{0}, {11926} },        {{0}, {11926} },        {{0}, {11926} },        {{0}, {11926} },    },};struct upower_tbl upower_tbl_cluster_l_1_FY = {    .row = {        {.cap = 116, .volt = 75000, .dyn_pwr = 2778, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },        {.cap = 152, .volt = 75000, .dyn_pwr = 3633, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },        {.cap = 179, .volt = 75000, .dyn_pwr = 4274, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },        {.cap = 201, .volt = 75000, .dyn_pwr = 4808, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },        {.cap = 228, .volt = 75000, .dyn_pwr = 5449, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },        {.cap = 255, .volt = 75000, .dyn_pwr = 6090, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },        {.cap = 282, .volt = 75700, .dyn_pwr = 6857, .lkg_pwr = {27058, 27058, 27058, 27058, 27058, 27058} },        {.cap = 304, .volt = 78100, .dyn_pwr = 7878, .lkg_pwr = {28843, 28843, 28843, 28843, 28843, 28843} },        {.cap = 331, .volt = 81100, .dyn_pwr = 9245, .lkg_pwr = {31188, 31188, 31188, 31188, 31188, 31188} },        {.cap = 349, .volt = 83500, .dyn_pwr = 10330, .lkg_pwr = {33223, 33223, 33223, 33223, 33223, 33223} },        {.cap = 371, .volt = 86000, .dyn_pwr = 11660, .lkg_pwr = {35461, 35461, 35461, 35461, 35461, 35461} },        {.cap = 393, .volt = 88400, .dyn_pwr = 13062, .lkg_pwr = {37781, 37781, 37781, 37781, 37781, 37781} },        {.cap = 416, .volt = 90800, .dyn_pwr = 14564, .lkg_pwr = {40209, 40209, 40209, 40209, 40209, 40209} },        {.cap = 438, .volt = 93200, .dyn_pwr = 16169, .lkg_pwr = {42854, 42854, 42854, 42854, 42854, 42854} },        {.cap = 452, .volt = 95000, .dyn_pwr = 17314, .lkg_pwr = {44837, 44837, 44837, 44837, 44837, 44837} },        {.cap = 474, .volt = 97400, .dyn_pwr = 19101, .lkg_pwr = {47852, 47852, 47852, 47852, 47852, 47852} },    },    .lkg_idx = DEFAULT_LKG_IDX,    .row_num = UPOWER_OPP_NUM,    .nr_idle_states = NR_UPOWER_CSTATES,    .idle_states = {        {{0}, {13776} },        {{0}, {13776} },        {{0}, {13776} },        {{0}, {13776} },        {{0}, {13776} },        {{0}, {13776} },    },};struct upower_tbl upower_tbl_b_1_FY = {    .row = {        {.cap = 211, .volt = 75000, .dyn_pwr = 61732, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },        {.cap = 268, .volt = 75000, .dyn_pwr = 78352, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },        {.cap = 317, .volt = 75000, .dyn_pwr = 92598, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },        {.cap = 358, .volt = 75000, .dyn_pwr = 104469, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },        {.cap = 406, .volt = 75000, .dyn_pwr = 118715, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },        {.cap = 447, .volt = 75000, .dyn_pwr = 130587, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },        {.cap = 504, .volt = 75700, .dyn_pwr = 149968, .lkg_pwr = {72438, 72438, 72438, 72438, 72438, 72438} },        {.cap = 561, .volt = 78100, .dyn_pwr = 177650, .lkg_pwr = {76806, 76806, 76806, 76806, 76806, 76806} },        {.cap = 634, .volt = 81100, .dyn_pwr = 216546, .lkg_pwr = {82521, 82521, 82521, 82521, 82521, 82521} },        {.cap = 691, .volt = 83500, .dyn_pwr = 250153, .lkg_pwr = {87447, 87447, 87447, 87447, 87447, 87447} },        {.cap = 748, .volt = 86000, .dyn_pwr = 287210, .lkg_pwr = {92841, 92841, 92841, 92841, 92841, 92841} },        {.cap = 805, .volt = 88400, .dyn_pwr = 326553, .lkg_pwr = {98397, 98397, 98397, 98397, 98397, 98397} },    {.cap = 861, .volt = 90800, .dyn_pwr = 368886, .lkg_pwr = {104190, 104190, 104190, 104190, 104190, 104190} },    {.cap = 918, .volt = 93200, .dyn_pwr = 414309, .lkg_pwr = {110456, 110456, 110456, 110456, 110456, 110456} },    {.cap = 959, .volt = 95000, .dyn_pwr = 449514, .lkg_pwr = {115156, 115156, 115156, 115156, 115156, 115156} },    {.cap = 1024, .volt = 97400, .dyn_pwr = 504548, .lkg_pwr = {122224, 122224, 122224, 122224, 122224, 122224} },    },    .lkg_idx = DEFAULT_LKG_IDX,    .row_num = UPOWER_OPP_NUM,    .nr_idle_states = NR_UPOWER_CSTATES,    .idle_states = {        {{0}, {38992} },        {{0}, {38992} },        {{0}, {38992} },        {{0}, {38992} },        {{0}, {38992} },        {{0}, {38992} },    },};struct upower_tbl upower_tbl_cluster_b_1_FY = {    .row = {        {.cap = 211, .volt = 75000, .dyn_pwr = 6408, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },        {.cap = 268, .volt = 75000, .dyn_pwr = 8133, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },        {.cap = 317, .volt = 75000, .dyn_pwr = 9612, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },        {.cap = 358, .volt = 75000, .dyn_pwr = 10844, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },        {.cap = 406, .volt = 75000, .dyn_pwr = 12323, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },        {.cap = 447, .volt = 75000, .dyn_pwr = 13555, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },        {.cap = 504, .volt = 75700, .dyn_pwr = 15567, .lkg_pwr = {28054, 28054, 28054, 28054, 28054, 28054} },        {.cap = 561, .volt = 78100, .dyn_pwr = 18440, .lkg_pwr = {29746, 29746, 29746, 29746, 29746, 29746} },        {.cap = 634, .volt = 81100, .dyn_pwr = 22478, .lkg_pwr = {31959, 31959, 31959, 31959, 31959, 31959} },        {.cap = 691, .volt = 83500, .dyn_pwr = 25966, .lkg_pwr = {33867, 33867, 33867, 33867, 33867, 33867} },        {.cap = 748, .volt = 86000, .dyn_pwr = 29813, .lkg_pwr = {35956, 35956, 35956, 35956, 35956, 35956} },        {.cap = 805, .volt = 88400, .dyn_pwr = 33897, .lkg_pwr = {38108, 38108, 38108, 38108, 38108, 38108} },        {.cap = 861, .volt = 90800, .dyn_pwr = 38291, .lkg_pwr = {40351, 40351, 40351, 40351, 40351, 40351} },        {.cap = 918, .volt = 93200, .dyn_pwr = 43006, .lkg_pwr = {42778, 42778, 42778, 42778, 42778, 42778} },        {.cap = 959, .volt = 95000, .dyn_pwr = 46661, .lkg_pwr = {44598, 44598, 44598, 44598, 44598, 44598} },        {.cap = 1024, .volt = 97400, .dyn_pwr = 52373, .lkg_pwr = {47335, 47335, 47335, 47335, 47335, 47335} },    },    .lkg_idx = DEFAULT_LKG_IDX,    .row_num = UPOWER_OPP_NUM,    .nr_idle_states = NR_UPOWER_CSTATES,    .idle_states = {        {{0}, {15101} },        {{0}, {15101} },        {{0}, {15101} },        {{0}, {15101} },        {{0}, {15101} },        {{0}, {15101} },    },};struct upower_tbl upower_tbl_cci_1_FY = {    .row = {        {.cap = 0, .volt = 75000, .dyn_pwr = 2708, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },        {.cap = 0, .volt = 75000, .dyn_pwr = 3611, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },        {.cap = 0, .volt = 75000, .dyn_pwr = 4288, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },        {.cap = 0, .volt = 75000, .dyn_pwr = 5191, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },        {.cap = 0, .volt = 75000, .dyn_pwr = 5868, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },        {.cap = 0, .volt = 75000, .dyn_pwr = 6771, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },        {.cap = 0, .volt = 75700, .dyn_pwr = 7588, .lkg_pwr = {16537, 16537, 16537, 16537, 16537, 16537} },        {.cap = 0, .volt = 78100, .dyn_pwr = 8811, .lkg_pwr = {17527, 17527, 17527, 17527, 17527, 17527} },        {.cap = 0, .volt = 81100, .dyn_pwr = 10292, .lkg_pwr = {18822, 18822, 18822, 18822, 18822, 18822} },        {.cap = 0, .volt = 83500, .dyn_pwr = 11750, .lkg_pwr = {19938, 19938, 19938, 19938, 19938, 19938} },        {.cap = 0, .volt = 86000, .dyn_pwr = 13354, .lkg_pwr = {21159, 21159, 21159, 21159, 21159, 21159} },        {.cap = 0, .volt = 88400, .dyn_pwr = 14737, .lkg_pwr = {22417, 22417, 22417, 22417, 22417, 22417} },        {.cap = 0, .volt = 90800, .dyn_pwr = 16540, .lkg_pwr = {23728, 23728, 23728, 23728, 23728, 23728} },        {.cap = 0, .volt = 93200, .dyn_pwr = 18472, .lkg_pwr = {25145, 25145, 25145, 25145, 25145, 25145} },        {.cap = 0, .volt = 95000, .dyn_pwr = 19916, .lkg_pwr = {26208, 26208, 26208, 26208, 26208, 26208} },        {.cap = 0, .volt = 97400, .dyn_pwr = 22077, .lkg_pwr = {27805, 27805, 27805, 27805, 27805, 27805} },    },    .lkg_idx = DEFAULT_LKG_IDX,    .row_num = UPOWER_OPP_NUM,    .nr_idle_states = NR_UPOWER_CSTATES,    .idle_states = {        {{0}, {8938} },        {{0}, {8938} },        {{0}, {8938} },        {{0}, {8938} },        {{0}, {8938} },        {{0}, {8938} },    },};

4.1.2、smp负载均衡的实现

负载均衡和很多参数相关，下面列出了其中最重要的一些参数：

成员所属结构含义更新/获取函数计算方法 rq->cpu_capacity_orig rq 本cpu总的计算能力 init_sched_groups_capacity()/update_sd_lb_stats() -> update_group_capacity() -> update_cpu_capacity() capacity = arch_scale_cpu_capacity(sd, cpu) rq->cpu_capacity rq 本cpu cfs的计算能力 = 总capacity - rt占用的capacity init_sched_groups_capacity()/update_sd_lb_stats() -> update_group_capacity() -> update_cpu_capacity() capacity *= scale_rt_capacity(cpu); rq->rd->max_cpu_capacity rq->rd root_domain中最大的cpu计算能力 init_sched_groups_capacity()/update_sd_lb_stats() -> update_group_capacity() -> update_cpu_capacity() rq->rd->overutilized rq->rd update_sd_lb_stats() rq->rd->overload rq->rd update_sd_lb_stats() rq->rt_avg rq 本cpu的rt平均负载 weighted_cpuload() -> cfs_rq_runnable_load_avg() rq->cfs.runnable_load_avg rq->cfs(cfs_rq) 本cpu cfs_rq的runable平均负载 __update_load_avg()、cfs_rq_load_avg() (runnable时间*freq*weight)/LOAD_AVG_MAX rq->cfs.avg.load_avg rq->cfs.avg 本cpu cfs_rq的runnable平均负载 __update_load_avg() (runnable时间*freq*weight)/LOAD_AVG_MAX rq->cfs.avg.loadwop_avg rq->cfs.avg 本cpu cfs_rq的runnable平均负载，不含weight __update_load_avg() (runnable时间*freq)/LOAD_AVG_MAX rq->cfs.avg.util_avg rq->cfs.avg 本cpu cfs_rq的running负载 __update_load_avg()、cpu_util() -> __cpu_util() (running时间*freq*capacity)/LOAD_AVG_MAX cfs_rq->nr_running cfs_rq 本cfs_rq这个层次runnable的se的数量 enqueue_entity()/dequeue_entity() -> account_entity_enqueue() cfs_rq->h_nr_running cfs_rq 本cfs_rq包含所有子cfs_rq nr_running的总和 enqueue_task_fair()/dequeue_task_fair rq->nr_running rq 本cpu rq所有runnable的se的数量，包含所有子cfs_rq enqueue_task_fair()/dequeue_task_fair -> add_nr_running()

4.1.2.1、rebalance_domains()

mtk对定义了3种power模式来兼容EAS的：EAS模式(energy_aware())、HMP模式(sched_feat(SCHED_HMP))、hybrid_support(EAS、HMP同时共存)；

hybrid_support()模式下：一般负载均衡交给EAS；如果cpu_rq(cpu)->rd->overutilized负载已经严重不均衡，交给HMP；

系统在scheduler_tick()中会定期的检测smp负载均衡的时间是否已到，如果到时触发SCHED_SOFTIRQ软中断：

void scheduler_tick(void){#ifdef CONFIG_SMP    rq->idle_balance = idle_cpu(cpu);    trigger_load_balance(rq);#endif}|→/* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */void trigger_load_balance(struct rq *rq){    /* Don't need to rebalance while attached to NULL domain */    if (unlikely(on_null_domain(rq)))        return;    if (time_after_eq(jiffies, rq->next_balance))        raise_softirq(SCHED_SOFTIRQ);#ifdef CONFIG_NO_HZ_COMMON    if (nohz_kick_needed(rq))        nohz_balancer_kick();#endif}

SCHED_SOFTIRQ软中断的执行主体为run_rebalance_domains：

__init void init_sched_fair_class(void){    open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);}/* * run_rebalance_domains is triggered when needed from the scheduler tick. * Also triggered for nohz idle balancing (with nohz_balancing_kick set). */static void run_rebalance_domains(struct softirq_action *h){    struct rq *this_rq = this_rq();    enum cpu_idle_type idle = this_rq->idle_balance ?                        CPU_IDLE : CPU_NOT_IDLE;    int this_cpu = smp_processor_id();    /* bypass load balance of HMP if EAS consideration */    if ((!energy_aware() && sched_feat(SCHED_HMP)) ||            (hybrid_support() && cpu_rq(this_cpu)->rd->overutilized))        hmp_force_up_migration(this_cpu);    /*     * If this cpu has a pending nohz_balance_kick, then do the     * balancing on behalf of the other idle cpus whose ticks are     * stopped. Do nohz_idle_balance *before* rebalance_domains to     * give the idle cpus a chance to load balance. Else we may     * load balance only within the local sched_domain hierarchy     * and abort nohz_idle_balance altogether if we pull some load.     */    nohz_idle_balance(this_rq, idle);    rebalance_domains(this_rq, idle);}

我们分析最核心的函数rebalance_domains()：

需要重点提一下的是：负载计算计算了3种负载(load_avg、loadwop_avg、util_avg)，rebalance_domains主要使用其中的load_avg，乘(SCHED_CAPACITY_SCALE/capacity)加以转换。

1、逐级轮询本cpu的sd，判断本sd的时间间隔是否到期，如果到期做load_balance()；

| tl层级 | cpu_busy? | sd->balance_interval | sd->busy_factor | sd balance interval |
|—|—|—|—|—|
MC层级 | idle | 4 |1 | 4ms
MC层级 | busy | 4 | 32 | 128ms
DIE层级 | idle | 8 |1 | 8ms
DIE层级 | busy | 8 | 32 | 256ms
| | | | | rq->next_balance = min(上述值)

2、在load_balance()中判断在本层级sd本cpu的当前情况是否适合充当dst_cpu，在should_we_balance()做各种判断，做dst_cpu的条件有：要么是本sg的第一个idle cpu，要么是本sg的第一个cpu。dst_cpu是作为目的cpu让负载高的cpu迁移进程过来，如果本cpu不符合条件中断操作；
3、继续find_busiest_group()，在sg链表中找出负载最重的sg。核心计算在update_sd_lb_stats()、update_sg_lb_stats()中。如果dst_cpu所在的local_group负载大于busiest sg，或者大于sds平均负载，中断操作；如果成功计算需要迁移的负载env->imbalance，为min((sds->avg - local), (busiest - sds->avg))；

这里写图片描述

3.1、根据当前cpu的idle状态计算cpu load(rq->cpu_load[])时选用的index值：

tl层级 busy_idx idle_idx newidle_idx MC层级 2 0 0 DIE层级 2 1 0

- 3.2、计算sg负载sgs，选择sgs->avg_load最大的sg作为busiest_group。其中几个关键值的计算如下：

负载值计算方法说明 sgs->group_load += cpu_rq(cpu)->cpu_load[index-1] 累加cpu的load值，相对值(每个cpu的最大值都是1024)，且带weight分量 sgs->group_util += cpu_rq(cpu)->cfs.avg.util_avg 累加cpu cfs running值，绝对值(不同cluster，只有最大capacity能力的cpu最大值为1024) sgs->group_capacity += (arch_scale_cpu_capacity(sd, cpu)*(1-rt_capacity)) 累加cpu的capacity，绝对值(不同cluster，只有最大capacity能力的cpu最大值为1024) sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity group_load做了转换，和group_capacity成反比

- 3.3、在计算sg负载时，几个关键状态的计算如下：

状态值计算方法说明 sgs->group_no_capacity (sgs->group_capacity * 100) < (sgs->group_util * env->sd->imbalance_pct) 预留一定空间(比例为imbalance_pct)，sg运算能力已经不够了，sgs->group_type=group_overloaded dst_rq->rd->overutilized (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin) 预留一定空间(比例为capacity_margin)，sg运算能力已经不够了 dst_rq->rd->overload rq->nr_running > 1 sg中任何一个cpu的runnable进程大于1

比例参数imbalance_pct、capacity_margin的值为：

tl层级 sd->imbalance_pct (/100) capacity_margin (/1024) MC层级 117 1280 DIE层级 125 1280

- 3.4、计算env->imbalance，这个是rebalance需要迁移的负载量：

负载值计算方法说明 sds->total_load += sgs->group_load sds->total_capacity += sgs->group_capacity sds.avg_load (SCHED_CAPACITY_SCALE * sds.total_load)/ sds.total_capacity env->imbalance min((busiest->avg_load - sds->avg_load)*busiest->group_capacity, (sds->avg_load - local->avg_load)*local->group_capacity) / SCHED_CAPACITY_SCALE) 感觉这里计算有bug啊，前面是1024/capcity，后面是capacity/1024，很混乱

4、继续find_busiest_queue()，查找busiest sg中负载最重的cpu。

这里写图片描述

4.1、找出sg中weighted_cpuload*capacity_of值最大的cpu：

负载值计算方法说明 weighted_cpuload(cpu) cpu_rq(cpu)->cfs->runnable_load_avg cpu的load值，相对值(每个cpu的最大值都是1024)，且带weight分量 capacity_of(cpu) arch_scale_cpu_capacity(sd, cpu)*(1-rt_capacity) cpu cfs running值，绝对值(不同cluster，只有最大capacity能力的cpu最大值为1024) weighted_cpuload(cpu)*capacity_of(cpu) 最大值为busiest sg中busiest cpu rq

5、迁移busiest cpu的负载到本地dst cpu上，迁移的负载额度为env->imbalance：detach_tasks() -> attach_tasks()；
6、处理几种因为进程亲和力问题，busiest cpu不能迁移走足够的进程：LBF_DST_PINNED尝试更改dst_cpu为本地cpu相同sg的其他cpu；LBF_SOME_PINNED当前不能均衡尝试让父sd均衡；LBF_ALL_PINNED一个进程都不能迁移尝试去掉dst_cpu重新进行load_balance()；
7、如果经过各种尝试后还是没有一个进程迁移成功，最后尝试一次active_balance;

/* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * * Balancing parameters are set up in init_sched_domains. * Balance的参数是在sched_domains初始化时设置的 */static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle){    int continue_balancing = 1;    int cpu = rq->cpu;    unsigned long interval;    struct sched_domain *sd;    /* 默认本cpu rq下一次的balance时间为60s以后 */    /* Earliest time when we have to do rebalance again */    unsigned long next_balance = jiffies + 60*HZ;    int update_next_balance = 0;    int need_serialize, need_decay = 0;    u64 max_cost = 0;    /* (1) 更新cpu rq中所有cfs_rq的最新负载 */    update_blocked_averages(cpu);    rcu_read_lock();    /* (2) 对本cpu每个层次的schedule_domain进行扫描 */    for_each_domain(cpu, sd) {        /* (3) 以1HZ的频率对sd->max_newidle_lb_cost进行老化，            老化公式： new = old * (253/256)         */        /*         * Decay the newidle max times here because this is a regular         * visit to all the domains. Decay ~1% per second.         */        if (time_after(jiffies, sd->next_decay_max_lb_cost)) {            sd->max_newidle_lb_cost =                (sd->max_newidle_lb_cost * 253) / 256;            sd->next_decay_max_lb_cost = jiffies + HZ;            need_decay = 1;        }        max_cost += sd->max_newidle_lb_cost;        if (!(sd->flags & SD_LOAD_BALANCE))            continue;#ifndef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT        /* nohz CPU need GTS balance to migrate tasks for more than 2 clusters*/        /* Don't consider GTS balance if hybrid support */        if (hybrid_support()) {            if (sd->child || (!sd->child &&                (rcu_dereference(per_cpu(sd_scs, cpu)) == NULL)))            continue;        }#endif        /* (4) 如果continue_balancing = 0，指示停止当前层级的load balance            因为shed_group中其他的cpu正在这个层次做load_balance         */        /*         * Stop the load balance at this level. There is another         * CPU in our sched group which is doing load balancing more         * actively.         */        if (!continue_balancing) {            if (need_decay)                continue;            break;        }        /* (5) 计算当前层次schedule_domain的balance间隔时间 */        interval = get_sd_balance_interval(sd, idle != CPU_IDLE);        /* (6) 如果需要串行化(SD_SERIALIZE)，做balance之前需要持锁 */        need_serialize = sd->flags & SD_SERIALIZE;        if (need_serialize) {            if (!spin_trylock(&balancing))                goto out;        }        /* (7) 如果本sd的balance间隔时间已到，进行实际的load_balance() */        if (time_after_eq(jiffies, sd->last_balance + interval)) {            if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {                /*                 * The LBF_DST_PINNED logic could have changed                 * env->dst_cpu, so we can't know our idle                 * state even if we migrated tasks. Update it.                 */                idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;            }            sd->last_balance = jiffies;            interval = get_sd_balance_interval(sd, idle != CPU_IDLE);        }        if (need_serialize)            spin_unlock(&balancing);out:        /* (8) 如果sd下一次balance时间在，rq的balance时间之前，需要更新rq的balance时间            rq的下一次balance时间：next_balance  (默认是60s后)            本sd的下一次balance时间：sd->last_balance + interval            rq的下一次balance时间需要选取多个sd中时间最近的一个         */        if (time_after(next_balance, sd->last_balance + interval)) {            next_balance = sd->last_balance + interval;            update_next_balance = 1;        }    }    if (need_decay) {        /*         * Ensure the rq-wide value also decays but keep it at a         * reasonable floor to avoid funnies with rq->avg_idle.         */        rq->max_idle_balance_cost =            max((u64)sysctl_sched_migration_cost, max_cost);    }    rcu_read_unlock();    /* (8.1) 更新rq的balance时间 */    /*     * next_balance will be updated only when there is a need.     * When the cpu is attached to null domain for ex, it will not be     * updated.     */    if (likely(update_next_balance)) {        rq->next_balance = next_balance;#ifdef CONFIG_NO_HZ_COMMON        /*         * If this CPU has been elected to perform the nohz idle         * balance. Other idle CPUs have already rebalanced with         * nohz_idle_balance() and nohz.next_balance has been         * updated accordingly. This CPU is now running the idle load         * balance for itself and we need to update the         * nohz.next_balance accordingly.         */        if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))            nohz.next_balance = rq->next_balance;#endif    }}|→static int load_balance(int this_cpu, struct rq *this_rq,            struct sched_domain *sd, enum cpu_idle_type idle,            int *continue_balancing){    int ld_moved, cur_ld_moved, active_balance = 0;    struct sched_domain *sd_parent = sd->parent;    struct sched_group *group;    struct rq *busiest;    unsigned long flags;    struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);    /* (7.1) 构造Load_balance需要的数据结构：        .sd     = sd,   //本cpu在本tl层次的sd        .dst_cpu    = this_cpu,   // 目的cpu是本cpu        .dst_rq     = this_rq,    // 目的rq是本cpu的rq        // load_balance的目的是找出负载最重的cpu，并将一部分负载迁移到本cpu上     */    struct lb_env env = {        .sd     = sd,        .dst_cpu    = this_cpu,        .dst_rq     = this_rq,        .dst_grpmask    = sched_group_cpus(sd->groups),        .idle       = idle,        .loop_break = sched_nr_migrate_break,        .cpus       = cpus,        .fbq_type   = all,        .tasks      = LIST_HEAD_INIT(env.tasks),    };    /*     * For NEWLY_IDLE load_balancing, we don't need to consider     * other cpus in our group     */    if (idle == CPU_NEWLY_IDLE)        env.dst_grpmask = NULL;    cpumask_copy(cpus, cpu_active_mask);    schedstat_inc(sd, lb_count[idle]);redo:    /* (7.2) check当前cpu是否适合作为dst_cpu(即light cpu，需要分担其他cpu的负载) */    if (!should_we_balance(&env)) {        *continue_balancing = 0;        goto out_balanced;    }    /* (7.3) 找出本层级sched_group链表中，负载最重的(busiest)的sched_group */    group = find_busiest_group(&env);    if (!group) {        schedstat_inc(sd, lb_nobusyg[idle]);        goto out_balanced;    }    /* (7.4) 找出busiest sched_group中sched_group的rq，即负载最重cpu对应的rq */    busiest = find_busiest_queue(&env, group);    if (!busiest) {        schedstat_inc(sd, lb_nobusyq[idle]);        goto out_balanced;    }    BUG_ON(busiest == env.dst_rq);    schedstat_add(sd, lb_imbalance[idle], env.imbalance);    env.src_cpu = busiest->cpu;    env.src_rq = busiest;    ld_moved = 0;    /* (7.5) 判断busiest cpu rq中的runnable进程数 > 1？        至少有进程可以迁移走     */    if (busiest->nr_running > 1) {        /*         * Attempt to move tasks. If find_busiest_group has found         * an imbalance but busiest->nr_running <= 1, the group is         * still unbalanced. ld_moved simply stays zero, so it is         * correctly treated as an imbalance.         */        env.flags |= LBF_ALL_PINNED;        env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);more_balance:        raw_spin_lock_irqsave(&busiest->lock, flags);        /* (7.6) 从busiest rq中detach进程，             env->imbalance：需要迁移的负载大小            cur_ld_moved：实际迁移的进程数         */        /*         * cur_ld_moved - load moved in current iteration         * ld_moved     - cumulative load moved across iterations         */        cur_ld_moved = detach_tasks(&env);        /* (7.7) busiest cpu负载减轻后，            在sched_freq中判断cpu频率是否可以调低         */        /*         * We want to potentially lower env.src_cpu's OPP.         */        if (cur_ld_moved)            update_capacity_of(env.src_cpu, SCHE_ONESHOT);        /*         * We've detached some tasks from busiest_rq. Every         * task is masked "TASK_ON_RQ_MIGRATING", so we can safely         * unlock busiest->lock, and we are able to be sure         * that nobody can manipulate the tasks in parallel.         * See task_rq_lock() family for the details.         */        raw_spin_unlock(&busiest->lock);        /* (7.8) 把迁移过来的任务attack到dest_cpu上 */        if (cur_ld_moved) {            attach_tasks(&env);            ld_moved += cur_ld_moved;        }        local_irq_restore(flags);        /* (7.9) LBF_NEED_BREAK设置，说明balance还没有完成，循环只是出来休息一下，            继续重新balance         */        if (env.flags & LBF_NEED_BREAK) {            env.flags &= ~LBF_NEED_BREAK;            goto more_balance;        }        /* (7.10) 设置了LBF_DST_PINNED标志，并且env.imbalance > 0            说明src_cpu上有些进程因为affinity的原因不能迁移到dst_cpu但是能迁移到同sg的new_dst_cpu上            把dst_cpu更改为new_dst_cpu，重新开始balance流程         */        /*         * Revisit (affine) tasks on src_cpu that couldn't be moved to         * us and move them to an alternate dst_cpu in our sched_group         * where they can run. The upper limit on how many times we         * iterate on same src_cpu is dependent on number of cpus in our         * sched_group.         *         * This changes load balance semantics a bit on who can move         * load to a given_cpu. In addition to the given_cpu itself         * (or a ilb_cpu acting on its behalf where given_cpu is         * nohz-idle), we now have balance_cpu in a position to move         * load to given_cpu. In rare situations, this may cause         * conflicts (balance_cpu and given_cpu/ilb_cpu deciding         * _independently_ and at _same_ time to move some load to         * given_cpu) causing exceess load to be moved to given_cpu.         * This however should not happen so much in practice and         * moreover subsequent load balance cycles should correct the         * excess load moved.         */        if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {            /* Prevent to re-select dst_cpu via env's cpus */            cpumask_clear_cpu(env.dst_cpu, env.cpus);            env.dst_rq   = cpu_rq(env.new_dst_cpu);            env.dst_cpu  = env.new_dst_cpu;            env.flags   &= ~LBF_DST_PINNED;            env.loop     = 0;            env.loop_break   = sched_nr_migrate_break;            /*             * Go back to "more_balance" rather than "redo" since we             * need to continue with same src_cpu.             */            goto more_balance;        }        /* (7.11) 设置了LBF_SOME_PINNED标志，说明有些进程因为affinity迁移失败，              设置当前sd的parent sd的 sgc->imbalance，让parent sd做rebalance的概率增高         */        /*         * We failed to reach balance because of affinity.         */        if (sd_parent) {            int *group_imbalance = &sd_parent->groups->sgc->imbalance;            if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)                *group_imbalance = 1;        }        /* (7.12) 如果LBF_ALL_PINNED标志一直被置位，            说明busiest_cpu因为affinity没有一个进程迁移成功，哪怕迁移到dst_cpu同sg的其他cpu也没有一个成功            将busiest cpu从全局cpu mask去掉，重新做整个流程：find_busiest_group -> find_busiest_queue -> detach_tasks -> attach_tasks         */        /* All tasks on this runqueue were pinned by CPU affinity */        if (unlikely(env.flags & LBF_ALL_PINNED)) {            cpumask_clear_cpu(cpu_of(busiest), cpus);            if (!cpumask_empty(cpus)) {                env.loop = 0;                env.loop_break = sched_nr_migrate_break;                goto redo;            }            goto out_all_pinned;        }    }    /* (7.13) 经过几轮的努力尝试，最终迁移的进程数ld_moved还是0，说明balance失败 */    if (!ld_moved) {        schedstat_inc(sd, lb_failed[idle]);        /*         * Increment the failure counter only on periodic balance.         * We do not want newidle balance, which can be very         * frequent, pollute the failure counter causing         * excessive cache_hot migrations and active balances.         */        if (idle != CPU_NEWLY_IDLE)            if (env.src_grp_nr_running > 1)                sd->nr_balance_failed++;        /* (7.14) 最后一次尝试迁移一个进程 */        if (need_active_balance(&env)) {            raw_spin_lock_irqsave(&busiest->lock, flags);            /* (7.15) 如果当前cpu不在busiest->curr进程的affinity之内，返回失败 */            /* don't kick the active_load_balance_cpu_stop,             * if the curr task on busiest cpu can't be             * moved to this_cpu             */            if (!cpumask_test_cpu(this_cpu,                    tsk_cpus_allowed(busiest->curr))) {                raw_spin_unlock_irqrestore(&busiest->lock,                                flags);                env.flags |= LBF_ALL_PINNED;                goto out_one_pinned;            }            /*             * ->active_balance synchronizes accesses to             * ->active_balance_work.  Once set, it's cleared             * only after active load balance is finished.             */            if (!busiest->active_balance && !cpu_park(cpu_of(busiest))) {                busiest->active_balance = 1; /* load_balance */                busiest->push_cpu = this_cpu;                active_balance = 1;            }            raw_spin_unlock_irqrestore(&busiest->lock, flags);            /* (7.16) 迁移busiest->curr进程当前期cpu */            if (active_balance) {                if (stop_one_cpu_dispatch(cpu_of(busiest),                    active_load_balance_cpu_stop, busiest,                    &busiest->active_balance_work)) {                    raw_spin_lock_irqsave(&busiest->lock, flags);                    busiest->active_balance = 0;                    active_balance = 0;                    raw_spin_unlock_irqrestore(&busiest->lock, flags);                }            }            /*             * We've kicked active balancing, reset the failure             * counter.             */            sd->nr_balance_failed = sd->cache_nice_tries+1;        }    } else        sd->nr_balance_failed = 0;    if (likely(!active_balance)) {        /* We were unbalanced, so reset the balancing interval */        sd->balance_interval = sd->min_interval;    } else {        /*         * If we've begun active balancing, start to back off. This         * case may not be covered by the all_pinned logic if there         * is only 1 task on the busy runqueue (because we don't call         * detach_tasks).         */        if (sd->balance_interval < sd->max_interval)            sd->balance_interval *= 2;    }    goto out;out_balanced:    /*     * We reach balance although we may have faced some affinity     * constraints. Clear the imbalance flag if it was set.     */    if (sd_parent) {        int *group_imbalance = &sd_parent->groups->sgc->imbalance;        if (*group_imbalance)            *group_imbalance = 0;    }out_all_pinned:    /*     * We reach balance because all tasks are pinned at this level so     * we can't migrate them. Let the imbalance flag set so parent level     * can try to migrate them.     */    schedstat_inc(sd, lb_balanced[idle]);    sd->nr_balance_failed = 0;out_one_pinned:    /* tune up the balancing interval */    if (((env.flags & LBF_ALL_PINNED) &&            sd->balance_interval < MAX_PINNED_INTERVAL) ||            (sd->balance_interval < sd->max_interval))        sd->balance_interval *= 2;    ld_moved = 0;out:    return ld_moved;}||→static int should_we_balance(struct lb_env *env){    struct sched_group *sg = env->sd->groups;    struct cpumask *sg_cpus, *sg_mask;    int cpu, balance_cpu = -1;    /* (7.2.1) 如果本cpu为CPU_NEWLY_IDLE，直接符合迁移条件 */    /*     * In the newly idle case, we will allow all the cpu's     * to do the newly idle load balance.     */    if (env->idle == CPU_NEWLY_IDLE)        return 1;    sg_cpus = sched_group_cpus(sg);    sg_mask = sched_group_mask(sg);    /* (7.2.2) 本sched_group的第一个idle cpu适合做load_balance */    /* Try to find first idle cpu */    for_each_cpu_and(cpu, sg_cpus, env->cpus) {        if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))            continue;        balance_cpu = cpu;        break;    }    /* (7.2.3) 没有idle cpu，则选取本sched_group的第一个cpu做load_balance */    if (balance_cpu == -1)        balance_cpu = group_balance_cpu(sg);    /* (7.2.4) 不满足上述条件的cpu，不适合来启动load_balance */    /*     * First idle cpu or the first cpu(busiest) in this sched group     * is eligible for doing load balancing at this and above domains.     */    return balance_cpu == env->dst_cpu;}||→static struct sched_group *find_busiest_group(struct lb_env *env){    struct sg_lb_stats *local, *busiest;    struct sd_lb_stats sds;    int local_cpu = 0, busiest_cpu = 0;    struct cpumask *busiest_cpumask;    int same_clus = 0;    init_sd_lb_stats(&sds);    /* (7.3.1) 更新本层级sched_group链表中，每个sched_group的负载,        并选出busiest的一个sched_group     */    /*     * Compute the various statistics relavent for load balancing at     * this level.     */    update_sd_lb_stats(env, &sds);    local = &sds.local_stat;    busiest = &sds.busiest_stat;    if (sds.busiest) {        busiest_cpumask = sched_group_cpus(sds.busiest);        local_cpu = env->dst_cpu;        busiest_cpu = group_first_cpu(sds.busiest);        same_clus = is_the_same_domain(local_cpu, busiest_cpu);        mt_sched_printf(sched_lb, "%s: local_cpu=%d, busiest_cpu=%d, busiest_mask=%lu, same_cluster=%d",                __func__, local_cpu, busiest_cpu, busiest_cpumask->bits[0], same_clus);    }    /* (7.3.2) 如果EAS使能，跨cluster的任务迁移使用EAS来做 */    if (energy_aware() && !env->dst_rq->rd->overutilized && !same_clus)        goto out_balanced;    /* (7.3.3) */    /* ASYM feature bypasses nice load balance check */    if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&        check_asym_packing(env, &sds))        return sds.busiest;    /* (7.3.4) busiest sg上没有负载，返回空 */    /* There is no busy sibling group to pull tasks from */    if (!sds.busiest || busiest->sum_nr_running == 0) {        if (!sds.busiest)            mt_sched_printf(sched_lb, "[%s] %d: fail no busiest ", __func__, env->src_cpu);        else            mt_sched_printf(sched_lb, "[%s] %d: fail busiest no task ", __func__, env->src_cpu);        goto out_balanced;    }    /* (7.3.5) sg链表里的平均负载 */    sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)                        / sds.total_capacity;    /* (7.3.6) 如果busiest sg低一级别的因为cpu affinity没有balance成功，设置了group_imbalanced标志         强制在当前级别上进行balance     */    /*     * If the busiest group is imbalanced the below checks don't     * work because they assume all things are equal, which typically     * isn't true due to cpus_allowed constraints and the like.     */    if (busiest->group_type == group_imbalanced)        goto force_balance;    /* (7.3.7) 如果dest cpu/group很闲，busiest负载很重，          强制开展balance     */    /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */    if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&        busiest->group_no_capacity)        goto force_balance;    /* (7.3.8)  如果dest_cpu所在sg的负载都大于busiest sg的负载，        放弃balance     */    /*     * If the local group is busier than the selected busiest group     * don't try and pull any tasks.     */    if (local->avg_load >= busiest->avg_load)        goto out_balanced;    /* (7.3.9)  如果dest_cpu所在sg的负载都大于sg链表的平均负载，        放弃balance     */    /*     * Don't pull any tasks if this group is already above the domain     * average load.     */    if (local->avg_load >= sds.avg_load)        goto out_balanced;    /* (7.3.10)  如果dest_cpu为idle，但是dest_cpu所在的sg idle cpu数量小于busiest sg的idle cpu数量        放弃balance     */#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT    if ((env->idle == CPU_IDLE) || (env->idle == CPU_NEWLY_IDLE)) {        int i = (env->idle == CPU_IDLE) ? 1:0;#else    if (env->idle == CPU_IDLE) {#endif        /*         * This cpu is idle. If the busiest group is not overloaded         * and there is no imbalance between this and busiest group         * wrt idle cpus, it is balanced. The imbalance becomes         * significant if the diff is greater than 1 otherwise we         * might end up to just move the imbalance on another group         */#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT        if ((busiest->group_type != group_overloaded) &&            (local->idle_cpus < (busiest->idle_cpus + i)))#else        if ((busiest->group_type != group_overloaded) &&                (local->idle_cpus <= (busiest->idle_cpus + 1)))#endif            goto out_balanced;    } else {        /* (7.3.11)  busiest->avg_load大于local->avg_load的比例没有超过env->sd->imbalance_pct            放弃balance        */        /*         * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use         * imbalance_pct to be conservative.         */        if (100 * busiest->avg_load <=                env->sd->imbalance_pct * local->avg_load)            goto out_balanced;    }force_balance:    /* Looks like there is an imbalance. Compute it */    /* (7.3.12) 计算需要迁移的负载值env->imbalance */    calculate_imbalance(env, &sds);#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT    env->imbalance = env->imbalance * SCHED_CAPACITY_SCALE        / (sds.busiest->sgc->capacity / cpumask_weight(sched_group_cpus(sds.busiest)));#endif    return sds.busiest;out_balanced:    env->imbalance = 0;    return NULL;}|||→static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds){    struct sched_domain *child = env->sd->child;    struct sched_group *sg = env->sd->groups;    struct sg_lb_stats tmp_sgs;    int load_idx, prefer_sibling = 0;    bool overload = false, overutilized = false;    if (child && child->flags & SD_PREFER_SIBLING)        prefer_sibling = 1;    /* (7.3.1.1) 根据idle情况，选择计算cpu负载时的idx，        idx：是CPU层级负载this_rq->cpu_load[i]数组的index值     */    load_idx = get_sd_load_idx(env->sd, env->idle);    /* (7.3.1.2) 逐个轮询本层级sched_group链表中的每个sched_group */    do {        struct sg_lb_stats *sgs = &tmp_sgs;        int local_group;        /* (7.3.1.3) 如果sg是当前cpu所在的sg，则本sg称为local_group             使用专门的数据结构来存储local_group的信息：            sds->local = sg;        // 使用sds->local来存储local_group            sgs = &sds->local_stat; // 使用sds->local_stat来存储local_group的统计         */        local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));        if (local_group) {            sds->local = sg;            sgs = &sds->local_stat;            /* (7.3.1.4) 更新local_group的capacity，更新的周期为sd->balance_interval                 主要目的是动态减去RT进程消耗的capacity             */            if (env->idle != CPU_NEWLY_IDLE ||                time_after_eq(jiffies, sg->sgc->next_update))                update_group_capacity(env->sd, env->dst_cpu);        }        /* (7.3.1.5) 更新当前sched_group的负载统计             sgs：sg统计数据放到sgs当中            overload：rq中runnable的进程>1，那么肯定有进程在等待            overutilized：cpu的capacity < util，运算能力不足         */        update_sg_lb_stats(env, sg, load_idx, local_group, sgs,                        &overload, &overutilized);        /* (7.3.1.6) local_group不参与busiest sg的计算 */        if (local_group)            goto next_group;        /* (7.3.1.7) 如果设置了SD_PREFER_SIBLING标志，说明local_group希望其他人迁移任务到它身上，            提高其他sg的迁移优先级         */        /*         * In case the child domain prefers tasks go to siblings         * first, lower the sg capacity so that we'll try         * and move all the excess tasks away. We lower the capacity         * of a group only if the local group has the capacity to fit         * these excess tasks. The extra check prevents the case where         * you always pull from the heaviest group when it is already         * under-utilized (possible with a large weight task outweighs         * the tasks on the system).         */        if (prefer_sibling && sds->local &&            group_has_capacity(env, &sds->local_stat) &&            (sgs->sum_nr_running > 1)) {            sgs->group_no_capacity = 1;            sgs->group_type = group_classify(sg, sgs);        }        /* (7.3.1.8) 根据计算的sgs统计数据，找出busiest sg */        if (update_sd_pick_busiest(env, sds, sg, sgs)) {            sds->busiest = sg;            sds->busiest_stat = *sgs;        }next_group:        /* (7.3.1.9) 更新sds中的负载、capacity统计 */        /* Now, start updating sd_lb_stats */        sds->total_load += sgs->group_load;        sds->total_capacity += sgs->group_capacity;        sg = sg->next;    } while (sg != env->sd->groups);    if (env->sd->flags & SD_NUMA)        env->fbq_type = fbq_classify_group(&sds->busiest_stat);    env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;    /* (7.3.1.10) 根据最后一个sg的overload、overutilized值        来更新dst_cpu rq->rd中的对应值 。        ooooo这里是怎么想的？不是local_group，也不是busiest_group，而是最后一个计算的sg!!!     */    if (!env->sd->parent) {        /* update overload indicator if we are at root domain */        if (env->dst_rq->rd->overload != overload)            env->dst_rq->rd->overload = overload;        /* Update over-utilization (tipping point, U >= 0) indicator */        if (env->dst_rq->rd->overutilized != overutilized)            env->dst_rq->rd->overutilized = overutilized;    } else {        if (!env->dst_rq->rd->overutilized && overutilized)            env->dst_rq->rd->overutilized = true;    }}||||→static inline void update_sg_lb_stats(struct lb_env *env,            struct sched_group *group, int load_idx,            int local_group, struct sg_lb_stats *sgs,            bool *overload, bool *overutilized){    unsigned long load;    int i;    memset(sgs, 0, sizeof(*sgs));    /*  (7.3.1.5.1) 遍历sched_group中的每个cpu */    for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {        struct rq *rq = cpu_rq(i);        /* (7.3.1.5.2) 获取本cpu的负载rq->cpu_load[load_idx-1] */        /* Bias balancing toward cpus of our domain */        if (local_group)            /* 如果是local_group，负载往小的取：min(rq->cpu_load[load_idx-1], weighted_cpuload(cpu)) */            load = target_load(i, load_idx);        else            /* 如果不是local_group，负载往大的取：max(rq->cpu_load[load_idx-1], weighted_cpuload(cpu)) */            load = source_load(i, load_idx);#ifdef CONFIG_MTK_SCHED_INTEROP        /* (7.3.1.5.3) 因为rq->cpu_load[]只包含cfs的负载，mtk尝试加上rt部分的负载            ooooo但是rq->cpu_capacity中已经减去了rt的部分，这里是否还需要？？         */        load += mt_rt_load(i);#endif        /* (7.3.1.5.4) 累加sgs各项值：            sgs->group_load   // runnable负载带weight分量(cpu_rq(cpu)->cfs.avg.util_avg)，经过rq->cpu_load[]计算            sgs->group_util   // running负载(cpu_rq(cpu)->cfs.avg.load_avg/cpu_rq(cpu)->cfs.runnable_load_avg)            sgs->sum_nr_running // rq中所有se的总和            sgs->sum_weighted_load // runnable负载带weight分量(cpu_rq(cpu)->cfs.avg.util_avg)            sgs->idle_cpus      // idle状态的cpu计数         */#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT        sgs->group_load += (load * capacity_orig_of(i)) >> SCHED_CAPACITY_SHIFT;#else        sgs->group_load += load;#endif        sgs->group_util += cpu_util(i);        sgs->sum_nr_running += rq->cfs.h_nr_running;        /* (7.3.1.5.5) 如果rq中进程数量>1，则就会有进程处于runnable状态，            overload = true         */        if (rq->nr_running > 1)            *overload = true;#ifdef CONFIG_NUMA_BALANCING        sgs->nr_numa_running += rq->nr_numa_running;        sgs->nr_preferred_running += rq->nr_preferred_running;#endif        sgs->sum_weighted_load += weighted_cpuload(i);        if (idle_cpu(i))            sgs->idle_cpus++;        /* (7.3.1.5.6) cpu的capacity小于cpu的running状态负载，            overutilized = true         */        if (cpu_overutilized(i))            *overutilized = true;    }    /* (7.3.1.5.7) 更新汇总后sgs的统计数据：        sgs->group_capacity     // sgs所有cpu capacity的累加        sgs->avg_load           // 按照group_capacity，等比例放大group_load负载，capacity越小avg_load越大        sgs->load_per_task      // sgs的平均每个进程的weight负载        sgs->group_weight       // sgs的online cpu个数        sgs->group_no_capacity  // sgs的capacity已经不够用，赶不上util        sgs->group_type         // 严重级别 group_overloaded > group_imbalanced > group_other                                // group_imbalanced: 下一等级的load_balance因为cpu_affinity的原因没有完成     */    /* Adjust by relative CPU capacity of the group */    sgs->group_capacity = group->sgc->capacity;    sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;    if (sgs->sum_nr_running)        sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;    sgs->group_weight = group->group_weight;    sgs->group_no_capacity = group_is_overloaded(env, sgs);    sgs->group_type = group_classify(group, sgs);}||||→static bool update_sd_pick_busiest(struct lb_env *env,                   struct sd_lb_stats *sds,                   struct sched_group *sg,                   struct sg_lb_stats *sgs){    struct sg_lb_stats *busiest = &sds->busiest_stat;#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT    if (sgs->sum_nr_running == 0) {        mt_sched_printf(sched_lb_info, "[%s] sgs->sum_nr_running=%d",            __func__, sgs->sum_nr_running);        return false;    }#endif    /* (7.3.1.9.1) 如果新的sgs group_type大于旧的busiest sgs，        新的sgs更busy     */    if (sgs->group_type > busiest->group_type)        return true;    /* (7.3.1.9.2) 如果新的sgs group_type小于旧的busiest sgs，        旧的sgs更busy     */    if (sgs->group_type < busiest->group_type)        return false;    /* (7.3.1.9.3) 在group_type相同的情况下，比较sgs->avg_load         sgs->avg_load = rq->cpu_load[load_idx-1] * (group_load*SCHED_CAPACITY_SCALE / sgs->group_capacity)     */    if (sgs->avg_load <= busiest->avg_load)        return false;    /* (7.3.1.9.4) 如果SD_ASYM_PACKING标志没有置位,        在group_type相同的情况下，sgs->avg_load值较大的为busiest sg     */    /* This is the busiest node in its class. */    if (!(env->sd->flags & SD_ASYM_PACKING))        return true;    /* (7.3.1.9.5) ASYM_PACKING的意思是会把负载移到最低序号的cpu上，        如果sg的frist cpu序号 > dst_cpu，则busiest        对个sg的frist cpu序号 > dst_cpu，选择序号小的sg     */    /*     * ASYM_PACKING needs to move all the work to the lowest     * numbered CPUs in the group, therefore mark all groups     * higher than ourself as busy.     */    if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {        if (!sds->busiest)            return true;        if (group_first_cpu(sds->busiest) > group_first_cpu(sg))            return true;    }    /* (7.3.1.9.6) 设置了ASYM_PACKING，且如果sg的frist cpu序号 <= dst_cpu，        返回false     */    return false;}|||→static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds){    unsigned long max_pull, load_above_capacity = ~0UL;    struct sg_lb_stats *local, *busiest;    /* (7.3.12.1) local sgs和busiest sgs */    local = &sds->local_stat;    busiest = &sds->busiest_stat;    if (busiest->group_type == group_imbalanced) {        /*         * In the group_imb case we cannot rely on group-wide averages         * to ensure cpu-load equilibrium, look at wider averages. XXX         */        busiest->load_per_task =            min(busiest->load_per_task, sds->avg_load);    }    /* (7.3.12.2) */    /*     * In the presence of smp nice balancing, certain scenarios can have     * max load less than avg load(as we skip the groups at or below     * its cpu_capacity, while calculating max_load..)     */    if (busiest->avg_load <= sds->avg_load ||        local->avg_load >= sds->avg_load) {        env->imbalance = 0;        return fix_small_imbalance(env, sds);    }    /* (7.3.12.3) */    /*     * If there aren't any idle cpus, avoid creating some.     */    if (busiest->group_type == group_overloaded &&        local->group_type   == group_overloaded) {        load_above_capacity = busiest->sum_nr_running *                    SCHED_LOAD_SCALE;        if (load_above_capacity > busiest->group_capacity)            load_above_capacity -= busiest->group_capacity;        else            load_above_capacity = ~0UL;    }    /* (7.3.12.4) env->imbalance的值等于min((sds->avg - local), (busiest - sds->avg))        在local和sds平均值，busiest和sds平均值，两个差值之间选择最小值     */    /*     * We're trying to get all the cpus to the average_load, so we don't     * want to push ourselves above the average load, nor do we wish to     * reduce the max loaded cpu below the average load. At the same time,     * we also don't want to reduce the group load below the group capacity     * (so that we can implement power-savings policies etc). Thus we look     * for the minimum possible imbalance.     */    max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);    /* How much load to actually move to equalise the imbalance */    env->imbalance = min(        max_pull * busiest->group_capacity,        (sds->avg_load - local->avg_load) * local->group_capacity    ) / SCHED_CAPACITY_SCALE;    /*     * if *imbalance is less than the average load per runnable task     * there is no guarantee that any tasks will be moved so we'll have     * a think about bumping its value to force at least one task to be     * moved     */    if (env->imbalance < busiest->load_per_task)        return fix_small_imbalance(env, sds);}||→static struct rq *find_busiest_queue(struct lb_env *env,                     struct sched_group *group){    struct rq *busiest = NULL, *rq;    unsigned long busiest_load = 0, busiest_capacity = 1;    int i;    /* (7.4.1) 逐个遍历sg中的cpu */    for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {        unsigned long capacity, wl;        enum fbq_type rt;        rq = cpu_rq(i);        rt = fbq_classify_rq(rq);        /*         * We classify groups/runqueues into three groups:         *  - regular: there are !numa tasks         *  - remote:  there are numa tasks that run on the 'wrong' node         *  - all:     there is no distinction         *         * In order to avoid migrating ideally placed numa tasks,         * ignore those when there's better options.         *         * If we ignore the actual busiest queue to migrate another         * task, the next balance pass can still reduce the busiest         * queue by moving tasks around inside the node.         *         * If we cannot move enough load due to this classification         * the next pass will adjust the group classification and         * allow migration of more tasks.         *         * Both cases only affect the total convergence complexity.         */        if (rt > env->fbq_type)            continue;        /* (7.4.2) 计算出cpu的capacity和weight_load */        capacity = capacity_of(i);        wl = weighted_cpuload(i);#ifdef CONFIG_MTK_SCHED_INTEROP        wl += mt_rt_load(i);#endif        /*         * When comparing with imbalance, use weighted_cpuload()         * which is not scaled with the cpu capacity.         */        if (rq->nr_running == 1 && wl > env->imbalance &&            !check_cpu_capacity(rq, env->sd))            continue;        /* (7.4.3) 选出相对负载最重的cpu */        /*         * For the load comparisons with the other cpu's, consider         * the weighted_cpuload() scaled with the cpu capacity, so         * that the load can be moved away from the cpu that is         * potentially running at a lower capacity.         *         * Thus we're looking for max(wl_i / capacity_i), crosswise         * multiplication to rid ourselves of the division works out         * to: wl_i * capacity_j > wl_j * capacity_i;  where j is         * our previous maximum.         */        if (wl * busiest_capacity > busiest_load * capacity) {            busiest_load = wl;            busiest_capacity = capacity;            busiest = rq;        }    }    return busiest;}||→static int detach_tasks(struct lb_env *env){    struct list_head *tasks = &env->src_rq->cfs_tasks;    struct task_struct *p;    unsigned long load;    int detached = 0;    lockdep_assert_held(&env->src_rq->lock);    if (env->imbalance <= 0)        return 0;    /* (7.6.1) 遍历busiest rq中的进程 */    while (!list_empty(tasks)) {        /* (7.6.2) 如果dest cpu不是idle，不能将busiest cpu迁移到idle状态 */            /*         * We don't want to steal all, otherwise we may be treated likewise,         * which could at worst lead to a livelock crash.         */        if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)            break;        p = list_first_entry(tasks, struct task_struct, se.group_node);        /* (7.6.3) 遍历任务最多不超过sysctl_sched_nr_migrate(32) */        env->loop++;        /* We've more or less seen every task there is, call it quits */        if (env->loop > env->loop_max)            break;        /* (7.6.4) 每sched_nr_migrate_break个任务遍历需要跳出休息一下，            如果没有达到env->loop_max，后面会重来         */        /* take a breather every nr_migrate tasks */        if (env->loop > env->loop_break) {            env->loop_break += sched_nr_migrate_break;            env->flags |= LBF_NEED_BREAK;            break;        }        /* (7.6.5) 判断任务是否支持迁移？ */        if (!can_migrate_task(p, env))            goto next;        /* (7.6.6) 获取p进程相对顶层cfs_rq的负载，             根据负载判断进程是否适合迁移         */        load = task_h_load(p);        if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)            goto next;        if ((load / 2) > env->imbalance)            goto next;        /* (7.6.7) detach 进程 */        detach_task(p, env);        list_add(&p->se.group_node, &env->tasks);        detached++;        env->imbalance -= load;#ifdef CONFIG_PREEMPT        /*         * NEWIDLE balancing is a source of latency, so preemptible         * kernels will stop after the first task is detached to minimize         * the critical section.         */        if (env->idle == CPU_NEWLY_IDLE)            break;#endif        /*         * We only want to steal up to the prescribed amount of         * weighted load.         */        if (env->imbalance <= 0)            break;        continue;next:        list_move_tail(&p->se.group_node, tasks);    }    /*     * Right now, this is one of only two places we collect this stat     * so we can safely collect detach_one_task() stats here rather     * than inside detach_one_task().     */    schedstat_add(env->sd, lb_gained[env->idle], detached);    return detached;}|||→staticint can_migrate_task(struct task_struct *p, struct lb_env *env){    int tsk_cache_hot;    lockdep_assert_held(&env->src_rq->lock);    /*     * We do not migrate tasks that are:     * 1) throttled_lb_pair, or     * 2) cannot be migrated to this CPU due to cpus_allowed, or     * 3) running (obviously), or     * 4) are cache-hot on their current CPU.     */    /* (7.6.5.1) 如果达到bandwith限制，返回失败 */    if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))        return 0;    /* (7.6.5.2) 如果p进程的cpu affinity不允许迁移到dst_cpu，进一步处理 */    if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {        int cpu;        schedstat_inc(p, se.statistics.nr_failed_migrations_affine);        /* (7.6.5.3) LBF_SOME_PINNED标志，记录有些进程迁移失败 */        env->flags |= LBF_SOME_PINNED;        /* (7.6.5.5) 如果已经有其他的LBF_DST_PINNED动作，直接返回失败 */        /*         * Remember if this task can be migrated to any other cpu in         * our sched_group. We may want to revisit it if we couldn't         * meet load balance goals by pulling other tasks on src_cpu.         *         * Also avoid computing new_dst_cpu if we have already computed         * one in current iteration.         */        if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))            return 0;        /* (7.6.5.4) 如果dst_cpu同一sched_group中的其他cpu符合p的affinity，尝试更改dst_cpu，            设置LBF_DST_PINNED标志         */        /* Prevent to re-select dst_cpu via env's cpus */        for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {            if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {                env->flags |= LBF_DST_PINNED;                env->new_dst_cpu = cpu;                break;            }        }        return 0;    }    /* (7.6.5.6) 有任何符合affinity条件的p，清除LBF_ALL_PINNED标志 */    /* Record that we found atleast one task that could run on dst_cpu */    env->flags &= ~LBF_ALL_PINNED;    /* (7.6.5.7) 如果p在running状态，返回失败 */    if (task_running(env->src_rq, p)) {        schedstat_inc(p, se.statistics.nr_failed_migrations_running);        return 0;    }    /* (7.6.5.8) NUMA 相关的一些判断  */    /*     * Aggressive migration if:     * 1) destination numa is preferred     * 2) task is cache cold, or     * 3) too many balance attempts have failed.     */    tsk_cache_hot = migrate_degrades_locality(p, env);    if (tsk_cache_hot == -1)        tsk_cache_hot = task_hot(p, env);    if (tsk_cache_hot <= 0 ||        env->sd->nr_balance_failed > env->sd->cache_nice_tries) {        if (tsk_cache_hot == 1) {            schedstat_inc(env->sd, lb_hot_gained[env->idle]);            schedstat_inc(p, se.statistics.nr_forced_migrations);        }        return 1;    }    schedstat_inc(p, se.statistics.nr_failed_migrations_hot);    return 0;}|||→static unsigned long task_h_load(struct task_struct *p){    struct cfs_rq *cfs_rq = task_cfs_rq(p);    update_cfs_rq_h_load(cfs_rq);    /* (7.6.6.1) task_h_load的目的是在task_group使能时，rq中有多个层次的cfs_rq         如果进程p挂载在底层的cfs_rq中，把p的负载转换成顶层cfs_rq的相对负载     */    return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,            cfs_rq_load_avg(cfs_rq) + 1);}static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq){    struct rq *rq = rq_of(cfs_rq);    struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];    u64 now = sched_clock_cpu(cpu_of(rq));    unsigned long load;    /* sched: change to jiffies */    now = now * HZ >> 30;    if (cfs_rq->last_h_load_update == now)        return;    /* 从底层cfs_rq到顶层cfs_rq建立起层次关系 */    cfs_rq->h_load_next = NULL;    for_each_sched_entity(se) {        cfs_rq = cfs_rq_of(se);        cfs_rq->h_load_next = se;        if (cfs_rq->last_h_load_update == now)            break;    }    if (!se) {        cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);        cfs_rq->last_h_load_update = now;    }    /* 使用建立的关系，从顶层cfs_rq开始计算每个层次cfs_rq的相对顶层负载h_load */    while ((se = cfs_rq->h_load_next) != NULL) {        load = cfs_rq->h_load;        load = div64_ul(load * se->avg.load_avg,            cfs_rq_load_avg(cfs_rq) + 1);        cfs_rq = group_cfs_rq(se);        cfs_rq->h_load = load;        cfs_rq->last_h_load_update = now;    }}

4.1.2.2、nohz_idle_balance()

每个cpu的负载均衡是在本cpu的tick任务scheduler_tick()中判断执行的，如果cpu进入了nohz模式scheduler_tick()被stop，那么本cpu没有机会去做rebalance_domains()。为了解决这个问题，系统设计了nohz_idle_balance()，在运行的cpu上判断进入nohz的cpu是否需要rebalance load，如果需要选择一个idle cpu来帮所有的nohz idle cpu做负载均衡。

在rebalance_domains()函数之前有一个nohz_idle_balance()，这是系统在条件满足的情况下让一个idle cpu做idle负载均衡。主要的原理如下：

1、cpu在进入nohz idle状态时，设置标志：

这里写图片描述

tick_nohz_idle_enter() -> set_cpu_sd_state_idle():↓void set_cpu_sd_state_idle(void){    struct sched_domain *sd;    int cpu = smp_processor_id();    rcu_read_lock();    sd = rcu_dereference(per_cpu(sd_busy, cpu));    if (!sd || sd->nohz_idle)        goto unlock;    /* (1.1) 进入nohz idle，设置sd->nohz_idle标志 */    sd->nohz_idle = 1;    /* (1.2) 减少sgc->nr_busy_cpus的计数 */    atomic_dec(&sd->groups->sgc->nr_busy_cpus);unlock:    rcu_read_unlock();}tick_nohz_idle_enter() -> __tick_nohz_idle_enter() -> tick_nohz_stop_sched_tick() -> nohz_balance_enter_idle():↓void nohz_balance_enter_idle(int cpu){    /*     * If this cpu is going down, then nothing needs to be done.     */    if (!cpu_active(cpu))        return;    if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))        return;    /*     * If we're a completely isolated CPU, we don't play.     */    if (on_null_domain(cpu_rq(cpu)))        return;    /* (2.1) 进入idle状态，设置nohz.idle_cpus_mask中对应的bit */    cpumask_set_cpu(cpu, nohz.idle_cpus_mask);    /* (2.2) 进入idle状态，增加nohz.nr_cpus计数 */    atomic_inc(&nohz.nr_cpus);    /* (2.3) 设置cpu_rq(cpu)->nohz_flags中的NOHZ_TICK_STOPPED标志 */    set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));}

2、在trigger_load_balance()中判断，当前是否需要触发idle load balance：

这里写图片描述

void trigger_load_balance(struct rq *rq){    /* (1) 判断当前是否需要idle load balance */    if (nohz_kick_needed(rq))        /* (2) 选中一个idle cpu去做idle load balance */        nohz_balancer_kick();}|→/* * Current heuristic for kicking the idle load balancer in the presence * of an idle cpu in the system. *   - This rq has more than one task. *   - This rq has at least one CFS task and the capacity of the CPU is *     significantly reduced because of RT tasks or IRQs. *   - At parent of LLC scheduler domain level, this cpu's scheduler group has *     multiple busy cpu. *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler *     domain span are idle. */static inline bool nohz_kick_needed(struct rq *rq){    unsigned long now = jiffies;    struct sched_domain *sd;    struct sched_group_capacity *sgc;    int nr_busy, cpu = rq->cpu;    bool kick = false;    /* (1.1) 如果当前cpu为idle状态，失败退出 */    if (unlikely(rq->idle_balance))        return false;    /* (1.2) 退出nohz状态：set_cpu_sd_state_busy()、nohz_balance_exit_idle(cpu)        是set_cpu_sd_state_idle()、nohz_balance_enter_idle()的反向操作     */   /*    * We may be recently in ticked or tickless idle mode. At the first    * busy tick after returning from idle, we will update the busy stats.    */    set_cpu_sd_state_busy();    nohz_balance_exit_idle(cpu);    /* (1.3) 如果进入nohz idle状态的cpu数量为0，失败退出 */    /*     * None are in tickless mode and hence no need for NOHZ idle load     * balancing.     */    if (likely(!atomic_read(&nohz.nr_cpus)))        return false;    /* (1.4) nohz balance时间未到，失败退出 */    if (time_before(now, nohz.next_balance))        return false;#if !defined(CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT) && defined(CONFIG_HMP)    /* for more than two clusters, still need wakup nohz CPUs and force balancing */    /*     * Bail out if there are no nohz CPUs in our     * HMP domain, since we will move tasks between     * domains through wakeup and force balancing     * as necessary based upon task load.     */    if (sched_feat(SCHED_HMP) && cpumask_first_and(nohz.idle_cpus_mask,                &((struct hmp_domain *)hmp_cpu_domain(cpu))->cpus) >= nr_cpu_ids)        return false;#endif    /* (1.5) 当前cpu的进程>=2，返回成功 */    if (rq->nr_running >= 2 &&        (!energy_aware() || cpu_overutilized(cpu)))        return true;    /* (1.6) sd所在sg的nr_busy_cpus>1，返回成功 */    rcu_read_lock();    sd = rcu_dereference(per_cpu(sd_busy, cpu));    if (sd && !energy_aware()) {        sgc = sd->groups->sgc;        nr_busy = atomic_read(&sgc->nr_busy_cpus);        if (nr_busy > 1) {            kick = true;            goto unlock;        }    }    /* (1.7) 如果所有层次的se个数>=1，且capacity在减少，返回成功 */    sd = rcu_dereference(rq->sd);    if (sd) {        if ((rq->cfs.h_nr_running >= 1) &&                check_cpu_capacity(rq, sd)) {            kick = true;            goto unlock;        }    }    /* (1.8) 如果本sd->span[]中第一个idle cpu < sd_asym，返回成功 */    sd = rcu_dereference(per_cpu(sd_asym, cpu));    if (sd && (cpumask_first_and(nohz.idle_cpus_mask,                  sched_domain_span(sd)) < cpu)) {        kick = true;        goto unlock;    }unlock:    rcu_read_unlock();    return kick;}|→static void nohz_balancer_kick(void){    int ilb_cpu;    nohz.next_balance++;    /* (2.1) 找到所有idle cpu中的第一个idle cpu */    ilb_cpu = find_new_ilb();    if (ilb_cpu >= nr_cpu_ids)        return;    /* (2.2) 给ilb_cpu的cpu_rq(cpu)->nohz_flags设置NOHZ_BALANCE_KICK标志位 */    if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))        return;    /* (2.3) 使用ipi中断来唤醒ilb_cpu执行idle load balance */    /*     * Use smp_send_reschedule() instead of resched_cpu().     * This way we generate a sched IPI on the target cpu which     * is idle. And the softirq performing nohz idle load balance     * will be run before returning from the IPI.     */    smp_send_reschedule(ilb_cpu);    return;}/* (2.3.1) ilb_cpu倍唤醒后处理IPI_RESCHEDULE，    会触发一个SCHED_SOFTIRQ软中断来启动run_rebalance_domains() */void handle_IPI(int ipinr, struct pt_regs *regs){    unsigned int cpu = smp_processor_id();    struct pt_regs *old_regs = set_irq_regs(regs);    if ((unsigned)ipinr < NR_IPI) {        trace_ipi_entry_rcuidle(ipi_types[ipinr]);        __inc_irq_stat(cpu, ipi_irqs[ipinr]);    }    switch (ipinr) {    case IPI_RESCHEDULE:        scheduler_ipi();        break;}↓void scheduler_ipi(void){    /*     * Check if someone kicked us for doing the nohz idle load balance.     */    if (unlikely(got_nohz_idle_kick())) {        this_rq()->idle_balance = 1;        raise_softirq_irqoff(SCHED_SOFTIRQ);    }}

3、被选中的ilb_cpu被唤醒后，需要帮其他所有idle cpu完成rebalance_domains()工作：

这里写图片描述

static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle){    int this_cpu = this_rq->cpu;    struct rq *rq;    int balance_cpu;    /* Earliest time when we have to do rebalance again */    unsigned long next_balance = jiffies + 60*HZ;    int update_next_balance = 0;    /* (1) 判断当前cpu是不是被选中被唤醒的ilb_cpu */    if (idle != CPU_IDLE ||        !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))        goto end;    /* (2) 轮询所有进入onhz状态的cpu */    for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {        /* (3) 只服务非本cpu，且还是idle状态的cpu             ooooo本cpu也是idle状态，不需对本cpu做idle负载均衡？            ooooo给其他idle cpu的rq做了负载均衡后，什么时候唤醒其他idle cpu？         */        if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))            continue;        /* (4) 如果本cpu被设置了resched标志，说明有线程被唤醒，退出idle状态 */        /*         * If this cpu gets work to do, stop the load balancing         * work being done for other cpus. Next load         * balancing owner will pick it up.         */        if (need_resched())            break;        /* (5) 需要做负载均衡的idle进程balance_cpu */        rq = cpu_rq(balance_cpu);        /* (6) 如果balance_cpu的rq->next_balance时间已到，替其做rebalance_domains() */        /*         * If time for next balance is due,         * do the balance.         */        if (time_after_eq(jiffies, rq->next_balance)) {            raw_spin_lock_irq(&rq->lock);            update_rq_clock(rq);            /* (7) 更新idle cpu因为idle造成的负载衰减 */            update_idle_cpu_load(rq);            raw_spin_unlock_irq(&rq->lock);            /* (8) 对balance_cpu做负载均衡                 ooooo做完负载均衡，什么时候唤醒balance_cpu？？             */            rebalance_domains(rq, CPU_IDLE);        }        if (time_after(next_balance, rq->next_balance)) {            next_balance = rq->next_balance;            update_next_balance = 1;        }    }    /* (9) 根据所有进入nohz idle cpu rq的最近的一次到期时间，更新nohz.next_balance */    /*     * next_balance will be updated only when there is a need.     * When the CPU is attached to null domain for ex, it will not be     * updated.     */    if (likely(update_next_balance))        nohz.next_balance = next_balance;end:    clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));}

4.1.2.3、select_task_rq_fair()

除了scheduler_tick()的时候会做负载均衡，另外一个时刻也会做负载均衡。就是fork新进程、wakeup休眠进程时，系统会根据负载均衡挑选一个最合适的cpu给进程运行，其核心函数就是select_task_rq_fair()：

1、首先是使用EAS的方法来select_cpu，在EAS使能且没有overutilized时使用EAS方法：

需要重点提一下的是：负载计算计算了3种负载(load_avg、loadwop_avg、util_avg)，EAS主要使用其中的util_avg，和capacity一起计算。

1.1、EAS遍历cluster和cpu，找到一个既能满足进程p的affinity又能容纳下进程p的负载util，属于能用最小capacity满足的cluster其中剩余capacity最多的target_cpu；

首先找到能容纳进程p的util且capacity最小的cluster：

这里写图片描述

然后在目标cluster中找到加上进程p以后，剩余capacity最大的cpu：

这里写图片描述

pre_cpu是进程p上一次运行的cpu作为src_cpu，上面选择的target_cpu作为dst_cpu，就是尝试计算进程p从pre_cpu迁移到target_cpu系统的功耗差异：

这里写图片描述

1.2、计算负载变化前后，target_cpu和prev_cpu带来的power变化。如果没有power增加则返回target_cpu，如果有power增加则返回prev_cpu；

计算负载变化的函数energy_diff()循环很多比较复杂，仔细分析下来就是计算target_cpu/prev_cpu在“MC层次cpu所在sg链表”+“DIE层级cpu所在sg”，这两种范围在负载变化中的功耗差异：

这里写图片描述

energy_diff()的计算方法如下：

负载值计算方法说明 idle_idx min(rq->idle_state_idx) sg多个cpu中，idle_state_idx最小值 eenv->cap_idx find_new_capacity() 在负载变化后，根据sg多个cpu中的最大util值，匹配的cpu freq档位sg->sge->cap_states[eenv->cap_idx].cap group_util += (__cpu_util << SCHED_CAPACITY_SHIFT)/sg->sge->cap_states[eenv->cap_idx].cap 累加sg中cpu的util值，并且把util转换成capacity的反比 sg_busy_energy (group_util * sg->sge->busy_power(group_first_cpu(sg), eenv, (sd->child) ? 1 : 0)) >> SCHED_CAPACITY_SHIFT 使用group_util计算busy部分消耗的功耗 sg_idle_energy ((SCHED_LOAD_SCALE - group_util) * sg->sge->idle_power(idle_idx, group_first_cpu(sg), eenv, (sd->child) ? 1 : 0)) >> SCHED_CAPACITY_SHIFT 使用(SCHED_LOAD_SCALE - group_util)计算idle部分计算的功耗 total_energy sg_busy_energy + sg_idle_energy 单个sg的功耗，累计所有相关sg的功耗，总的差异就是进程P迁移以后的功耗差异

2、如果EAS不适应，使用传统的负载均衡方法来select_cpu：
2.1、find_idlest_group() -> find_idlest_cpu() 找出最时候的target_cpu；
2.2、最差的方法使用select_idle_sibling()讲究找到一个idle cpu作为target_cpu；
2.3、确定target_cpu后，继续使用hmp_select_task_rq_fair()来判断是否需要进行hmp迁移；

static intselect_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags){    struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;    int cpu = smp_processor_id();    int new_cpu = prev_cpu;  /* 默认new_cpu为prev_cpu */    int want_affine = 0;    int sync = wake_flags & WF_SYNC;    int policy = 0;#ifdef CONFIG_MTK_SCHED_VIP_TASKS    /* mtk: If task is VIP task, prefer most efficiency idle cpu */    if (is_vip_task(p)) {        int vip_idle_cpu;        vip_idle_cpu = find_idle_vip_cpu(p);        if (vip_idle_cpu >= 0)            return vip_idle_cpu;    }#endif    /* (1) 优先使用EAS计算target cpu，         mtk 对EAS定义了3种模式：EAS模式(energy_aware())、HMP模式(sched_feat(SCHED_HMP))、hybrid_support(EAS、HMP同时共存)；        hybrid_support()模式下：一般负载均衡交给EAS；如果cpu_rq(cpu)->rd->overutilized负载已经严重不均衡，交给HMP；     */    /*     *  Consider EAS if only EAS enabled, but HMP     *  if hybrid enabled and system is over-utilized.     */    if ((energy_aware() && !hybrid_support()) ||            (hybrid_support() && !cpu_rq(cpu)->rd->overutilized))        goto CONSIDER_EAS;    /* (2) 非EAS情况，fork使用hmp balance */    /* HMP fork balance:     * always put non-kernel forking tasks on a big domain     */    if (sched_feat(SCHED_HMP) && p->mm && (sd_flag & SD_BALANCE_FORK)) {        new_cpu = hmp_fork_balance(p, prev_cpu);        /* to recover new_cpu value if something wrong */        if (new_cpu >= nr_cpu_ids)            new_cpu = prev_cpu;        else {#ifdef CONFIG_MTK_SCHED_TRACERS            trace_sched_select_task_rq(p, (LB_FORK | new_cpu), prev_cpu, new_cpu);#endif            return new_cpu;        }    }CONSIDER_EAS:    /* (3) 如果唤醒flag中设置了SD_BALANCE_WAKE，优先使用唤醒cpu来运行进程p，        还需判断下面3个条件是否满足：        !wake_wide(p)           // 当前cpu的唤醒次数没有超标        task_fits_max(p, cpu)   // 当前cpu的capacity能容纳进程p的util        cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) // 当前cpu在进程在P的affinity中        EAS利用了want_affine这个标志，只要EAS使能，want_affine =1     */    if (sd_flag & SD_BALANCE_WAKE)        want_affine = (!wake_wide(p) && task_fits_max(p, cpu) &&                  cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||                  energy_aware();    rcu_read_lock();    /* (4) 从下往上遍历当前cpu的sd，查询在哪个层次的sd进行负载均衡 */    for_each_domain(cpu, tmp) {        /* (4.1 如果当前sd不支持负载均SD_LOAD_BALANCE，退出) */        if (!(tmp->flags & SD_LOAD_BALANCE))            break;        /* (4.2) 优先找affine_sd，找到直接break；            需要符合以下3个条件：            want_affine                     //            (tmp->flags & SD_WAKE_AFFINE)   // 当前sd支持SD_WAKE_AFFINE标志            cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))  //当前sd->span[]中同时包含cpu、pre_cpu         */        /*         * If both cpu and prev_cpu are part of this domain,         * cpu is a valid SD_WAKE_AFFINE target.         */        if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&            cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {            affine_sd = tmp;            break;        }        /* (4.3) 其次找一个符合sd_flag的sd */        if (tmp->flags & sd_flag)            sd = tmp;        /* (4.4) 如果以上都失败，直接跳出 */        else if (!want_affine)            break;    }    /* (5) 如果affine_sd成功找到     */    if (affine_sd) {        sd = NULL; /* Prefer wake_affine over balance flags */        if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))            new_cpu = cpu;    }    /* (6) 没有找到符合sd_flag的sd */    if (!sd) {        /* (6.1) EAS使能，且本cpu没有overutilized，             使用EAS负载均衡算法         */        if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) {            new_cpu = energy_aware_wake_cpu(p, prev_cpu);            policy |= LB_EAS;        }        /* (6.2) 如果不能使用EAS，且sd_flag中设置SD_BALANCE_WAKE标志             尝试在唤醒的cpu上运行p进程,            ooooo前面辛苦计算的affine_sd没有派上用场？         */        else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */            if (true) {#ifdef CONFIG_CGROUP_SCHEDTUNE                bool prefer_idle = schedtune_prefer_idle(p) > 0;#else                bool prefer_idle = true;#endif                int idle_cpu;                idle_cpu = find_best_idle_cpu(p, prefer_idle);                if (idle_cpu >= 0) {                    new_cpu = idle_cpu;                    policy |= LB_IDLEST;                } else {                    new_cpu = select_max_spare_capacity_cpu(p, new_cpu);                    policy |= LB_SPARE;                }            } else            /* (6.3) 不符合上述条件下的默认处理，尝试找一个idle cpu */                new_cpu = select_idle_sibling(p, new_cpu);        }    } else while (sd) {    /* (7) 找到符合sd_flag的sd */        struct sched_group *group;        int weight;        policy |= LB_SMP;        /* (7.1) */        if (!(sd->flags & sd_flag)) {            sd = sd->child;            continue;        }        /* (7.2) */        group = find_idlest_group(sd, p, cpu, sd_flag);        if (!group) {            sd = sd->child;            continue;        }        /* (7.3) */        new_cpu = find_idlest_cpu(group, p, cpu);        if (new_cpu == -1 || new_cpu == cpu) {            /* Now try balancing at a lower domain level of cpu */            sd = sd->child;            continue;        }        /* (7.4) */        /* Now try balancing at a lower domain level of new_cpu */        cpu = new_cpu;        weight = sd->span_weight;        sd = NULL;        for_each_domain(cpu, tmp) {            if (weight <= tmp->span_weight)                break;            if (tmp->flags & sd_flag)                sd = tmp;        }        /* while loop will break here if sd == NULL */    }#ifdef CONFIG_MTK_SCHED_TRACERS    policy |= (new_cpu << LB_SMP_SHIFT);#endif    rcu_read_unlock();    /* (8) 在EAS不能运行的情况下，在做一次HMP的select操作：        判断进程p是否符合hmp的迁移条件，如果符合一次迁移到位，避免后续hmp的操作     */    /*  Consider hmp if no EAS  or over-utiled in hybrid mode. */    if ((!energy_aware() && sched_feat(SCHED_HMP)) ||        (hybrid_support() && cpu_rq(cpu)->rd->overutilized)) {        new_cpu = hmp_select_task_rq_fair(sd_flag, p, prev_cpu, new_cpu);#ifdef CONFIG_MTK_SCHED_TRACERS        policy |= (new_cpu << LB_HMP_SHIFT);#endif        policy |= LB_HMP;    }#ifdef CONFIG_MTK_SCHED_TRACERS    trace_sched_select_task_rq(p, policy, prev_cpu, new_cpu);#endif    return new_cpu;}|→inline int hmp_fork_balance(struct task_struct *p, int prev_cpu){    int new_cpu = prev_cpu;    int cpu = smp_processor_id();    /* (2.1) prev_cpu所在cluster是最快(fastest)的  */    if (hmp_cpu_is_fastest(prev_cpu)) {        /* prev_cpu is fastest domain */        struct hmp_domain *hmpdom;        __always_unused int lowest_ratio;        hmpdom = list_entry(                &hmp_cpu_domain(prev_cpu)->hmp_domains,                struct hmp_domain, hmp_domains);        /* (2.2) 尝试选出负载最小的cpu */        lowest_ratio = hmp_domain_min_load(hmpdom, &new_cpu);        if (new_cpu < nr_cpu_ids && cpumask_test_cpu(new_cpu, tsk_cpus_allowed(p)))            return new_cpu;        new_cpu = cpumask_any_and(&hmp_faster_domain(cpu)->cpus,                tsk_cpus_allowed(p));        if (new_cpu < nr_cpu_ids)            return new_cpu;    } else {        /* (2.3) 尝试选出prev_cpu所在cluster中负载最小的cpu */        /* prev_cpu is not fastest domain */        new_cpu = hmp_select_faster_cpu(p, prev_cpu);        if (new_cpu < nr_cpu_ids)            return new_cpu;    }    return new_cpu;}|→static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync){    s64 this_load, load;    s64 this_eff_load, prev_eff_load;    int idx, this_cpu, prev_cpu;    struct task_group *tg;    unsigned long weight;    int balanced;    idx   = sd->wake_idx;    this_cpu  = smp_processor_id();    prev_cpu  = task_cpu(p);    load      = source_load(prev_cpu, idx);    this_load = target_load(this_cpu, idx);    /* (5.1) */    /*     * If sync wakeup then subtract the (maximum possible)     * effect of the currently running task from the load     * of the current CPU:     */    if (sync) {        tg = task_group(current);        weight = current->se.avg.load_avg;        this_load += effective_load(tg, this_cpu, -weight, -weight);        load += effective_load(tg, prev_cpu, 0, -weight);    }    tg = task_group(p);    weight = p->se.avg.load_avg;    /*     * In low-load situations, where prev_cpu is idle and this_cpu is idle     * due to the sync cause above having dropped this_load to 0, we'll     * always have an imbalance, but there's really nothing you can do     * about that, so that's good too.     *     * Otherwise check if either cpus are near enough in load to allow this     * task to be woken on this_cpu.     */    this_eff_load = 100;    this_eff_load *= capacity_of(prev_cpu);    prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;    prev_eff_load *= capacity_of(this_cpu);    if (this_load > 0) {        this_eff_load *= this_load +            effective_load(tg, this_cpu, weight, weight);        prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);    }    balanced = this_eff_load <= prev_eff_load;    schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);    if (!balanced)        return 0;    schedstat_inc(sd, ttwu_move_affine);    schedstat_inc(p, se.statistics.nr_wakeups_affine);    return 1;}|→static int energy_aware_wake_cpu(struct task_struct *p, int target){    int target_max_cap = INT_MAX;    int target_cpu = task_cpu(p);    unsigned long min_util;    unsigned long new_util;    int i, cpu;    bool is_tiny = false;    int nrg_diff = 0;    int cluster_id = 0;    struct cpumask cluster_cpus;    int max_cap_cpu = 0;    int best_cpu = 0;    /* (6.1.1) 遍历cluster和cpu，找出一个capacity最小的cpu能容纳下util(p)为best_cpu */    /*     * Find group with sufficient capacity. We only get here if no cpu is     * overutilized. We may end up overutilizing a cpu by adding the task,     * but that should not be any worse than select_idle_sibling().     * load_balance() should sort it out later as we get above the tipping     * point.     */    cluster_id = arch_get_nr_clusters();    for (i = 0; i < cluster_id; i++) {        arch_get_cluster_cpus(&cluster_cpus, i);        max_cap_cpu = cpumask_first(&cluster_cpus);        /* Assuming all cpus are the same in group */        for_each_cpu(cpu, &cluster_cpus) {            if (!cpu_online(cpu))                continue;            if (capacity_of(max_cap_cpu) < target_max_cap &&            task_fits_max(p, max_cap_cpu)) {                best_cpu = cpu;                target_max_cap = capacity_of(max_cap_cpu);            }            break;        }    }    if (task_util(p) < TINY_TASK_THRESHOLD)        is_tiny = true;    /* Find cpu with sufficient capacity */    min_util = boosted_task_util(p);    if (!is_tiny)        /* (6.1.2) 根据best_cpu所在的cluster和进程p的affinity，            找出加上util(p)以后，剩余capacity最大的cpu：target_cpu         */        target_cpu = select_max_spare_capacity_cpu(p, best_cpu);    else        /* (6.1.3) 根据cluster和进程p的affinity，            找出加上util(p)以后，当前freq的capacity能满足的第一个cpu：target_cpu         */        for_each_cpu_and(i, tsk_cpus_allowed(p), &cluster_cpus) {            if (!cpu_online(i))                continue;            /*             * p's blocked utilization is still accounted for on prev_cpu             * so prev_cpu will receive a negative bias due to the double             * accounting. However, the blocked utilization may be zero.             */            new_util = cpu_util(i) + task_util(p);            /*             * Ensure minimum capacity to grant the required boost.             * The target CPU can be already at a capacity level higher             * than the one required to boost the task.             */            new_util = max(min_util, new_util);#ifdef CONFIG_MTK_SCHED_INTEROP            if (cpu_rq(i)->rt.rt_nr_running && likely(!is_rt_throttle(i)))                continue;#endif            if (new_util > capacity_orig_of(i))                continue;            if (new_util < capacity_curr_of(i)) {                target_cpu = i;                if (cpu_rq(i)->nr_running)                    break;            }            /* cpu has capacity at higher OPP, keep it as fallback */            if (target_cpu == task_cpu(p))                target_cpu = i;        }    /* (6.1.4) 如果pre_cpu和target_cpu是同一个cluster，直接成功返回 */    /* no need energy calculation if the same domain */    if (is_the_same_domain(task_cpu(p), target_cpu))        return target_cpu;    /* no energy comparison if the same cluster */    if (target_cpu != task_cpu(p)) {        /* (6.1.5) 构造需要迁移的环境变量  */        struct energy_env eenv = {            .util_delta = task_util(p),            .src_cpu    = task_cpu(p),            .dst_cpu    = target_cpu,            .task       = p,        };        /* Not enough spare capacity on previous cpu */        if (cpu_overutilized(task_cpu(p))) {            trace_energy_aware_wake_cpu(p, task_cpu(p), target_cpu,                    (int)task_util(p), nrg_diff, true, is_tiny);            return target_cpu;        }        /* (6.1.6) 计算进程p从pre_cpu迁移到target_cpu后的功耗差值nrg_diff，            如果功耗增加，nrg_diff >= 0，返回pre_cpu即task_cpu(p)，            如果功耗减少，返回新的target_cpu         */        nrg_diff = energy_diff(&eenv);        if (nrg_diff >= 0) {            trace_energy_aware_wake_cpu(p, task_cpu(p), target_cpu,                    (int)task_util(p), nrg_diff, false, is_tiny);            return task_cpu(p);        }    }    trace_energy_aware_wake_cpu(p, task_cpu(p), target_cpu, (int)task_util(p), nrg_diff, false, is_tiny);    return target_cpu;}||→static inline intenergy_diff(struct energy_env *eenv){    unsigned int boost;    int nrg_delta;    /* Conpute "absolute" energy diff */    __energy_diff(eenv);    /* Return energy diff when boost margin is 0 */#ifdef CONFIG_CGROUP_SCHEDTUNE    boost = schedtune_task_boost(eenv->task);#else    boost = get_sysctl_sched_cfs_boost();#endif    if (boost == 0)        return eenv->nrg.diff;    /* Compute normalized energy diff */    nrg_delta = normalize_energy(eenv->nrg.diff);    eenv->nrg.delta = nrg_delta;    eenv->payoff = schedtune_accept_deltas(            eenv->nrg.delta,            eenv->cap.delta,            eenv->task);    /*     * When SchedTune is enabled, the energy_diff() function will return     * the computed energy payoff value. Since the energy_diff() return     * value is expected to be negative by its callers, this evaluation     * function return a negative value each time the evaluation return a     * positive payoff, which is the condition for the acceptance of     * a scheduling decision     */    return -eenv->payoff;}static int __energy_diff(struct energy_env *eenv){    struct sched_domain *sd;    struct sched_group *sg;    int sd_cpu = -1, energy_before = 0, energy_after = 0;    /* (6.1.6.1) 构造迁移前的环境变量  */    struct energy_env eenv_before = {        .util_delta = 0,        .src_cpu    = eenv->src_cpu,        .dst_cpu    = eenv->dst_cpu,        .nrg        = { 0, 0, 0, 0},        .cap        = { 0, 0, 0 },    };#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT    int i;#endif    if (eenv->src_cpu == eenv->dst_cpu)        return 0;#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT    /* To get max opp index of every cluster for power estimation of share buck */    for (i = 0; i < arch_get_nr_clusters(); i++) {        /* for energy before */        eenv_before.opp_idx[i]  = mtk_cluster_capacity_idx(i, &eenv_before);        /* for energy after */        eenv->opp_idx[i]  = mtk_cluster_capacity_idx(i, eenv);        mt_sched_printf(sched_eas_energy_calc, "cid=%d, before max_opp:%d, after max_opp:%d\n",                    i, eenv_before.opp_idx[i], eenv->opp_idx[i]);    }#endif    /* (6.1.6.2) sd来至于cache sd_ea，是cpu对应的顶层sd(tl DIE层) */    sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;    sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));    if (!sd)        return 0; /* Error */    mt_sched_printf(sched_eas_energy_calc, "0. %s: move task from src=%d to dst=%d util=%d",                __func__, eenv->src_cpu, eenv->dst_cpu, eenv->util_delta);    sg = sd->groups;    /* (6.1.6.3) 遍历sg所在sg链表，找到符合条件的sg，         累加计算eenv_before、eenv相关sg的功耗     */     do {        /* (6.1.6.4) 如果当前sg包含src_cpu或者dst_cpu，计算 */        if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {            /* (6.1.6.5) 当前顶层sg为eenv的sg_top  */            eenv_before.sg_top = eenv->sg_top = sg;            mt_sched_printf(sched_eas_energy_calc, "1. %s: src=%d dst=%d mask=0x%lx (before)",                    __func__,  eenv_before.src_cpu, eenv_before.dst_cpu, sg->cpumask[0]);            /* (6.1.6.6) 计算eenv_before负载下sg的power */            if (sched_group_energy(&eenv_before))                return 0; /* Invalid result abort */            energy_before += eenv_before.energy;            /* Keep track of SRC cpu (before) capacity */            eenv->cap.before = eenv_before.cap.before;            eenv->cap.delta = eenv_before.cap.delta;            mt_sched_printf(sched_eas_energy_calc, "2. %s: src=%d dst=%d mask=0x%lx (after)",                    __func__,  eenv->src_cpu, eenv->dst_cpu, sg->cpumask[0]);            /* (6.1.6.7) 计算eenv负载下sg的power */            if (sched_group_energy(eenv))                return 0; /* Invalid result abort */            energy_after += eenv->energy;        }    } while (sg = sg->next, sg != sd->groups);    /* (6.1.6.8) 计算energy_after - energy_before */    eenv->nrg.before = energy_before;    eenv->nrg.after = energy_after;    eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;    eenv->payoff = 0;    trace_sched_energy_diff(eenv->task,                eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,                eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,                eenv->cap.before, eenv->cap.after, eenv->cap.delta,                eenv->nrg.delta, eenv->payoff);    mt_sched_printf(sched_eas_energy_calc, "5. %s: nrg.diff=%d cap.delta=%d",                __func__, eenv->nrg.diff, eenv->cap.delta);    return eenv->nrg.diff;}|||→static int sched_group_energy(struct energy_env *eenv){    struct sched_domain *sd;    int cpu, total_energy = 0;    struct cpumask visit_cpus;    struct sched_group *sg;#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT    int only_lv1_sd = 0;#endif    WARN_ON(!eenv->sg_top->sge);    cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));    /* (6.1.6.6.1) 根据sg_top顶层sd，找到需要计算的cpu集合visit_cpus，逐个遍历其中每一个cpu        ooooo这一套复杂的循环算法计算下来，其实就计算了几个power，以cpu0-cpu3为例：        4个底层sg的power + 1个顶层sg的power     */     while (!cpumask_empty(&visit_cpus)) {        struct sched_group *sg_shared_cap = NULL;        /* (6.1.6.6.2) 选取visit_cpus中的第一个cpu */        cpu = cpumask_first(&visit_cpus);        sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);        if (!sd) {            /* a corner racing with hotplug? sd doesn't exist in this cpu. */            return -EINVAL;        }        /*         * Is the group utilization affected by cpus outside this         * sched_group?         */        sd = rcu_dereference(per_cpu(sd_scs, cpu));#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT        /* Try to handle one CPU in this cluster by hotplug.         * In it there is only lv-1 sched_domain exist which having         * no share_cap_states.         */        if (!sd) {            sd = rcu_dereference(per_cpu(sd_ea, cpu));            only_lv1_sd = 1;        }#endif        if (!sd) {            /*             * We most probably raced with hotplug; returning a             * wrong energy estimation is better than entering an             * infinite loop.             */            return -EINVAL;        }        if (sd->parent)            sg_shared_cap = sd->parent->groups;        /* (6.1.6.6.3) 从底层到顶层逐个遍历cpu所在的sd */        for_each_domain(cpu, sd) {            sg = sd->groups;            /* (6.1.6.6.4) 如果是顶层sd，只会计算一个sg */            /* Has this sched_domain already been visited? */            if (sd->child && group_first_cpu(sg) != cpu)                break;            /* (6.1.6.6.5) 逐个遍历该层次sg链表所在sg */            do {                unsigned long group_util;                int sg_busy_energy, sg_idle_energy;                int cap_idx, idle_idx;                if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)                    eenv->sg_cap = sg_shared_cap;                else                    eenv->sg_cap = sg;                /* (6.1.6.6.6) 根据eenv指示的负载变化，找出满足该sg中最大负载cpu的capacity_index */                cap_idx = find_new_capacity(eenv, sg->sge);                if (sg->group_weight == 1) {                    /* Remove capacity of src CPU (before task move) */                    if (eenv->util_delta == 0 &&                        cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {                        eenv->cap.before = sg->sge->cap_states[cap_idx].cap;                        eenv->cap.delta -= eenv->cap.before;                    }                    /* Add capacity of dst CPU  (after task move) */                    if (eenv->util_delta != 0 &&                        cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {                        eenv->cap.after = sg->sge->cap_states[cap_idx].cap;                        eenv->cap.delta += eenv->cap.after;                    }                }                /* (6.1.6.6.7) 找出sg所有cpu中最小的idle index */                idle_idx = group_idle_state(sg);                /* (6.1.6.6.8) 累加sg中所有cpu的相对负载，                    最大负载为sg->sge->cap_states[eenv->cap_idx].cap                 */                group_util = group_norm_util(eenv, sg);                /* (6.1.6.6.9) 计算power = busy_power + idle_power */#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT                /*                 * To support power estimation for MTK soc.                 * Consider share buck for dynamic power and SPARK/MCDI for static power.                 */                sg_busy_energy = (group_util *                    sg->sge->busy_power(group_first_cpu(sg), eenv, (sd->child) ? 1 : 0))                                >> SCHED_CAPACITY_SHIFT;                sg_idle_energy = ((SCHED_LOAD_SCALE - group_util) *                    sg->sge->idle_power(idle_idx, group_first_cpu(sg), eenv, (sd->child) ? 1 : 0))                                >> SCHED_CAPACITY_SHIFT;#else                /* Power value had been separated to static + dynamic here */                sg_busy_energy = (group_util * (sg->sge->cap_states[cap_idx].dyn_pwr +                        sg->sge->cap_states[cap_idx].lkg_pwr[sg->sge->lkg_idx]))                                >> SCHED_CAPACITY_SHIFT;                sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) *                        sg->sge->idle_states[idle_idx].power)                                >> SCHED_CAPACITY_SHIFT;#endif                total_energy += sg_busy_energy + sg_idle_energy;                mt_sched_printf(sched_eas_energy_calc, "busy_energy=%d idle_eneryg=%d (cost=%d)",                            sg_busy_energy, sg_idle_energy, total_energy);                /* (6.1.6.6.10) 如果遍历了底层sd，从visit_cpus中去掉对应的sg cpu */                if (!sd->child)                    cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT                /*                 * We try to get correct energy estimation while racing with hotplug                 * and avoid entering a infinite loop.                 */                if (only_lv1_sd) {                    eenv->energy = total_energy;                    return 0;                }#endif                if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))                    goto next_cpu;            } while (sg = sg->next, sg != sd->groups);        }        /* (6.1.6.6.11) 如果遍历了cpu的底层到顶层sd，从visit_cpus中去掉对应的cpu */next_cpu:        cpumask_clear_cpu(cpu, &visit_cpus);        continue;    }    eenv->energy = total_energy;    return 0;}|→static struct sched_group *find_idlest_group(struct sched_domain *sd, struct task_struct *p,          int this_cpu, int sd_flag){    struct sched_group *idlest = NULL, *group = sd->groups;    struct sched_group *fit_group = NULL;    unsigned long min_load = ULONG_MAX, this_load = 0;    unsigned long fit_capacity = ULONG_MAX;    int load_idx = sd->forkexec_idx;    int imbalance = 100 + (sd->imbalance_pct-100)/2;    /* (7.2.1) 选择load_idx */    if (sd_flag & SD_BALANCE_WAKE)        load_idx = sd->wake_idx;    /* (7.2.2) 当前cpu所在sd层次的sg，遍历sg所在的sg链表，选出负载最轻的idlest sg */    do {        unsigned long load, avg_load;        int local_group;        int i;        /* (7.2.3) 略过不符合p进程affinity的sg */        /* Skip over this group if it has no CPUs allowed */        if (!cpumask_intersects(sched_group_cpus(group),                    tsk_cpus_allowed(p)))            continue;        /* (7.2.4) local_group等于本cpu所在的sg */        local_group = cpumask_test_cpu(this_cpu,                           sched_group_cpus(group));        /* Tally up the load of all CPUs in the group */        avg_load = 0;        /* (7.2.5) 遍历sg中的所有cpu，累加负载 */        for_each_cpu(i, sched_group_cpus(group)) {            /* Bias balancing toward cpus of our domain */            if (local_group)                load = source_load(i, load_idx);            else                load = target_load(i, load_idx);#ifdef CONFIG_MTK_SCHED_INTEROP            load += mt_rt_load(i);#endif            avg_load += load;            /* (7.2.6) 如果EAS使能，找到能最小满足进程p的capacity sg */            /*             * Look for most energy-efficient group that can fit             * that can fit the task.             */            if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {                fit_capacity = capacity_of(i);                fit_group = group;            }        }        /* (7.2.7) 用累计的负载计算相对负载 */        /* Adjust by relative CPU capacity of the group */        avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;        /* (7.2.8) 计算idlest sg */        if (local_group) {            this_load = avg_load;        } else if (avg_load < min_load) {            min_load = avg_load;            idlest = group;        }    } while (group = group->next, group != sd->groups);    /* (7.2.9) EAS使能，返回fit_group */    if (energy_aware() && fit_group)        return fit_group;    if (!idlest || 100*this_load < imbalance*min_load)        return NULL;    /* (7.2.11) 否则，返回idlest */    return idlest;}|→static intfind_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu){    unsigned long load, min_load = ULONG_MAX;    unsigned int min_exit_latency = UINT_MAX;    u64 latest_idle_timestamp = 0;    int least_loaded_cpu = this_cpu;    int shallowest_idle_cpu = -1;    int i;    /* (7.3.1) 遍历sg中符合p进程affinity的cpu */    /* Traverse only the allowed CPUs */    for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {        /* (7.3.2) 如果cpu的剩余capacity能容纳下p进程的load */        if (task_fits_spare(p, i)) {            struct rq *rq = cpu_rq(i);            struct cpuidle_state *idle = idle_get_state(rq);            /* (7.3.2.1) 优先选出idle状态，且退出idle开销最小的cpu */            if (idle && idle->exit_latency < min_exit_latency) {                /*                 * We give priority to a CPU whose idle state                 * has the smallest exit latency irrespective                 * of any idle timestamp.                 */                min_exit_latency = idle->exit_latency;                latest_idle_timestamp = rq->idle_stamp;                shallowest_idle_cpu = i;            } else if (idle_cpu(i) &&                   (!idle || idle->exit_latency == min_exit_latency) &&                   rq->idle_stamp > latest_idle_timestamp) {                /*                 * If equal or no active idle state, then                 * the most recently idled CPU might have                 * a warmer cache.                 */                latest_idle_timestamp = rq->idle_stamp;                shallowest_idle_cpu = i;            } else if (shallowest_idle_cpu == -1) {                /*                 * If we haven't found an idle CPU yet                 * pick a non-idle one that can fit the task as                 * fallback.                 */                shallowest_idle_cpu = i;            }        /* (7.3.3) cpu的剩余capacity容纳不下进程p，选出负载最轻的cpu */        } else if (shallowest_idle_cpu == -1) {            load = weighted_cpuload(i);#ifdef CONFIG_MTK_SCHED_INTEROP            load += mt_rt_load(i);#endif            if (load < min_load || (load == min_load && i == this_cpu)) {                min_load = load;                least_loaded_cpu = i;            }        }    }    return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;}|→static int hmp_select_task_rq_fair(int sd_flag, struct task_struct *p,        int prev_cpu, int new_cpu){    struct list_head *pos;    struct sched_entity *se = &p->se;    struct cpumask fast_cpu_mask, slow_cpu_mask;#ifdef CONFIG_HMP_TRACER    int cpu = 0;    for_each_online_cpu(cpu)        trace_sched_cfs_runnable_load(cpu, cfs_load(cpu), cfs_length(cpu));#endif    /* error handling */    if (prev_cpu >= num_possible_cpus())        return new_cpu;    /*     * Skip all the checks if only one CPU is online.     * Otherwise, select the most appropriate CPU from cluster.     */    if (num_online_cpus() == 1)        goto out;    /* (8.1) 找出fastest hmp_domain，只有一个，         找出slow hmp_domain，有多个，        在一个fast_cpu_mask和多个slow_cpu_mask之间，逐个尝试hmp_select_task_migration()        p进程是否会满足hmp迁移     */    cpumask_clear(&fast_cpu_mask);    cpumask_clear(&slow_cpu_mask);    /* order: fast to slow hmp domain */    list_for_each(pos, &hmp_domains) {        struct hmp_domain *domain = list_entry(pos, struct hmp_domain, hmp_domains);        if (!cpumask_empty(&domain->cpus)) {            if (cpumask_empty(&fast_cpu_mask)) {                cpumask_copy(&fast_cpu_mask, &domain->possible_cpus);            } else {                cpumask_copy(&slow_cpu_mask, &domain->possible_cpus);                new_cpu = hmp_select_task_migration(sd_flag, p,                    prev_cpu, new_cpu, &fast_cpu_mask, &slow_cpu_mask);            }        }    }out:    /* it happens when num_online_cpus=1 */    if (new_cpu >= nr_cpu_ids) {        /* BUG_ON(1); */        new_cpu = prev_cpu;    }    cfs_nr_pending(new_cpu)++;    cfs_pending_load(new_cpu) += se_load(se);    return new_cpu;}||→static int hmp_select_task_migration(int sd_flag, struct task_struct *p, int prev_cpu, int new_cpu,        struct cpumask *fast_cpu_mask, struct cpumask *slow_cpu_mask){    int step = 0;    struct sched_entity *se = &p->se;    int B_target = num_possible_cpus();    int L_target = num_possible_cpus();    struct clb_env clbenv;    /* (8.1.1) 找出fast_cpu_mask中负载最轻的cpu B_target，且符合p进程的affinity */    B_target = hmp_select_cpu(HMP_SELECT_RQ, p, fast_cpu_mask, prev_cpu, 0);    /* (8.1.2) 找出slow_cpu_mask中负载最轻的cpu L_target，且符合p进程的affinity */    L_target = hmp_select_cpu(HMP_SELECT_RQ, p, slow_cpu_mask, prev_cpu, 1);    /*     * Only one cluster exists or only one cluster is allowed for this task     * Case 1: return the runqueue whose load is minimum     * Case 2: return original CFS runqueue selection result     */    if (B_target >= num_possible_cpus() && L_target >= num_possible_cpus())        goto out;    if (B_target >= num_possible_cpus())        goto select_slow;    if (L_target >= num_possible_cpus())        goto select_fast;    /*     * Two clusters exist and both clusters are allowed for this task     * Step 1: Move newly created task to the cpu where no tasks are running     * Step 2: Migrate heavy-load task to big     * Step 3: Migrate light-load task to LITTLE     * Step 4: Make sure the task stays in its previous hmp domain     */    step = 1;    if (task_created(sd_flag) && !task_low_priority(p->prio)) {        if (!rq_length(B_target))            goto select_fast;        if (!rq_length(L_target))            goto select_slow;    }    /* (8.1.3) 计算如果L_target和B_target发生hmp迁移，各种负载和thershold的计算 */    memset(&clbenv, 0, sizeof(clbenv));    clbenv.flags |= HMP_SELECT_RQ;    cpumask_copy(&clbenv.lcpus, slow_cpu_mask);    cpumask_copy(&clbenv.bcpus, fast_cpu_mask);    clbenv.ltarget = L_target;    clbenv.btarget = B_target;    sched_update_clbstats(&clbenv);    /* (8.1.4) 判断进程p从L_target up到 B_target的可行性 */    step = 2;    if (hmp_up_migration(L_target, &B_target, se, &clbenv))        goto select_fast;    /* (8.1.5) 判断进程p从B_target down到 L_target的可行性 */    step = 3;    if (hmp_down_migration(B_target, &L_target, se, &clbenv))        goto select_slow;    /* (8.1.6) 如果prev_cpu是slowest */    step = 4;    if (hmp_cpu_is_slowest(prev_cpu))        goto select_slow;    goto select_fast;    /* (8.1.7) 返回 B_target */select_fast:    new_cpu = B_target;    cpumask_clear(slow_cpu_mask);    goto out;    /* (8.1.8) 返回 L_target */select_slow:    new_cpu = L_target;    cpumask_copy(fast_cpu_mask, slow_cpu_mask);    cpumask_clear(slow_cpu_mask);    goto out;out:#ifdef CONFIG_HMP_TRACER    trace_sched_hmp_load(clbenv.bstats.load_avg, clbenv.lstats.load_avg);#endif    return new_cpu;}

4.2、HMP负载均衡

除了SMP load_balance()负载均衡以外，我们还希望在多个SMP cluster之间能遵守一种规则：heavy任务跑在big core上，light任务跑在little core上，这样能快速的达到一个合理的负载状态。这种算法就叫做HMP负载均衡，EAS会统一的考虑负载、性能、功耗，EAS使能后HMP就被禁用了。

HMP负载均衡的操作分两种：

1、heavy task从little cpu迁移到big cpu。这种叫做up操作，对应的函数hmp_force_up_migration()；
2、light task从big cpu迁移到little cpu。这种叫做down操作，对应的函数hmp_force_down_migration()；

4.2.1、hmp domain初始化

这里写图片描述

hmp在初始化的时候会每个cluster分配一个hmp_domain，把所有hmp_domain加入到全局链表hmp_domains中。hmp_domains链表构建完成以后，离链表头hmp_domains最近的hmp_domain是速度最快的cluster，离hmp_domains越远hmp_domain对应的速度越慢。因为在构造链表时是按照cluster id来加入的，速度最快cluster的hmp_domain最后加入，所以离表头最近。

static int __init hmp_cpu_mask_setup(void){    struct hmp_domain *domain;    struct list_head *pos;    int dc, cpu;    pr_warn("Initializing HMP scheduler:\n");    /* Initialize hmp_domains using platform code */    /* (1) 调用arch相关的hmp_domains初始化函数 */    arch_get_hmp_domains(&hmp_domains);    if (list_empty(&hmp_domains)) {        pr_warn("HMP domain list is empty!\n");        return 0;    }    /* Print hmp_domains */    dc = 0;    list_for_each(pos, &hmp_domains) {        domain = list_entry(pos, struct hmp_domain, hmp_domains);        for_each_cpu(cpu, &domain->possible_cpus) {            /* (2) 给per_cpu变量hmp_cpu_domain赋值 */            per_cpu(hmp_cpu_domain, cpu) = domain;        }        dc++;    }    return 1;}|→void __init arch_get_hmp_domains(struct list_head *hmp_domains_list){    struct hmp_domain *domain;    struct cpumask cpu_mask;    int id, maxid;    cpumask_clear(&cpu_mask);    maxid = arch_get_nr_clusters();    /*     * Initialize hmp_domains     * Must be ordered with respect to compute capacity.     * Fastest domain at head of list.     */    /* (1.1) 按照cluster id初始化对应的hmp_domain */    for (id = 0; id < maxid; id++) {        arch_get_cluster_cpus(&cpu_mask, id);        domain = (struct hmp_domain *)            kmalloc(sizeof(struct hmp_domain), GFP_KERNEL);        cpumask_copy(&domain->possible_cpus, &cpu_mask);        cpumask_and(&domain->cpus, cpu_online_mask, &domain->possible_cpus);        /* (1.2) 将hmp_domain加入到全局链表hmp_domains_list即hmp_domains中 */        list_add(&domain->hmp_domains, hmp_domains_list);    }}

4.2.2、hmp_force_up_migration()

hmp_force_up_migration()的操作主要有以下几个步骤：

需要重点提一下的是：负载计算计算了3种负载(load_avg、loadwop_avg、util_avg)，rebalance_domains主要使用其中的loadwop_avg。

1、根据当前cpu，选择fast_cpu_mask、slow_cpu_mask；

hmp_force_up_migration尝试把slow cpu上的heavy进程迁移到fast cpu上，关于slow、fast的选择有以下几种场景：

这里写图片描述

2、选择当前cpu的heaviest进程作为迁移进程p；并不会遍历cpu上所有进程去选出heaviest进程，只会查询curr进程和cfs_rq中5个进程中的heaviest；
3、根据fast_cpu_mask，选择一个负载最少的target cpu；

这里写图片描述

4、根据源cpu(curr_cpu)、目的cpu(target_cpu)，计算负载；

重要的数据计算方法：

重要数据所属结构含义更新/获取函数计算方法 clbenv->bstats.cpu_power clbenv->bstats B族cpu的绝对计算能力 sched_update_clbstats() arch_scale_cpu_capacity(NULL, clbenv->btarget) clbenv->lstats.cpu_power clbenv->lstats L族cpu的绝对计算能力 sched_update_clbstats() arch_scale_cpu_capacity(NULL, clbenv->ltarget) clbenv->lstats.cpu_capacity clbenv->lstats B族cpu的相对计算能力，大于1024 sched_update_clbstats() SCHED_CAPACITY_SCALE * clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1) clbenv->bstats.cpu_capacity clbenv->bstats L族cpu的相对计算能力，等于1024 sched_update_clbstats() SCHED_CAPACITY_SCALE clbs->ncpu clbenv->bstats/clbenv->lstats L族/B族online的cpu数量 collect_cluster_stats() if (cpu_online(cpu)) clbs->ncpu++; clbs->ntask clbenv->bstats/clbenv->lstats L族/B族所有online cpu中所有层级se的总和 collect_cluster_stats() clbs->ntask += cpu_rq(cpu)->cfs.h_nr_running; clbs->load_avg clbenv->bstats/clbenv->lstats L族/B族online cpu的平均runnable负载，不带weight collect_cluster_stats() sum(cpu_rq(cpu)->cfs.avg.loadwop_avg)/clbs->ncpu clbs->scaled_acap clbenv->bstats/clbenv->lstats L族/B族target cpu计算能力的剩余值 collect_cluster_stats() hmp_scale_down(clbs->cpu_capacity - cpu_rq(target)->cfs.avg.loadwop_avg) clbs->scaled_atask clbenv->bstats/clbenv->lstats L族/B族target cpu的task space的剩余值 collect_cluster_stats() hmp_scale_down(clbs->cpu_capacity - cpu_rq(target)->cfs.h_nr_running * cpu_rq(target)->cfs.avg.loadwop_avg) clbenv->bstats.threshold clbenv->bstats 进程要up迁移到B族的负载门限值 adj_threshold() HMP_MAX_LOAD - HMP_MAX_LOAD * b_nacap * b_natask / ((b_nacap + l_nacap) * (b_natask + l_natask) + 1)；b_nacap、b_natask会乘以一个放大系数(b_cpu_power/l_cpu_power)，类似如cpu_capacity的计算 clbenv->lstats.threshold clbenv->lstats 进程要down迁移到L族的负载门限值 adj_threshold() HMP_MAX_LOAD * l_nacap * l_natask / ((b_nacap + l_nacap) * (b_natask + l_natask) + 1)；b_nacap、b_natask会乘以一个放大系数(b_cpu_power/l_cpu_power)，类似如cpu_capacity的计算

5、根据计算的负载情况，判断进程p是否符合up迁移条件((se_load(se) > B->threshold)，等其他条件)；

up-migration条件列表(hmp_up_migration())：

条件含义计算方法计算解析 [1] Migration stabilizing 如果target cpu刚做过up迁移，不适合再进行迁移 if (!hmp_up_stable(*target_cpu)) check->result = 0; (((now - hmp_last_up_migration(cpu)) >> 10)

static void run_rebalance_domains(struct softirq_action *h){    struct rq *this_rq = this_rq();    enum cpu_idle_type idle = this_rq->idle_balance ?                        CPU_IDLE : CPU_NOT_IDLE;    int this_cpu = smp_processor_id();    /* bypass load balance of HMP if EAS consideration */    /* (1) 在EAS不使能的情况下，尝试进行HMP负载均衡 */    if ((!energy_aware() && sched_feat(SCHED_HMP)) ||            (hybrid_support() && cpu_rq(this_cpu)->rd->overutilized))        hmp_force_up_migration(this_cpu);    /*     * If this cpu has a pending nohz_balance_kick, then do the     * balancing on behalf of the other idle cpus whose ticks are     * stopped. Do nohz_idle_balance *before* rebalance_domains to     * give the idle cpus a chance to load balance. Else we may     * load balance only within the local sched_domain hierarchy     * and abort nohz_idle_balance altogether if we pull some load.     */    nohz_idle_balance(this_rq, idle);    rebalance_domains(this_rq, idle);}|→static void hmp_force_up_migration(int this_cpu){    int curr_cpu, target_cpu;a    struct sched_entity *se;    struct rq *target;    unsigned long flags;    unsigned int force = 0;    struct task_struct *p;    struct clb_env clbenv;#ifdef CONFIG_SCHED_HMP_PLUS    struct sched_entity *orig;#endif    if (!spin_trylock(&hmp_force_migration))        return;#ifdef CONFIG_HMP_TRACER    for_each_online_cpu(curr_cpu)        trace_sched_cfs_runnable_load(curr_cpu, cfs_load(curr_cpu), cfs_length(curr_cpu));#endif    /* Migrate heavy task from LITTLE to big */    /* (1.1) 逐个online cpu尝试进行heavy task从little cpu到big cpu的迁移 */    for_each_online_cpu(curr_cpu) {        struct hmp_domain *hmp_domain = NULL;        struct cpumask fast_cpu_mask, slow_cpu_mask;        cpumask_clear(&fast_cpu_mask);        cpumask_clear(&slow_cpu_mask);        /* (1.2) 如果当前cpu不属于速度最快(fastest)的domain,            则尝试进行up操作         */        if (!hmp_cpu_is_fastest(curr_cpu)) {            /* current cpu is slow_cpu_mask*/            /* (1.2.1) 当前cpu所在的hmp_domain为slow_cpu_mask */            hmp_domain = hmp_cpu_domain(curr_cpu);            cpumask_copy(&slow_cpu_mask, &hmp_domain->possible_cpus);            /* (1.2.2) 最fastest且online的hmp_domain为fast_cpu_mask */            while (&hmp_domain->hmp_domains != hmp_domains.next) {                struct list_head *pos = &hmp_domain->hmp_domains;                hmp_domain = list_entry(pos->prev, struct hmp_domain, hmp_domains);                if (!cpumask_empty(&hmp_domain->cpus)) {                    cpumask_copy(&fast_cpu_mask, &hmp_domain->possible_cpus);                    break;                }            }        } else {        /* (1.3) 如果当前cpu属于速度最快(fastest)的domain,            则直接进行down操作         */            hmp_force_down_migration(this_cpu);            continue;        }        if (!hmp_domain || hmp_domain == hmp_cpu_domain(curr_cpu))            continue;        if (cpumask_empty(&fast_cpu_mask) || cpumask_empty(&slow_cpu_mask))            continue;        force = 0;        /* (1.4) 取出当前cpu的当前cfs进程 */        target = cpu_rq(curr_cpu);        raw_spin_lock_irqsave(&target->lock, flags);        se = target->cfs.curr;        if (!se) {            raw_spin_unlock_irqrestore(&target->lock, flags);            continue;        }        /* Find task entity */        if (!entity_is_task(se)) {            struct cfs_rq *cfs_rq;            cfs_rq = group_cfs_rq(se);            while (cfs_rq) {                se = cfs_rq->curr;                cfs_rq = group_cfs_rq(se);            }        }#ifdef CONFIG_SCHED_HMP_PLUS        orig = se;        /* (1.5) 或者取出当前cpu前5个cfs进程中，负载最重(heaviest)的进程 */        se = hmp_get_heaviest_task(se, -1);        if (!se) {            raw_spin_unlock_irqrestore(&target->lock, flags);            continue;        }        if (!entity_is_task(se))            p = task_of(orig);        else#endif            p = task_of(se);        /* (1.6) 选择fast_cpu_mask domain中，负载最少的cpu */        target_cpu = hmp_select_cpu(HMP_GB, p, &fast_cpu_mask, -1, 0);        if (target_cpu >= num_possible_cpus()) {            raw_spin_unlock_irqrestore(&target->lock, flags);            continue;        }        /* Collect cluster information */        /* (1.7) up操作的对象已经选择好：            源little cpu：curr_cpu            目的big cpu：target_cpu         */        memset(&clbenv, 0, sizeof(clbenv));        clbenv.flags |= HMP_GB;        clbenv.ltarget = curr_cpu;        clbenv.btarget = target_cpu;        cpumask_copy(&clbenv.lcpus, &slow_cpu_mask);        cpumask_copy(&clbenv.bcpus, &fast_cpu_mask);        /* (1.8) up操作前的数据计算 */        sched_update_clbstats(&clbenv);        /* Check migration threshold */        /* (1.9) 根据计算的数据，判断up操作的可行性 */        if (!target->active_balance &&                hmp_up_migration(curr_cpu, &target_cpu, se, &clbenv) &&                !cpu_park(cpu_of(target))) {            if (p->state != TASK_DEAD) {                /* 准备从target rq中迁移进程p到target_cpu，                    设置rq正在处理负载balance标志active_balance */                get_task_struct(p);                target->active_balance = 1; /* force up */                target->push_cpu = target_cpu;                target->migrate_task = p;                force = 1;                trace_sched_hmp_migrate(p, target->push_cpu, 1);                hmp_next_up_delay(&p->se, target->push_cpu);            }        }        raw_spin_unlock_irqrestore(&target->lock, flags);        /* (1.10) 判断结果是可以进行up操作，            则调用hmp_force_up_cpu_stop()进行实际的up操作          */        if (force) {            if (stop_one_cpu_dispatch(cpu_of(target),                        hmp_force_up_cpu_stop,                        target, &target->active_balance_work)) {                /* 迁移完成，清除标志 */                put_task_struct(p); /* out of rq->lock */                raw_spin_lock_irqsave(&target->lock, flags);                target->active_balance = 0;                force = 0;                raw_spin_unlock_irqrestore(&target->lock, flags);            }        } else        /* (1.11) 否则，再尝试进行down操作 */            hmp_force_down_migration(this_cpu);    }#ifdef CONFIG_HMP_TRACER    trace_sched_hmp_load(clbenv.bstats.load_avg, clbenv.lstats.load_avg);#endif    spin_unlock(&hmp_force_migration);}||→static const int hmp_max_tasks = 5;static struct sched_entity *hmp_get_heaviest_task(        struct sched_entity *se, int target_cpu){    int num_tasks = hmp_max_tasks;    struct sched_entity *max_se = se;    unsigned long int max_ratio = se->avg.loadwop_avg;    const struct cpumask *hmp_target_mask = NULL;    struct hmp_domain *hmp;    /* (1.5.1) 如果本cpu是fastest cpu，则不用查找直接返回，        因为本函数的目的是找little cpu中的heaviest进程     */    if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq)))        return max_se;    /* (1.5.2) 获取比本cpu fater一级cpu的hmp_domain，作为进程亲和力判断的mask */    hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq));    hmp_target_mask = &hmp->cpus;    /* (1.5.3) 传入参数target_cpu = -1，        所以hmp_target_mask使用的是源cpu hmp_domain的hmp->cpus      */    if (target_cpu >= 0) {        /* idle_balance gets run on a CPU while         * it is in the middle of being hotplugged         * out. Bail early in that case.         */        if (!cpumask_test_cpu(target_cpu, hmp_target_mask))            return NULL;        hmp_target_mask = cpumask_of(target_cpu);    }    /* The currently running task is not on the runqueue */    /* (1.5.4) 从当前cpu的cfs红黑树中，连续5个进程和curr进程比较，选出heaviest进程         比较使用的负载为se->avg.loadwop_avg，不带weight分量     */    se = __pick_first_entity(cfs_rq_of(se));    while (num_tasks && se) {        if (entity_is_task(se) && se->avg.loadwop_avg > max_ratio &&                cpumask_intersects(hmp_target_mask, tsk_cpus_allowed(task_of(se)))) {            max_se = se;            max_ratio = se->avg.loadwop_avg;        }        se = __pick_next_entity(se);        num_tasks--;    }    return max_se;}||→static unsigned int hmp_select_cpu(unsigned int caller, struct task_struct *p,        struct cpumask *mask, int prev, int up){    int curr = 0;    int target = num_possible_cpus();    unsigned long curr_wload = 0;    unsigned long target_wload = 0;    struct cpumask srcp;    /* (1.6.1) 综合fast_cpu_mask、cpu_online_mask、tsk_cpus_allowed(p)，        选取first cpu为target     */    cpumask_and(&srcp, cpu_online_mask, mask);    target = cpumask_any_and(&srcp, tsk_cpus_allowed(p));    if (target >= num_possible_cpus())        goto out;    /*     * RT class is taken into account because CPU load is multiplied     * by the total number of CPU runnable tasks that includes RT tasks.     */    /*  (1.6.2) 计算target cpu所对应的load，        target_wload = (rq->cfs.avg.loadwop_avg + rq->cfs.avg.pending_load) * (rq->nr_running + rq->cfs.avg.nr_pending)        该负载会受RT进程的影响，因为rq->nr_running会统计包括RT进程的数量     */    target_wload = hmp_inc(cfs_load(target));    target_wload += cfs_pending_load(target);    target_wload *= rq_length(target);    for_each_cpu(curr, mask) {        /* Check CPU status and task affinity */        if (!cpu_online(curr) || !cpumask_test_cpu(curr, tsk_cpus_allowed(p)))            continue;        /* For global load balancing, unstable CPU will be bypassed */        /* (1.6.3) 如果当前是up操作，如果cpu在短时间内进行了down操作，则不适合马上进行up操作 */        if (hmp_caller_is_gb(caller) && !hmp_cpu_stable(curr, up))            continue;        curr_wload = hmp_inc(cfs_load(curr));        curr_wload += cfs_pending_load(curr);        curr_wload *= rq_length(curr);        /* (1.6.4) 选择load最小的作为target cpu */        if (curr_wload < target_wload) {            target_wload = curr_wload;            target = curr;        /* (1.6.5) 在load同样小的情况下，选择prev cpu */        } else if (curr_wload == target_wload && curr == prev) {            target = curr;        }    }out:    return target;}||→static void sched_update_clbstats(struct clb_env *clbenv){    /* init cpu power and capacity */    /* (1.8.1) L族和B族的绝对运行能力和相对运算能力，        .cpu_power = 绝对运算能力        .cpu_capacity = 相对运算能力     */    clbenv->bstats.cpu_power = (int) arch_scale_cpu_capacity(NULL, clbenv->btarget);    clbenv->lstats.cpu_power = (int) arch_scale_cpu_capacity(NULL, clbenv->ltarget);    clbenv->lstats.cpu_capacity = SCHED_CAPACITY_SCALE;    clbenv->bstats.cpu_capacity = SCHED_CAPACITY_SCALE * clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1);    /* (1.8.2) L族和B族的 */    collect_cluster_stats(&clbenv->bstats, &clbenv->bcpus, clbenv->btarget);    collect_cluster_stats(&clbenv->lstats, &clbenv->lcpus, clbenv->ltarget);    /* (1.8.3) L族和B族的 */    adj_threshold(clbenv);}|||→static void collect_cluster_stats(struct clb_stats *clbs, struct cpumask *cluster_cpus, int target){#define HMP_RESOLUTION_SCALING (4)#define hmp_scale_down(w) ((w) >> HMP_RESOLUTION_SCALING)    /* Update cluster informatics */    int cpu;    /* (1.8.2.1) 累加本族online cpu的值 */    for_each_cpu(cpu, cluster_cpus) {        if (cpu_online(cpu)) {            clbs->ncpu++;            clbs->ntask += cpu_rq(cpu)->cfs.h_nr_running;            clbs->load_avg += cpu_rq(cpu)->cfs.avg.loadwop_avg;#ifdef CONFIG_SCHED_HMP_PRIO_FILTER            clbs->nr_normal_prio_task += cfs_nr_normal_prio(cpu);            clbs->nr_dequeuing_low_prio += cfs_nr_dequeuing_low_prio(cpu);#endif        }    }    if (!clbs->ncpu || target >= num_possible_cpus() || !cpumask_test_cpu(target, cluster_cpus))        return;    /*     * Calculate available CPU capacity     * Calculate available task space     *     * Why load ratio should be multiplied by the number of task ?     * The task is the entity of scheduling unit so that we should consider     * it in scheduler. Only considering task load is not enough.     * Thus, multiplying the number of tasks can adjust load ratio to a more     * reasonable value.     */    /* (1.8.2.2) 计算本族剩余的cpu计算能力         capacity = 相对计算能力(clbs->cpu_capacity) - 本cpu的负载(rq->cfs.avg.loadwop_avg)        ：clbs->cpu_capacity是B族和L族相对的(L是1024，B大于1024)，而负载(rq->cfs.avg.loadwop_avg)是相对自己的B族和L族的最大值都是1024     */    clbs->load_avg /= clbs->ncpu;    clbs->acap = clbs->cpu_capacity - cpu_rq(target)->cfs.avg.loadwop_avg;    clbs->scaled_acap = hmp_scale_down(clbs->acap);    /* (1.8.2.3) 计算本族剩余的task空间        scaled_atask = 相对计算能力(clbs->cpu_capacity) - 本cpu的负载(rq->cfs.avg.loadwop_avg)*本cpu所有的进程数量(rq->cfs.h_nr_running)        ooooo这里的计算也不是在同一纬度上的     */    clbs->scaled_atask = cpu_rq(target)->cfs.h_nr_running * cpu_rq(target)->cfs.avg.loadwop_avg;    clbs->scaled_atask = clbs->cpu_capacity - clbs->scaled_atask;    clbs->scaled_atask = hmp_scale_down(clbs->scaled_atask);    mt_sched_printf(sched_log, "[%s] cpu/cluster:%d/%02lx load/len:%lu/%u stats:%d,%d,%d,%d,%d,%d,%d,%d\n",            __func__, target, *cpumask_bits(cluster_cpus),            cpu_rq(target)->cfs.avg.loadwop_avg,            cpu_rq(target)->cfs.h_nr_running,            clbs->ncpu, clbs->ntask, clbs->load_avg, clbs->cpu_capacity,            clbs->acap, clbs->scaled_acap, clbs->scaled_atask, clbs->threshold);}|||→/* * Task Dynamic Migration Threshold Adjustment. * * If the workload between clusters is not balanced, adjust migration * threshold in an attempt to move task precisely. * * Diff. = Max Threshold - Min Threshold * * Dynamic UP-Threshold = *                               B_nacap               B_natask * Max Threshold - Diff. x  -----------------  x  ------------------- *                          B_nacap + L_nacap     B_natask + L_natask * * * Dynamic Down-Threshold = *                               L_nacap               L_natask * Min Threshold + Diff. x  -----------------  x  ------------------- *                          B_nacap + L_nacap     B_natask + L_natask */static void adj_threshold(struct clb_env *clbenv){#define POSITIVE(x) ((int)(x) < 0 ? 0 : (x))    unsigned long b_cap = 0, l_cap = 0;    int b_nacap, l_nacap, b_natask, l_natask;    b_cap = clbenv->bstats.cpu_power;    l_cap = clbenv->lstats.cpu_power;    /* (1.8.3.1) 把B族剩余cpu计算能力和task空间，转换成L族的相对值 */    b_nacap = POSITIVE(clbenv->bstats.scaled_acap *            clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1));    b_natask = POSITIVE(clbenv->bstats.scaled_atask *            clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1));    /* L族的值维持不变 */          l_nacap = POSITIVE(clbenv->lstats.scaled_acap);    l_natask = POSITIVE(clbenv->lstats.scaled_atask);    /* (1.8.3.2) 计算up的threshold，         up-threshold = HMP_MAX_LOAD - HMP_MAX_LOAD*B族剩余     */    clbenv->bstats.threshold = HMP_MAX_LOAD - HMP_MAX_LOAD * b_nacap * b_natask /        ((b_nacap + l_nacap) * (b_natask + l_natask) + 1);    /* (1.8.3.3) 计算down的threshold，         down-threshold = HMP_MAX_LOAD*L族剩余     */    clbenv->lstats.threshold = HMP_MAX_LOAD * l_nacap * l_natask /        ((b_nacap + l_nacap) * (b_natask + l_natask) + 1);    mt_sched_printf(sched_log, "[%s]\tup/dl:%4d/%4d L(%d:%4lu) b(%d:%4lu)\n", __func__,            clbenv->bstats.threshold, clbenv->lstats.threshold,            clbenv->ltarget, l_cap, clbenv->btarget, b_cap);}||→/* * Check whether this task should be migrated to big * Briefly summarize the flow as below; * 1) Migration stabilizing * 2) Filter low-priority task * 2.5) Keep all cpu busy * 3) Check CPU capacity * 4) Check dynamic migration threshold */static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se,        struct clb_env *clbenv){    struct task_struct *p = task_of(se);    struct clb_stats *L, *B;    struct mcheck *check;    int curr_cpu = cpu;#ifdef CONFIG_HMP_TRACER    unsigned int caller = clbenv->flags;#endif    L = &clbenv->lstats;    B = &clbenv->bstats;    check = &clbenv->mcheck;    check->status = clbenv->flags;    check->status |= HMP_TASK_UP_MIGRATION;    check->result = 0;    /*     * No migration is needed if     * 1) There is only one cluster     * 2) Task is already in big cluster     * 3) It violates task affinity     */    if (!L->ncpu || !B->ncpu            || cpumask_test_cpu(curr_cpu, &clbenv->bcpus)            || !cpumask_intersects(&clbenv->bcpus, tsk_cpus_allowed(p)))        goto out;    /* (1.9.1) 如果目标cpu短时间内已经执行了up操作，则为up unstable状态，退出 */    /*     * [1] Migration stabilizing     * Let the task load settle before doing another up migration.     * It can prevent a bunch of tasks from migrating to a unstable CPU.     */    if (!hmp_up_stable(*target_cpu))        goto out;    /* (1.9.2) 过滤掉优先级较低的进程，不进行迁移操作。具体有3个条件：        (task_low_priority(p->prio) && \    // nice值大于5        (B->ntask >= B->ncpu || 0 != L->nr_normal_prio_task) && \  // B组进程大于cou数 || 正常优先级的进程不为0        (p->se.avg.loadwop_avg < 800))  // 平均负载小于800     */    /* [2] Filter low-priority task */#ifdef CONFIG_SCHED_HMP_PRIO_FILTER    if (hmp_low_prio_task_up_rejected(p, B, L)) {        check->status |= HMP_LOW_PRIORITY_FILTER;        goto trace;    }#endif    /* (1.9.3) 如果B组的target cpu为idle，不用过多判断，直接准备迁移 */    /* [2.5]if big is idle, just go to big */    if (rq_length(*target_cpu) == 0) {        check->status |= HMP_BIG_IDLE;        check->status |= HMP_MIGRATION_APPROVED;        check->result = 1;        goto trace;    }    /* (1.9.4) 判断B族target cpu的capacity是否足够，        (se_load(se) + cfs_load(cpu)) < (B->cpu_capacity - (B->cpu_capacity >> 2))        // target cpu负载 + 要迁移的se负载 是否小于 3/4 B族cpu的capacity     */    /*     * [3] Check CPU capacity     * Forbid up-migration if big CPU can't handle this task     */    if (!hmp_task_fast_cpu_afford(B, se, *target_cpu)) {        check->status |= HMP_BIG_CAPACITY_INSUFFICIENT;        goto trace;    }    /* (1.9.5) 判断se的负载是否已经大于up-threshold(B->threshold) */    /*     * [4] Check dynamic migration threshold     * Migrate task from LITTLE to big if load is greater than up-threshold     */    if (se_load(se) > B->threshold) {        check->status |= HMP_MIGRATION_APPROVED;        check->result = 1;    }trace:#ifdef CONFIG_HMP_TRACER    if (check->result && hmp_caller_is_gb(caller))        hmp_stats.nr_force_up++;    trace_sched_hmp_stats(&hmp_stats);    trace_sched_dynamic_threshold(task_of(se), B->threshold, check->status,            curr_cpu, *target_cpu, se_load(se), B, L);    trace_sched_dynamic_threshold_draw(B->threshold, L->threshold);#endifout:    return check->result;}||→static int hmp_force_up_cpu_stop(void *data){    /* (1.10.1) 执行进程迁移 */    return hmp_active_task_migration_cpu_stop(data);}|||→static int hmp_active_task_migration_cpu_stop(void *data){    struct rq *busiest_rq = data;    struct task_struct *p = NULL;    int busiest_cpu = cpu_of(busiest_rq);    int target_cpu = busiest_rq->push_cpu;    struct rq *target_rq = cpu_rq(target_cpu);    struct sched_domain *sd;    raw_spin_lock_irq(&busiest_rq->lock);    p = busiest_rq->migrate_task;    /* make sure the requested cpu hasn't gone down in the meantime */    if (unlikely(busiest_cpu != smp_processor_id() ||                !busiest_rq->active_balance)) {        goto out_unlock;    }    /* Is there any task to move? */    if (busiest_rq->nr_running <= 1)        goto out_unlock;    /* Are both target and busiest cpu online */    if (!cpu_online(busiest_cpu) || !cpu_online(target_cpu))        goto out_unlock;    /* Task has migrated meanwhile, abort forced migration */    if ((!p) || (task_rq(p) != busiest_rq))        goto out_unlock;    /*     * This condition is "impossible", if it occurs     * we need to fix it. Originally reported by     * Bjorn Helgaas on a 128-cpu setup.     */    WARN_ON(busiest_rq == target_rq);    /* (1.10.1.1) 将源、目的rq lock住 */    /* move a task from busiest_rq to target_rq */    double_lock_balance(busiest_rq, target_rq);    /* (1.10.1.2) 搜索target cpu所在的某一层次的sd，其sd->span[]即包含源cpu又包含目的cpu */    /* Search for an sd spanning us and the target CPU. */    rcu_read_lock();    for_each_domain(target_cpu, sd) {        if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))            break;    }    /* (1.10.1.3) 构造数据，在同一sd下进行迁移 */    if (likely(sd)) {        struct lb_env env = {            .sd             = sd,            .dst_cpu        = target_cpu,            .dst_rq         = target_rq,            .src_cpu        = busiest_rq->cpu,            .src_rq         = busiest_rq,            .idle           = CPU_IDLE,        };        schedstat_inc(sd, alb_count);        /* (1.10.1.4) 任务迁移 */        if (move_specific_task(&env, p))            schedstat_inc(sd, alb_pushed);        else            schedstat_inc(sd, alb_failed);    }    rcu_read_unlock();    double_unlock_balance(busiest_rq, target_rq);out_unlock:    busiest_rq->active_balance = 0;    raw_spin_unlock_irq(&busiest_rq->lock);    put_task_struct(p);    return 0;}||||→static int move_specific_task(struct lb_env *env, struct task_struct *pm){    struct task_struct *p, *n;    /* (1.10.1.4.1) 从源rq->cfs_tasks逐个取出任务，直到查到pm */    list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {        /* (1.10.1.4.2) task group的throttled判断 */        if (throttled_lb_pair(task_group(p), env->src_rq->cpu,                    env->dst_cpu))            continue;        /* (1.10.1.4.3) 判断任务能否被迁移 */        if (!hmp_can_migrate_task(p, env))            continue;        /* Check if we found the right task */        if (p != pm)            continue;        /* (1.10.1.4.4) 迁移 */        move_task(p, env);        /*         * Right now, this is only the third place move_task()         * is called, so we can safely collect move_task()         * stats here rather than inside move_task().         */        schedstat_inc(env->sd, lb_gained[env->idle]);        return 1;    }    return 0;}|||||→static void move_task(struct task_struct *p, struct lb_env *env){    deactivate_task(env->src_rq, p, 0);    set_task_cpu(p, env->dst_cpu);    activate_task(env->dst_rq, p, 0);    check_preempt_curr(env->dst_rq, p, 0);}

4.2.3、hmp_force_down_migration()

hmp_force_down_migration()的操作主要有以下几个步骤：

1、根据当前cpu，选择fast_cpu_mask、slow_cpu_mask；

hmp_force_down_migration尝试把fast cpu上的light进程迁移到slow cpu上，关于fast、slow的选择有以下几种场景：

这里写图片描述

2、选择当前cpu的lightest进程作为迁移进程p；并不会遍历cpu上所有进程去选出lightest进程，只会查询curr进程和cfs_rq中5个进程中的lightest；
3、根据slow_cpu_mask，选择一个负载最少的target cpu；

这里写图片描述

4、根据源cpu(curr_cpu)、目的cpu(target_cpu)，计算负载；

重要的数据计算方法和hmp_force_up_migration()一致，参考上一节；

5、根据计算的负载情况，判断进程p是否符合down迁移条件((L->threshold >= se_load(se))，等其他条件)；

down-migration条件列表(hmp_down_migration())：

条件含义计算方法计算解析 [1] Migration stabilizing 如果target cpu刚做过down迁移，不适合再进行迁移 if (!hmp_down_stable(*target_cpu)) check->result = 0; (((now - hmp_last_down_migration(cpu)) >> 10)

static void hmp_force_down_migration(int this_cpu){    int target_cpu;    struct sched_entity *se;    struct rq *target;    unsigned long flags;    unsigned int force = 0;    struct task_struct *p;    struct clb_env clbenv;#ifdef CONFIG_SCHED_HMP_PLUS    struct sched_entity *orig;    int B_cpu;#endif    struct hmp_domain *hmp_domain = NULL;    struct cpumask fast_cpu_mask, slow_cpu_mask;    cpumask_clear(&fast_cpu_mask);    cpumask_clear(&slow_cpu_mask);    /* Migrate light task from big to LITTLE */    /* (1) 如果当前cpu不是最慢的cpu(slowest)，则尝试down操作 */    if (!hmp_cpu_is_slowest(this_cpu)) {        /* (2) 当前cpu所在的hmp_domain为fast_cpu_mask */        hmp_domain = hmp_cpu_domain(this_cpu);        cpumask_copy(&fast_cpu_mask, &hmp_domain->possible_cpus);        /* (3) 查找相比当前最慢且online的hmp_domain作为slow_cpu_mask */        while (!list_is_last(&hmp_domain->hmp_domains, &hmp_domains)) {            struct list_head *pos = &hmp_domain->hmp_domains;            hmp_domain = list_entry(pos->next, struct hmp_domain, hmp_domains);            if (!cpumask_empty(&hmp_domain->cpus)) {                cpumask_copy(&slow_cpu_mask, &hmp_domain->possible_cpus);                break;            }        }    }    if (!hmp_domain || hmp_domain == hmp_cpu_domain(this_cpu))        return;    /* (4) 找不到可操作的fast_cpu_mask、slow_cpu_mask直接返回 */    if (cpumask_empty(&fast_cpu_mask) || cpumask_empty(&slow_cpu_mask))        return;    /* (5) 源cpu = this_cpu，源rq = target */    force = 0;    target = cpu_rq(this_cpu);    raw_spin_lock_irqsave(&target->lock, flags);    se = target->cfs.curr;    if (!se) {        raw_spin_unlock_irqrestore(&target->lock, flags);        return;    }    /* (6) 首先尝试使用curr进程作为down迁移的进程 */    /* Find task entity */    if (!entity_is_task(se)) {        struct cfs_rq *cfs_rq;        cfs_rq = group_cfs_rq(se);        while (cfs_rq) {            se = cfs_rq->curr;            cfs_rq = group_cfs_rq(se);        }    }#ifdef CONFIG_SCHED_HMP_PLUS    /* (7) 在curr进程开始的5个进程中，挑负载最轻的进程作为down迁移进程 */    orig = se;    se = hmp_get_lightest_task(orig, 1);    if (!entity_is_task(se))        p = task_of(orig);    else#endif        p = task_of(se);#ifdef CONFIG_SCHED_HMP_PLUS    /* (8) 找出B族中负载最轻的cpu，如果其为idle状态，则放弃down操作         因为load_balance中的idle_balance会重新把任务迁移回idle的big cpu，避免相互的乒乓操作     */    /* Don't offload to little if there is one idle big, let load balance to do it's work */    /* Also, to prevent idle_balance from leading to potential ping-pong */    B_cpu = hmp_select_cpu(HMP_GB, p, &fast_cpu_mask, this_cpu, 0);    if (B_cpu < nr_cpu_ids && !rq_length(B_cpu)) {        raw_spin_unlock_irqrestore(&target->lock, flags);        return;    }#endif    /* (9) 找出L族中负载最轻的cpu作为target_cpu */    target_cpu = hmp_select_cpu(HMP_GB, p, &slow_cpu_mask, -1, 1);    if (target_cpu >= num_possible_cpus()) {        raw_spin_unlock_irqrestore(&target->lock, flags);        return;    }    /* (10) 迁移前对B族、L族负载和threshold的计算 */    /* Collect cluster information */    memset(&clbenv, 0, sizeof(clbenv));    clbenv.flags |= HMP_GB;    clbenv.btarget = this_cpu;    clbenv.ltarget = target_cpu;    cpumask_copy(&clbenv.lcpus, &slow_cpu_mask);    cpumask_copy(&clbenv.bcpus, &fast_cpu_mask);    sched_update_clbstats(&clbenv);#ifdef CONFIG_SCHED_HMP_PLUS    if (cpu_rq(this_cpu)->cfs.h_nr_running < 2) {        raw_spin_unlock_irqrestore(&target->lock, flags);        return;    }#endif    /* (11) 检查down操作的迁移条件是否成立,hmp_down_migration() */    /* Check migration threshold */    if (!target->active_balance &&            hmp_down_migration(this_cpu, &target_cpu, se, &clbenv) &&            !cpu_park(cpu_of(target))) {        if (p->state != TASK_DEAD) {            get_task_struct(p);            target->active_balance = 1; /* force down */            target->push_cpu = target_cpu;            target->migrate_task = p;            force = 1;            trace_sched_hmp_migrate(p, target->push_cpu, 1);            hmp_next_down_delay(&p->se, target->push_cpu);        }    }    raw_spin_unlock_irqrestore(&target->lock, flags);    /* (12) 条件成立进行实际的down迁移操作hmp_force_down_cpu_stop() */    if (force) {        if (stop_one_cpu_dispatch(cpu_of(target),                    hmp_force_down_cpu_stop,                    target, &target->active_balance_work)) {            put_task_struct(p); /* out of rq->lock */            raw_spin_lock_irqsave(&target->lock, flags);            target->active_balance = 0;            force = 0;            raw_spin_unlock_irqrestore(&target->lock, flags);        }    }}|→static struct sched_entity *hmp_get_lightest_task(        struct sched_entity *se, int migrate_down){    int num_tasks = hmp_max_tasks;    struct sched_entity *min_se = se;    unsigned long int min_ratio = se->avg.loadwop_avg;    const struct cpumask *hmp_target_mask = NULL;    if (migrate_down) {        struct hmp_domain *hmp;        /* (7.1) 如果cpu是最慢cpu(slowest)则直接退出，            因为本函数的目的是找出faster cpu中lightest进程         */        if (hmp_cpu_is_slowest(cpu_of(se->cfs_rq->rq)))            return min_se;        /* (7.2) 将更slow一级的hmp_domain作为进程cpu亲和力的mask */        hmp = hmp_slower_domain(cpu_of(se->cfs_rq->rq));        hmp_target_mask = &hmp->cpus;    }    /* The currently running task is not on the runqueue */    se = __pick_first_entity(cfs_rq_of(se));    /* (7.3) 从当前cpu的cfs红黑树中，连续5个进程和curr进程比较，选出lightest进程         比较使用的负载为se->avg.loadwop_avg，不带weight分量     */    while (num_tasks && se) {        if (entity_is_task(se) &&                (se->avg.loadwop_avg < min_ratio && hmp_target_mask &&                 cpumask_intersects(hmp_target_mask, tsk_cpus_allowed(task_of(se))))) {            min_se = se;            min_ratio = se->avg.loadwop_avg;        }        se = __pick_next_entity(se);        num_tasks--;    }    return min_se;}|→/* * Check whether this task should be migrated to LITTLE * Briefly summarize the flow as below; * 1) Migration stabilizing * 1.5) Keep all cpu busy * 2) Filter low-priority task * 3) Check CPU capacity * 4) Check dynamic migration threshold */static unsigned int hmp_down_migration(int cpu, int *target_cpu, struct sched_entity *se,        struct clb_env *clbenv){    struct task_struct *p = task_of(se);    struct clb_stats *L, *B;    struct mcheck *check;    int curr_cpu = cpu;    unsigned int caller = clbenv->flags;    L = &clbenv->lstats;    B = &clbenv->bstats;    check = &clbenv->mcheck;    check->status = caller;    check->status |= HMP_TASK_DOWN_MIGRATION;    check->result = 0;    /*     * No migration is needed if     * 1) There is only one cluster     * 2) Task is already in LITTLE cluster     * 3) It violates task affinity     */    if (!L->ncpu || !B->ncpu            || cpumask_test_cpu(curr_cpu, &clbenv->lcpus)            || !cpumask_intersects(&clbenv->lcpus, tsk_cpus_allowed(p)))        goto out;    /* (11.1) 目的little cpu target_cpu近期如果有做过down操作，不适合再做down迁移 */    /*     * [1] Migration stabilizing     * Let the task load settle before doing another down migration.     * It can prevent a bunch of tasks from migrating to a unstable CPU.     */    if (!hmp_down_stable(*target_cpu))        goto out;    /* (11.2) 如果big busy，little idle则不用进行threshold判断 */    /* [1.5]if big is busy and little is idle, just go to little */    if (rq_length(*target_cpu) == 0 && caller == HMP_SELECT_RQ && rq_length(curr_cpu) > 0) {        struct rq *curr_rq = cpu_rq(curr_cpu);        /* (11.2.1) 如果big cpu，curr进程不是heavy进程，但是p是heavy进程，直接准许down迁移             heavy进程的判断标准为：负载>=650         */        /* if current big core is not heavy task and wake up task is heavy task no go to little */        if (!(!is_heavy_task(curr_rq->curr) && is_heavy_task(p))) {            check->status |= HMP_BIG_BUSY_LITTLE_IDLE;            check->status |= HMP_MIGRATION_APPROVED;            check->result = 1;            goto trace;        }    }    /* (11.3) 低优先级进程，如果满足以下条件，准许迁移：        (task_low_priority(p->prio) && !B->nr_dequeuing_low_prio && \   // nice值大于5         B->ntask >= B->ncpu && 0 != L->nr_normal_prio_task && \        // B和L都不是特别空闲         (p->se.avg.loadwop_avg < 800))                                 // L上准备迁移的进程负载小于800     */    /* [2] Filter low-priority task */#ifdef CONFIG_SCHED_HMP_PRIO_FILTER    if (hmp_low_prio_task_down_allowed(p, B, L)) {        cfs_nr_dequeuing_low_prio(curr_cpu)++;        check->status |= HMP_LOW_PRIORITY_FILTER;        check->status |= HMP_MIGRATION_APPROVED;        check->result = 1;        goto trace;    }#endif    /*     * [3] Check CPU capacity     * Forbid down-migration if either of the following conditions is true     * 1) big cpu is not oversubscribed (if big CPU seems to have spare     *    cycles, do not force this task to run on LITTLE CPU, but     *    keep it staying in its previous cluster instead)     * 2) LITTLE cpu doesn't have available capacity for this new task     */    /* (11.4) 如果big cpu有足够的空闲周期，不需要强制把light任务迁移到little cpu上         cfs_load(cpu) < (B->cpu_capacity - (B->cpu_capacity >> 2))     */    if (!hmp_fast_cpu_oversubscribed(caller, B, se, curr_cpu)) {        check->status |= HMP_BIG_NOT_OVERSUBSCRIBED;        goto trace;    }    /* (11.5) 判断L族cpu的capacity是否足够容纳需要迁移的进程，        (L->acap > 0 && L->acap >= se_load(se))     */    if (!hmp_task_slow_cpu_afford(L, se)) {        check->status |= HMP_LITTLE_CAPACITY_INSUFFICIENT;        goto trace;    }    /* (11.6) 判断se的负载是否已经小于down-threshold(L->threshold) */    /*     * [4] Check dynamic migration threshold     * Migrate task from big to LITTLE if load ratio is less than     * or equal to down-threshold     */    if (L->threshold >= se_load(se)) {        check->status |= HMP_MIGRATION_APPROVED;        check->result = 1;    }trace:#ifdef CONFIG_HMP_TRACER    if (check->result && hmp_caller_is_gb(caller))        hmp_stats.nr_force_down++;    trace_sched_hmp_stats(&hmp_stats);    trace_sched_dynamic_threshold(task_of(se), L->threshold, check->status,            curr_cpu, *target_cpu, se_load(se), B, L);    trace_sched_dynamic_threshold_draw(B->threshold, L->threshold);#endifout:    return check->result;}

4.2.4、hmp_select_task_rq_fair()

4.3、cpu freq调整

前面讲的负载均衡的手段都是负载迁移，把负载迁移到最idle或者最省power的cpu上。另外一种方式就是调整cpu的freq，从而改变cpu的curr_capacity，来满足性能和功耗的需求。

cpu的频率调整是基于3个层次的：cpufreq governor、cpufreq core、cpufreq driver。

1、cpufreq governor决定cpu调频的算法，计算负载、根据负载的变化来动态调整频率；
2、cpufreq core对通用层进行了一些封装，比如cpufreq_policy的封装；
3、cpufreq driver是底层操作的实现，比如freq_table的初始化、cpu target频率的配置；

这里写图片描述

如果是MTK平台，cpufreq driver除了接受governor的频率调整还需要接受ppm的频率调整，它的框图大概如下：

这里写图片描述

4.3.1、cpufreq core & cpufreq driver

cpufreq core层次最核心的就是每个cpu有一个自己的cpufreq_policy policy，放在per_cpu(cpufreq_cpu_data, cpu)变量中。实际上cpufreq_policy是一个cluster对应一个的，因为在现有的架构中，同一个cluster cpu都是同一个频率，所以同cluster中所有cpu的per_cpu(cpufreq_cpu_data, cpu)都指向同一个cpufreq_policy。

这里写图片描述

4.3.1.1、cpufreq_policy policy初始化

struct cpufreq_policy {    /* CPUs sharing clock, require sw coordination */    cpumask_var_t       cpus;   /* Online CPUs only */    cpumask_var_t       related_cpus; /* Online + Offline CPUs */    cpumask_var_t       real_cpus; /* Related and present */    unsigned int        shared_type; /* ACPI: ANY or ALL affected CPUs                        should set cpufreq */    unsigned int        cpu;    /* cpu managing this policy, must be online */    struct clk      *clk;    struct cpufreq_cpuinfo  cpuinfo;/* see above */    unsigned int        min;    /* in kHz */    unsigned int        max;    /* in kHz */    unsigned int        cur;    /* in kHz, only needed if cpufreq                     * governors are used */    unsigned int        restore_freq; /* = policy->cur before transition */    unsigned int        suspend_freq; /* freq to set during suspend */    unsigned int        policy; /* see above */    unsigned int        last_policy; /* policy before unplug */    struct cpufreq_governor *governor; /* see below */    void            *governor_data;    bool            governor_enabled; /* governor start/stop flag */    char            last_governor[CPUFREQ_NAME_LEN]; /* last governor used */    struct work_struct  update; /* if update_policy() needs to be                     * called, but you're in IRQ context */    struct cpufreq_user_policy user_policy;    struct cpufreq_frequency_table  *freq_table;    struct list_head        policy_list;    struct kobject      kobj;    struct completion   kobj_unregister;    /*     * The rules for this semaphore:     * - Any routine that wants to read from the policy structure will     *   do a down_read on this semaphore.     * - Any routine that will write to the policy structure and/or may take away     *   the policy altogether (eg. CPU hotplug), will hold this lock in write     *   mode before doing so.     *     * Additional rules:     * - Lock should not be held across     *     __cpufreq_governor(data, CPUFREQ_GOV_POLICY_EXIT);     */    struct rw_semaphore rwsem;    /* Synchronization for frequency transitions */    bool            transition_ongoing; /* Tracks transition status */    spinlock_t      transition_lock;    wait_queue_head_t   transition_wait;    struct task_struct  *transition_task; /* Task which is doing the transition */    /* cpufreq-stats */    struct cpufreq_stats    *stats;    /* For cpufreq driver's internal use */    void            *driver_data;}

在系统初始化化的时候初始化online cpu的cpufreq_policy，cpu在hotplug online的时候也会重新初始化cpufreq_policy。

1、在mtk的cpufreq_driver驱动初始化函数_mt_cpufreq_pdrv_probe()中注册了_mt_cpufreq_driver：

static int _mt_cpufreq_pdrv_probe(struct platform_device *pdev){    /* 注册cpufreq_driver */    cpufreq_register_driver(&_mt_cpufreq_driver);    /* 注册ppm的回调 */    mt_ppm_register_client(PPM_CLIENT_DVFS, &ppm_limit_callback);}static struct cpufreq_driver _mt_cpufreq_driver = {    .flags = CPUFREQ_ASYNC_NOTIFICATION,    .verify = _mt_cpufreq_verify,    .target = _mt_cpufreq_target,    .init = _mt_cpufreq_init,    .exit = _mt_cpufreq_exit,    .get = _mt_cpufreq_get,    .name = "mt-cpufreq",    .attr = _mt_cpufreq_attr,};

2、在驱动注册cpufreq_register_driver()过程中会初始化online cpu的cpufreq_policy：

_mt_cpufreq_pdrv_probe() -> cpufreq_register_driver() -> subsys_interface_register() -> cpufreq_add_dev() -> cpufreq_online()↓static int cpufreq_online(unsigned int cpu){    struct cpufreq_policy *policy;    bool new_policy;    unsigned long flags;    unsigned int j;    int ret;    pr_debug("%s: bringing CPU%u online\n", __func__, cpu);    /* (1) 检查per_cpu(cpufreq_cpu_data, cpu)中的cpufreq_policy，         如果为NULL，重新分配空间     */    /* Check if this CPU already has a policy to manage it */    policy = per_cpu(cpufreq_cpu_data, cpu);    if (policy) {        WARN_ON(!cpumask_test_cpu(cpu, policy->related_cpus));        if (!policy_is_inactive(policy))            return cpufreq_add_policy_cpu(policy, cpu);        /* This is the only online CPU for the policy.  Start over. */        new_policy = false;        down_write(&policy->rwsem);        policy->cpu = cpu;        policy->governor = NULL;        up_write(&policy->rwsem);    } else {        new_policy = true;        policy = cpufreq_policy_alloc(cpu);        if (!policy)            return -ENOMEM;    }    cpumask_copy(policy->cpus, cpumask_of(cpu));    /* (2) 调用cpufreq_driver的初始化函数来初始化cpufreq_policy，         这步比较重要，初始化了以下的数据：     */    /* call driver. From then on the cpufreq must be able     * to accept all calls to ->verify and ->setpolicy for this CPU     */    ret = cpufreq_driver->init(policy);    if (ret) {        pr_debug("initialization failed\n");        goto out_free_policy;    }    down_write(&policy->rwsem);    /* (3) 如果cpufreq_policy是新分配空间的，        做一些相应的初始化工作     */    if (new_policy) {        /* related_cpus should at least include policy->cpus. */        cpumask_copy(policy->related_cpus, policy->cpus);        /* Remember CPUs present at the policy creation time. */        cpumask_and(policy->real_cpus, policy->cpus, cpu_present_mask);        /* Name and add the kobject */        ret = kobject_add(&policy->kobj, cpufreq_global_kobject,                  "policy%u",                  cpumask_first(policy->related_cpus));        if (ret) {            pr_err("%s: failed to add policy->kobj: %d\n", __func__,                   ret);            goto out_exit_policy;        }    }    /*     * affected cpus must always be the one, which are online. We aren't     * managing offline cpus here.     */    cpumask_and(policy->cpus, policy->cpus, cpu_online_mask);    if (new_policy) {        policy->user_policy.min = policy->min;        policy->user_policy.max = policy->max;        write_lock_irqsave(&cpufreq_driver_lock, flags);        /* (3.1) 同一个cluster中所有cpu的per_cpu(cpufreq_cpu_data, j)，共享同一个cpufreq_policy */        for_each_cpu(j, policy->related_cpus)            per_cpu(cpufreq_cpu_data, j) = policy;        write_unlock_irqrestore(&cpufreq_driver_lock, flags);    }    /* (4) 获取cpufreq_policy的当前频率     */    if (cpufreq_driver->get && !cpufreq_driver->setpolicy) {        policy->cur = cpufreq_driver->get(policy->cpu);        if (!policy->cur) {            pr_err("%s: ->get() failed\n", __func__);            goto out_exit_policy;        }    }    /*     * Sometimes boot loaders set CPU frequency to a value outside of     * frequency table present with cpufreq core. In such cases CPU might be     * unstable if it has to run on that frequency for long duration of time     * and so its better to set it to a frequency which is specified in     * freq-table. This also makes cpufreq stats inconsistent as     * cpufreq-stats would fail to register because current frequency of CPU     * isn't found in freq-table.     *     * Because we don't want this change to effect boot process badly, we go     * for the next freq which is >= policy->cur ('cur' must be set by now,     * otherwise we will end up setting freq to lowest of the table as 'cur'     * is initialized to zero).     *     * We are passing target-freq as "policy->cur - 1" otherwise     * __cpufreq_driver_target() would simply fail, as policy->cur will be     * equal to target-freq.     */    if ((cpufreq_driver->flags & CPUFREQ_NEED_INITIAL_FREQ_CHECK)        && has_target()) {        /* Are we running at unknown frequency ? */        ret = cpufreq_frequency_table_get_index(policy, policy->cur);        if (ret == -EINVAL) {            /* Warn user and fix it */            pr_warn("%s: CPU%d: Running at unlisted freq: %u KHz\n",                __func__, policy->cpu, policy->cur);            ret = __cpufreq_driver_target(policy, policy->cur - 1,                CPUFREQ_RELATION_L);            /*             * Reaching here after boot in a few seconds may not             * mean that system will remain stable at "unknown"             * frequency for longer duration. Hence, a BUG_ON().             */            BUG_ON(ret);            pr_warn("%s: CPU%d: Unlisted initial frequency changed to: %u KHz\n",                __func__, policy->cpu, policy->cur);        }    }    blocking_notifier_call_chain(&cpufreq_policy_notifier_list,                     CPUFREQ_START, policy);    if (new_policy) {        ret = cpufreq_add_dev_interface(policy);        if (ret)            goto out_exit_policy;        blocking_notifier_call_chain(&cpufreq_policy_notifier_list,                CPUFREQ_CREATE_POLICY, policy);        write_lock_irqsave(&cpufreq_driver_lock, flags);        list_add(&policy->policy_list, &cpufreq_policy_list);        write_unlock_irqrestore(&cpufreq_driver_lock, flags);    }    /* (5) 调用cpufreq governor的初始化函数，来初始化cpufreq_policy     */    ret = cpufreq_init_policy(policy);    if (ret) {        pr_err("%s: Failed to initialize policy for cpu: %d (%d)\n",               __func__, cpu, ret);        /* cpufreq_policy_free() will notify based on this */        new_policy = false;        goto out_exit_policy;    }    up_write(&policy->rwsem);    kobject_uevent(&policy->kobj, KOBJ_ADD);    /* Callback for handling stuff after policy is ready */    if (cpufreq_driver->ready)        cpufreq_driver->ready(policy);    pr_debug("initialization complete\n");    return 0;out_exit_policy:    up_write(&policy->rwsem);    if (cpufreq_driver->exit)        cpufreq_driver->exit(policy);out_free_policy:    cpufreq_policy_free(policy, !new_policy);    return ret;}|→static int _mt_cpufreq_init(struct cpufreq_policy *policy){    int ret = -EINVAL;    unsigned long flags;    FUNC_ENTER(FUNC_LV_MODULE);    policy->shared_type = CPUFREQ_SHARED_TYPE_ANY;    cpumask_setall(policy->cpus);    policy->cpuinfo.transition_latency = 1000;    {        enum mt_cpu_dvfs_id id = _get_cpu_dvfs_id(policy->cpu);        struct mt_cpu_dvfs *p = id_to_cpu_dvfs(id);        unsigned int lv = _mt_cpufreq_get_cpu_level();        struct opp_tbl_info *opp_tbl_info;        struct opp_tbl_m_info *opp_tbl_m_info;        struct opp_tbl_m_info *opp_tbl_m_cci_info;        struct mt_cpu_dvfs *p_cci;        cpufreq_ver("DVFS: _mt_cpufreq_init: %s(cpu_id = %d)\n", cpu_dvfs_get_name(p), p->cpu_id);        opp_tbl_info = &opp_tbls[id][lv];        p->cpu_level = lv;        /* (2.1) 给policy->freq_table赋值             给policy->cpus赋值            给policy->related_cpus赋值         */        ret = _mt_cpufreq_setup_freqs_table(policy,                            opp_tbl_info->opp_tbl, opp_tbl_info->size);        /* (2.2) 给policy->cpuinfo.max_freq赋值             给policy->cpuinfo.min_freq赋值         */        policy->cpuinfo.max_freq = cpu_dvfs_get_max_freq(p);        policy->cpuinfo.min_freq = cpu_dvfs_get_min_freq(p);        opp_tbl_m_info = &opp_tbls_m[id][lv];        p->freq_tbl = opp_tbl_m_info->opp_tbl_m;        cpufreq_lock(flags);        /* Sync p */        if (_mt_cpufreq_sync_opp_tbl_idx(p) >= 0)            if (p->idx_normal_max_opp == -1)                p->idx_normal_max_opp = p->idx_opp_tbl;        /* (2.3) 给policy->cur赋值             给policy->max赋值            给policy->min赋值         */        policy->cur = cpu_dvfs_get_cur_freq(p); /* use cur phy freq is better */        policy->max = cpu_dvfs_get_freq_by_idx(p, p->idx_opp_ppm_limit);        policy->min = cpu_dvfs_get_freq_by_idx(p, p->idx_opp_ppm_base);        p->mt_policy = policy;        p->armpll_is_available = 1;#ifdef CONFIG_HYBRID_CPU_DVFS        if (turbo_flag && cpu_dvfs_is(p, MT_CPU_DVFS_B) && !turbo_is_inited) {            unsigned int turbo_f, turbo_v;            turbo_f = ((cpu_dvfs_get_max_freq(p) * 104 / 100) / 13) * 13 / 1000;            if (picachu_need_higher_volt(MT_PICACHU_DOMAIN2))                turbo_v = MAX_VPROC_VOLT;            else                turbo_v = MAX_VPROC_VOLT - 2000;            /* turbo_v = p->opp_tbl[0].cpufreq_volt; */            cpuhvfs_set_turbo_scale(turbo_f * 1000, turbo_v);            turbo_is_inited = 1;        }#endif        /* Sync cci */        if (cci_is_inited == 0) {            p_cci = id_to_cpu_dvfs(MT_CPU_DVFS_CCI);            /* init cci freq idx */            if (_mt_cpufreq_sync_opp_tbl_idx(p_cci) >= 0)                if (p_cci->idx_normal_max_opp == -1)                    p_cci->idx_normal_max_opp = p_cci->idx_opp_tbl;            opp_tbl_m_cci_info = &opp_tbls_m[MT_CPU_DVFS_CCI][lv];            p_cci->freq_tbl = opp_tbl_m_cci_info->opp_tbl_m;            p_cci->mt_policy = NULL;            p_cci->armpll_is_available = 1;            cci_is_inited = 1;        }#ifdef CONFIG_HYBRID_CPU_DVFS        cpuhvfs_set_cluster_on_off(arch_get_cluster_id(p->cpu_id), 1);#endif        cpufreq_unlock(flags);    }    if (ret)        cpufreq_err("failed to setup frequency table\n");    FUNC_EXIT(FUNC_LV_MODULE);    return ret;}||→static int _mt_cpufreq_setup_freqs_table(struct cpufreq_policy *policy,                     struct mt_cpu_freq_info *freqs, int num){    struct mt_cpu_dvfs *p;    int ret = 0;    FUNC_ENTER(FUNC_LV_LOCAL);    p = id_to_cpu_dvfs(_get_cpu_dvfs_id(policy->cpu));#ifdef CONFIG_CPU_FREQ    ret = cpufreq_frequency_table_cpuinfo(policy, p->freq_tbl_for_cpufreq);    /* (2.1.1) 给policy->freq_table赋值      */    if (!ret)        policy->freq_table = p->freq_tbl_for_cpufreq;    /* (2.1.2) 根据cpu相同cluster中有哪些cpu         给policy->cpus赋值        给policy->related_cpus赋值     */    cpumask_copy(policy->cpus, topology_core_cpumask(policy->cpu));    cpumask_copy(policy->related_cpus, policy->cpus);#endif    FUNC_EXIT(FUNC_LV_LOCAL);    return 0;}

3、在cpufreq_online()初始化完cpufreq_policy，最后会调用cpufreq_init_policy()继续governor的初始化：

static int cpufreq_init_policy(struct cpufreq_policy *policy){    struct cpufreq_governor *gov = NULL;    struct cpufreq_policy new_policy;    memcpy(&new_policy, policy, sizeof(*policy));    /* (5.1) 使用last或者default的governor，        给new_policy.governor赋值     */    /* Update governor of new_policy to the governor used before hotplug */    gov = find_governor(policy->last_governor);    if (gov)        pr_debug("Restoring governor %s for cpu %d\n",                policy->governor->name, policy->cpu);    else        gov = CPUFREQ_DEFAULT_GOVERNOR;    new_policy.governor = gov;    /* Use the default policy if there is no last_policy. */    if (cpufreq_driver->setpolicy) {        if (policy->last_policy)            new_policy.policy = policy->last_policy;        else            cpufreq_parse_governor(gov->name, &new_policy.policy,                           NULL);    }    /* (5.2) 启动governor来使用cpufreq_policy */    /* set default policy */    return cpufreq_set_policy(policy, &new_policy);}|→static int cpufreq_set_policy(struct cpufreq_policy *policy,                struct cpufreq_policy *new_policy){    struct cpufreq_governor *old_gov;    int ret;    pr_debug("setting new policy for CPU %u: %u - %u kHz\n",         new_policy->cpu, new_policy->min, new_policy->max);    memcpy(&new_policy->cpuinfo, &policy->cpuinfo, sizeof(policy->cpuinfo));    /* (5.2.1) 对policy、new_policy的一堆合法性判断 */    /*    * This check works well when we store new min/max freq attributes,    * because new_policy is a copy of policy with one field updated.    */    if (new_policy->min > new_policy->max)        return -EINVAL;    /* verify the cpu speed can be set within this limit */    ret = cpufreq_driver->verify(new_policy);    if (ret)        return ret;    /* adjust if necessary - all reasons */    blocking_notifier_call_chain(&cpufreq_policy_notifier_list,            CPUFREQ_ADJUST, new_policy);    /*     * verify the cpu speed can be set within this limit, which might be     * different to the first one     */    ret = cpufreq_driver->verify(new_policy);    if (ret)        return ret;    /* notification of the new policy */    blocking_notifier_call_chain(&cpufreq_policy_notifier_list,            CPUFREQ_NOTIFY, new_policy);    scale_freq_capacity(new_policy, NULL);    policy->min = new_policy->min;    policy->max = new_policy->max;    trace_cpu_frequency_limits(policy->max, policy->min, policy->cpu);    pr_debug("new min and max freqs are %u - %u kHz\n",         policy->min, policy->max);    if (cpufreq_driver->setpolicy) {        policy->policy = new_policy->policy;        pr_debug("setting range\n");        return cpufreq_driver->setpolicy(new_policy);    }    if (new_policy->governor == policy->governor)        goto out;    pr_debug("governor switch\n");    /* (5.2.2) 如果旧的governor在工作中，        依次调用 CPUFREQ_GOV_STOP、CPUFREQ_GOV_POLICY_EXIT停止旧的governor     */    /* save old, working values */    old_gov = policy->governor;    /* end old governor */    if (old_gov) {        ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);        if (ret) {            /* This can happen due to race with other operations */            pr_debug("%s: Failed to Stop Governor: %s (%d)\n",                 __func__, old_gov->name, ret);            return ret;        }        up_write(&policy->rwsem);        ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);        down_write(&policy->rwsem);        if (ret) {            pr_err("%s: Failed to Exit Governor: %s (%d)\n",                   __func__, old_gov->name, ret);            return ret;        }    }    /* (5.2.3) 依次调用 CPUFREQ_GOV_POLICY_INIT、CPUFREQ_GOV_START让新的governor开工     */    /* start new governor */    policy->governor = new_policy->governor;    ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT);    if (!ret) {        ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);        if (!ret)            goto out;        up_write(&policy->rwsem);        __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);        down_write(&policy->rwsem);    }    /* new governor failed, so re-start old one */    pr_debug("starting governor %s failed\n", policy->governor->name);    if (old_gov) {        policy->governor = old_gov;        if (__cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT))            policy->governor = NULL;        else            __cpufreq_governor(policy, CPUFREQ_GOV_START);    }    return ret; out:    pr_debug("governor: change or update limits\n");    return __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);}||→static int __cpufreq_governor(struct cpufreq_policy *policy,                    unsigned int event){    /* __cpufreq_governor()调用的各种命令最后调用的都是governor的具体函数 */    ret = policy->governor->governor(policy, event);}

4、以interactive governor为例，说明policy->governor->governor()对CPUFREQ_GOV_POLICY_INIT、CPUFREQ_GOV_START、CPUFREQ_GOV_STOP、CPUFREQ_GOV_POLICY_EXIT这几个命令的实现：

struct cpufreq_governor cpufreq_gov_interactive = {    .name = "interactive",    .governor = cpufreq_governor_interactive,    .max_transition_latency = 10000000,    .owner = THIS_MODULE,};↓static int cpufreq_governor_interactive(struct cpufreq_policy *policy,        unsigned int event){    int rc;    unsigned int j;    struct cpufreq_interactive_cpuinfo *pcpu;    struct cpufreq_frequency_table *freq_table;    struct cpufreq_interactive_tunables *tunables;    unsigned long flags;    if (have_governor_per_policy())        tunables = policy->governor_data;    else        tunables = common_tunables;    WARN_ON(!tunables && (event != CPUFREQ_GOV_POLICY_INIT));    switch (event) {    /* (1) CPUFREQ_GOV_POLICY_INIT命令的实现:        初始化tunables，tunables是interactive governor在计算时使用的各种参数        相关的sysfs注册     */    case CPUFREQ_GOV_POLICY_INIT:        if (have_governor_per_policy()) {            WARN_ON(tunables);        } else if (tunables) {            tunables->usage_count++;            policy->governor_data = tunables;            return 0;        }        tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);        if (!tunables) {            pr_err("%s: POLICY_INIT: kzalloc failed\n", __func__);            return -ENOMEM;        }        tunables->usage_count = 1;        tunables->above_hispeed_delay = default_above_hispeed_delay;        tunables->nabove_hispeed_delay =            ARRAY_SIZE(default_above_hispeed_delay);        tunables->go_hispeed_load = DEFAULT_GO_HISPEED_LOAD;        tunables->target_loads = default_target_loads;        tunables->ntarget_loads = ARRAY_SIZE(default_target_loads);        tunables->min_sample_time = DEFAULT_MIN_SAMPLE_TIME;        tunables->timer_rate = DEFAULT_TIMER_RATE;        tunables->boostpulse_duration_val = DEFAULT_MIN_SAMPLE_TIME;        tunables->timer_slack_val = DEFAULT_TIMER_SLACK;        spin_lock_init(&tunables->target_loads_lock);        spin_lock_init(&tunables->above_hispeed_delay_lock);        policy->governor_data = tunables;        if (!have_governor_per_policy()) {            common_tunables = tunables;        }        rc = sysfs_create_group(get_governor_parent_kobj(policy),                get_sysfs_attr());        if (rc) {            kfree(tunables);            policy->governor_data = NULL;            if (!have_governor_per_policy()) {                common_tunables = NULL;            }            return rc;        }        if (!policy->governor->initialized) {            idle_notifier_register(&cpufreq_interactive_idle_nb);            cpufreq_register_notifier(&cpufreq_notifier_block,                    CPUFREQ_TRANSITION_NOTIFIER);        }        break;    /* (2) CPUFREQ_GOV_POLICY_EXIT命令的实现:        remove相关的sysfs     */    case CPUFREQ_GOV_POLICY_EXIT:        if (!--tunables->usage_count) {            if (policy->governor->initialized == 1) {                cpufreq_unregister_notifier(&cpufreq_notifier_block,                        CPUFREQ_TRANSITION_NOTIFIER);                idle_notifier_unregister(&cpufreq_interactive_idle_nb);            }#ifdef CONFIG_MEIZU_BSP        }#else            sysfs_remove_group(get_governor_parent_kobj(policy),                    get_sysfs_attr());            kfree(tunables);            common_tunables = NULL;        }        policy->governor_data = NULL;#endif //CONFIG_MEIZU_BSP        break;    /* (3) CPUFREQ_GOV_START命令的实现:        因为同一个cluster中的多个cpu是共享一个cpufreq_policy的，        所以使用同一个cpufreq_policy来初始化cluster中多个online cpu的per_cpu(cpuinfo, j)变量：        pcpu->target_freq    // 当前频率        pcpu->freq_table     // 频率表        并且启动cpu上的interactive_timer=pcpu->cpu_timer：        cpufreq_interactive_timer_start(tunables, j);     */    case CPUFREQ_GOV_START:        mutex_lock(&gov_lock);        freq_table = cpufreq_frequency_get_table(policy->cpu);        if (tunables && !tunables->hispeed_freq)            tunables->hispeed_freq = policy->max;        for_each_cpu(j, policy->cpus) {            pcpu = &per_cpu(cpuinfo, j);            pcpu->policy = policy;            pcpu->target_freq = policy->cur;            pcpu->freq_table = freq_table;            pcpu->floor_freq = pcpu->target_freq;            pcpu->pol_floor_val_time =                ktime_to_us(ktime_get());            pcpu->loc_floor_val_time = pcpu->pol_floor_val_time;            pcpu->pol_hispeed_val_time = pcpu->pol_floor_val_time;            pcpu->loc_hispeed_val_time = pcpu->pol_floor_val_time;            down_write(&pcpu->enable_sem);            del_timer_sync(&pcpu->cpu_timer);            del_timer_sync(&pcpu->cpu_slack_timer);            cpufreq_interactive_timer_start(tunables, j);            pcpu->governor_enabled = 1;            up_write(&pcpu->enable_sem);        }        mutex_unlock(&gov_lock);        break;    /* (4) CPUFREQ_GOV_STOP命令的实现:        如果同一个cluster中的多个cpu都已经offline，停掉对应的governor：        停掉cpu上的interactive_timer=pcpu->cpu_timer     */    case CPUFREQ_GOV_STOP:        mutex_lock(&gov_lock);        for_each_cpu(j, policy->cpus) {            pcpu = &per_cpu(cpuinfo, j);            down_write(&pcpu->enable_sem);            pcpu->governor_enabled = 0;            del_timer_sync(&pcpu->cpu_timer);            del_timer_sync(&pcpu->cpu_slack_timer);            up_write(&pcpu->enable_sem);        }        mutex_unlock(&gov_lock);        break;    case CPUFREQ_GOV_LIMITS:        if (policy->max < policy->cur)            __cpufreq_driver_target(policy,                    policy->max, CPUFREQ_RELATION_H);        else if (policy->min > policy->cur)            __cpufreq_driver_target(policy,                    policy->min, CPUFREQ_RELATION_L);        for_each_cpu(j, policy->cpus) {            pcpu = &per_cpu(cpuinfo, j);            down_read(&pcpu->enable_sem);            if (pcpu->governor_enabled == 0) {                up_read(&pcpu->enable_sem);                continue;            }            spin_lock_irqsave(&pcpu->target_freq_lock, flags);            if (policy->max < pcpu->target_freq)                pcpu->target_freq = policy->max;            else if (policy->min > pcpu->target_freq)                pcpu->target_freq = policy->min;            spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);            up_read(&pcpu->enable_sem);        }        break;    }

4.3.1.2、cpufrep的频率配置

cpufreq一个重要的作用就是能把用户需要的cpu频率配置下去，这部分的代码也需要cpufreq core和cpufreq driver的配合。频率调整也叫DVFS(Dynamic Voltage and Frequency Scaling)，需要按照对应关系把电压和频率一起配置下去。

具体的代码解析如下：

int __cpufreq_driver_target(struct cpufreq_policy *policy,                unsigned int target_freq,                unsigned int relation){    unsigned int old_target_freq = target_freq;    int retval = -EINVAL;    if (cpufreq_disabled())        return -ENODEV;    /* (1) target目标频率在policy中的合法性检测 */    /* Make sure that target_freq is within supported range */    if (target_freq > policy->max)        target_freq = policy->max;    if (target_freq < policy->min)        target_freq = policy->min;    pr_debug("target for CPU %u: %u kHz, relation %u, requested %u kHz\n",         policy->cpu, target_freq, relation, old_target_freq);    /* (2) 如果当前频率就是target频率，不用调整直接返回 */    /*     * This might look like a redundant call as we are checking it again     * after finding index. But it is left intentionally for cases where     * exactly same freq is called again and so we can save on few function     * calls.     */    if (target_freq == policy->cur)        return 0;    /* Save last value to restore later on errors */    policy->restore_freq = policy->cur;    if (cpufreq_driver->target)        /* (3) 调用实际的驱动target()函数来调整cpu频率 */        retval = cpufreq_driver->target(policy, target_freq, relation);    else if (cpufreq_driver->target_index) {        struct cpufreq_frequency_table *freq_table;        int index;        freq_table = cpufreq_frequency_get_table(policy->cpu);        if (unlikely(!freq_table)) {            pr_err("%s: Unable to find freq_table\n", __func__);            goto out;        }        retval = cpufreq_frequency_table_target(policy, freq_table,                target_freq, relation, &index);        if (unlikely(retval)) {            pr_err("%s: Unable to find matching freq\n", __func__);            goto out;        }        if (freq_table[index].frequency == policy->cur) {            retval = 0;            goto out;        }        retval = __target_index(policy, freq_table, index);    }out:    return retval;}|→static int _mt_cpufreq_target(struct cpufreq_policy *policy, unsigned int target_freq,                  unsigned int relation){    struct mt_cpu_dvfs *p;    int ret;    unsigned int new_opp_idx;    p = id_to_cpu_dvfs(_get_cpu_dvfs_id(policy->cpu));    if (!p)        return -EINVAL;    /* (3.1) 驱动根据频率电压表，配置target频率和对应电压 */    ret = cpufreq_frequency_table_target(policy, p->freq_tbl_for_cpufreq,                         target_freq, relation, &new_opp_idx);    if (ret || new_opp_idx >= p->nr_opp_tbl)        return -EINVAL;    if (dvfs_disable_flag || p->dvfs_disable_by_suspend || p->dvfs_disable_by_procfs)        return -EPERM;    _mt_cpufreq_dvfs_request_wrapper(p, new_opp_idx, MT_CPU_DVFS_NORMAL, NULL);    return 0;}

4.3.2、interactive governor

在所有的cpufreq governor中最有名气的就是interactive governor了，因为几乎所有的andriod系统中都在使用。

interactive的思想就是使用cpu的负载来调整cpu频率，核心就是：使用一个20ms的定时器来计算cpu占用率，根据cpu占用率的不同threshold来调整不同档位的频率。

这里写图片描述

interactive的负载计算方法如上图所示。interactive的整个计算方法大概如下：

1、计算cpu的累加负载。每20ms采样一次，每次采样统计增加的active_time和当前频率的乘积：cputime_speedadj += active_time * cur_freq;
2、计算cpu的占用率。当前cpu占用率 = (累加负载100)/(累加时间当前频率)，cpu_load = (loadadjfreq*100)/(delta_time*cur_freq)；
3、如果cpu_load达到高门限go_hispeed_load(99%)或者发生boost，直接调节频率到hispeed_freq(最高频率)；
4、其他情况下使用choose_freq()公式计算新频率：new_freq = cur_freq*(cpu_load/DEFAULT_TARGET_LOAD(90))；new_freq = cpufreq_frequency_table_target(new_freq, CPUFREQ_RELATION_L);
5、如果当前频率已经达到hispeed_freq，还需要往上调整，必须在之前的频率上保持above_hispeed_delay(20ms)；如果当前频率已经达到hispeed_freq，还需要往下调整，必须在之前的频率上保持min_sample_time(80ms)；

interactive governor从原理上看，有以下问题：

1、20ms的采样时间过长，负载变化到频率调整的反应时间过长；
2、负载累加计算有问题，历史负载没有老化机制，历史负载的权重和当前一样，造成当前的负载变化不真实；
3、计算cpu占用率=总历史负载/(总时间*当前频率)，算法不合理历史负载对当前影响太大。如果之前是高频率，现在变成低频率，那么cpu_load计算出来的值可能超过100%；如果之前是低频率，现在是高频率，那么cpu_load计算出来的值也会大大被拉低；
4、choose_freq()的计算公式有重大漏洞。比如我们cpu频率表={800M, 900M}，当前cur_freq=800m cur_load=100%，那么newfreq = (cur_freq*cur_load)/90 = 889M，使用CPUFREQ_RELATION_L选择档位，选择到还是800M根本不能向高档位前进。这是算法的一个漏洞，如果cpu不同档位的频率差值大于(100/90)，那么正常往上调频是调不上去的，会被CPUFREQ_RELATION_L参数拦下来。所以实际的interactive调频，都是使用go_hispeed_load(99%)调到最高值的，再使用choose_freq()来降频。

所以interactive governor会逐渐的被cpufreq gorernor所取代。

4.3.2.1、interactive governor的初始化

1、interactive的一部分初始化在cpufreq_interactive_init()当中：

static int __init cpufreq_interactive_init(void){    unsigned int i;    struct cpufreq_interactive_cpuinfo *pcpu;    struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };    /* (1) 初始化percpu变量per_cpu(cpuinfo, i)：         每个cpu创建负载计算定时器pcpu->cpu_timer        其他的锁     */    /* Initalize per-cpu timers */    for_each_possible_cpu(i) {        pcpu = &per_cpu(cpuinfo, i);        init_timer_deferrable(&pcpu->cpu_timer);        pcpu->cpu_timer.function = cpufreq_interactive_timer;        pcpu->cpu_timer.data = i;        init_timer(&pcpu->cpu_slack_timer);        pcpu->cpu_slack_timer.function = cpufreq_interactive_nop_timer;        spin_lock_init(&pcpu->load_lock);        spin_lock_init(&pcpu->target_freq_lock);        init_rwsem(&pcpu->enable_sem);    }    spin_lock_init(&speedchange_cpumask_lock);    mutex_init(&gov_lock);    /* (2) 创建频率调整进程speedchange_task，         把耗时的频率调整工作单独放到一个进程中去做     */    speedchange_task =        kthread_create(cpufreq_interactive_speedchange_task, NULL,                   "cfinteractive");    if (IS_ERR(speedchange_task))        return PTR_ERR(speedchange_task);    sched_setscheduler_nocheck(speedchange_task, SCHED_FIFO, &param);    get_task_struct(speedchange_task);    /* NB: wake up so the thread does not look hung to the freezer */    wake_up_process(speedchange_task);    return cpufreq_register_governor(&cpufreq_gov_interactive);}

2、interactive另一部分初始化在cpufreq_governor_interactive()中的CPUFREQ_GOV_POLICY_INIT、CPUFREQ_GOV_START命令，在cpu online时执行：

static int cpufreq_governor_interactive(struct cpufreq_policy *policy,        unsigned int event){    switch (event) {    /* (1)  CPUFREQ_GOV_POLICY_INIT命令初始化interactive governor最核心的参数     */    case CPUFREQ_GOV_POLICY_INIT:        if (have_governor_per_policy()) {            WARN_ON(tunables);        } else if (tunables) {            tunables->usage_count++;            policy->governor_data = tunables;            return 0;        }        tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);        if (!tunables) {            pr_err("%s: POLICY_INIT: kzalloc failed\n", __func__);            return -ENOMEM;        }        tunables->usage_count = 1;        tunables->above_hispeed_delay = default_above_hispeed_delay;        tunables->nabove_hispeed_delay =            ARRAY_SIZE(default_above_hispeed_delay);        tunables->go_hispeed_load = DEFAULT_GO_HISPEED_LOAD;        tunables->target_loads = default_target_loads;        tunables->ntarget_loads = ARRAY_SIZE(default_target_loads);        tunables->min_sample_time = DEFAULT_MIN_SAMPLE_TIME;        tunables->timer_rate = DEFAULT_TIMER_RATE;          // interactive负载计算timer默认时间为20ms        tunables->boostpulse_duration_val = DEFAULT_MIN_SAMPLE_TIME;        tunables->timer_slack_val = DEFAULT_TIMER_SLACK;        spin_lock_init(&tunables->target_loads_lock);        spin_lock_init(&tunables->above_hispeed_delay_lock);        policy->governor_data = tunables;        if (!have_governor_per_policy()) {            common_tunables = tunables;        }        rc = sysfs_create_group(get_governor_parent_kobj(policy),                get_sysfs_attr());        if (rc) {            kfree(tunables);            policy->governor_data = NULL;            if (!have_governor_per_policy()) {                common_tunables = NULL;            }            return rc;        }        if (!policy->governor->initialized) {            idle_notifier_register(&cpufreq_interactive_idle_nb);            cpufreq_register_notifier(&cpufreq_notifier_block,                    CPUFREQ_TRANSITION_NOTIFIER);        }        break;    /* (2) CPUFREQ_GOV_START命令启动interactive负载计算的timer     */    case CPUFREQ_GOV_START:        mutex_lock(&gov_lock);        freq_table = cpufreq_frequency_get_table(policy->cpu);        if (tunables && !tunables->hispeed_freq)            tunables->hispeed_freq = policy->max;        for_each_cpu(j, policy->cpus) {            pcpu = &per_cpu(cpuinfo, j);            pcpu->policy = policy;            pcpu->target_freq = policy->cur;            pcpu->freq_table = freq_table;            pcpu->floor_freq = pcpu->target_freq;            pcpu->pol_floor_val_time =                ktime_to_us(ktime_get());            pcpu->loc_floor_val_time = pcpu->pol_floor_val_time;            pcpu->pol_hispeed_val_time = pcpu->pol_floor_val_time;            pcpu->loc_hispeed_val_time = pcpu->pol_floor_val_time;            down_write(&pcpu->enable_sem);            del_timer_sync(&pcpu->cpu_timer);            del_timer_sync(&pcpu->cpu_slack_timer);            cpufreq_interactive_timer_start(tunables, j);            pcpu->governor_enabled = 1;            up_write(&pcpu->enable_sem);        }        mutex_unlock(&gov_lock);        break;    }

4.3.2.2、interactive governor的算法

interactive governor的核心算法在20ms周期的timer interactive governor()中：

static void cpufreq_interactive_timer(unsigned long data){    u64 now;    unsigned int delta_time;    u64 cputime_speedadj;    int cpu_load;    struct cpufreq_interactive_cpuinfo *pcpu =        &per_cpu(cpuinfo, data);    struct cpufreq_interactive_tunables *tunables =        pcpu->policy->governor_data;    unsigned int new_freq;    unsigned int loadadjfreq;    unsigned int index;    unsigned long flags;    u64 max_fvtime;    int j;    unsigned int max_t_freq = 0;#ifdef CPUDVFS_POWER_MODE    /* default(normal), low power, just make, performance(sports) */    int min_sample_t[4] = { 80, 20, 20, 80 };    int ppb_idx;#endif    if (!down_read_trylock(&pcpu->enable_sem))        return;    if (!pcpu->governor_enabled)        goto exit;    spin_lock_irqsave(&pcpu->load_lock, flags);    /* (1) 累加cpu上自从cpu_up()以来的负载，        pcpu->cputime_speedadj += active_time * pcpu->policy->cur;        pcpu->cputime_speedadj = (active_time * pcpu->policy->cur)samp1 + ... +(active_time * pcpu->policy->cur)sampn ;        每个采样周期为20mS，累加：第1个20ms中active_time*cur_cpu_freq + 第2个20ms中active_time*cur_cpu_freq +...+ 第n个20ms中active_time*cur_cpu_freq     */    now = update_load(data);    /* (2) 自从cpu_up()以来的总的时间        delta_time = active_time + ilde_time     */    delta_time = (unsigned int)(now - pcpu->cputime_speedadj_timestamp);    cputime_speedadj = pcpu->cputime_speedadj;    spin_unlock_irqrestore(&pcpu->load_lock, flags);    if (WARN_ON_ONCE(!delta_time))        goto rearm;    spin_lock_irqsave(&pcpu->target_freq_lock, flags);    /* (3) 总的负载/总时间 = 平均频率 */    do_div(cputime_speedadj, delta_time);    /* (4) (平均频率 * 100)/当前频率 = 当前cpu的占用率      */    loadadjfreq = (unsigned int)cputime_speedadj * 100;    cpu_load = loadadjfreq / pcpu->policy->cur;    tunables->boosted = tunables->boost_val || now < tunables->boostpulse_endtime;#ifdef CPUDVFS_POWER_MODE    ppb_idx = mt_cpufreq_get_ppb_state();    {        unsigned int idx = mt_cpufreq_ppb_hispeed_freq(data, ppb_idx);        tunables->hispeed_freq = pcpu->freq_table[idx].frequency;        tunables->min_sample_time = min_sample_t[ppb_idx] * USEC_PER_MSEC;        if (hispeed_freq_perf != 0)            tunables->hispeed_freq = hispeed_freq_perf;        if (min_sample_time_perf != 0)            tunables->min_sample_time = min_sample_time_perf;    }#endif    /* (5) 如果cpu占用率达到go_hispeed_load(99%)，或者在boost状态，        频率直接调整到最高频率hispeed_freq     */    if (cpu_load >= tunables->go_hispeed_load || tunables->boosted) {        if (pcpu->policy->cur < tunables->hispeed_freq) {            new_freq = tunables->hispeed_freq;        } else {            new_freq = choose_freq(pcpu, loadadjfreq);            if (new_freq < tunables->hispeed_freq)                new_freq = tunables->hispeed_freq;        }    /* (6) 否则使用choose_freq()根据当前负载来计算对应的频率     */    } else {        new_freq = choose_freq(pcpu, loadadjfreq);        if (new_freq > tunables->hispeed_freq &&                pcpu->policy->cur < tunables->hispeed_freq)            new_freq = tunables->hispeed_freq;    }    /* (7) 如果计算出的新频率 > hispeed_freq，不能马上调整，        在hispeed_freq以上的频率上必须待满above_hispeed_delay(20ms)，才能继续往上调整频率     */    if (pcpu->policy->cur >= tunables->hispeed_freq &&        new_freq > pcpu->policy->cur &&        now - pcpu->pol_hispeed_val_time <        freq_to_above_hispeed_delay(tunables, pcpu->policy->cur)) {        trace_cpufreq_interactive_notyet(            data, cpu_load, pcpu->target_freq,            pcpu->policy->cur, new_freq);        spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);        goto rearm;    }    pcpu->loc_hispeed_val_time = now;    if (cpufreq_frequency_table_target(pcpu->policy, pcpu->freq_table,                       new_freq, CPUFREQ_RELATION_L,                       &index)) {        spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);        goto rearm;    }    new_freq = pcpu->freq_table[index].frequency;    /* (8) 如果之前的频率 > hispeed_freq，或者发生boost        现在需要往低调频，之前的频率需要待满min_sample_time(80ms)     */    /*     * Do not scale below floor_freq unless we have been at or above the     * floor frequency for the minimum sample time since last validated.     */    max_fvtime = max(pcpu->pol_floor_val_time, pcpu->loc_floor_val_time);    if (new_freq < pcpu->floor_freq &&        pcpu->target_freq >= pcpu->policy->cur) {        if (now - max_fvtime < tunables->min_sample_time) {            trace_cpufreq_interactive_notyet(                data, cpu_load, pcpu->target_freq,                pcpu->policy->cur, new_freq);            spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);            goto rearm;        }    }    /*     * Update the timestamp for checking whether speed has been held at     * or above the selected frequency for a minimum of min_sample_time,     * if not boosted to hispeed_freq.  If boosted to hispeed_freq then we     * allow the speed to drop as soon as the boostpulse duration expires     * (or the indefinite boost is turned off).     */    if (!tunables->boosted || new_freq > tunables->hispeed_freq) {        pcpu->floor_freq = new_freq;        if (pcpu->target_freq >= pcpu->policy->cur ||            new_freq >= pcpu->policy->cur)            pcpu->loc_floor_val_time = now;    }    /* (9) 如果当前cpu往低调整频率，判断当前policy是否需要更新，        因为多个cpu共享一个policy，取最大期望频率cpu的值作为整个policy的调整值     */    if (pcpu->target_freq == new_freq &&            pcpu->target_freq <= pcpu->policy->cur) {        max_t_freq = 0;        for_each_cpu(j, pcpu->policy->cpus) {            struct cpufreq_interactive_cpuinfo *pjcpu;            pjcpu = &per_cpu(cpuinfo, j);            max_t_freq = max(max_t_freq, pjcpu->target_freq);        }        if (max_t_freq != pcpu->policy->cur)            goto pass_t;        trace_cpufreq_interactive_already(            data, cpu_load, pcpu->target_freq,            pcpu->policy->cur, new_freq);        spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);        goto rearm;    }pass_t:    trace_cpufreq_interactive_target(data, cpu_load, pcpu->target_freq,                     pcpu->policy->cur, new_freq);    /* (10) 如果policy需要更新唤醒speedchange_task来执行调频动作 */    pcpu->target_freq = new_freq;    spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);    spin_lock_irqsave(&speedchange_cpumask_lock, flags);    cpumask_set_cpu(data, &speedchange_cpumask);    spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);    wake_up_process(speedchange_task);rearm:    if (!timer_pending(&pcpu->cpu_timer))        cpufreq_interactive_timer_resched(pcpu);exit:    up_read(&pcpu->enable_sem);    return;}|→static unsigned int choose_freq(struct cpufreq_interactive_cpuinfo *pcpu,        unsigned int loadadjfreq){    unsigned int freq = pcpu->policy->cur;    unsigned int prevfreq, freqmin, freqmax;    unsigned int tl;    int index;    freqmin = 0;    freqmax = UINT_MAX;    do {        prevfreq = freq;        /* (6.1) tl = 90，loadadjfreq = (平均频率 * 100)            即 newfreq =  (平均频率 * 100)/ 90            相当于cpufreq_frequency_table_target(CPUFREQ_RELATION_L)，            相当于newfreq往低档位的计算，            ooooo这里带来一个非常严重的问题，如果档位之间差值大于100/90，向上调频将调不上去         */        tl = freq_to_targetload(pcpu->policy->governor_data, freq);        /*         * Find the lowest frequency where the computed load is less         * than or equal to the target load.         */        if (cpufreq_frequency_table_target(                pcpu->policy, pcpu->freq_table, loadadjfreq / tl,                CPUFREQ_RELATION_L, &index))            break;        freq = pcpu->freq_table[index].frequency;        if (freq > prevfreq) {            /* The previous frequency is too low. */            freqmin = prevfreq;            if (freq >= freqmax) {                /*                 * Find the highest frequency that is less                 * than freqmax.                 */                if (cpufreq_frequency_table_target(                        pcpu->policy, pcpu->freq_table,                        freqmax - 1, CPUFREQ_RELATION_H,                        &index))                    break;                freq = pcpu->freq_table[index].frequency;                if (freq == freqmin) {                    /*                     * The first frequency below freqmax                     * has already been found to be too                     * low.  freqmax is the lowest speed                     * we found that is fast enough.                     */                    freq = freqmax;                    break;                }            }        } else if (freq < prevfreq) {            /* The previous frequency is high enough. */            freqmax = prevfreq;            if (freq <= freqmin) {                /*                 * Find the lowest frequency that is higher                 * than freqmin.                 */                if (cpufreq_frequency_table_target(                        pcpu->policy, pcpu->freq_table,                        freqmin + 1, CPUFREQ_RELATION_L,                        &index))                    break;                freq = pcpu->freq_table[index].frequency;                /*                 * If freqmax is the first frequency above                 * freqmin then we have already found that                 * this speed is fast enough.                 */                if (freq == freqmax)                    break;            }        }        /* If same frequency chosen as previous then done. */    } while (freq != prevfreq);    return freq;}

4.4、cpu hotplug调整

还有一种调节负载的方式是cpu hotplug：

1、cpu被hotplug掉的功耗小于cpu进入idle的功耗；如果整个cluster的cpu都offline，cluster也可以poweroff；所以hotplug能够节省功耗；
2、但是hotplug是有开销的：hotplug动作在速度慢的时候达到了ms级别，另外进程的迁移也是有开销的；cpu的hotplug必须遵循顺序插拔的规则，如果先拔掉负载重的cpu也是不合理的；
3、MTK的技术限制必须使用hotplug：MTK平台只有在剩一个online cpu的情况下才能进入深度idle模式，所以MTK平台必须支持hotplug；而samsung、qualcomm在多核online的情况下可以进入深度idle，所以一般不支持cpu hotplug；

4.4.1、hotplug 底层实现

4.4.1.1、cpu_cup()/cpu_down()

kernel对hotplug的支持是很完善的，标准接口cpu_up()/cpu_down()可以进行hotplug。

这里写图片描述

4.4.1.2、hotplug 进程迁移

在cpu_down()时，需要调用migration_call() -> migrate_tasks()把cpu上所有runnable进程迁移到其他cpu；在cpu_up()时，并不需要在函数中迁移进程，直接等待负载均衡算法的迁移。

static void migrate_tasks(struct rq *dead_rq){    struct rq *rq = dead_rq;    struct task_struct *next, *stop = rq->stop;    int dest_cpu;    /*     * Fudge the rq selection such that the below task selection loop     * doesn't get stuck on the currently eligible stop task.     *     * We're currently inside stop_machine() and the rq is either stuck     * in the stop_machine_cpu_stop() loop, or we're executing this code,     * either way we should never end up calling schedule() until we're     * done here.     */    rq->stop = NULL;    /*     * put_prev_task() and pick_next_task() sched     * class method both need to have an up-to-date     * value of rq->clock[_task]     */    update_rq_clock(rq);    unthrottle_offline_rt_rqs(rq);    for (;;) {        /*         * There's this thread running, bail when that's the only         * remaining thread.         */        if (rq->nr_running == 1)            break;        /* (1) 逐个从rq中获取task = next */        /*         * pick_next_task assumes pinned rq->lock.         */        lockdep_pin_lock(&rq->lock);        next = pick_next_task(rq, &fake_task);        BUG_ON(!next);        next->sched_class->put_prev_task(rq, next);        /*         * Rules for changing task_struct::cpus_allowed are holding         * both pi_lock and rq->lock, such that holding either         * stabilizes the mask.         *         * Drop rq->lock is not quite as disastrous as it usually is         * because !cpu_active at this point, which means load-balance         * will not interfere. Also, stop-machine.         */        lockdep_unpin_lock(&rq->lock);        raw_spin_unlock(&rq->lock);        raw_spin_lock(&next->pi_lock);        raw_spin_lock(&rq->lock);        /*         * Since we're inside stop-machine, _nothing_ should have         * changed the task, WARN if weird stuff happened, because in         * that case the above rq->lock drop is a fail too.         */        if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {            raw_spin_unlock(&next->pi_lock);            continue;        }        /* (2) 找到最适合next进程迁移的目的cpu */        /* Find suitable destination for @next, with force if needed. */        dest_cpu = select_fallback_rq(dead_rq->cpu, next);        /* (3) 实施进程迁移 */        rq = __migrate_task(rq, next, dest_cpu);        if (rq != dead_rq) {            raw_spin_unlock(&rq->lock);            rq = dead_rq;            raw_spin_lock(&rq->lock);        }        raw_spin_unlock(&next->pi_lock);    }    rq->stop = stop;}|→static int select_fallback_rq(int cpu, struct task_struct *p){    int nid = cpu_to_node(cpu);    const struct cpumask *nodemask = NULL;    enum { cpuset, possible, fail } state = cpuset;    int dest_cpu;    /*     * If the node that the cpu is on has been offlined, cpu_to_node()     * will return -1. There is no cpu on the node, and we should     * select the cpu on the other node.     */    if (nid != -1) {        nodemask = cpumask_of_node(nid);        /* Look for allowed, online CPU in same node. */        for_each_cpu(dest_cpu, nodemask) {            if (!cpu_online(dest_cpu))                continue;            if (!cpu_active(dest_cpu))                continue;            if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))                return dest_cpu;        }    }    for (;;) {        /* (2.1) 最好的情况：在tsk_cpus_allowed(p)中能找到online cpu迁移 */        /* Any allowed, online CPU? */        for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {            if (!cpu_online(dest_cpu))                continue;            if (!cpu_active(dest_cpu))                continue;            goto out;        }        /* No more Mr. Nice Guy. */        switch (state) {        /* (2.2) 其次的情况：在cpuset中能找到online cpu迁移 */        case cpuset:            if (IS_ENABLED(CONFIG_CPUSETS)) {                cpuset_cpus_allowed_fallback(p);                state = possible;                break;            }        /* (2.3) 最差的情况：在系统所有cpu中能找到online cpu迁移 */            /* fall-through */        case possible:            do_set_cpus_allowed(p, cpu_possible_mask);            state = fail;            break;        case fail:            BUG();            break;        }    }out:    if (state != cpuset) {        /*         * Don't tell them about moving exiting tasks or         * kernel threads (both mm NULL), since they never         * leave kernel.         */        if (p->mm && printk_ratelimit()) {            printk_deferred("process %d (%s) no longer affine to cpu%d\n",                    task_pid_nr(p), p->comm, cpu);        }    }    return dest_cpu;}

4.4.2、MTK hotplug算法

在有了hotplug的底层cpu_cup()、cpu_down()的实现以后，在此之上还需要有一套算法根据cpu的负载来动态hotplug。MTK这套算法比较齐全，主要分为HICA、hps_algo_main两部分。

这里写图片描述

4.4.2.1、HICA/PPM

HICA和hps的关系，其实是HICA决定了一种大的mode，而hps在大的mode中实现精细化的调整。

比如对MT6799 HICA支持3种模式：

1、LL_ONLY。 // 只开小核
2、L_ONLY。 // 只开中核
3、ALL。 // LL、L、B10核都可以使用

HICA在mt_ppm_hica_update_algo_data()中计算负载，根据负载变化来决定mode：

_hps_task_main() -> mt_ppm_hica_update_algo_data()↓void mt_ppm_hica_update_algo_data(unsigned int cur_loads,                    unsigned int cur_nr_heavy_task, unsigned int cur_tlp){    struct ppm_power_state_data *state_info = ppm_get_power_state_info();    struct ppm_state_transfer_data *data;    enum ppm_power_state cur_state;    enum ppm_mode cur_mode;    int i, j;    FUNC_ENTER(FUNC_LV_HICA);    ppm_lock(&hica_policy.lock);    ppm_hica_algo_data.ppm_cur_loads = cur_loads;    ppm_hica_algo_data.ppm_cur_tlp = cur_tlp;    ppm_hica_algo_data.ppm_cur_nr_heavy_task = cur_nr_heavy_task;    cur_state = ppm_hica_algo_data.cur_state;    cur_mode = ppm_main_info.cur_mode;    ppm_dbg(HICA, "cur_loads = %d, cur_tlp = %d, cur_nr_heavy_task = %d, cur_state = %s, cur_mode = %d\n",        cur_loads, cur_tlp, cur_nr_heavy_task, ppm_get_power_state_name(cur_state), cur_mode);    if (!ppm_main_info.is_enabled || !hica_policy.is_enabled || ppm_main_info.is_in_suspend ||        cur_state == PPM_POWER_STATE_NONE)        goto end;#if defined(CONFIG_MACH_MT6757) || defined(CONFIG_MACH_KIBOPLUS)    if (setup_max_cpus == 4)        goto end;#endif#ifdef PPM_IC_SEGMENT_CHECK    if (ppm_main_info.fix_state_by_segment != PPM_POWER_STATE_NONE)        goto end;#endif    /* skip HICA if DVFS is not ready (we cannot get current freq...) */    if (!ppm_main_info.client_info[PPM_CLIENT_DVFS].limit_cb)        goto end;    /* Power state is fixed by user, skip HICA state calculation */    if (fix_power_state != PPM_POWER_STATE_NONE)        goto end;    /* (1) 从transfer_by_perf到transfer_by_pwr逐个遍历判断当前state是否需要改变 */    for (i = 0; i < 2; i++) {        data = (i == 0) ? state_info[cur_state].transfer_by_perf                : state_info[cur_state].transfer_by_pwr;        /* (2) 如果当前state有几种变化逐个遍历，比如：            当前state为ALL，            可以ALL -> LL_ONLY            也可以ALL -> L_ONLY         */        for (j = 0; j < data->size; j++) {            if (!data->transition_data[j].transition_rule                || !((1 << cur_mode) & data->transition_data[j].mode_mask))                continue;            /* (3) 如果state变化，获取新的state返回 */            if (data->transition_data[j].transition_rule(                ppm_hica_algo_data, &data->transition_data[j])) {                ppm_hica_algo_data.new_state = data->transition_data[j].next_state;                ppm_dbg(HICA, "[%s(%d)] Need state transfer: %s --> %s\n",                    (i == 0) ? "PERF" : "PWR",                    j,                    ppm_get_power_state_name(cur_state),                    ppm_get_power_state_name(ppm_hica_algo_data.new_state)                    );                goto end;            /* (4) 如果state不变化，维持当前state，继续遍历*/            } else {                ppm_hica_algo_data.new_state = cur_state;#ifdef PPM_HICA_2P0                ppm_dbg(HICA, "[%s(%d)]hold in %s state, capacity_hold_cnt = %d, bigtsk_hold_cnt = %d, freq_hold_cnt = %d\n",                    (i == 0) ? "PERF" : "PWR",                    j,                    ppm_get_power_state_name(cur_state),                    data->transition_data[j].capacity_hold_cnt,                    data->transition_data[j].bigtsk_hold_cnt,                    data->transition_data[j].freq_hold_cnt                    );#else#if PPM_HICA_VARIANT_SUPPORT                ppm_dbg(HICA, "[%s(%d)]hold in %s state, loading_cnt = %d, freq_cnt = %d, overutil_l_hold_cnt = %d, .overutil_h_hold_cnt = %d\n",                    (i == 0) ? "PERF" : "PWR",                    j,                    ppm_get_power_state_name(cur_state),                    data->transition_data[j].loading_hold_cnt,                    data->transition_data[j].freq_hold_cnt,                    data->transition_data[j].overutil_l_hold_cnt,                    data->transition_data[j].overutil_h_hold_cnt                    );#else                ppm_dbg(HICA, "[%s(%d)]hold in %s state, loading_cnt = %d, freq_cnt = %d\n",                    (i == 0) ? "PERF" : "PWR",                    j,                    ppm_get_power_state_name(cur_state),                    data->transition_data[j].loading_hold_cnt,                    data->transition_data[j].freq_hold_cnt                    );#endif#endif            }        }    }end:    ppm_unlock(&hica_policy.lock);    FUNC_EXIT(FUNC_LV_HICA);}

关于计算state的函数和阈值定义在表中，除了heavy_task和big_task，基本是计算util/capacity的cpu占用情况：

struct ppm_power_state_data pwr_state_info_SB[NR_PPM_POWER_STATE] = {    [0] = {        .name = __stringify(LL_ONLY),        .state = PPM_POWER_STATE_LL_ONLY,        PWR_STATE_INFO(LL_ONLY, SB)    },    [1] = {        .name = __stringify(L_ONLY),        .state = PPM_POWER_STATE_L_ONLY,        PWR_STATE_INFO(L_ONLY, SB)    },    [2] = {        .name = __stringify(ALL),        .state = PPM_POWER_STATE_ALL,        PWR_STATE_INFO(ALL, SB)    },};static struct ppm_state_transfer state_pwr_transfer_ALL[] = {    TRANS_DATA(        LL_ONLY,        PPM_MODE_MASK_ALL_MODE,        ppm_trans_rule_ALL_to_LL_ONLY,        PPM_DEFAULT_HOLD_TIME,        PPM_CAPACITY_DOWN,        PPM_DEFAULT_BIGTSK_TIME,        0,        0,        0        ),    TRANS_DATA(        L_ONLY,        PPM_MODE_MASK_ALL_MODE,        ppm_trans_rule_ALL_to_L_ONLY,        PPM_DEFAULT_HOLD_TIME,        PPM_CAPACITY_DOWN,        PPM_DEFAULT_BIGTSK_TIME,        2,        4,        0        ),};STATE_TRANSFER_DATA_PWR(ALL);static struct ppm_state_transfer state_perf_transfer_ALL[] = {    TRANS_DATA(NONE, 0, NULL, 0, 0, 0, 0, 0, 0),};STATE_TRANSFER_DATA_PERF(ALL);/* 举例：当前state为ALL    尝试从power的角度从ALL切换到LL_ONLY：ppm_trans_rule_ALL_to_LL_ONLY()    尝试从power的角度从ALL切换到L_ONLY：ppm_trans_rule_ALL_to_L_ONLY() */static bool ppm_trans_rule_ALL_to_LL_ONLY(    struct ppm_hica_algo_data data, struct ppm_state_transfer *settings){    /* keep in ALL state if root cluster is fixed at L or B */    if (ppm_main_info.fixed_root_cluster == PPM_CLUSTER_L        || ppm_main_info.fixed_root_cluster == PPM_CLUSTER_B)        return false;    /* (1) 从heavy task负载判断是否需要切换模式 */#if PPM_HEAVY_TASK_INDICATE_SUPPORT    {        unsigned int heavy_task, i;        for_each_ppm_clusters(i) {            heavy_task = hps_get_hvytsk(i);            if (heavy_task) {                ppm_dbg(HICA, "Stay in ALL due to cluster%d heavy task = %d\n",                    i, heavy_task);                trace_ppm_hica(                    ppm_get_power_state_name(PPM_POWER_STATE_ALL),                    ppm_get_power_state_name(PPM_POWER_STATE_LL_ONLY),                    -1, -1, -1, -1, heavy_task, -1, false);                settings->capacity_hold_cnt = 0;                return false;            }        }    }#endif    /* (2) 从big task负载判断是否需要切换模式 */#if PPM_BIG_TASK_INDICATE_SUPPORT    {        unsigned int big_task_L = hps_get_bigtsk(PPM_CLUSTER_L);        unsigned int big_task_B = hps_get_bigtsk(PPM_CLUSTER_B);        if (big_task_L || big_task_B) {            ppm_dbg(HICA, "Stay in ALL due to L/B big task = %d/%d\n",                big_task_L, big_task_B);            trace_ppm_hica(                ppm_get_power_state_name(PPM_POWER_STATE_ALL),                ppm_get_power_state_name(PPM_POWER_STATE_LL_ONLY),                -1, -1, big_task_L, big_task_B, -1, -1, false);            settings->capacity_hold_cnt = 0;            return false;        }    }#endif    /* (3) 从util/capacity负载判断是否需要切换模式 */    {        /* check capacity */        unsigned long usage, usage_total = 0, capacity = 0, dummy;        unsigned int i;        for_each_ppm_clusters(i) {            if (sched_get_cluster_util(i, &usage, &dummy)) {                ppm_err("Get cluster %d util failed\n", i);                return false;            }            usage_total += usage;            if (i == PPM_CLUSTER_LL)                capacity = dummy;        }        ppm_dbg(HICA, "usage_total = %ld, LL capacity = %ld\n", usage_total, capacity);        /* (3.1) (util/capacity)超过门限值(settings->capacity_bond) 是否达到次数settings->capacity_hold_time，            如果条件满足进行state切换         */        if (usage_total < capacity * settings->capacity_bond / 100) {            settings->capacity_hold_cnt++;            if (settings->capacity_hold_cnt >= settings->capacity_hold_time) {                trace_ppm_hica(                    ppm_get_power_state_name(PPM_POWER_STATE_ALL),                    ppm_get_power_state_name(PPM_POWER_STATE_LL_ONLY),                    usage_total, capacity, -1, -1, -1, -1, true);                return true;            }        } else            settings->capacity_hold_cnt = 0;        trace_ppm_hica(            ppm_get_power_state_name(PPM_POWER_STATE_ALL),            ppm_get_power_state_name(PPM_POWER_STATE_LL_ONLY),            usage_total, capacity, -1, -1, -1, -1, false);    }    return false;}

新的state计算完成后，是通过以下通道配置下去的：

_hps_task_main() -> mt_ppm_main() -> ppm_hica_update_limit_cb() -> ppm_hica_set_default_limit_by_state()↓void ppm_hica_set_default_limit_by_state(enum ppm_power_state state,                    struct ppm_policy_data *policy){    unsigned int i;    struct ppm_power_state_data *state_info = ppm_get_power_state_info();    FUNC_ENTER(FUNC_LV_HICA);    for (i = 0; i < policy->req.cluster_num; i++) {        if (state >= PPM_POWER_STATE_NONE) {            if (state > NR_PPM_POWER_STATE)                ppm_err("@%s: Invalid PPM state(%d)\n", __func__, state);            policy->req.limit[i].min_cpu_core = get_cluster_min_cpu_core(i);            policy->req.limit[i].max_cpu_core = get_cluster_max_cpu_core(i);            policy->req.limit[i].min_cpufreq_idx = get_cluster_min_cpufreq_idx(i);            policy->req.limit[i].max_cpufreq_idx = get_cluster_max_cpufreq_idx(i);#ifdef PPM_DISABLE_CLUSTER_MIGRATION            /* keep at least 1 LL */            if (i == 0)                policy->req.limit[i].min_cpu_core = 1;#endif        /* (1) HICA根据新的state，配置对应的min_cpu_core/max_cpu_core到本policy当中 */        } else {            policy->req.limit[i].min_cpu_core =                state_info[state].cluster_limit->state_limit[i].min_cpu_core;            policy->req.limit[i].max_cpu_core =                state_info[state].cluster_limit->state_limit[i].max_cpu_core;            policy->req.limit[i].min_cpufreq_idx =                state_info[state].cluster_limit->state_limit[i].min_cpufreq_idx;            policy->req.limit[i].max_cpufreq_idx =                state_info[state].cluster_limit->state_limit[i].max_cpufreq_idx;        }    }#ifdef PPM_IC_SEGMENT_CHECK        /* ignore HICA min freq setting for L cluster in L_ONLY state */        if (state == PPM_POWER_STATE_L_ONLY && ppm_main_info.fix_state_by_segment == PPM_POWER_STATE_L_ONLY)            policy->req.limit[1].min_cpufreq_idx = get_cluster_min_cpufreq_idx(1);#endif    FUNC_EXIT(FUNC_LV_HICA);}/*==============================================================*//* Local Variables                      *//*==============================================================*//* cluster limit for each power state */static const struct ppm_cluster_limit state_limit_LL_ONLY[] = {    [0] = LIMIT(15, 0, 1, 4),    [1] = LIMIT(15, 0, 0, 0),    [2] = LIMIT(15, 0, 0, 0),};STATE_LIMIT(LL_ONLY);static const struct ppm_cluster_limit state_limit_L_ONLY[] = {    [0] = LIMIT(15, 0, 0, 0),    [1] = LIMIT(8, 0, 1, 4),    [2] = LIMIT(15, 0, 0, 0),};STATE_LIMIT(L_ONLY);static const struct ppm_cluster_limit state_limit_ALL[] = {    [0] = LIMIT(15, 0, 0, 4),    [1] = LIMIT(15, 0, 0, 4),    [2] = LIMIT(15, 0, 0, 2),};STATE_LIMIT(ALL);_hps_task_main() -> mt_ppm_main() -> ppm_limit_callback()↓static void ppm_limit_callback(struct ppm_client_req req){    struct ppm_client_req *p = (struct ppm_client_req *)&req;    int i;    /* (2) 将HICA state对应的policy配置到hps限制中hps_sys.cluster_info[i].ref_base_value/ref_limit_value */    mutex_lock(&hps_ctxt.para_lock);    hps_sys.ppm_root_cluster = p->root_cluster;    for (i = 0; i < p->cluster_num; i++) {        /*         * hps_warn("ppm_limit_callback -> cluster%d: has_advise_core = %d, [%d, %d]\n",         *  i, p->cpu_limit[i].has_advise_core,         *  p->cpu_limit[i].min_cpu_core, p->cpu_limit[i].max_cpu_core);         */#ifdef _TRACE_        trace_ppm_limit_callback_update(i, p->cpu_limit[i].has_advise_core,            p->cpu_limit[i].min_cpu_core, p->cpu_limit[i].max_cpu_core);#endif        if (!p->cpu_limit[i].has_advise_core) {            hps_sys.cluster_info[i].ref_base_value = p->cpu_limit[i].min_cpu_core;            hps_sys.cluster_info[i].ref_limit_value = p->cpu_limit[i].max_cpu_core;        } else {            hps_sys.cluster_info[i].ref_base_value =                hps_sys.cluster_info[i].ref_limit_value =                p->cpu_limit[i].advise_cpu_core;        }    }    mutex_unlock(&hps_ctxt.para_lock);    hps_ctxt.is_interrupt = 1;    hps_task_wakeup_nolock();}

4.4.2.2、hps_algo_main

_hps_task_main() -> hps_algo_main()↓void hps_algo_main(void){    unsigned int i, val, base_val, action_print, origin_root, action_break;    char str_online[64], str_ref_limit[64], str_ref_base[64], str_criteria_limit[64],        str_criteria_base[64], str_target[64], str_hvytsk[64], str_pwrseq[64], str_bigtsk[64];    char *online_ptr = str_online;    char *criteria_limit_ptr = str_criteria_limit;    char *criteria_base_ptr = str_criteria_base;    char *ref_limit_ptr = str_ref_limit;    char *ref_base_ptr = str_ref_base;    char *hvytsk_ptr = str_hvytsk;    char *target_ptr = str_target;    char *pwrseq_ptr = str_pwrseq;    char *bigtsk_ptr = str_bigtsk;    static unsigned int hrtbt_dbg;#ifdef CONFIG_MEIZU_BSP    static unsigned long int j;#endif //CONFIG_MEIZU_BSP#ifdef CONFIG_MTK_ICCS_SUPPORT    unsigned char real_online_power_state_bitmask = 0;    unsigned char real_target_power_state_bitmask = 0;    unsigned char iccs_online_power_state_bitmask = 0;    unsigned char iccs_target_power_state_bitmask = iccs_get_target_power_state_bitmask();    unsigned char target_cache_shared_state_bitmask = 0;#endif    /* Initial value */    base_val = action_print = action_break = hps_sys.total_online_cores = 0;    hps_sys.up_load_avg = hps_sys.down_load_avg = hps_sys.tlp_avg = hps_sys.rush_cnt = 0;    hps_sys.action_id = origin_root = 0;    /*     * run algo or not by hps_ctxt.enabled     */    if ((u64) ktime_to_ms(ktime_sub(ktime_get(), hps_ctxt.hps_hrt_ktime)) >= HPS_HRT_DBG_MS)        action_print = hrtbt_dbg = 1;    else        hrtbt_dbg = 0;    mutex_lock(&hps_ctxt.lock);    hps_ctxt.action = ACTION_NONE;    atomic_set(&hps_ctxt.is_ondemand, 0);    if (!hps_ctxt.enabled)        goto HPS_END;    if (hps_ctxt.eas_indicator) {        /*Set cpu cores by scheduler*/        goto HPS_ALGO_END;    }    /*     * algo - begin     */    /*Back up limit and base value for check */    mutex_lock(&hps_ctxt.para_lock);    if ((hps_sys.cluster_info[0].base_value == 0) &&        (hps_sys.cluster_info[1].base_value == 0) &&        (hps_sys.cluster_info[2].base_value == 0) &&        (hps_sys.cluster_info[0].limit_value == 0) &&        (hps_sys.cluster_info[1].limit_value == 0) &&        (hps_sys.cluster_info[2].limit_value == 0)) {        hps_sys.cluster_info[0].base_value = hps_sys.cluster_info[0].ref_base_value = 0;        hps_sys.cluster_info[1].base_value = hps_sys.cluster_info[1].ref_base_value = 0;        hps_sys.cluster_info[2].base_value = hps_sys.cluster_info[2].ref_base_value = 0;        hps_sys.cluster_info[0].limit_value = hps_sys.cluster_info[0].ref_limit_value = 4;        hps_sys.cluster_info[1].limit_value = hps_sys.cluster_info[1].ref_limit_value = 4;        hps_sys.cluster_info[2].limit_value = hps_sys.cluster_info[2].ref_limit_value = 0;    }    for (i = 0; i < hps_sys.cluster_num; i++) {        hps_sys.cluster_info[i].base_value = hps_sys.cluster_info[i].ref_base_value;        hps_sys.cluster_info[i].limit_value = hps_sys.cluster_info[i].ref_limit_value;    }    for (i = 0; i < hps_sys.cluster_num; i++) {        base_val += hps_sys.cluster_info[i].base_value;        hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].online_core_num =            0;        hps_sys.cluster_info[i].online_core_num =            hps_get_cluster_cpus(hps_sys.cluster_info[i].cluster_id);        hps_sys.total_online_cores += hps_sys.cluster_info[i].online_core_num;    }    mutex_unlock(&hps_ctxt.para_lock);    /* Determine root cluster */    origin_root = hps_sys.root_cluster_id;    hps_define_root_cluster(&hps_sys);#ifdef CONFIG_MACH_MT6799    if (hps_ctxt.smart_det_enabled) {        mutex_lock(&hps_ctxt.para_lock);        hps_sys.root_cluster_id = 1;/*Change root to L cluster when smart detection is enabled*/        mutex_unlock(&hps_ctxt.para_lock);    }#endif    if (origin_root != hps_sys.root_cluster_id)        hps_sys.action_id = HPS_SYS_CHANGE_ROOT;    /*     * update history - tlp     */    val = hps_ctxt.tlp_history[hps_ctxt.tlp_history_index];    hps_ctxt.tlp_history[hps_ctxt.tlp_history_index] = hps_ctxt.cur_tlp;    hps_ctxt.tlp_sum += hps_ctxt.cur_tlp;    hps_ctxt.tlp_history_index =        (hps_ctxt.tlp_history_index + 1 ==         hps_ctxt.tlp_times) ? 0 : hps_ctxt.tlp_history_index + 1;    ++hps_ctxt.tlp_count;    if (hps_ctxt.tlp_count > hps_ctxt.tlp_times) {        WARN_ON(hps_ctxt.tlp_sum < val);        hps_ctxt.tlp_sum -= val;        hps_ctxt.tlp_avg = hps_ctxt.tlp_sum / hps_ctxt.tlp_times;    } else {        hps_ctxt.tlp_avg = hps_ctxt.tlp_sum / hps_ctxt.tlp_count;    }    if (hps_ctxt.stats_dump_enabled)        hps_ctxt_print_algo_stats_tlp(0);    /*Determine eas enabled or not*/    if (!hps_ctxt.eas_enabled)        hps_sys.hps_sys_ops[2].enabled = 0;    for (i = 0 ; i < hps_sys.cluster_num ; i++)        hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].online_core_num;    /* (1) 逐个调用 hps_sys_ops()根据各种算法来判断当前cpu是否需要hotplug */    for (i = 0; i < hps_sys.func_num; i++) {        if (hps_sys.hps_sys_ops[i].enabled == 1) {            if (hps_sys.hps_sys_ops[i].hps_sys_func_ptr()) {                hps_sys.action_id = hps_sys.hps_sys_ops[i].func_id;                break;            }        }    }/*    if (hps_ctxt.heavy_task_enabled)        if (hps_algo_heavytsk_det())            hps_sys.action_id = 0xE1;*/    if (hps_ctxt.big_task_enabled)        if (hps_algo_big_task_det())            hps_sys.action_id = 0xE2;    if (hps_sys.action_id == 0)        goto HPS_END;HPS_ALGO_END:#ifdef CONFIG_MACH_MT6799    if (hps_ctxt.smart_det_enabled) {        if (hps_sys.cluster_info[2].bigTsk_value <= 1) {            mutex_lock(&hps_ctxt.para_lock);            hps_sys.cluster_info[2].target_core_num = 1;            mutex_unlock(&hps_ctxt.para_lock);        }    }#endif    /*     * algo - end     */    /* (2) 对limit进行判断，HICA的值就配置到这里 */    /*Base and limit check */    hps_check_base_limit(&hps_sys);    /* Ensure that root cluster must one online cpu at less */    if (hps_sys.cluster_info[hps_sys.root_cluster_id].target_core_num <= 0)        hps_sys.cluster_info[hps_sys.root_cluster_id].target_core_num = 1;#ifdef CONFIG_MTK_ICCS_SUPPORT    real_online_power_state_bitmask = 0;    real_target_power_state_bitmask = 0;    for (i = 0; i < hps_sys.cluster_num; i++) {        real_online_power_state_bitmask |= ((hps_sys.cluster_info[i].online_core_num > 0) << i);        real_target_power_state_bitmask |= ((hps_sys.cluster_info[i].target_core_num > 0) << i);    }    iccs_online_power_state_bitmask = iccs_target_power_state_bitmask;    iccs_target_power_state_bitmask = real_target_power_state_bitmask;    iccs_get_target_state(&iccs_target_power_state_bitmask, &target_cache_shared_state_bitmask);    /*     * pr_err("[%s] iccs_target_power_state_bitmask: 0x%x\n", __func__, iccs_target_power_state_bitmask);     */    for (i = 0; i < hps_sys.cluster_num; i++) {        hps_sys.cluster_info[i].iccs_state = (((real_online_power_state_bitmask >> i) & 1) << 3) |                             (((real_target_power_state_bitmask >> i) & 1) << 2) |                             (((iccs_online_power_state_bitmask >> i) & 1) << 1) |                             (((iccs_target_power_state_bitmask >> i) & 1) << 0);        /*         * pr_err("[%s] cluster: 0x%x iccs_state: 0x%x\n", __func__, i, hps_sys.cluster_info[i].iccs_state);         */        if (hps_get_iccs_pwr_status(i) == 0x1)            iccs_cluster_on_off(i, 1);        else if (hps_get_iccs_pwr_status(i) == 0x2)            iccs_cluster_on_off(i, 0);    }#endif    /* (3) 经过各种算法计算后目标值是target_core_num，而当前值是online_core_num；        如果不一致，进行cpu_up()/cpu_down()操作     */#if 1               /*Make sure that priority of power on action is higher than power down. */    for (i = 0; i < hps_sys.cluster_num; i++) {        if (hps_sys.cluster_info[i].target_core_num >            hps_sys.cluster_info[i].online_core_num) {            if (hps_algo_do_cluster_action(i) == 1) {                action_print = action_break = 1;                break;            }            action_print = 1;        }    }    if (!action_break) {        for (i = 0; i < hps_sys.cluster_num; i++) {            if (hps_sys.cluster_info[i].target_core_num <                hps_sys.cluster_info[i].online_core_num) {                if (hps_algo_do_cluster_action(i) == 1) {                    action_print = action_break = 1;                    break;                }                action_print = 1;            }        }    }#else    /*Process root cluster first */    if (hps_sys.cluster_info[hps_sys.root_cluster_id].target_core_num !=        hps_sys.cluster_info[hps_sys.root_cluster_id].online_core_num) {        if (hps_algo_do_cluster_action(hps_sys.root_cluster_id) == 1)            action_break = 1;        else            action_break = 0;        action_print = 1;    }    for (i = 0; i < hps_sys.cluster_num; i++) {        if (i == hps_sys.root_cluster_id)            continue;        if (hps_sys.cluster_info[i].target_core_num !=            hps_sys.cluster_info[i].online_core_num) {            if (hps_algo_do_cluster_action(i) == 1)                action_break = 1;            else                action_break = 0;            action_print = 1;        }    }#endif#ifdef CONFIG_MTK_ICCS_SUPPORT    for (i = 0; i < hps_sys.cluster_num; i++) {        if (hps_get_cluster_cpus(hps_sys.cluster_info[i].cluster_id) !=                hps_sys.cluster_info[i].target_core_num) {            if (hps_get_cluster_cpus(hps_sys.cluster_info[i].cluster_id) == 0)                iccs_target_power_state_bitmask &= ~(1 << i);            else if (hps_sys.cluster_info[i].target_core_num == 0)                iccs_target_power_state_bitmask |= (1 << i);        }    }    /*     * pr_err("[%s] iccs_target_power_state_bitmask: 0x%x\n", __func__, iccs_target_power_state_bitmask);     */    iccs_set_target_power_state_bitmask(iccs_target_power_state_bitmask);#endifHPS_END:    if (action_print || hrtbt_dbg) {        int online, target, ref_limit, ref_base, criteria_limit, criteria_base, hvytsk, pwrseq, bigtsk;        mutex_lock(&hps_ctxt.para_lock);        online = target = criteria_limit = criteria_base = 0;        for (i = 0; i < hps_sys.cluster_num; i++) {            if (i == origin_root)                online =                    sprintf(online_ptr, "<%d>",                        hps_sys.cluster_info[i].online_core_num);            else                online =                    sprintf(online_ptr, "(%d)",                        hps_sys.cluster_info[i].online_core_num);            if (i == hps_sys.root_cluster_id)                target =                    sprintf(target_ptr, "<%d>",                        hps_sys.cluster_info[i].target_core_num);            else                target =                    sprintf(target_ptr, "(%d)",                        hps_sys.cluster_info[i].target_core_num);            criteria_limit =                sprintf(criteria_limit_ptr, "(%d)",                    hps_sys.cluster_info[i].limit_value);            criteria_base =                sprintf(criteria_base_ptr, "(%d)", hps_sys.cluster_info[i].base_value);            ref_limit =                sprintf(ref_limit_ptr, "(%d)", hps_sys.cluster_info[i].ref_limit_value);            ref_base =                sprintf(ref_base_ptr, "(%d)", hps_sys.cluster_info[i].ref_base_value);            hvytsk = sprintf(hvytsk_ptr, "(%d)", hps_sys.cluster_info[i].hvyTsk_value);            bigtsk = sprintf(bigtsk_ptr, "(%d)", hps_sys.cluster_info[i].bigTsk_value);            if (i == 0)                pwrseq = sprintf(pwrseq_ptr, "(%d->", hps_sys.cluster_info[i].pwr_seq);            else if ((i != 0) && (i != (hps_sys.cluster_num - 1)))                pwrseq = sprintf(pwrseq_ptr, "%d->", hps_sys.cluster_info[i].pwr_seq);            else if (i == (hps_sys.cluster_num - 1))                pwrseq = sprintf(pwrseq_ptr, "%d) ", hps_sys.cluster_info[i].pwr_seq);            online_ptr += online;            target_ptr += target;            criteria_limit_ptr += criteria_limit;            criteria_base_ptr += criteria_base;            ref_limit_ptr += ref_limit;            ref_base_ptr += ref_base;            hvytsk_ptr += hvytsk;            bigtsk_ptr += bigtsk;            pwrseq_ptr += pwrseq;        }        mutex_unlock(&hps_ctxt.para_lock);        if (action_print) {            hps_set_funct_ctrl();            if (action_break)                hps_warn                    ("(0x%X)%s action break!! (%u)(%u)(%u) %s %s%s-->%s%s (%u)(%u)(%u)(%u) %s\n",                     ((hps_ctxt.hps_func_control << 12) | hps_sys.action_id),                     str_online, hps_ctxt.cur_loads,                     hps_ctxt.cur_tlp, hps_ctxt.cur_iowait, str_hvytsk,                     str_criteria_limit, str_criteria_base,                     str_ref_limit, str_ref_base,                     hps_sys.up_load_avg,                     hps_sys.down_load_avg, hps_sys.tlp_avg, hps_sys.rush_cnt,                     str_target);            else {                char str1[256];                char str2[256];                snprintf(str1, sizeof(str1),    "(0x%X)%s action end (%u)(%u)(%u) %s %s[%u][%u](%u) %s %s%s (%u)(%u)(%u)(%u)",                        ((hps_ctxt.hps_func_control << 12) | hps_sys.action_id),                        str_online, hps_ctxt.cur_loads,                        hps_ctxt.cur_tlp, hps_ctxt.cur_iowait,                        str_hvytsk, str_bigtsk, hps_ctxt.is_screen_off,                        hps_ctxt.is_idle, hps_ctxt.idle_ratio,                        str_pwrseq, str_criteria_limit, str_criteria_base,                        hps_sys.up_load_avg,                        hps_sys.down_load_avg,                        hps_sys.tlp_avg, hps_sys.rush_cnt);                snprintf(str2, sizeof(str2),    "[%u,%u|%u,%u|%u,%u][%u,%u,%u] [%u,%u,%u] [%u,%u,%u] [%u,%u,%u] %s",                        hps_sys.cluster_info[0].up_threshold,                        hps_sys.cluster_info[0].down_threshold,                        hps_sys.cluster_info[1].up_threshold,                        hps_sys.cluster_info[1].down_threshold,                        hps_sys.cluster_info[2].up_threshold,                        hps_sys.cluster_info[2].down_threshold,                        hps_sys.cluster_info[0].loading,                        hps_sys.cluster_info[1].loading,                        hps_sys.cluster_info[2].loading,                        hps_sys.cluster_info[0].rel_load,                        hps_sys.cluster_info[1].rel_load,                        hps_sys.cluster_info[2].rel_load,                        hps_sys.cluster_info[0].abs_load,                        hps_sys.cluster_info[1].abs_load,                        hps_sys.cluster_info[2].abs_load,                        /* sched-assist hotplug: for debug */                        hps_sys.cluster_info[0].sched_load,                        hps_sys.cluster_info[1].sched_load,                        hps_sys.cluster_info[2].sched_load,                        str_target);#ifdef CONFIG_MEIZU_BSP                if (printk_timed_ratelimit(&j, 500))                    hps_warn("%s%s\n", str1, str2);#else                    hps_warn("%s%s\n", str1, str2);#endif //CONFIG_MEIZU_BSP#ifdef _TRACE_                trace_hps_update(hps_sys.action_id, str_online, hps_ctxt.cur_loads,                        hps_ctxt.cur_tlp, hps_ctxt.cur_iowait, str_hvytsk,                        str_criteria_limit, str_criteria_base,                        hps_sys.up_load_avg, hps_sys.down_load_avg,                        hps_sys.tlp_avg,                        hps_sys.rush_hps_sys.cluster_info[0].up_threshold,                        hps_sys.cluster_info[0].down_threshold,                        hps_sys.cluster_info[0].up_threshold,                        hps_sys.cluster_info[0].down_threshold,                        hps_sys.cluster_info[2].up_threshold,                        hps_sys.cluster_info[2].down_threshold,                        hps_sys.cluster_info[0].loading, hps_sys.cluster_info[1].loading,                        hps_sys.cluster_info[2].loading,                        hps_ctxt.up_times, hps_ctxt.down_times, str_target);#endif            }            hps_ctxt_reset_stas_nolock();        }    }#if HPS_HRT_BT_EN    if (hrtbt_dbg && (action_print)) {        hps_set_funct_ctrl();        hps_warn("(0x%X)%s HRT_BT_DBG (%u)(%u)(%u) %s %s %s %s%s (%u)(%u)(%u)(%u) %s\n",             ((hps_ctxt.hps_func_control << 12) | hps_sys.action_id),             str_online, hps_ctxt.cur_loads, hps_ctxt.cur_tlp,             hps_ctxt.cur_iowait, str_hvytsk, str_bigtsk, str_pwrseq, str_criteria_limit,             str_criteria_base, hps_sys.up_load_avg, hps_sys.down_load_avg,             hps_sys.tlp_avg, hps_sys.rush_cnt, str_target);        hrtbt_dbg = 0;        hps_ctxt.hps_hrt_ktime = ktime_get();    }#endif    action_print = 0;    action_break = 0;    mutex_unlock(&hps_ctxt.lock);}

当前hps_algo_main()的算法对应有几种：

static int (*hps_func[]) (void) = {/*hps_algo_perf_indicator, hps_algo_rush_boost, hps_algo_eas, hps_algo_up, hps_algo_down};*/hps_algo_perf_indicator, hps_algo_rush_boost, hps_algo_eas};/* (1) 取perf规定的最小值 */static int hps_algo_perf_indicator(void){    unsigned int i;    if (atomic_read(&hps_ctxt.is_ondemand) != 0) { /* for ondemand request */        atomic_set(&hps_ctxt.is_ondemand, 0);        mutex_lock(&hps_ctxt.para_lock);        for (i = 0; i < hps_sys.cluster_num; i++)            hps_sys.cluster_info[i].target_core_num =                max(hps_sys.cluster_info[i].base_value, hps_sys.cluster_info[i].online_core_num);        mutex_unlock(&hps_ctxt.para_lock);        return 1;    }    return 0;}/* (2) 根据当前load的值是否达到boost门限，来决定是否启动boost */static int hps_algo_rush_boost(void){    int val, base_val;    unsigned int idx, total_rel_load;    idx = total_rel_load = 0;    for (idx = 0 ; idx < hps_sys.cluster_num ; idx++)        total_rel_load += hps_sys.cluster_info[idx].rel_load;    if (!hps_ctxt.rush_boost_enabled)        return 0;    base_val = cal_base_cores();    if (total_rel_load > hps_ctxt.rush_boost_threshold * hps_sys.total_online_cores)        ++hps_ctxt.rush_count;    else        hps_ctxt.rush_count = 0;    if (hps_ctxt.rush_boost_times == 1)        hps_ctxt.tlp_avg = hps_ctxt.cur_tlp;    if ((hps_ctxt.rush_count >= hps_ctxt.rush_boost_times) &&        (hps_sys.total_online_cores * 100 < hps_ctxt.tlp_avg)) {        val = hps_ctxt.tlp_avg / 100 + (hps_ctxt.tlp_avg % 100 ? 1 : 0);        WARN_ON(!(val > hps_sys.total_online_cores));        if (val > num_possible_cpus())            val = num_possible_cpus();        if (val > base_val)            val -= base_val;        else            val = 0;        hps_sys.tlp_avg = hps_ctxt.tlp_avg;        hps_sys.rush_cnt = hps_ctxt.rush_count;        hps_cal_core_num(&hps_sys, val, base_val);        /* [MET] debug for geekbench */        met_tag_oneshot(0, "sched_rush_boost", 1);        return 1;    } else {        /* [MET] debug for geekbench */        met_tag_oneshot(0, "sched_rush_boost", 0);        return 0;    }}/* (3) 根据负载来计算需要的online cpu */static int hps_algo_eas(void){    int val, ret, i;    ret = 0;    for (i = 0 ; i < hps_sys.cluster_num ; i++) {        hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].online_core_num;        /*if up_threshold > loading > down_threshold ==> No action*/        if ((hps_sys.cluster_info[i].loading <        (hps_sys.cluster_info[i].up_threshold*hps_sys.cluster_info[i].online_core_num)) &&        (hps_sys.cluster_info[i].loading >        (hps_sys.cluster_info[i].down_threshold*hps_sys.cluster_info[i].online_core_num)))        continue;        /*if loading > up_threshod ==> power on cores*/        if ((hps_sys.cluster_info[i].loading >            (hps_sys.cluster_info[i].up_threshold*hps_sys.cluster_info[i].online_core_num))) {            val = hps_sys.cluster_info[i].loading / hps_sys.cluster_info[i].up_threshold;            if (hps_sys.cluster_info[i].loading % hps_sys.cluster_info[i].up_threshold)                val++;            if (val <= hps_sys.cluster_info[i].limit_value)                hps_sys.cluster_info[i].target_core_num = val;            else                hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].limit_value;            ret = 1;        } else if ((hps_sys.cluster_info[i].loading <            (hps_sys.cluster_info[i].down_threshold*hps_sys.cluster_info[i].online_core_num))) {        /*if loading < down_threshod ==> power off cores*/            if (!hps_sys.cluster_info[i].loading) {                hps_sys.cluster_info[i].target_core_num = 0;                continue;            }            val = hps_sys.cluster_info[i].loading / hps_sys.cluster_info[i].down_threshold;            if (hps_sys.cluster_info[i].loading % hps_sys.cluster_info[i].down_threshold)                val++;            if (val >= hps_sys.cluster_info[i].base_value)                hps_sys.cluster_info[i].target_core_num = val;            else                hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].base_value;            ret = 1;        }    }#if 0    /*Check with big task criteriai*/    for (i = 1 ; i < hps_sys.cluster_num ; i++) {        if ((!hps_sys.cluster_info[i].bigTsk_value) &&        (!(hps_sys.cluster_info[i].loading / hps_sys.cluster_info[i].down_threshold)))            hps_sys.cluster_info[i].target_core_num = 0;    }#endif    return ret;}

4.5、NUMA负载均衡

NUMA arm架构没有使用，暂时不去解析。

阅读全文

0 0