Linux schedule 4、负载均衡
来源:互联网 发布:被淘宝网店诈骗 编辑:程序博客网 时间:2024/06/06 01:35
4、负载均衡
4.1、SMP负载均衡
4.1.1、Scheduling Domains
4.1.1.1、Scheduling Domains概念
借用Linux Scheduling Domains的描述,阐述Scheduling Domains的概念。
一个复杂的高端系统由上到下可以这样构成:
- 1、它是一个 NUMA 架构的系统,系统中的每个 Node 访问系统中不同区域的内存有不同的速度。
- 2、同时它又是一个 SMP 系统。由多个物理 CPU(Physical Package) 构成。这些物理 CPU 共享系统中所有的内存。但都有自己独立的 Cache 。
- 3、每个物理 CPU 又由多个核 (Core) 构成,即 Multi-core 技术或者叫 Chip-level Multi processor(CMP) 。这些核都被集成在一块 die 里面。一般有自己独立的 L1 Cache,但可能共享 L2 Cache 。
- 4、每个核中又通过 SMT 之类的技术实现多个硬件线程,或者叫 Virtual CPU( 比如 Intel 的 Hyper-threading 技术 ) 。这些硬件线程,逻辑上看是就是一个 CPU 。它们之间几乎所有的东西都共享。包括 L1 Cache,甚至是逻辑运算单元 (ALU) 以及 Power 。
可以看到cpu是有多个层级的,cpu和越近的层级之间共享的资源越多。所以进程在cpu之间迁移是有代价的,从性能的角度看,迁移跨越的层级越大性能损失越大。另外还需要从功耗的角度来考虑进程迁移的代价,这就是EAS考虑的。
4.1.1.2、arm64 cpu_topology
arm64架构的cpu拓扑结构存储在cpu_topology[]变量当中:
/* * cpu topology table */struct cpu_topology cpu_topology[NR_CPUS];struct cpu_topology { int thread_id; int core_id; int cluster_id; // 本cpu所在的cluster unsigned int partno; cpumask_t thread_sibling; cpumask_t core_sibling; // 在MutiCore层次(即同一个cluster中),有哪些兄弟cpu};
cpu_topology[]是parse_dt_cpu_capacity()函数解析dts中的信息建立的:
kernel_init() -> kernel_init_freeable() -> smp_prepare_cpus() -> init_cpu_topology() -> parse_dt_topology()↓static int __init parse_dt_topology(void){ struct device_node *cn, *map; int ret = 0; int cpu; /* (1) 找到dts中cpu topology的根节点"/cpus"" */ cn = of_find_node_by_path("/cpus"); if (!cn) { pr_err("No CPU information found in DT\n"); return 0; } /* * When topology is provided cpu-map is essentially a root * cluster with restricted subnodes. */ /* (2) 找到"cpu-map"节点 */ map = of_get_child_by_name(cn, "cpu-map"); if (!map) goto out; /* (3) 解析"cpu-map"中的cluster */ ret = parse_cluster(map, 0); if (ret != 0) goto out_map; /* * Check that all cores are in the topology; the SMP code will * only mark cores described in the DT as possible. */ for_each_possible_cpu(cpu) if (cpu_topology[cpu].cluster_id == -1) ret = -EINVAL;out_map: of_node_put(map);out: of_node_put(cn); return ret;}|→static int __init parse_cluster(struct device_node *cluster, int depth){ char name[10]; bool leaf = true; bool has_cores = false; struct device_node *c; static int cluster_id __initdata; int core_id = 0; int i, ret; /* * First check for child clusters; we currently ignore any * information about the nesting of clusters and present the * scheduler with a flat list of them. */ i = 0; /* (3.1) 如果有多级cluster,继续递归搜索 */ do { snprintf(name, sizeof(name), "cluster%d", i); c = of_get_child_by_name(cluster, name); if (c) { leaf = false; ret = parse_cluster(c, depth + 1); of_node_put(c); if (ret != 0) return ret; } i++; } while (c); /* Now check for cores */ i = 0; do { /* (3.2) 或者core层次的节点 */ snprintf(name, sizeof(name), "core%d", i); c = of_get_child_by_name(cluster, name); if (c) { has_cores = true; if (depth == 0) { pr_err("%s: cpu-map children should be clusters\n", c->full_name); of_node_put(c); return -EINVAL; } if (leaf) { /* (3.3) 如果是叶子cluster节点,继续遍历core中的cpu节点 */ ret = parse_core(c, cluster_id, core_id++); } else { pr_err("%s: Non-leaf cluster with core %s\n", cluster->full_name, name); ret = -EINVAL; } of_node_put(c); if (ret != 0) return ret; } i++; } while (c); if (leaf && !has_cores) pr_warn("%s: empty cluster\n", cluster->full_name); if (leaf) cluster_id++; return 0;}||→static int __init parse_core(struct device_node *core, int cluster_id, int core_id){ char name[10]; bool leaf = true; int i = 0; int cpu; struct device_node *t; do { /* (3.3.1) 如果存在thread层级,解析thread和cpu层级 */ snprintf(name, sizeof(name), "thread%d", i); t = of_get_child_by_name(core, name); if (t) { leaf = false; cpu = get_cpu_for_node(t); if (cpu >= 0) { cpu_topology[cpu].cluster_id = cluster_id; cpu_topology[cpu].core_id = core_id; cpu_topology[cpu].thread_id = i; } else { pr_err("%s: Can't get CPU for thread\n", t->full_name); of_node_put(t); return -EINVAL; } of_node_put(t); } i++; } while (t); /* (3.3.2) 否则直接解析cpu层级 */ cpu = get_cpu_for_node(core); if (cpu >= 0) { if (!leaf) { pr_err("%s: Core has both threads and CPU\n", core->full_name); return -EINVAL; } /* (3.3.3) 得到了cpu的cluster_id/core_id */ cpu_topology[cpu].cluster_id = cluster_id; cpu_topology[cpu].core_id = core_id; } else if (leaf) { pr_err("%s: Can't get CPU for leaf core\n", core->full_name); return -EINVAL; } return 0;}|||→static int __init get_cpu_for_node(struct device_node *node){ struct device_node *cpu_node; int cpu; cpu_node = of_parse_phandle(node, "cpu", 0); if (!cpu_node) return -1; for_each_possible_cpu(cpu) { if (of_get_cpu_node(cpu, NULL) == cpu_node) { of_node_put(cpu_node); return cpu; } } pr_crit("Unable to find CPU node for %s\n", cpu_node->full_name); of_node_put(cpu_node); return -1;}
cpu同一层次的关系cpu_topology[cpu].core_sibling/thread_sibling会在update_siblings_masks()中更新:
kernel_init() -> kernel_init_freeable() -> smp_prepare_cpus() -> store_cpu_topology() -> update_siblings_masks()↓static void update_siblings_masks(unsigned int cpuid){ struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; int cpu; /* update core and thread sibling masks */ for_each_possible_cpu(cpu) { cpu_topo = &cpu_topology[cpu]; if (cpuid_topo->cluster_id != cpu_topo->cluster_id) continue; cpumask_set_cpu(cpuid, &cpu_topo->core_sibling); if (cpu != cpuid) cpumask_set_cpu(cpu, &cpuid_topo->core_sibling); if (cpuid_topo->core_id != cpu_topo->core_id) continue; cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling); if (cpu != cpuid) cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling); }}
以mt6799为例,topology为”4*A35 + 4*A53 + 2*A73”,dts中定义如下:
mt6799.dtsi:cpus { #address-cells = <1>; #size-cells = <0>; cpu0: cpu@0 { device_type = "cpu"; compatible = "arm,cortex-a35"; reg = <0x000>; enable-method = "psci"; cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>, <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>; cpu-release-addr = <0x0 0x40000200>; clock-frequency = <1248000000>; }; cpu1: cpu@001 { device_type = "cpu"; compatible = "arm,cortex-a35"; reg = <0x001>; enable-method = "psci"; cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>, <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>; cpu-release-addr = <0x0 0x40000200>; clock-frequency = <1248000000>; }; cpu2: cpu@002 { device_type = "cpu"; compatible = "arm,cortex-a35"; reg = <0x002>; enable-method = "psci"; cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>, <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>; cpu-release-addr = <0x0 0x40000200>; clock-frequency = <1248000000>; }; cpu3: cpu@003 { device_type = "cpu"; compatible = "arm,cortex-a35"; reg = <0x003>; enable-method = "psci"; cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>, <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>; cpu-release-addr = <0x0 0x40000200>; clock-frequency = <1248000000>; }; cpu4: cpu@100 { device_type = "cpu"; compatible = "arm,cortex-a53"; reg = <0x100>; enable-method = "psci"; cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>, <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>; cpu-release-addr = <0x0 0x40000200>; clock-frequency = <1378000000>; }; cpu5: cpu@101 { device_type = "cpu"; compatible = "arm,cortex-a53"; reg = <0x101>; enable-method = "psci"; cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>, <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>; cpu-release-addr = <0x0 0x40000200>; clock-frequency = <1378000000>; }; cpu6: cpu@102 { device_type = "cpu"; compatible = "arm,cortex-a53"; reg = <0x102>; enable-method = "psci"; cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>, <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>; cpu-release-addr = <0x0 0x40000200>; clock-frequency = <1378000000>; }; cpu7: cpu@103 { device_type = "cpu"; compatible = "arm,cortex-a53"; reg = <0x103>; enable-method = "psci"; cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>, <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>; cpu-release-addr = <0x0 0x40000200>; clock-frequency = <1378000000>; }; cpu8: cpu@200 { device_type = "cpu"; compatible = "arm,cortex-a73"; reg = <0x200>; enable-method = "psci"; cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>, <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>; cpu-release-addr = <0x0 0x40000200>; clock-frequency = <1638000000>; }; cpu9: cpu@201 { device_type = "cpu"; compatible = "arm,cortex-a73"; reg = <0x201>; enable-method = "psci"; cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>, <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>; cpu-release-addr = <0x0 0x40000200>; clock-frequency = <1638000000>; }; cpu-map { cluster0 { core0 { cpu = <&cpu0>; }; core1 { cpu = <&cpu1>; }; core2 { cpu = <&cpu2>; }; core3 { cpu = <&cpu3>; }; }; cluster1 { core0 { cpu = <&cpu4>; }; core1 { cpu = <&cpu5>; }; core2 { cpu = <&cpu6>; }; core3 { cpu = <&cpu7>; }; }; cluster2 { core0 { cpu = <&cpu8>; }; core1 { cpu = <&cpu9>; }; }; };
- 经过parse_dt_topology()、update_siblings_masks()解析后得到cpu_topology[}的值为:
cpu 0 cluster_id = 0, core_id = 0, core_sibling = 0xfcpu 1 cluster_id = 0, core_id = 1, core_sibling = 0xfcpu 2 cluster_id = 0, core_id = 2, core_sibling = 0xfcpu 3 cluster_id = 0, core_id = 3, core_sibling = 0xfcpu 4 cluster_id = 1, core_id = 0, core_sibling = 0xf0cpu 5 cluster_id = 1, core_id = 1, core_sibling = 0xf0cpu 6 cluster_id = 1, core_id = 2, core_sibling = 0xf0cpu 7 cluster_id = 1, core_id = 3, core_sibling = 0xf0cpu 8 cluster_id = 2, core_id = 0, core_sibling = 0x300cpu 9 cluster_id = 2, core_id = 1, core_sibling = 0x300
4.1.1.3、Scheduling Domains的初始化
在kernel_init_freeable()中,调用smp_prepare_cpus()初始化完cpu的拓扑关系,再调用smp_init()唤醒cpu,紧接会调用sched_init_smp()初始化系统的Scheduling Domains。
关于拓扑的层次默认可选的有3层:SMT/MC/DIE。arm目前不支持多线程技术,所以现在只支持2层:MC/DIE。
/* * Topology list, bottom-up. */static struct sched_domain_topology_level default_topology[] = {#ifdef CONFIG_SCHED_SMT { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },#endif#ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },#endif { cpu_cpu_mask, SD_INIT_NAME(DIE) }, { NULL, },};
arm64使用的SDTL如下:
static struct sched_domain_topology_level arm64_topology[] = {#ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) },#endif { cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) }, { NULL, },};
具体的Scheduling Domains的初始化代码分析如下:
kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> init_sched_domains(cpu_active_mask):↓static int init_sched_domains(const struct cpumask *cpu_map){ int err; arch_update_cpu_topology(); /* (1) 当前只有一个schedule domain需要初始化 */ ndoms_cur = 1; doms_cur = alloc_sched_domains(ndoms_cur); if (!doms_cur) doms_cur = &fallback_doms; /* (2) 按照传入的cpu_active_mask,构造sched_domains */ cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); err = build_sched_domains(doms_cur[0], NULL); /* (3) 注册“/proc/sys/kernel/sched_domain/” */ register_sched_domain_sysctl(); return err;}|→static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr){ enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; struct rq *rq = NULL; int i, ret = -ENOMEM; /* (2.1) 在每个tl层次,给每个cpu分配sd、sg、sgc空间 */ alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) goto error; /* Set up domains for cpus specified by the cpu_map. */ for_each_cpu(i, cpu_map) { struct sched_domain_topology_level *tl; sd = NULL; for_each_sd_topology(tl) { /* (2.2) 初始化sd 构造其不同tl之间的sd的parent、cild关系 按照SDTL传入的tl->mask()函数,给sd->span[]赋值 */ sd = build_sched_domain(tl, cpu_map, attr, sd, i); /* (2.2.1) 将最底层tl的sd赋值给d.sd */ if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) sd->flags |= SD_OVERLAP; if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; } } /* Build the groups for the domains */ for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { /* (2.3) 给sd->span_weight赋值 */ sd->span_weight = cpumask_weight(sched_domain_span(sd)); if (sd->flags & SD_OVERLAP) { if (build_overlap_sched_groups(sd, i)) goto error; } else { /* (2.4) 按照span,构造每个tl层次中,sd、sg之间的关系 */ if (build_sched_groups(sd, i)) goto error; } } } /* Calculate CPU capacity for physical packages and nodes */ for (i = nr_cpumask_bits-1; i >= 0; i--) { struct sched_domain_topology_level *tl = sched_domain_topology; if (!cpumask_test_cpu(i, cpu_map)) continue; for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) { /* (2.5) 初始化sg->sge对应的energy表 */ init_sched_energy(i, sd, tl->energy); /* (2.6) 对有人引用的sd、sg、sgc进行标识, 无人引用的sd、sg、sgc在__free_domain_allocs()中会被释放 */ claim_allocations(i, sd); /* (2.7) 初始化每个tl层级的sgc->capacity */ init_sched_groups_capacity(i, sd); } } /* Attach the domains */ rcu_read_lock(); /* (2.8) 将d.rd赋值给rq->sd 将d.rd赋值给rq->rd */ for_each_cpu(i, cpu_map) { rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); cpu_attach_domain(sd, d.rd, i); } rcu_read_unlock(); ret = 0;error: /* (2.9) free掉分配失败/分配成功多余的内存 */ __free_domain_allocs(&d, alloc_state, cpu_map); return ret;}||→static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map){ memset(d, 0, sizeof(*d)); /* (2.1.1) 每个tl层次,给每个cpu都分配sd、sg、sgc, tl->data->sd、l->data->sg、l->data->sgc */ if (__sdt_alloc(cpu_map)) return sa_sd_storage; /* (2.1.2) 分配d->sd指针空间 实际d->sd会指向最底层tl的tl->data->sd */ d->sd = alloc_percpu(struct sched_domain *); if (!d->sd) return sa_sd_storage; /* (2.1.3) 分配d->rd的指针空间和实际空间 rd = root_domain */ d->rd = alloc_rootdomain(); if (!d->rd) return sa_sd; return sa_rootdomain;}||→struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu){ struct sched_domain *sd = sd_init(tl, cpu); if (!sd) return child; /* (2.2.1) 根据tl->mask()初始化sd->sapn[] */ cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { sd->level = child->level + 1; sched_domain_level_max = max(sched_domain_level_max, sd->level); /* (2.2.2) 如果有多层tl,建立起sd之间的parent/child关系, 对arm来说:MC层tl->data->sd是child,DIE层tl->data->sd是parent */ child->parent = sd; sd->child = child; if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) { pr_err("BUG: arch topology borken\n");#ifdef CONFIG_SCHED_DEBUG pr_err(" the %s domain not a subset of the %s domain\n", child->name, sd->name);#endif /* Fixup, ensure @sd has at least @child cpus. */ cpumask_or(sched_domain_span(sd), sched_domain_span(sd), sched_domain_span(child)); } } set_domain_attribute(sd, attr); return sd;}||→static intbuild_sched_groups(struct sched_domain *sd, int cpu){ struct sched_group *first = NULL, *last = NULL; struct sd_data *sdd = sd->private; const struct cpumask *span = sched_domain_span(sd); struct cpumask *covered; int i; /* (2.4.1) 根据sd->span[]建立起sd、sg之间的关系 , 如果sd没有child,每个cpu的sd、sg之间建立链接 如果sd有child,每个cpu的sd和span中第一个cpu的sg建立链接 */ get_group(cpu, sdd, &sd->groups); atomic_inc(&sd->groups->ref); if (cpu != cpumask_first(span)) return 0; lockdep_assert_held(&sched_domains_mutex); covered = sched_domains_tmpmask; cpumask_clear(covered); /* (2.4.2) 挑选有sd链接的sg,给其中的sg->cpumask[]成员赋值 */ for_each_cpu(i, span) { struct sched_group *sg; int group, j; if (cpumask_test_cpu(i, covered)) continue; group = get_group(i, sdd, &sg); cpumask_setall(sched_group_mask(sg)); for_each_cpu(j, span) { if (get_group(j, sdd, NULL) != group) continue; cpumask_set_cpu(j, covered); cpumask_set_cpu(j, sched_group_cpus(sg)); } /* (2.4.3) 挑选有sd链接的sg,将同一层级sg链接成链表, */ if (!first) first = sg; if (last) last->next = sg; last = sg; } last->next = first; return 0;}||→static void init_sched_energy(int cpu, struct sched_domain *sd, sched_domain_energy_f fn){ if (!(fn && fn(cpu))) return; if (cpu != group_balance_cpu(sd->groups)) return; if (sd->child && !sd->child->groups->sge) { pr_err("BUG: EAS setup broken for CPU%d\n", cpu);#ifdef CONFIG_SCHED_DEBUG pr_err(" energy data on %s but not on %s domain\n", sd->name, sd->child->name);#endif return; } check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups)); /* (2.5.1) 不同层级tl,按照tl->energy()给sg->sge赋值 */ sd->groups->sge = fn(cpu);}||→static void claim_allocations(int cpu, struct sched_domain *sd){ struct sd_data *sdd = sd->private; /* (2.6.1) 对有人使用的tl->data->sd、tl->data->sg、tl->data->sgc置空, 无人使用的空间,将会在__free_domain_allocs()中被释放 */ WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); *per_cpu_ptr(sdd->sd, cpu) = NULL; if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) *per_cpu_ptr(sdd->sgc, cpu) = NULL;}||→static void init_sched_groups_capacity(int cpu, struct sched_domain *sd){ struct sched_group *sg = sd->groups; WARN_ON(!sg); do { /* (2.7.1) 更新sg->group_weight的值 */ sg->group_weight = cpumask_weight(sched_group_cpus(sg)); sg = sg->next; } while (sg != sd->groups); if (cpu != group_balance_cpu(sg)) return; /* (2.7.2) 更新sgc->capacity的值 */ update_group_capacity(sd, cpu); /* (2.7.3) 更新sgc->nr_busy_cpus的值 */ atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);}|||→void update_group_capacity(struct sched_domain *sd, int cpu){ struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; unsigned long capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); interval = clamp(interval, 1UL, max_load_balance_interval); sdg->sgc->next_update = jiffies + interval; if (!child) { /* (2.7.2.1) 如果sd没有child是最底层tl, 则调用arch_scale_cpu_capacity()获取最大运算能力,并减去rt进程的消耗rq->rt_avg, 得到本sd的sg->sgc->capacity */ update_cpu_capacity(sd, cpu); return; } capacity = 0; if (child->flags & SD_OVERLAP) { /* * SD_OVERLAP domains cannot assume that child groups * span the current group. */ for_each_cpu(cpu, sched_group_cpus(sdg)) { struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); /* * build_sched_domains() -> init_sched_groups_capacity() * gets here before we've attached the domains to the * runqueues. * * Use capacity_of(), which is set irrespective of domains * in update_cpu_capacity(). * * This avoids capacity from being 0 and * causing divide-by-zero issues on boot. */ if (unlikely(!rq->sd)) { capacity += capacity_of(cpu); continue; } sgc = rq->sd->groups->sgc; capacity += sgc->capacity; } } else { /* * !SD_OVERLAP domains can assume that child groups * span the current group. */ /* (2.7.2.2) 如果sd有child不是最底层tl, 则sgc->capacity等于所有child sg的group->sgc->capacity的和 */ group = child->groups; do { capacity += group->sgc->capacity; group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity;}||||→static void update_cpu_capacity(struct sched_domain *sd, int cpu){ unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); struct sched_group *sdg = sd->groups; struct max_cpu_capacity *mcc; unsigned long max_capacity; int max_cap_cpu; unsigned long flags; /* (2.7.2.1.1) 根据arch_scale_cpu_capacity获取到本cpu最大/orig capacity */ cpu_rq(cpu)->cpu_capacity_orig = capacity; mcc = &cpu_rq(cpu)->rd->max_cpu_capacity; raw_spin_lock_irqsave(&mcc->lock, flags); max_capacity = mcc->val; max_cap_cpu = mcc->cpu; if ((max_capacity > capacity && max_cap_cpu == cpu) || (max_capacity < capacity)) { mcc->val = capacity; mcc->cpu = cpu;#ifdef CONFIG_SCHED_DEBUG raw_spin_unlock_irqrestore(&mcc->lock, flags); /* pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity); */ goto skip_unlock;#endif } raw_spin_unlock_irqrestore(&mcc->lock, flags);skip_unlock: __attribute__ ((unused)); /* (2.7.2.1.2) 减去rt消耗的capacity, rq->rt_avg/(sched_avg_period() + delta)是rt进程占用cpu的比例, 剩下就为cfs可用的capacity */ capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; if (!capacity) capacity = 1; cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity;}
init_sched_domains()是在系统启动时创建sched_domain,如果发生cpu hotplug系统中online的cpu发生变化时,会调用partition_sched_domains重新构造系统的sched_domain。
cpu_up() -> _cpu_up() -> __raw_notifier_call_chain() -> cpuset_cpu_active() -> cpuset_update_active_cpus() -> partition_sched_domains() -> build_sched_domains();void __init sched_init_smp(void){ hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);}static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu){ switch (action) { case CPU_ONLINE_FROZEN: case CPU_DOWN_FAILED_FROZEN: /* * num_cpus_frozen tracks how many CPUs are involved in suspend * resume sequence. As long as this is not the last online * operation in the resume sequence, just build a single sched * domain, ignoring cpusets. */ num_cpus_frozen--; if (likely(num_cpus_frozen)) { partition_sched_domains(1, NULL, NULL); break; } /* * This is the last CPU online operation. So fall through and * restore the original sched domains by considering the * cpuset configurations. */ case CPU_ONLINE: cpuset_update_active_cpus(true); break; default: return NOTIFY_DONE; } return NOTIFY_OK;}static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu){ unsigned long flags; long cpu = (long)hcpu; struct dl_bw *dl_b; bool overflow; int cpus; switch (action) { case CPU_DOWN_PREPARE: rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); cpus = dl_bw_cpus(cpu); overflow = __dl_overflow(dl_b, cpus, 0, 0); raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); if (overflow) return notifier_from_errno(-EBUSY); cpuset_update_active_cpus(false); break; case CPU_DOWN_PREPARE_FROZEN: num_cpus_frozen++; partition_sched_domains(1, NULL, NULL); break; default: return NOTIFY_DONE; } return NOTIFY_OK;}
4.1.1.4、mt6799的Scheduling Domains
在系统初始化时,因为cmdline中传入了“maxcpus=8”所以setup_max_cpus=8,smp只是启动了8个核,mt6799的另外2个大核是在后面才启动的。我们看看在系统启动8个核的时候,Scheduling Domains是什么样的。
在启动的时候每个层次的tl对每个cpu都会分配sd、sg、sgc的内存空间,但是建立起有效链接后有些sg、sgc空间是没有用上的。没有使用的内存后面会在claim_allocations()中标识出来,build_sched_domains()函数返回之前调用__free_domain_allocs()释放掉。
kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> init_sched_domains() -> build_sched_domains() -> __visit_domain_allocation_hell() -> __sdt_alloc():[__sdt_alloc][tl MC] cpu0, &sd = 0xffffffc15663c600, &sg = 0xffffffc156062600, &sgc = 0xffffffc156062780 [__sdt_alloc][tl MC] cpu1, &sd = 0xffffffc15608f000, &sg = 0xffffffc156056780, &sgc = 0xffffffc156090000 [__sdt_alloc][tl MC] cpu2, &sd = 0xffffffc15608fc00, &sg = 0xffffffc156090d80, &sgc = 0xffffffc156090180 [__sdt_alloc][tl MC] cpu3, &sd = 0xffffffc15608f300, &sg = 0xffffffc156090c00, &sgc = 0xffffffc156090300 [__sdt_alloc][tl MC] cpu4, &sd = 0xffffffc15608f900, &sg = 0xffffffc156090a80, &sgc = 0xffffffc156090480 [__sdt_alloc][tl MC] cpu5, &sd = 0xffffffc15608f600, &sg = 0xffffffc156090900, &sgc = 0xffffffc156090600 [__sdt_alloc][tl MC] cpu6, &sd = 0xffffffc156091000, &sg = 0xffffffc156090780, &sgc = 0xffffffc156092000 [__sdt_alloc][tl MC] cpu7, &sd = 0xffffffc156091c00, &sg = 0xffffffc156092d80, &sgc = 0xffffffc156092180 [__sdt_alloc][tl DIE] cpu0, &sd = 0xffffffc156091300, &sg = 0xffffffc156092c00, &sgc = 0xffffffc156092300 [__sdt_alloc][tl DIE] cpu1, &sd = 0xffffffc156091900, &sg = 0xffffffc156092a80, &sgc = 0xffffffc156092480 [__sdt_alloc][tl DIE] cpu2, &sd = 0xffffffc156091600, &sg = 0xffffffc156092900, &sgc = 0xffffffc156092600 [__sdt_alloc][tl DIE] cpu3, &sd = 0xffffffc156093000, &sg = 0xffffffc156092780, &sgc = 0xffffffc156094000 [__sdt_alloc][tl DIE] cpu4, &sd = 0xffffffc156093c00, &sg = 0xffffffc156094d80, &sgc = 0xffffffc156094180 [__sdt_alloc][tl DIE] cpu5, &sd = 0xffffffc156093300, &sg = 0xffffffc156094c00, &sgc = 0xffffffc156094300 [__sdt_alloc][tl DIE] cpu6, &sd = 0xffffffc156093900, &sg = 0xffffffc156094a80, &sgc = 0xffffffc156094480 [__sdt_alloc][tl DIE] cpu7, &sd = 0xffffffc156093600, &sg = 0xffffffc156094900, &sgc = 0xffffffc156094600
建立链接以后每个层次tl的sd、sg之间的关系:
kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> init_sched_domains() -> build_sched_domains() -> build_sched_groups():[build_sched_domains][tl MC] cpu0, sd->groups=0xffffffc156062600, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf[build_sched_domains][tl MC] cpu0, sg->sgc=0xffffffc156062780, sg->next=0xffffffc156056780, sg->group_weight=0, sg->cpumask[]=0x1[build_sched_domains][tl MC] cpu0, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff[build_sched_domains][tl MC] cpu0, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 [build_sched_domains][tl MC] cpu0, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|[build_sched_domains][tl MC] cpu1, sd->groups=0xffffffc156056780, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf[build_sched_domains][tl MC] cpu1, sg->sgc=0xffffffc156090000, sg->next=0xffffffc156090d80, sg->group_weight=0, sg->cpumask[]=0x2[build_sched_domains][tl MC] cpu1, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff[build_sched_domains][tl MC] cpu1, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 [build_sched_domains][tl MC] cpu1, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|[build_sched_domains][tl MC] cpu2, sd->groups=0xffffffc156090d80, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf[build_sched_domains][tl MC] cpu2, sg->sgc=0xffffffc156090180, sg->next=0xffffffc156090c00, sg->group_weight=0, sg->cpumask[]=0x4[build_sched_domains][tl MC] cpu2, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff[build_sched_domains][tl MC] cpu2, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 [build_sched_domains][tl MC] cpu2, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|[build_sched_domains][tl MC] cpu3, sd->groups=0xffffffc156090c00, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf[build_sched_domains][tl MC] cpu3, sg->sgc=0xffffffc156090300, sg->next=0xffffffc156062600, sg->group_weight=0, sg->cpumask[]=0x8[build_sched_domains][tl MC] cpu3, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff[build_sched_domains][tl MC] cpu3, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 [build_sched_domains][tl MC] cpu3, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|[build_sched_domains][tl MC] cpu4, sd->groups=0xffffffc156090a80, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf0[build_sched_domains][tl MC] cpu4, sg->sgc=0xffffffc156090480, sg->next=0xffffffc156090900, sg->group_weight=0, sg->cpumask[]=0x10[build_sched_domains][tl MC] cpu4, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff[build_sched_domains][tl MC] cpu4, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 [build_sched_domains][tl MC] cpu4, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|[build_sched_domains][tl MC] cpu5, sd->groups=0xffffffc156090900, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf0[build_sched_domains][tl MC] cpu5, sg->sgc=0xffffffc156090600, sg->next=0xffffffc156090780, sg->group_weight=0, sg->cpumask[]=0x20[build_sched_domains][tl MC] cpu5, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff[build_sched_domains][tl MC] cpu5, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 [build_sched_domains][tl MC] cpu5, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|[build_sched_domains][tl MC] cpu6, sd->groups=0xffffffc156090780, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf0[build_sched_domains][tl MC] cpu6, sg->sgc=0xffffffc156092000, sg->next=0xffffffc156092d80, sg->group_weight=0, sg->cpumask[]=0x40[build_sched_domains][tl MC] cpu6, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff[build_sched_domains][tl MC] cpu6, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 [build_sched_domains][tl MC] cpu6, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|[build_sched_domains][tl MC] cpu7, sd->groups=0xffffffc156092d80, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf0[build_sched_domains][tl MC] cpu7, sg->sgc=0xffffffc156092180, sg->next=0xffffffc156090a80, sg->group_weight=0, sg->cpumask[]=0x80[build_sched_domains][tl MC] cpu7, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff[build_sched_domains][tl MC] cpu7, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 [build_sched_domains][tl MC] cpu7, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|[build_sched_domains][tl DIE] cpu0, sd->groups=0xffffffc156092c00, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff[build_sched_domains][tl DIE] cpu0, sg->sgc=0xffffffc156092300, sg->next=0xffffffc156094d80, sg->group_weight=0, sg->cpumask[]=0xf[build_sched_domains][tl DIE] cpu0, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff[build_sched_domains][tl DIE] cpu0, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 [build_sched_domains][tl DIE] cpu0, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|[build_sched_domains][tl DIE] cpu1, sd->groups=0xffffffc156092c00, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff[build_sched_domains][tl DIE] cpu1, sg->sgc=0x0, sg->next=0xffffffc156092a80, sg->group_weight=0, sg->cpumask[]=0x0[build_sched_domains][tl DIE] cpu1, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0[build_sched_domains][tl DIE] cpu1, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 [build_sched_domains][tl DIE] cpu1, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|[build_sched_domains][tl DIE] cpu2, sd->groups=0xffffffc156092c00, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff[build_sched_domains][tl DIE] cpu2, sg->sgc=0x0, sg->next=0xffffffc156092900, sg->group_weight=0, sg->cpumask[]=0x0[build_sched_domains][tl DIE] cpu2, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0[build_sched_domains][tl DIE] cpu2, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 [build_sched_domains][tl DIE] cpu2, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|[build_sched_domains][tl DIE] cpu3, sd->groups=0xffffffc156092c00, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff[build_sched_domains][tl DIE] cpu3, sg->sgc=0x0, sg->next=0xffffffc156092780, sg->group_weight=0, sg->cpumask[]=0x0[build_sched_domains][tl DIE] cpu3, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0[build_sched_domains][tl DIE] cpu3, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 [build_sched_domains][tl DIE] cpu3, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|[build_sched_domains][tl DIE] cpu4, sd->groups=0xffffffc156094d80, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff[build_sched_domains][tl DIE] cpu4, sg->sgc=0xffffffc156094180, sg->next=0xffffffc156092c00, sg->group_weight=0, sg->cpumask[]=0xf0[build_sched_domains][tl DIE] cpu4, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff[build_sched_domains][tl DIE] cpu4, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 [build_sched_domains][tl DIE] cpu4, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|[build_sched_domains][tl DIE] cpu5, sd->groups=0xffffffc156094d80, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff[build_sched_domains][tl DIE] cpu5, sg->sgc=0x0, sg->next=0xffffffc156094c00, sg->group_weight=0, sg->cpumask[]=0x0[build_sched_domains][tl DIE] cpu5, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0[build_sched_domains][tl DIE] cpu5, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 [build_sched_domains][tl DIE] cpu5, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|[build_sched_domains][tl DIE] cpu6, sd->groups=0xffffffc156094d80, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff[build_sched_domains][tl DIE] cpu6, sg->sgc=0x0, sg->next=0xffffffc156094a80, sg->group_weight=0, sg->cpumask[]=0x0[build_sched_domains][tl DIE] cpu6, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0[build_sched_domains][tl DIE] cpu6, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 [build_sched_domains][tl DIE] cpu6, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|[build_sched_domains][tl DIE] cpu7, sd->groups=0xffffffc156094d80, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff[build_sched_domains][tl DIE] cpu7, sg->sgc=0x0, sg->next=0xffffffc156094900, sg->group_weight=0, sg->cpumask[]=0x0[build_sched_domains][tl DIE] cpu7, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0[build_sched_domains][tl DIE] cpu7, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 [build_sched_domains][tl DIE] cpu7, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
用图形表达的关系如下:
每个sched_domain中的参数也非常重要,在函数sd_init()中初始化,在smp负载均衡时会频繁的使用这些参数和标志:
sd 参数 tl MC 层级 tl DIE 层级 sd->min_interval 4 8 sd->max_interval 8 16 sd->busy_factor 32 32 sd->imbalance_pct 117 125 sd->cache_nice_tries 1 1 sd->busy_idx 2 2 sd->idle_idx 0 1 sd->newidle_idx 0 0 sd->wake_idx 0 0 sd->forkexec_idx 0 0 sd->span_weight 4 8 sd->balance_interval 4 8 sd->level 0 1 sd->flags 0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES 0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLINGupdate_top_cache_domain()函数中还把常用的一些sd进行了cache,我们通过打印得出每个cache实际对应的层次sd:
cache sd 说明 赋值 sd_busy per_cpu(sd_busy, cpu), 本cpu的tl DIE层级sd sd_llc per_cpu(sd_llc, cpu), 本cpu的tl MC层级sd sd_llc_size per_cpu(sd_llc_size, cpu), 4 sd_llc_id per_cpu(sd_llc_id, cpu), 0/4 sd_numa per_cpu(sd_numa, cpu), 0 sd_asym per_cpu(sd_asym, cpu), 0 sd_ea per_cpu(sd_ea, cpu), 本cpu的tl DIE层级sd sd_scs per_cpu(sd_scs, cpu), 本cpu的tl MC层级sdstatic void update_top_cache_domain(int cpu){ struct sched_domain *sd; struct sched_domain *busy_sd = NULL, *ea_sd = NULL; int id = cpu; int size = 1; sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); busy_sd = sd->parent; /* sd_busy */ } rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; sd = lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); sd = highest_flag_domain(cpu, SD_ASYM_PACKING); rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); for_each_domain(cpu, sd) { if (sd->groups->sge) ea_sd = sd; else break; } rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd); sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES); rcu_assign_pointer(per_cpu(sd_scs, cpu), sd);}
[update_top_cache_domain] cpu0, sd_busy=0xffffffc156091300, sd_llc=0xffffffc15663c600, sd_llc_size=4, sd_llc_id=0, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156091300, sd_scs=0xffffffc15663c600[update_top_cache_domain] cpu1, sd_busy=0xffffffc156091900, sd_llc=0xffffffc15608f000, sd_llc_size=4, sd_llc_id=0, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156091900, sd_scs=0xffffffc15608f000[update_top_cache_domain] cpu2, sd_busy=0xffffffc156091600, sd_llc=0xffffffc15608fc00, sd_llc_size=4, sd_llc_id=0, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156091600, sd_scs=0xffffffc15608fc00[update_top_cache_domain] cpu3, sd_busy=0xffffffc156093000, sd_llc=0xffffffc15608f300, sd_llc_size=4, sd_llc_id=0, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093000, sd_scs=0xffffffc15608f300[update_top_cache_domain] cpu4, sd_busy=0xffffffc156093c00, sd_llc=0xffffffc15608f900, sd_llc_size=4, sd_llc_id=4, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093c00, sd_scs=0xffffffc15608f900[update_top_cache_domain] cpu5, sd_busy=0xffffffc156093300, sd_llc=0xffffffc15608f600, sd_llc_size=4, sd_llc_id=4, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093300, sd_scs=0xffffffc15608f600[update_top_cache_domain] cpu6, sd_busy=0xffffffc156093900, sd_llc=0xffffffc156091000, sd_llc_size=4, sd_llc_id=4, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093900, sd_scs=0xffffffc156091000[update_top_cache_domain] cpu7, sd_busy=0xffffffc156093600, sd_llc=0xffffffc156091c00, sd_llc_size=4, sd_llc_id=4, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093600, sd_scs=0xffffffc156091c00[__sdt_alloc][tl MC] cpu0, &sd = 0xffffffc15663c600, &sg = 0xffffffc156062600, &sgc = 0xffffffc156062780 [__sdt_alloc][tl MC] cpu1, &sd = 0xffffffc15608f000, &sg = 0xffffffc156056780, &sgc = 0xffffffc156090000 [__sdt_alloc][tl MC] cpu2, &sd = 0xffffffc15608fc00, &sg = 0xffffffc156090d80, &sgc = 0xffffffc156090180 [__sdt_alloc][tl MC] cpu3, &sd = 0xffffffc15608f300, &sg = 0xffffffc156090c00, &sgc = 0xffffffc156090300 [__sdt_alloc][tl MC] cpu4, &sd = 0xffffffc15608f900, &sg = 0xffffffc156090a80, &sgc = 0xffffffc156090480 [__sdt_alloc][tl MC] cpu5, &sd = 0xffffffc15608f600, &sg = 0xffffffc156090900, &sgc = 0xffffffc156090600 [__sdt_alloc][tl MC] cpu6, &sd = 0xffffffc156091000, &sg = 0xffffffc156090780, &sgc = 0xffffffc156092000 [__sdt_alloc][tl MC] cpu7, &sd = 0xffffffc156091c00, &sg = 0xffffffc156092d80, &sgc = 0xffffffc156092180 [__sdt_alloc][tl DIE] cpu0, &sd = 0xffffffc156091300, &sg = 0xffffffc156092c00, &sgc = 0xffffffc156092300 [__sdt_alloc][tl DIE] cpu1, &sd = 0xffffffc156091900, &sg = 0xffffffc156092a80, &sgc = 0xffffffc156092480 [__sdt_alloc][tl DIE] cpu2, &sd = 0xffffffc156091600, &sg = 0xffffffc156092900, &sgc = 0xffffffc156092600 [__sdt_alloc][tl DIE] cpu3, &sd = 0xffffffc156093000, &sg = 0xffffffc156092780, &sgc = 0xffffffc156094000 [__sdt_alloc][tl DIE] cpu4, &sd = 0xffffffc156093c00, &sg = 0xffffffc156094d80, &sgc = 0xffffffc156094180 [__sdt_alloc][tl DIE] cpu5, &sd = 0xffffffc156093300, &sg = 0xffffffc156094c00, &sgc = 0xffffffc156094300 [__sdt_alloc][tl DIE] cpu6, &sd = 0xffffffc156093900, &sg = 0xffffffc156094a80, &sgc = 0xffffffc156094480 [__sdt_alloc][tl DIE] cpu7, &sd = 0xffffffc156093600, &sg = 0xffffffc156094900, &sgc = 0xffffffc156094600
mt6799在计算功耗(energy)和运算能力(capacity)时使用的表项如下:
kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> init_sched_domains() -> build_sched_domains() -> init_sched_energy()/init_sched_groups_capacity();/* v1 FY */struct upower_tbl_info upower_tbl_infos_FY[NR_UPOWER_BANK] = { INIT_UPOWER_TBL_INFOS(UPOWER_BANK_LL, upower_tbl_ll_1_FY), INIT_UPOWER_TBL_INFOS(UPOWER_BANK_L, upower_tbl_l_1_FY), INIT_UPOWER_TBL_INFOS(UPOWER_BANK_B, upower_tbl_b_1_FY), INIT_UPOWER_TBL_INFOS(UPOWER_BANK_CLS_LL, upower_tbl_cluster_ll_1_FY), INIT_UPOWER_TBL_INFOS(UPOWER_BANK_CLS_L, upower_tbl_cluster_l_1_FY), INIT_UPOWER_TBL_INFOS(UPOWER_BANK_CLS_B, upower_tbl_cluster_b_1_FY), INIT_UPOWER_TBL_INFOS(UPOWER_BANK_CCI, upower_tbl_cci_1_FY),};/* ver1 *//* FY table */struct upower_tbl upower_tbl_ll_1_FY = { .row = { {.cap = 100, .volt = 75000, .dyn_pwr = 9994, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} }, {.cap = 126, .volt = 75000, .dyn_pwr = 12585, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} }, {.cap = 148, .volt = 75000, .dyn_pwr = 14806, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} }, {.cap = 167, .volt = 75000, .dyn_pwr = 16656, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} }, {.cap = 189, .volt = 75000, .dyn_pwr = 18877, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} }, {.cap = 212, .volt = 75000, .dyn_pwr = 21098, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} }, {.cap = 230, .volt = 75700, .dyn_pwr = 23379, .lkg_pwr = {13936, 13936, 13936, 13936, 13936, 13936} }, {.cap = 245, .volt = 78100, .dyn_pwr = 26490, .lkg_pwr = {14811, 14811, 14811, 14811, 14811, 14811} }, {.cap = 263, .volt = 81100, .dyn_pwr = 30729, .lkg_pwr = {15958, 15958, 15958, 15958, 15958, 15958} }, {.cap = 278, .volt = 83500, .dyn_pwr = 34409, .lkg_pwr = {16949, 16949, 16949, 16949, 16949, 16949} }, {.cap = 293, .volt = 86000, .dyn_pwr = 38447, .lkg_pwr = {18036, 18036, 18036, 18036, 18036, 18036} }, {.cap = 304, .volt = 88400, .dyn_pwr = 42166, .lkg_pwr = {19159, 19159, 19159, 19159, 19159, 19159} }, {.cap = 319, .volt = 90800, .dyn_pwr = 46657, .lkg_pwr = {20333, 20333, 20333, 20333, 20333, 20333} }, {.cap = 334, .volt = 93200, .dyn_pwr = 51442, .lkg_pwr = {21605, 21605, 21605, 21605, 21605, 21605} }, {.cap = 345, .volt = 95000, .dyn_pwr = 55230, .lkg_pwr = {22560, 22560, 22560, 22560, 22560, 22560} }, {.cap = 356, .volt = 97400, .dyn_pwr = 59928, .lkg_pwr = {24002, 24002, 24002, 24002, 24002, 24002} }, }, .lkg_idx = DEFAULT_LKG_IDX, .row_num = UPOWER_OPP_NUM, .nr_idle_states = NR_UPOWER_CSTATES, .idle_states = { {{0}, {7321} }, {{0}, {7321} }, {{0}, {7321} }, {{0}, {7321} }, {{0}, {7321} }, {{0}, {7321} }, },};struct upower_tbl upower_tbl_cluster_ll_1_FY = { .row = { {.cap = 100, .volt = 75000, .dyn_pwr = 3656, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} }, {.cap = 126, .volt = 75000, .dyn_pwr = 4604, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} }, {.cap = 148, .volt = 75000, .dyn_pwr = 5417, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} }, {.cap = 167, .volt = 75000, .dyn_pwr = 6094, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} }, {.cap = 189, .volt = 75000, .dyn_pwr = 6906, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} }, {.cap = 212, .volt = 75000, .dyn_pwr = 7719, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} }, {.cap = 230, .volt = 75700, .dyn_pwr = 8553, .lkg_pwr = {22134, 22134, 22134, 22134, 22134, 22134} }, {.cap = 245, .volt = 78100, .dyn_pwr = 9692, .lkg_pwr = {23523, 23523, 23523, 23523, 23523, 23523} }, {.cap = 263, .volt = 81100, .dyn_pwr = 11242, .lkg_pwr = {25344, 25344, 25344, 25344, 25344, 25344} }, {.cap = 278, .volt = 83500, .dyn_pwr = 12589, .lkg_pwr = {26919, 26919, 26919, 26919, 26919, 26919} }, {.cap = 293, .volt = 86000, .dyn_pwr = 14066, .lkg_pwr = {28646, 28646, 28646, 28646, 28646, 28646} }, {.cap = 304, .volt = 88400, .dyn_pwr = 15427, .lkg_pwr = {30430, 30430, 30430, 30430, 30430, 30430} }, {.cap = 319, .volt = 90800, .dyn_pwr = 17069, .lkg_pwr = {32293, 32293, 32293, 32293, 32293, 32293} }, {.cap = 334, .volt = 93200, .dyn_pwr = 18820, .lkg_pwr = {34314, 34314, 34314, 34314, 34314, 34314} }, {.cap = 345, .volt = 95000, .dyn_pwr = 20206, .lkg_pwr = {35830, 35830, 35830, 35830, 35830, 35830} }, {.cap = 356, .volt = 97400, .dyn_pwr = 21925, .lkg_pwr = {38121, 38121, 38121, 38121, 38121, 38121} }, }, .lkg_idx = DEFAULT_LKG_IDX, .row_num = UPOWER_OPP_NUM, .nr_idle_states = NR_UPOWER_CSTATES, .idle_states = { {{0}, {11628} }, {{0}, {11628} }, {{0}, {11628} }, {{0}, {11628} }, {{0}, {11628} }, {{0}, {11628} }, },};struct upower_tbl upower_tbl_l_1_FY = { .row = { {.cap = 116, .volt = 75000, .dyn_pwr = 16431, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} }, {.cap = 152, .volt = 75000, .dyn_pwr = 21486, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} }, {.cap = 179, .volt = 75000, .dyn_pwr = 25278, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} }, {.cap = 201, .volt = 75000, .dyn_pwr = 28437, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} }, {.cap = 228, .volt = 75000, .dyn_pwr = 32229, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} }, {.cap = 255, .volt = 75000, .dyn_pwr = 36021, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} }, {.cap = 282, .volt = 75700, .dyn_pwr = 40559, .lkg_pwr = {23423, 23423, 23423, 23423, 23423, 23423} }, {.cap = 304, .volt = 78100, .dyn_pwr = 46598, .lkg_pwr = {24968, 24968, 24968, 24968, 24968, 24968} }, {.cap = 331, .volt = 81100, .dyn_pwr = 54680, .lkg_pwr = {26999, 26999, 26999, 26999, 26999, 26999} }, {.cap = 349, .volt = 83500, .dyn_pwr = 61098, .lkg_pwr = {28760, 28760, 28760, 28760, 28760, 28760} }, {.cap = 371, .volt = 86000, .dyn_pwr = 68965, .lkg_pwr = {30698, 30698, 30698, 30698, 30698, 30698} }, {.cap = 393, .volt = 88400, .dyn_pwr = 77258, .lkg_pwr = {32706, 32706, 32706, 32706, 32706, 32706} }, {.cap = 416, .volt = 90800, .dyn_pwr = 86141, .lkg_pwr = {34808, 34808, 34808, 34808, 34808, 34808} }, {.cap = 438, .volt = 93200, .dyn_pwr = 95634, .lkg_pwr = {37097, 37097, 37097, 37097, 37097, 37097} }, {.cap = 452, .volt = 95000, .dyn_pwr = 102406, .lkg_pwr = {38814, 38814, 38814, 38814, 38814, 38814} }, {.cap = 474, .volt = 97400, .dyn_pwr = 112974, .lkg_pwr = {41424, 41424, 41424, 41424, 41424, 41424} }, }, .lkg_idx = DEFAULT_LKG_IDX, .row_num = UPOWER_OPP_NUM, .nr_idle_states = NR_UPOWER_CSTATES, .idle_states = { {{0}, {11926} }, {{0}, {11926} }, {{0}, {11926} }, {{0}, {11926} }, {{0}, {11926} }, {{0}, {11926} }, },};struct upower_tbl upower_tbl_cluster_l_1_FY = { .row = { {.cap = 116, .volt = 75000, .dyn_pwr = 2778, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} }, {.cap = 152, .volt = 75000, .dyn_pwr = 3633, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} }, {.cap = 179, .volt = 75000, .dyn_pwr = 4274, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} }, {.cap = 201, .volt = 75000, .dyn_pwr = 4808, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} }, {.cap = 228, .volt = 75000, .dyn_pwr = 5449, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} }, {.cap = 255, .volt = 75000, .dyn_pwr = 6090, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} }, {.cap = 282, .volt = 75700, .dyn_pwr = 6857, .lkg_pwr = {27058, 27058, 27058, 27058, 27058, 27058} }, {.cap = 304, .volt = 78100, .dyn_pwr = 7878, .lkg_pwr = {28843, 28843, 28843, 28843, 28843, 28843} }, {.cap = 331, .volt = 81100, .dyn_pwr = 9245, .lkg_pwr = {31188, 31188, 31188, 31188, 31188, 31188} }, {.cap = 349, .volt = 83500, .dyn_pwr = 10330, .lkg_pwr = {33223, 33223, 33223, 33223, 33223, 33223} }, {.cap = 371, .volt = 86000, .dyn_pwr = 11660, .lkg_pwr = {35461, 35461, 35461, 35461, 35461, 35461} }, {.cap = 393, .volt = 88400, .dyn_pwr = 13062, .lkg_pwr = {37781, 37781, 37781, 37781, 37781, 37781} }, {.cap = 416, .volt = 90800, .dyn_pwr = 14564, .lkg_pwr = {40209, 40209, 40209, 40209, 40209, 40209} }, {.cap = 438, .volt = 93200, .dyn_pwr = 16169, .lkg_pwr = {42854, 42854, 42854, 42854, 42854, 42854} }, {.cap = 452, .volt = 95000, .dyn_pwr = 17314, .lkg_pwr = {44837, 44837, 44837, 44837, 44837, 44837} }, {.cap = 474, .volt = 97400, .dyn_pwr = 19101, .lkg_pwr = {47852, 47852, 47852, 47852, 47852, 47852} }, }, .lkg_idx = DEFAULT_LKG_IDX, .row_num = UPOWER_OPP_NUM, .nr_idle_states = NR_UPOWER_CSTATES, .idle_states = { {{0}, {13776} }, {{0}, {13776} }, {{0}, {13776} }, {{0}, {13776} }, {{0}, {13776} }, {{0}, {13776} }, },};struct upower_tbl upower_tbl_b_1_FY = { .row = { {.cap = 211, .volt = 75000, .dyn_pwr = 61732, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} }, {.cap = 268, .volt = 75000, .dyn_pwr = 78352, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} }, {.cap = 317, .volt = 75000, .dyn_pwr = 92598, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} }, {.cap = 358, .volt = 75000, .dyn_pwr = 104469, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} }, {.cap = 406, .volt = 75000, .dyn_pwr = 118715, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} }, {.cap = 447, .volt = 75000, .dyn_pwr = 130587, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} }, {.cap = 504, .volt = 75700, .dyn_pwr = 149968, .lkg_pwr = {72438, 72438, 72438, 72438, 72438, 72438} }, {.cap = 561, .volt = 78100, .dyn_pwr = 177650, .lkg_pwr = {76806, 76806, 76806, 76806, 76806, 76806} }, {.cap = 634, .volt = 81100, .dyn_pwr = 216546, .lkg_pwr = {82521, 82521, 82521, 82521, 82521, 82521} }, {.cap = 691, .volt = 83500, .dyn_pwr = 250153, .lkg_pwr = {87447, 87447, 87447, 87447, 87447, 87447} }, {.cap = 748, .volt = 86000, .dyn_pwr = 287210, .lkg_pwr = {92841, 92841, 92841, 92841, 92841, 92841} }, {.cap = 805, .volt = 88400, .dyn_pwr = 326553, .lkg_pwr = {98397, 98397, 98397, 98397, 98397, 98397} }, {.cap = 861, .volt = 90800, .dyn_pwr = 368886, .lkg_pwr = {104190, 104190, 104190, 104190, 104190, 104190} }, {.cap = 918, .volt = 93200, .dyn_pwr = 414309, .lkg_pwr = {110456, 110456, 110456, 110456, 110456, 110456} }, {.cap = 959, .volt = 95000, .dyn_pwr = 449514, .lkg_pwr = {115156, 115156, 115156, 115156, 115156, 115156} }, {.cap = 1024, .volt = 97400, .dyn_pwr = 504548, .lkg_pwr = {122224, 122224, 122224, 122224, 122224, 122224} }, }, .lkg_idx = DEFAULT_LKG_IDX, .row_num = UPOWER_OPP_NUM, .nr_idle_states = NR_UPOWER_CSTATES, .idle_states = { {{0}, {38992} }, {{0}, {38992} }, {{0}, {38992} }, {{0}, {38992} }, {{0}, {38992} }, {{0}, {38992} }, },};struct upower_tbl upower_tbl_cluster_b_1_FY = { .row = { {.cap = 211, .volt = 75000, .dyn_pwr = 6408, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} }, {.cap = 268, .volt = 75000, .dyn_pwr = 8133, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} }, {.cap = 317, .volt = 75000, .dyn_pwr = 9612, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} }, {.cap = 358, .volt = 75000, .dyn_pwr = 10844, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} }, {.cap = 406, .volt = 75000, .dyn_pwr = 12323, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} }, {.cap = 447, .volt = 75000, .dyn_pwr = 13555, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} }, {.cap = 504, .volt = 75700, .dyn_pwr = 15567, .lkg_pwr = {28054, 28054, 28054, 28054, 28054, 28054} }, {.cap = 561, .volt = 78100, .dyn_pwr = 18440, .lkg_pwr = {29746, 29746, 29746, 29746, 29746, 29746} }, {.cap = 634, .volt = 81100, .dyn_pwr = 22478, .lkg_pwr = {31959, 31959, 31959, 31959, 31959, 31959} }, {.cap = 691, .volt = 83500, .dyn_pwr = 25966, .lkg_pwr = {33867, 33867, 33867, 33867, 33867, 33867} }, {.cap = 748, .volt = 86000, .dyn_pwr = 29813, .lkg_pwr = {35956, 35956, 35956, 35956, 35956, 35956} }, {.cap = 805, .volt = 88400, .dyn_pwr = 33897, .lkg_pwr = {38108, 38108, 38108, 38108, 38108, 38108} }, {.cap = 861, .volt = 90800, .dyn_pwr = 38291, .lkg_pwr = {40351, 40351, 40351, 40351, 40351, 40351} }, {.cap = 918, .volt = 93200, .dyn_pwr = 43006, .lkg_pwr = {42778, 42778, 42778, 42778, 42778, 42778} }, {.cap = 959, .volt = 95000, .dyn_pwr = 46661, .lkg_pwr = {44598, 44598, 44598, 44598, 44598, 44598} }, {.cap = 1024, .volt = 97400, .dyn_pwr = 52373, .lkg_pwr = {47335, 47335, 47335, 47335, 47335, 47335} }, }, .lkg_idx = DEFAULT_LKG_IDX, .row_num = UPOWER_OPP_NUM, .nr_idle_states = NR_UPOWER_CSTATES, .idle_states = { {{0}, {15101} }, {{0}, {15101} }, {{0}, {15101} }, {{0}, {15101} }, {{0}, {15101} }, {{0}, {15101} }, },};struct upower_tbl upower_tbl_cci_1_FY = { .row = { {.cap = 0, .volt = 75000, .dyn_pwr = 2708, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} }, {.cap = 0, .volt = 75000, .dyn_pwr = 3611, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} }, {.cap = 0, .volt = 75000, .dyn_pwr = 4288, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} }, {.cap = 0, .volt = 75000, .dyn_pwr = 5191, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} }, {.cap = 0, .volt = 75000, .dyn_pwr = 5868, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} }, {.cap = 0, .volt = 75000, .dyn_pwr = 6771, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} }, {.cap = 0, .volt = 75700, .dyn_pwr = 7588, .lkg_pwr = {16537, 16537, 16537, 16537, 16537, 16537} }, {.cap = 0, .volt = 78100, .dyn_pwr = 8811, .lkg_pwr = {17527, 17527, 17527, 17527, 17527, 17527} }, {.cap = 0, .volt = 81100, .dyn_pwr = 10292, .lkg_pwr = {18822, 18822, 18822, 18822, 18822, 18822} }, {.cap = 0, .volt = 83500, .dyn_pwr = 11750, .lkg_pwr = {19938, 19938, 19938, 19938, 19938, 19938} }, {.cap = 0, .volt = 86000, .dyn_pwr = 13354, .lkg_pwr = {21159, 21159, 21159, 21159, 21159, 21159} }, {.cap = 0, .volt = 88400, .dyn_pwr = 14737, .lkg_pwr = {22417, 22417, 22417, 22417, 22417, 22417} }, {.cap = 0, .volt = 90800, .dyn_pwr = 16540, .lkg_pwr = {23728, 23728, 23728, 23728, 23728, 23728} }, {.cap = 0, .volt = 93200, .dyn_pwr = 18472, .lkg_pwr = {25145, 25145, 25145, 25145, 25145, 25145} }, {.cap = 0, .volt = 95000, .dyn_pwr = 19916, .lkg_pwr = {26208, 26208, 26208, 26208, 26208, 26208} }, {.cap = 0, .volt = 97400, .dyn_pwr = 22077, .lkg_pwr = {27805, 27805, 27805, 27805, 27805, 27805} }, }, .lkg_idx = DEFAULT_LKG_IDX, .row_num = UPOWER_OPP_NUM, .nr_idle_states = NR_UPOWER_CSTATES, .idle_states = { {{0}, {8938} }, {{0}, {8938} }, {{0}, {8938} }, {{0}, {8938} }, {{0}, {8938} }, {{0}, {8938} }, },};
4.1.2、smp负载均衡的实现
负载均衡和很多参数相关,下面列出了其中最重要的一些参数:
成员 所属结构 含义 更新/获取函数 计算方法 rq->cpu_capacity_orig rq 本cpu总的计算能力 init_sched_groups_capacity()/update_sd_lb_stats() -> update_group_capacity() -> update_cpu_capacity() capacity = arch_scale_cpu_capacity(sd, cpu) rq->cpu_capacity rq 本cpu cfs的计算能力 = 总capacity - rt占用的capacity init_sched_groups_capacity()/update_sd_lb_stats() -> update_group_capacity() -> update_cpu_capacity() capacity *= scale_rt_capacity(cpu); rq->rd->max_cpu_capacity rq->rd root_domain中最大的cpu计算能力 init_sched_groups_capacity()/update_sd_lb_stats() -> update_group_capacity() -> update_cpu_capacity() rq->rd->overutilized rq->rd update_sd_lb_stats() rq->rd->overload rq->rd update_sd_lb_stats() rq->rt_avg rq 本cpu的rt平均负载 weighted_cpuload() -> cfs_rq_runnable_load_avg() rq->cfs.runnable_load_avg rq->cfs(cfs_rq) 本cpu cfs_rq的runable平均负载 __update_load_avg()、cfs_rq_load_avg() (runnable时间*freq*weight)/LOAD_AVG_MAX rq->cfs.avg.load_avg rq->cfs.avg 本cpu cfs_rq的runnable平均负载 __update_load_avg() (runnable时间*freq*weight)/LOAD_AVG_MAX rq->cfs.avg.loadwop_avg rq->cfs.avg 本cpu cfs_rq的runnable平均负载,不含weight __update_load_avg() (runnable时间*freq)/LOAD_AVG_MAX rq->cfs.avg.util_avg rq->cfs.avg 本cpu cfs_rq的running负载 __update_load_avg()、cpu_util() -> __cpu_util() (running时间*freq*capacity)/LOAD_AVG_MAX cfs_rq->nr_running cfs_rq 本cfs_rq这个层次runnable的se的数量 enqueue_entity()/dequeue_entity() -> account_entity_enqueue() cfs_rq->h_nr_running cfs_rq 本cfs_rq包含所有子cfs_rq nr_running的总和 enqueue_task_fair()/dequeue_task_fair rq->nr_running rq 本cpu rq所有runnable的se的数量,包含所有子cfs_rq enqueue_task_fair()/dequeue_task_fair -> add_nr_running()4.1.2.1、rebalance_domains()
mtk对定义了3种power模式来兼容EAS的:EAS模式(energy_aware())、HMP模式(sched_feat(SCHED_HMP))、hybrid_support(EAS、HMP同时共存);
hybrid_support()模式下:一般负载均衡交给EAS;如果cpu_rq(cpu)->rd->overutilized负载已经严重不均衡,交给HMP;
系统在scheduler_tick()中会定期的检测smp负载均衡的时间是否已到,如果到时触发SCHED_SOFTIRQ软中断:
void scheduler_tick(void){#ifdef CONFIG_SMP rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq);#endif}|→/* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */void trigger_load_balance(struct rq *rq){ /* Don't need to rebalance while attached to NULL domain */ if (unlikely(on_null_domain(rq))) return; if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ);#ifdef CONFIG_NO_HZ_COMMON if (nohz_kick_needed(rq)) nohz_balancer_kick();#endif}
SCHED_SOFTIRQ软中断的执行主体为run_rebalance_domains:
__init void init_sched_fair_class(void){ open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);}/* * run_rebalance_domains is triggered when needed from the scheduler tick. * Also triggered for nohz idle balancing (with nohz_balancing_kick set). */static void run_rebalance_domains(struct softirq_action *h){ struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; int this_cpu = smp_processor_id(); /* bypass load balance of HMP if EAS consideration */ if ((!energy_aware() && sched_feat(SCHED_HMP)) || (hybrid_support() && cpu_rq(this_cpu)->rd->overutilized)) hmp_force_up_migration(this_cpu); /* * If this cpu has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle cpus whose ticks are * stopped. Do nohz_idle_balance *before* rebalance_domains to * give the idle cpus a chance to load balance. Else we may * load balance only within the local sched_domain hierarchy * and abort nohz_idle_balance altogether if we pull some load. */ nohz_idle_balance(this_rq, idle); rebalance_domains(this_rq, idle);}
我们分析最核心的函数rebalance_domains():
需要重点提一下的是:负载计算计算了3种负载(load_avg、loadwop_avg、util_avg),rebalance_domains主要使用其中的load_avg,乘(SCHED_CAPACITY_SCALE/capacity)加以转换。
- 1、逐级轮询本cpu的sd,判断本sd的时间间隔是否到期,如果到期做load_balance();
| tl层级 | cpu_busy? | sd->balance_interval | sd->busy_factor | sd balance interval |
|—|—|—|—|—|
MC层级 | idle | 4 |1 | 4ms
MC层级 | busy | 4 | 32 | 128ms
DIE层级 | idle | 8 |1 | 8ms
DIE层级 | busy | 8 | 32 | 256ms
| | | | | rq->next_balance = min(上述值)
2、在load_balance()中判断在本层级sd本cpu的当前情况是否适合充当dst_cpu,在should_we_balance()做各种判断,做dst_cpu的条件有:要么是本sg的第一个idle cpu,要么是本sg的第一个cpu。dst_cpu是作为目的cpu让负载高的cpu迁移进程过来,如果本cpu不符合条件中断操作;
3、继续find_busiest_group(),在sg链表中找出负载最重的sg。核心计算在update_sd_lb_stats()、update_sg_lb_stats()中。如果dst_cpu所在的local_group负载大于busiest sg,或者大于sds平均负载,中断操作;如果成功计算需要迁移的负载env->imbalance,为min((sds->avg - local), (busiest - sds->avg));
- 3.1、根据当前cpu的idle状态计算cpu load(rq->cpu_load[])时选用的index值:
- 3.2、计算sg负载sgs,选择sgs->avg_load最大的sg作为busiest_group。其中几个关键值的计算如下:
- 3.3、在计算sg负载时,几个关键状态的计算如下:
比例参数imbalance_pct、capacity_margin的值为:
- 3.4、计算env->imbalance,这个是rebalance需要迁移的负载量:
- 4、继续find_busiest_queue(),查找busiest sg中负载最重的cpu。
- 4.1、找出sg中weighted_cpuload*capacity_of值最大的cpu:
5、迁移busiest cpu的负载到本地dst cpu上,迁移的负载额度为env->imbalance:detach_tasks() -> attach_tasks();
6、处理几种因为进程亲和力问题,busiest cpu不能迁移走足够的进程:LBF_DST_PINNED尝试更改dst_cpu为本地cpu相同sg的其他cpu;LBF_SOME_PINNED当前不能均衡尝试让父sd均衡;LBF_ALL_PINNED一个进程都不能迁移尝试去掉dst_cpu重新进行load_balance();
7、如果经过各种尝试后还是没有一个进程迁移成功,最后尝试一次active_balance;
/* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * * Balancing parameters are set up in init_sched_domains. * Balance的参数是在sched_domains初始化时设置的 */static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle){ int continue_balancing = 1; int cpu = rq->cpu; unsigned long interval; struct sched_domain *sd; /* 默认本cpu rq下一次的balance时间为60s以后 */ /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; int need_serialize, need_decay = 0; u64 max_cost = 0; /* (1) 更新cpu rq中所有cfs_rq的最新负载 */ update_blocked_averages(cpu); rcu_read_lock(); /* (2) 对本cpu每个层次的schedule_domain进行扫描 */ for_each_domain(cpu, sd) { /* (3) 以1HZ的频率对sd->max_newidle_lb_cost进行老化, 老化公式: new = old * (253/256) */ /* * Decay the newidle max times here because this is a regular * visit to all the domains. Decay ~1% per second. */ if (time_after(jiffies, sd->next_decay_max_lb_cost)) { sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256; sd->next_decay_max_lb_cost = jiffies + HZ; need_decay = 1; } max_cost += sd->max_newidle_lb_cost; if (!(sd->flags & SD_LOAD_BALANCE)) continue;#ifndef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT /* nohz CPU need GTS balance to migrate tasks for more than 2 clusters*/ /* Don't consider GTS balance if hybrid support */ if (hybrid_support()) { if (sd->child || (!sd->child && (rcu_dereference(per_cpu(sd_scs, cpu)) == NULL))) continue; }#endif /* (4) 如果continue_balancing = 0,指示停止当前层级的load balance 因为shed_group中其他的cpu正在这个层次做load_balance */ /* * Stop the load balance at this level. There is another * CPU in our sched group which is doing load balancing more * actively. */ if (!continue_balancing) { if (need_decay) continue; break; } /* (5) 计算当前层次schedule_domain的balance间隔时间 */ interval = get_sd_balance_interval(sd, idle != CPU_IDLE); /* (6) 如果需要串行化(SD_SERIALIZE),做balance之前需要持锁 */ need_serialize = sd->flags & SD_SERIALIZE; if (need_serialize) { if (!spin_trylock(&balancing)) goto out; } /* (7) 如果本sd的balance间隔时间已到,进行实际的load_balance() */ if (time_after_eq(jiffies, sd->last_balance + interval)) { if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { /* * The LBF_DST_PINNED logic could have changed * env->dst_cpu, so we can't know our idle * state even if we migrated tasks. Update it. */ idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; } sd->last_balance = jiffies; interval = get_sd_balance_interval(sd, idle != CPU_IDLE); } if (need_serialize) spin_unlock(&balancing);out: /* (8) 如果sd下一次balance时间在,rq的balance时间之前,需要更新rq的balance时间 rq的下一次balance时间:next_balance (默认是60s后) 本sd的下一次balance时间:sd->last_balance + interval rq的下一次balance时间需要选取多个sd中时间最近的一个 */ if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; update_next_balance = 1; } } if (need_decay) { /* * Ensure the rq-wide value also decays but keep it at a * reasonable floor to avoid funnies with rq->avg_idle. */ rq->max_idle_balance_cost = max((u64)sysctl_sched_migration_cost, max_cost); } rcu_read_unlock(); /* (8.1) 更新rq的balance时间 */ /* * next_balance will be updated only when there is a need. * When the cpu is attached to null domain for ex, it will not be * updated. */ if (likely(update_next_balance)) { rq->next_balance = next_balance;#ifdef CONFIG_NO_HZ_COMMON /* * If this CPU has been elected to perform the nohz idle * balance. Other idle CPUs have already rebalanced with * nohz_idle_balance() and nohz.next_balance has been * updated accordingly. This CPU is now running the idle load * balance for itself and we need to update the * nohz.next_balance accordingly. */ if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) nohz.next_balance = rq->next_balance;#endif }}|→static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *continue_balancing){ int ld_moved, cur_ld_moved, active_balance = 0; struct sched_domain *sd_parent = sd->parent; struct sched_group *group; struct rq *busiest; unsigned long flags; struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); /* (7.1) 构造Load_balance需要的数据结构: .sd = sd, //本cpu在本tl层次的sd .dst_cpu = this_cpu, // 目的cpu是本cpu .dst_rq = this_rq, // 目的rq是本cpu的rq // load_balance的目的是找出负载最重的cpu,并将一部分负载迁移到本cpu上 */ struct lb_env env = { .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, .dst_grpmask = sched_group_cpus(sd->groups), .idle = idle, .loop_break = sched_nr_migrate_break, .cpus = cpus, .fbq_type = all, .tasks = LIST_HEAD_INIT(env.tasks), }; /* * For NEWLY_IDLE load_balancing, we don't need to consider * other cpus in our group */ if (idle == CPU_NEWLY_IDLE) env.dst_grpmask = NULL; cpumask_copy(cpus, cpu_active_mask); schedstat_inc(sd, lb_count[idle]);redo: /* (7.2) check当前cpu是否适合作为dst_cpu(即light cpu,需要分担其他cpu的负载) */ if (!should_we_balance(&env)) { *continue_balancing = 0; goto out_balanced; } /* (7.3) 找出本层级sched_group链表中,负载最重的(busiest)的sched_group */ group = find_busiest_group(&env); if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; } /* (7.4) 找出busiest sched_group中sched_group的rq,即负载最重cpu对应的rq */ busiest = find_busiest_queue(&env, group); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; } BUG_ON(busiest == env.dst_rq); schedstat_add(sd, lb_imbalance[idle], env.imbalance); env.src_cpu = busiest->cpu; env.src_rq = busiest; ld_moved = 0; /* (7.5) 判断busiest cpu rq中的runnable进程数 > 1? 至少有进程可以迁移走 */ if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found * an imbalance but busiest->nr_running <= 1, the group is * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);more_balance: raw_spin_lock_irqsave(&busiest->lock, flags); /* (7.6) 从busiest rq中detach进程, env->imbalance:需要迁移的负载大小 cur_ld_moved:实际迁移的进程数 */ /* * cur_ld_moved - load moved in current iteration * ld_moved - cumulative load moved across iterations */ cur_ld_moved = detach_tasks(&env); /* (7.7) busiest cpu负载减轻后, 在sched_freq中判断cpu频率是否可以调低 */ /* * We want to potentially lower env.src_cpu's OPP. */ if (cur_ld_moved) update_capacity_of(env.src_cpu, SCHE_ONESHOT); /* * We've detached some tasks from busiest_rq. Every * task is masked "TASK_ON_RQ_MIGRATING", so we can safely * unlock busiest->lock, and we are able to be sure * that nobody can manipulate the tasks in parallel. * See task_rq_lock() family for the details. */ raw_spin_unlock(&busiest->lock); /* (7.8) 把迁移过来的任务attack到dest_cpu上 */ if (cur_ld_moved) { attach_tasks(&env); ld_moved += cur_ld_moved; } local_irq_restore(flags); /* (7.9) LBF_NEED_BREAK设置,说明balance还没有完成,循环只是出来休息一下, 继续重新balance */ if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; goto more_balance; } /* (7.10) 设置了LBF_DST_PINNED标志,并且env.imbalance > 0 说明src_cpu上有些进程因为affinity的原因不能迁移到dst_cpu但是能迁移到同sg的new_dst_cpu上 把dst_cpu更改为new_dst_cpu,重新开始balance流程 */ /* * Revisit (affine) tasks on src_cpu that couldn't be moved to * us and move them to an alternate dst_cpu in our sched_group * where they can run. The upper limit on how many times we * iterate on same src_cpu is dependent on number of cpus in our * sched_group. * * This changes load balance semantics a bit on who can move * load to a given_cpu. In addition to the given_cpu itself * (or a ilb_cpu acting on its behalf where given_cpu is * nohz-idle), we now have balance_cpu in a position to move * load to given_cpu. In rare situations, this may cause * conflicts (balance_cpu and given_cpu/ilb_cpu deciding * _independently_ and at _same_ time to move some load to * given_cpu) causing exceess load to be moved to given_cpu. * This however should not happen so much in practice and * moreover subsequent load balance cycles should correct the * excess load moved. */ if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { /* Prevent to re-select dst_cpu via env's cpus */ cpumask_clear_cpu(env.dst_cpu, env.cpus); env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; env.flags &= ~LBF_DST_PINNED; env.loop = 0; env.loop_break = sched_nr_migrate_break; /* * Go back to "more_balance" rather than "redo" since we * need to continue with same src_cpu. */ goto more_balance; } /* (7.11) 设置了LBF_SOME_PINNED标志,说明有些进程因为affinity迁移失败, 设置当前sd的parent sd的 sgc->imbalance,让parent sd做rebalance的概率增高 */ /* * We failed to reach balance because of affinity. */ if (sd_parent) { int *group_imbalance = &sd_parent->groups->sgc->imbalance; if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) *group_imbalance = 1; } /* (7.12) 如果LBF_ALL_PINNED标志一直被置位, 说明busiest_cpu因为affinity没有一个进程迁移成功,哪怕迁移到dst_cpu同sg的其他cpu也没有一个成功 将busiest cpu从全局cpu mask去掉,重新做整个流程:find_busiest_group -> find_busiest_queue -> detach_tasks -> attach_tasks */ /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); if (!cpumask_empty(cpus)) { env.loop = 0; env.loop_break = sched_nr_migrate_break; goto redo; } goto out_all_pinned; } } /* (7.13) 经过几轮的努力尝试,最终迁移的进程数ld_moved还是0,说明balance失败 */ if (!ld_moved) { schedstat_inc(sd, lb_failed[idle]); /* * Increment the failure counter only on periodic balance. * We do not want newidle balance, which can be very * frequent, pollute the failure counter causing * excessive cache_hot migrations and active balances. */ if (idle != CPU_NEWLY_IDLE) if (env.src_grp_nr_running > 1) sd->nr_balance_failed++; /* (7.14) 最后一次尝试迁移一个进程 */ if (need_active_balance(&env)) { raw_spin_lock_irqsave(&busiest->lock, flags); /* (7.15) 如果当前cpu不在busiest->curr进程的affinity之内,返回失败 */ /* don't kick the active_load_balance_cpu_stop, * if the curr task on busiest cpu can't be * moved to this_cpu */ if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(busiest->curr))) { raw_spin_unlock_irqrestore(&busiest->lock, flags); env.flags |= LBF_ALL_PINNED; goto out_one_pinned; } /* * ->active_balance synchronizes accesses to * ->active_balance_work. Once set, it's cleared * only after active load balance is finished. */ if (!busiest->active_balance && !cpu_park(cpu_of(busiest))) { busiest->active_balance = 1; /* load_balance */ busiest->push_cpu = this_cpu; active_balance = 1; } raw_spin_unlock_irqrestore(&busiest->lock, flags); /* (7.16) 迁移busiest->curr进程当前期cpu */ if (active_balance) { if (stop_one_cpu_dispatch(cpu_of(busiest), active_load_balance_cpu_stop, busiest, &busiest->active_balance_work)) { raw_spin_lock_irqsave(&busiest->lock, flags); busiest->active_balance = 0; active_balance = 0; raw_spin_unlock_irqrestore(&busiest->lock, flags); } } /* * We've kicked active balancing, reset the failure * counter. */ sd->nr_balance_failed = sd->cache_nice_tries+1; } } else sd->nr_balance_failed = 0; if (likely(!active_balance)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; } else { /* * If we've begun active balancing, start to back off. This * case may not be covered by the all_pinned logic if there * is only 1 task on the busy runqueue (because we don't call * detach_tasks). */ if (sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; } goto out;out_balanced: /* * We reach balance although we may have faced some affinity * constraints. Clear the imbalance flag if it was set. */ if (sd_parent) { int *group_imbalance = &sd_parent->groups->sgc->imbalance; if (*group_imbalance) *group_imbalance = 0; }out_all_pinned: /* * We reach balance because all tasks are pinned at this level so * we can't migrate them. Let the imbalance flag set so parent level * can try to migrate them. */ schedstat_inc(sd, lb_balanced[idle]); sd->nr_balance_failed = 0;out_one_pinned: /* tune up the balancing interval */ if (((env.flags & LBF_ALL_PINNED) && sd->balance_interval < MAX_PINNED_INTERVAL) || (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; ld_moved = 0;out: return ld_moved;}||→static int should_we_balance(struct lb_env *env){ struct sched_group *sg = env->sd->groups; struct cpumask *sg_cpus, *sg_mask; int cpu, balance_cpu = -1; /* (7.2.1) 如果本cpu为CPU_NEWLY_IDLE,直接符合迁移条件 */ /* * In the newly idle case, we will allow all the cpu's * to do the newly idle load balance. */ if (env->idle == CPU_NEWLY_IDLE) return 1; sg_cpus = sched_group_cpus(sg); sg_mask = sched_group_mask(sg); /* (7.2.2) 本sched_group的第一个idle cpu适合做load_balance */ /* Try to find first idle cpu */ for_each_cpu_and(cpu, sg_cpus, env->cpus) { if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) continue; balance_cpu = cpu; break; } /* (7.2.3) 没有idle cpu,则选取本sched_group的第一个cpu做load_balance */ if (balance_cpu == -1) balance_cpu = group_balance_cpu(sg); /* (7.2.4) 不满足上述条件的cpu,不适合来启动load_balance */ /* * First idle cpu or the first cpu(busiest) in this sched group * is eligible for doing load balancing at this and above domains. */ return balance_cpu == env->dst_cpu;}||→static struct sched_group *find_busiest_group(struct lb_env *env){ struct sg_lb_stats *local, *busiest; struct sd_lb_stats sds; int local_cpu = 0, busiest_cpu = 0; struct cpumask *busiest_cpumask; int same_clus = 0; init_sd_lb_stats(&sds); /* (7.3.1) 更新本层级sched_group链表中,每个sched_group的负载, 并选出busiest的一个sched_group */ /* * Compute the various statistics relavent for load balancing at * this level. */ update_sd_lb_stats(env, &sds); local = &sds.local_stat; busiest = &sds.busiest_stat; if (sds.busiest) { busiest_cpumask = sched_group_cpus(sds.busiest); local_cpu = env->dst_cpu; busiest_cpu = group_first_cpu(sds.busiest); same_clus = is_the_same_domain(local_cpu, busiest_cpu); mt_sched_printf(sched_lb, "%s: local_cpu=%d, busiest_cpu=%d, busiest_mask=%lu, same_cluster=%d", __func__, local_cpu, busiest_cpu, busiest_cpumask->bits[0], same_clus); } /* (7.3.2) 如果EAS使能,跨cluster的任务迁移使用EAS来做 */ if (energy_aware() && !env->dst_rq->rd->overutilized && !same_clus) goto out_balanced; /* (7.3.3) */ /* ASYM feature bypasses nice load balance check */ if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && check_asym_packing(env, &sds)) return sds.busiest; /* (7.3.4) busiest sg上没有负载,返回空 */ /* There is no busy sibling group to pull tasks from */ if (!sds.busiest || busiest->sum_nr_running == 0) { if (!sds.busiest) mt_sched_printf(sched_lb, "[%s] %d: fail no busiest ", __func__, env->src_cpu); else mt_sched_printf(sched_lb, "[%s] %d: fail busiest no task ", __func__, env->src_cpu); goto out_balanced; } /* (7.3.5) sg链表里的平均负载 */ sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) / sds.total_capacity; /* (7.3.6) 如果busiest sg低一级别的因为cpu affinity没有balance成功,设置了group_imbalanced标志 强制在当前级别上进行balance */ /* * If the busiest group is imbalanced the below checks don't * work because they assume all things are equal, which typically * isn't true due to cpus_allowed constraints and the like. */ if (busiest->group_type == group_imbalanced) goto force_balance; /* (7.3.7) 如果dest cpu/group很闲,busiest负载很重, 强制开展balance */ /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && busiest->group_no_capacity) goto force_balance; /* (7.3.8) 如果dest_cpu所在sg的负载都大于busiest sg的负载, 放弃balance */ /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. */ if (local->avg_load >= busiest->avg_load) goto out_balanced; /* (7.3.9) 如果dest_cpu所在sg的负载都大于sg链表的平均负载, 放弃balance */ /* * Don't pull any tasks if this group is already above the domain * average load. */ if (local->avg_load >= sds.avg_load) goto out_balanced; /* (7.3.10) 如果dest_cpu为idle,但是dest_cpu所在的sg idle cpu数量小于busiest sg的idle cpu数量 放弃balance */#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT if ((env->idle == CPU_IDLE) || (env->idle == CPU_NEWLY_IDLE)) { int i = (env->idle == CPU_IDLE) ? 1:0;#else if (env->idle == CPU_IDLE) {#endif /* * This cpu is idle. If the busiest group is not overloaded * and there is no imbalance between this and busiest group * wrt idle cpus, it is balanced. The imbalance becomes * significant if the diff is greater than 1 otherwise we * might end up to just move the imbalance on another group */#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT if ((busiest->group_type != group_overloaded) && (local->idle_cpus < (busiest->idle_cpus + i)))#else if ((busiest->group_type != group_overloaded) && (local->idle_cpus <= (busiest->idle_cpus + 1)))#endif goto out_balanced; } else { /* (7.3.11) busiest->avg_load大于local->avg_load的比例没有超过env->sd->imbalance_pct 放弃balance */ /* * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use * imbalance_pct to be conservative. */ if (100 * busiest->avg_load <= env->sd->imbalance_pct * local->avg_load) goto out_balanced; }force_balance: /* Looks like there is an imbalance. Compute it */ /* (7.3.12) 计算需要迁移的负载值env->imbalance */ calculate_imbalance(env, &sds);#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT env->imbalance = env->imbalance * SCHED_CAPACITY_SCALE / (sds.busiest->sgc->capacity / cpumask_weight(sched_group_cpus(sds.busiest)));#endif return sds.busiest;out_balanced: env->imbalance = 0; return NULL;}|||→static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds){ struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; bool overload = false, overutilized = false; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; /* (7.3.1.1) 根据idle情况,选择计算cpu负载时的idx, idx:是CPU层级负载this_rq->cpu_load[i]数组的index值 */ load_idx = get_sd_load_idx(env->sd, env->idle); /* (7.3.1.2) 逐个轮询本层级sched_group链表中的每个sched_group */ do { struct sg_lb_stats *sgs = &tmp_sgs; int local_group; /* (7.3.1.3) 如果sg是当前cpu所在的sg,则本sg称为local_group 使用专门的数据结构来存储local_group的信息: sds->local = sg; // 使用sds->local来存储local_group sgs = &sds->local_stat; // 使用sds->local_stat来存储local_group的统计 */ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); if (local_group) { sds->local = sg; sgs = &sds->local_stat; /* (7.3.1.4) 更新local_group的capacity,更新的周期为sd->balance_interval 主要目的是动态减去RT进程消耗的capacity */ if (env->idle != CPU_NEWLY_IDLE || time_after_eq(jiffies, sg->sgc->next_update)) update_group_capacity(env->sd, env->dst_cpu); } /* (7.3.1.5) 更新当前sched_group的负载统计 sgs:sg统计数据放到sgs当中 overload:rq中runnable的进程>1,那么肯定有进程在等待 overutilized:cpu的capacity < util,运算能力不足 */ update_sg_lb_stats(env, sg, load_idx, local_group, sgs, &overload, &overutilized); /* (7.3.1.6) local_group不参与busiest sg的计算 */ if (local_group) goto next_group; /* (7.3.1.7) 如果设置了SD_PREFER_SIBLING标志,说明local_group希望其他人迁移任务到它身上, 提高其他sg的迁移优先级 */ /* * In case the child domain prefers tasks go to siblings * first, lower the sg capacity so that we'll try * and move all the excess tasks away. We lower the capacity * of a group only if the local group has the capacity to fit * these excess tasks. The extra check prevents the case where * you always pull from the heaviest group when it is already * under-utilized (possible with a large weight task outweighs * the tasks on the system). */ if (prefer_sibling && sds->local && group_has_capacity(env, &sds->local_stat) && (sgs->sum_nr_running > 1)) { sgs->group_no_capacity = 1; sgs->group_type = group_classify(sg, sgs); } /* (7.3.1.8) 根据计算的sgs统计数据,找出busiest sg */ if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; }next_group: /* (7.3.1.9) 更新sds中的负载、capacity统计 */ /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; sg = sg->next; } while (sg != env->sd->groups); if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); env->src_grp_nr_running = sds->busiest_stat.sum_nr_running; /* (7.3.1.10) 根据最后一个sg的overload、overutilized值 来更新dst_cpu rq->rd中的对应值 。 ooooo这里是怎么想的?不是local_group,也不是busiest_group,而是最后一个计算的sg!!! */ if (!env->sd->parent) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; /* Update over-utilization (tipping point, U >= 0) indicator */ if (env->dst_rq->rd->overutilized != overutilized) env->dst_rq->rd->overutilized = overutilized; } else { if (!env->dst_rq->rd->overutilized && overutilized) env->dst_rq->rd->overutilized = true; }}||||→static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs, bool *overload, bool *overutilized){ unsigned long load; int i; memset(sgs, 0, sizeof(*sgs)); /* (7.3.1.5.1) 遍历sched_group中的每个cpu */ for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); /* (7.3.1.5.2) 获取本cpu的负载rq->cpu_load[load_idx-1] */ /* Bias balancing toward cpus of our domain */ if (local_group) /* 如果是local_group,负载往小的取:min(rq->cpu_load[load_idx-1], weighted_cpuload(cpu)) */ load = target_load(i, load_idx); else /* 如果不是local_group,负载往大的取:max(rq->cpu_load[load_idx-1], weighted_cpuload(cpu)) */ load = source_load(i, load_idx);#ifdef CONFIG_MTK_SCHED_INTEROP /* (7.3.1.5.3) 因为rq->cpu_load[]只包含cfs的负载,mtk尝试加上rt部分的负载 ooooo但是rq->cpu_capacity中已经减去了rt的部分,这里是否还需要?? */ load += mt_rt_load(i);#endif /* (7.3.1.5.4) 累加sgs各项值: sgs->group_load // runnable负载带weight分量(cpu_rq(cpu)->cfs.avg.util_avg),经过rq->cpu_load[]计算 sgs->group_util // running负载(cpu_rq(cpu)->cfs.avg.load_avg/cpu_rq(cpu)->cfs.runnable_load_avg) sgs->sum_nr_running // rq中所有se的总和 sgs->sum_weighted_load // runnable负载带weight分量(cpu_rq(cpu)->cfs.avg.util_avg) sgs->idle_cpus // idle状态的cpu计数 */#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT sgs->group_load += (load * capacity_orig_of(i)) >> SCHED_CAPACITY_SHIFT;#else sgs->group_load += load;#endif sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; /* (7.3.1.5.5) 如果rq中进程数量>1,则就会有进程处于runnable状态, overload = true */ if (rq->nr_running > 1) *overload = true;#ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running;#endif sgs->sum_weighted_load += weighted_cpuload(i); if (idle_cpu(i)) sgs->idle_cpus++; /* (7.3.1.5.6) cpu的capacity小于cpu的running状态负载, overutilized = true */ if (cpu_overutilized(i)) *overutilized = true; } /* (7.3.1.5.7) 更新汇总后sgs的统计数据: sgs->group_capacity // sgs所有cpu capacity的累加 sgs->avg_load // 按照group_capacity,等比例放大group_load负载,capacity越小avg_load越大 sgs->load_per_task // sgs的平均每个进程的weight负载 sgs->group_weight // sgs的online cpu个数 sgs->group_no_capacity // sgs的capacity已经不够用,赶不上util sgs->group_type // 严重级别 group_overloaded > group_imbalanced > group_other // group_imbalanced: 下一等级的load_balance因为cpu_affinity的原因没有完成 */ /* Adjust by relative CPU capacity of the group */ sgs->group_capacity = group->sgc->capacity; sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; sgs->group_weight = group->group_weight; sgs->group_no_capacity = group_is_overloaded(env, sgs); sgs->group_type = group_classify(group, sgs);}||||→static bool update_sd_pick_busiest(struct lb_env *env, struct sd_lb_stats *sds, struct sched_group *sg, struct sg_lb_stats *sgs){ struct sg_lb_stats *busiest = &sds->busiest_stat;#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT if (sgs->sum_nr_running == 0) { mt_sched_printf(sched_lb_info, "[%s] sgs->sum_nr_running=%d", __func__, sgs->sum_nr_running); return false; }#endif /* (7.3.1.9.1) 如果新的sgs group_type大于旧的busiest sgs, 新的sgs更busy */ if (sgs->group_type > busiest->group_type) return true; /* (7.3.1.9.2) 如果新的sgs group_type小于旧的busiest sgs, 旧的sgs更busy */ if (sgs->group_type < busiest->group_type) return false; /* (7.3.1.9.3) 在group_type相同的情况下,比较sgs->avg_load sgs->avg_load = rq->cpu_load[load_idx-1] * (group_load*SCHED_CAPACITY_SCALE / sgs->group_capacity) */ if (sgs->avg_load <= busiest->avg_load) return false; /* (7.3.1.9.4) 如果SD_ASYM_PACKING标志没有置位, 在group_type相同的情况下,sgs->avg_load值较大的为busiest sg */ /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) return true; /* (7.3.1.9.5) ASYM_PACKING的意思是会把负载移到最低序号的cpu上, 如果sg的frist cpu序号 > dst_cpu,则busiest 对个sg的frist cpu序号 > dst_cpu,选择序号小的sg */ /* * ASYM_PACKING needs to move all the work to the lowest * numbered CPUs in the group, therefore mark all groups * higher than ourself as busy. */ if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) { if (!sds->busiest) return true; if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) return true; } /* (7.3.1.9.6) 设置了ASYM_PACKING,且如果sg的frist cpu序号 <= dst_cpu, 返回false */ return false;}|||→static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds){ unsigned long max_pull, load_above_capacity = ~0UL; struct sg_lb_stats *local, *busiest; /* (7.3.12.1) local sgs和busiest sgs */ local = &sds->local_stat; busiest = &sds->busiest_stat; if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages * to ensure cpu-load equilibrium, look at wider averages. XXX */ busiest->load_per_task = min(busiest->load_per_task, sds->avg_load); } /* (7.3.12.2) */ /* * In the presence of smp nice balancing, certain scenarios can have * max load less than avg load(as we skip the groups at or below * its cpu_capacity, while calculating max_load..) */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { env->imbalance = 0; return fix_small_imbalance(env, sds); } /* (7.3.12.3) */ /* * If there aren't any idle cpus, avoid creating some. */ if (busiest->group_type == group_overloaded && local->group_type == group_overloaded) { load_above_capacity = busiest->sum_nr_running * SCHED_LOAD_SCALE; if (load_above_capacity > busiest->group_capacity) load_above_capacity -= busiest->group_capacity; else load_above_capacity = ~0UL; } /* (7.3.12.4) env->imbalance的值等于min((sds->avg - local), (busiest - sds->avg)) 在local和sds平均值,busiest和sds平均值,两个差值之间选择最小值 */ /* * We're trying to get all the cpus to the average_load, so we don't * want to push ourselves above the average load, nor do we wish to * reduce the max loaded cpu below the average load. At the same time, * we also don't want to reduce the group load below the group capacity * (so that we can implement power-savings policies etc). Thus we look * for the minimum possible imbalance. */ max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); /* How much load to actually move to equalise the imbalance */ env->imbalance = min( max_pull * busiest->group_capacity, (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; /* * if *imbalance is less than the average load per runnable task * there is no guarantee that any tasks will be moved so we'll have * a think about bumping its value to force at least one task to be * moved */ if (env->imbalance < busiest->load_per_task) return fix_small_imbalance(env, sds);}||→static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group){ struct rq *busiest = NULL, *rq; unsigned long busiest_load = 0, busiest_capacity = 1; int i; /* (7.4.1) 逐个遍历sg中的cpu */ for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { unsigned long capacity, wl; enum fbq_type rt; rq = cpu_rq(i); rt = fbq_classify_rq(rq); /* * We classify groups/runqueues into three groups: * - regular: there are !numa tasks * - remote: there are numa tasks that run on the 'wrong' node * - all: there is no distinction * * In order to avoid migrating ideally placed numa tasks, * ignore those when there's better options. * * If we ignore the actual busiest queue to migrate another * task, the next balance pass can still reduce the busiest * queue by moving tasks around inside the node. * * If we cannot move enough load due to this classification * the next pass will adjust the group classification and * allow migration of more tasks. * * Both cases only affect the total convergence complexity. */ if (rt > env->fbq_type) continue; /* (7.4.2) 计算出cpu的capacity和weight_load */ capacity = capacity_of(i); wl = weighted_cpuload(i);#ifdef CONFIG_MTK_SCHED_INTEROP wl += mt_rt_load(i);#endif /* * When comparing with imbalance, use weighted_cpuload() * which is not scaled with the cpu capacity. */ if (rq->nr_running == 1 && wl > env->imbalance && !check_cpu_capacity(rq, env->sd)) continue; /* (7.4.3) 选出相对负载最重的cpu */ /* * For the load comparisons with the other cpu's, consider * the weighted_cpuload() scaled with the cpu capacity, so * that the load can be moved away from the cpu that is * potentially running at a lower capacity. * * Thus we're looking for max(wl_i / capacity_i), crosswise * multiplication to rid ourselves of the division works out * to: wl_i * capacity_j > wl_j * capacity_i; where j is * our previous maximum. */ if (wl * busiest_capacity > busiest_load * capacity) { busiest_load = wl; busiest_capacity = capacity; busiest = rq; } } return busiest;}||→static int detach_tasks(struct lb_env *env){ struct list_head *tasks = &env->src_rq->cfs_tasks; struct task_struct *p; unsigned long load; int detached = 0; lockdep_assert_held(&env->src_rq->lock); if (env->imbalance <= 0) return 0; /* (7.6.1) 遍历busiest rq中的进程 */ while (!list_empty(tasks)) { /* (7.6.2) 如果dest cpu不是idle,不能将busiest cpu迁移到idle状态 */ /* * We don't want to steal all, otherwise we may be treated likewise, * which could at worst lead to a livelock crash. */ if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) break; p = list_first_entry(tasks, struct task_struct, se.group_node); /* (7.6.3) 遍历任务最多不超过sysctl_sched_nr_migrate(32) */ env->loop++; /* We've more or less seen every task there is, call it quits */ if (env->loop > env->loop_max) break; /* (7.6.4) 每sched_nr_migrate_break个任务遍历需要跳出休息一下, 如果没有达到env->loop_max,后面会重来 */ /* take a breather every nr_migrate tasks */ if (env->loop > env->loop_break) { env->loop_break += sched_nr_migrate_break; env->flags |= LBF_NEED_BREAK; break; } /* (7.6.5) 判断任务是否支持迁移? */ if (!can_migrate_task(p, env)) goto next; /* (7.6.6) 获取p进程相对顶层cfs_rq的负载, 根据负载判断进程是否适合迁移 */ load = task_h_load(p); if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) goto next; if ((load / 2) > env->imbalance) goto next; /* (7.6.7) detach 进程 */ detach_task(p, env); list_add(&p->se.group_node, &env->tasks); detached++; env->imbalance -= load;#ifdef CONFIG_PREEMPT /* * NEWIDLE balancing is a source of latency, so preemptible * kernels will stop after the first task is detached to minimize * the critical section. */ if (env->idle == CPU_NEWLY_IDLE) break;#endif /* * We only want to steal up to the prescribed amount of * weighted load. */ if (env->imbalance <= 0) break; continue;next: list_move_tail(&p->se.group_node, tasks); } /* * Right now, this is one of only two places we collect this stat * so we can safely collect detach_one_task() stats here rather * than inside detach_one_task(). */ schedstat_add(env->sd, lb_gained[env->idle], detached); return detached;}|||→staticint can_migrate_task(struct task_struct *p, struct lb_env *env){ int tsk_cache_hot; lockdep_assert_held(&env->src_rq->lock); /* * We do not migrate tasks that are: * 1) throttled_lb_pair, or * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) running (obviously), or * 4) are cache-hot on their current CPU. */ /* (7.6.5.1) 如果达到bandwith限制,返回失败 */ if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; /* (7.6.5.2) 如果p进程的cpu affinity不允许迁移到dst_cpu,进一步处理 */ if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { int cpu; schedstat_inc(p, se.statistics.nr_failed_migrations_affine); /* (7.6.5.3) LBF_SOME_PINNED标志,记录有些进程迁移失败 */ env->flags |= LBF_SOME_PINNED; /* (7.6.5.5) 如果已经有其他的LBF_DST_PINNED动作,直接返回失败 */ /* * Remember if this task can be migrated to any other cpu in * our sched_group. We may want to revisit it if we couldn't * meet load balance goals by pulling other tasks on src_cpu. * * Also avoid computing new_dst_cpu if we have already computed * one in current iteration. */ if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) return 0; /* (7.6.5.4) 如果dst_cpu同一sched_group中的其他cpu符合p的affinity,尝试更改dst_cpu, 设置LBF_DST_PINNED标志 */ /* Prevent to re-select dst_cpu via env's cpus */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { env->flags |= LBF_DST_PINNED; env->new_dst_cpu = cpu; break; } } return 0; } /* (7.6.5.6) 有任何符合affinity条件的p,清除LBF_ALL_PINNED标志 */ /* Record that we found atleast one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; /* (7.6.5.7) 如果p在running状态,返回失败 */ if (task_running(env->src_rq, p)) { schedstat_inc(p, se.statistics.nr_failed_migrations_running); return 0; } /* (7.6.5.8) NUMA 相关的一些判断 */ /* * Aggressive migration if: * 1) destination numa is preferred * 2) task is cache cold, or * 3) too many balance attempts have failed. */ tsk_cache_hot = migrate_degrades_locality(p, env); if (tsk_cache_hot == -1) tsk_cache_hot = task_hot(p, env); if (tsk_cache_hot <= 0 || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (tsk_cache_hot == 1) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } return 1; } schedstat_inc(p, se.statistics.nr_failed_migrations_hot); return 0;}|||→static unsigned long task_h_load(struct task_struct *p){ struct cfs_rq *cfs_rq = task_cfs_rq(p); update_cfs_rq_h_load(cfs_rq); /* (7.6.6.1) task_h_load的目的是在task_group使能时,rq中有多个层次的cfs_rq 如果进程p挂载在底层的cfs_rq中,把p的负载转换成顶层cfs_rq的相对负载 */ return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, cfs_rq_load_avg(cfs_rq) + 1);}static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq){ struct rq *rq = rq_of(cfs_rq); struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; u64 now = sched_clock_cpu(cpu_of(rq)); unsigned long load; /* sched: change to jiffies */ now = now * HZ >> 30; if (cfs_rq->last_h_load_update == now) return; /* 从底层cfs_rq到顶层cfs_rq建立起层次关系 */ cfs_rq->h_load_next = NULL; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_load_next = se; if (cfs_rq->last_h_load_update == now) break; } if (!se) { cfs_rq->h_load = cfs_rq_load_avg(cfs_rq); cfs_rq->last_h_load_update = now; } /* 使用建立的关系,从顶层cfs_rq开始计算每个层次cfs_rq的相对顶层负载h_load */ while ((se = cfs_rq->h_load_next) != NULL) { load = cfs_rq->h_load; load = div64_ul(load * se->avg.load_avg, cfs_rq_load_avg(cfs_rq) + 1); cfs_rq = group_cfs_rq(se); cfs_rq->h_load = load; cfs_rq->last_h_load_update = now; }}
4.1.2.2、nohz_idle_balance()
每个cpu的负载均衡是在本cpu的tick任务scheduler_tick()中判断执行的,如果cpu进入了nohz模式scheduler_tick()被stop,那么本cpu没有机会去做rebalance_domains()。为了解决这个问题,系统设计了nohz_idle_balance(),在运行的cpu上判断进入nohz的cpu是否需要rebalance load,如果需要选择一个idle cpu来帮所有的nohz idle cpu做负载均衡。
在rebalance_domains()函数之前有一个nohz_idle_balance(),这是系统在条件满足的情况下让一个idle cpu做idle负载均衡。主要的原理如下:
- 1、cpu在进入nohz idle状态时,设置标志:
tick_nohz_idle_enter() -> set_cpu_sd_state_idle():↓void set_cpu_sd_state_idle(void){ struct sched_domain *sd; int cpu = smp_processor_id(); rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); if (!sd || sd->nohz_idle) goto unlock; /* (1.1) 进入nohz idle,设置sd->nohz_idle标志 */ sd->nohz_idle = 1; /* (1.2) 减少sgc->nr_busy_cpus的计数 */ atomic_dec(&sd->groups->sgc->nr_busy_cpus);unlock: rcu_read_unlock();}tick_nohz_idle_enter() -> __tick_nohz_idle_enter() -> tick_nohz_stop_sched_tick() -> nohz_balance_enter_idle():↓void nohz_balance_enter_idle(int cpu){ /* * If this cpu is going down, then nothing needs to be done. */ if (!cpu_active(cpu)) return; if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) return; /* * If we're a completely isolated CPU, we don't play. */ if (on_null_domain(cpu_rq(cpu))) return; /* (2.1) 进入idle状态,设置nohz.idle_cpus_mask中对应的bit */ cpumask_set_cpu(cpu, nohz.idle_cpus_mask); /* (2.2) 进入idle状态,增加nohz.nr_cpus计数 */ atomic_inc(&nohz.nr_cpus); /* (2.3) 设置cpu_rq(cpu)->nohz_flags中的NOHZ_TICK_STOPPED标志 */ set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));}
- 2、在trigger_load_balance()中判断,当前是否需要触发idle load balance:
void trigger_load_balance(struct rq *rq){ /* (1) 判断当前是否需要idle load balance */ if (nohz_kick_needed(rq)) /* (2) 选中一个idle cpu去做idle load balance */ nohz_balancer_kick();}|→/* * Current heuristic for kicking the idle load balancer in the presence * of an idle cpu in the system. * - This rq has more than one task. * - This rq has at least one CFS task and the capacity of the CPU is * significantly reduced because of RT tasks or IRQs. * - At parent of LLC scheduler domain level, this cpu's scheduler group has * multiple busy cpu. * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */static inline bool nohz_kick_needed(struct rq *rq){ unsigned long now = jiffies; struct sched_domain *sd; struct sched_group_capacity *sgc; int nr_busy, cpu = rq->cpu; bool kick = false; /* (1.1) 如果当前cpu为idle状态,失败退出 */ if (unlikely(rq->idle_balance)) return false; /* (1.2) 退出nohz状态:set_cpu_sd_state_busy()、nohz_balance_exit_idle(cpu) 是set_cpu_sd_state_idle()、nohz_balance_enter_idle()的反向操作 */ /* * We may be recently in ticked or tickless idle mode. At the first * busy tick after returning from idle, we will update the busy stats. */ set_cpu_sd_state_busy(); nohz_balance_exit_idle(cpu); /* (1.3) 如果进入nohz idle状态的cpu数量为0,失败退出 */ /* * None are in tickless mode and hence no need for NOHZ idle load * balancing. */ if (likely(!atomic_read(&nohz.nr_cpus))) return false; /* (1.4) nohz balance时间未到,失败退出 */ if (time_before(now, nohz.next_balance)) return false;#if !defined(CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT) && defined(CONFIG_HMP) /* for more than two clusters, still need wakup nohz CPUs and force balancing */ /* * Bail out if there are no nohz CPUs in our * HMP domain, since we will move tasks between * domains through wakeup and force balancing * as necessary based upon task load. */ if (sched_feat(SCHED_HMP) && cpumask_first_and(nohz.idle_cpus_mask, &((struct hmp_domain *)hmp_cpu_domain(cpu))->cpus) >= nr_cpu_ids) return false;#endif /* (1.5) 当前cpu的进程>=2,返回成功 */ if (rq->nr_running >= 2 && (!energy_aware() || cpu_overutilized(cpu))) return true; /* (1.6) sd所在sg的nr_busy_cpus>1,返回成功 */ rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); if (sd && !energy_aware()) { sgc = sd->groups->sgc; nr_busy = atomic_read(&sgc->nr_busy_cpus); if (nr_busy > 1) { kick = true; goto unlock; } } /* (1.7) 如果所有层次的se个数>=1,且capacity在减少,返回成功 */ sd = rcu_dereference(rq->sd); if (sd) { if ((rq->cfs.h_nr_running >= 1) && check_cpu_capacity(rq, sd)) { kick = true; goto unlock; } } /* (1.8) 如果本sd->span[]中第一个idle cpu < sd_asym,返回成功 */ sd = rcu_dereference(per_cpu(sd_asym, cpu)); if (sd && (cpumask_first_and(nohz.idle_cpus_mask, sched_domain_span(sd)) < cpu)) { kick = true; goto unlock; }unlock: rcu_read_unlock(); return kick;}|→static void nohz_balancer_kick(void){ int ilb_cpu; nohz.next_balance++; /* (2.1) 找到所有idle cpu中的第一个idle cpu */ ilb_cpu = find_new_ilb(); if (ilb_cpu >= nr_cpu_ids) return; /* (2.2) 给ilb_cpu的cpu_rq(cpu)->nohz_flags设置NOHZ_BALANCE_KICK标志位 */ if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) return; /* (2.3) 使用ipi中断来唤醒ilb_cpu执行idle load balance */ /* * Use smp_send_reschedule() instead of resched_cpu(). * This way we generate a sched IPI on the target cpu which * is idle. And the softirq performing nohz idle load balance * will be run before returning from the IPI. */ smp_send_reschedule(ilb_cpu); return;}/* (2.3.1) ilb_cpu倍唤醒后处理IPI_RESCHEDULE, 会触发一个SCHED_SOFTIRQ软中断来启动run_rebalance_domains() */void handle_IPI(int ipinr, struct pt_regs *regs){ unsigned int cpu = smp_processor_id(); struct pt_regs *old_regs = set_irq_regs(regs); if ((unsigned)ipinr < NR_IPI) { trace_ipi_entry_rcuidle(ipi_types[ipinr]); __inc_irq_stat(cpu, ipi_irqs[ipinr]); } switch (ipinr) { case IPI_RESCHEDULE: scheduler_ipi(); break;}↓void scheduler_ipi(void){ /* * Check if someone kicked us for doing the nohz idle load balance. */ if (unlikely(got_nohz_idle_kick())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); }}
- 3、被选中的ilb_cpu被唤醒后,需要帮其他所有idle cpu完成rebalance_domains()工作:
static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle){ int this_cpu = this_rq->cpu; struct rq *rq; int balance_cpu; /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; /* (1) 判断当前cpu是不是被选中被唤醒的ilb_cpu */ if (idle != CPU_IDLE || !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) goto end; /* (2) 轮询所有进入onhz状态的cpu */ for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { /* (3) 只服务非本cpu,且还是idle状态的cpu ooooo本cpu也是idle状态,不需对本cpu做idle负载均衡? ooooo给其他idle cpu的rq做了负载均衡后,什么时候唤醒其他idle cpu? */ if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) continue; /* (4) 如果本cpu被设置了resched标志,说明有线程被唤醒,退出idle状态 */ /* * If this cpu gets work to do, stop the load balancing * work being done for other cpus. Next load * balancing owner will pick it up. */ if (need_resched()) break; /* (5) 需要做负载均衡的idle进程balance_cpu */ rq = cpu_rq(balance_cpu); /* (6) 如果balance_cpu的rq->next_balance时间已到,替其做rebalance_domains() */ /* * If time for next balance is due, * do the balance. */ if (time_after_eq(jiffies, rq->next_balance)) { raw_spin_lock_irq(&rq->lock); update_rq_clock(rq); /* (7) 更新idle cpu因为idle造成的负载衰减 */ update_idle_cpu_load(rq); raw_spin_unlock_irq(&rq->lock); /* (8) 对balance_cpu做负载均衡 ooooo做完负载均衡,什么时候唤醒balance_cpu?? */ rebalance_domains(rq, CPU_IDLE); } if (time_after(next_balance, rq->next_balance)) { next_balance = rq->next_balance; update_next_balance = 1; } } /* (9) 根据所有进入nohz idle cpu rq的最近的一次到期时间,更新nohz.next_balance */ /* * next_balance will be updated only when there is a need. * When the CPU is attached to null domain for ex, it will not be * updated. */ if (likely(update_next_balance)) nohz.next_balance = next_balance;end: clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));}
4.1.2.3、select_task_rq_fair()
除了scheduler_tick()的时候会做负载均衡,另外一个时刻也会做负载均衡。就是fork新进程、wakeup休眠进程时,系统会根据负载均衡挑选一个最合适的cpu给进程运行,其核心函数就是select_task_rq_fair():
- 1、首先是使用EAS的方法来select_cpu,在EAS使能且没有overutilized时使用EAS方法:
需要重点提一下的是:负载计算计算了3种负载(load_avg、loadwop_avg、util_avg),EAS主要使用其中的util_avg,和capacity一起计算。
- 1.1、EAS遍历cluster和cpu,找到一个既能满足进程p的affinity又能容纳下进程p的负载util,属于能用最小capacity满足的cluster其中剩余capacity最多的target_cpu;
首先找到能容纳进程p的util且capacity最小的cluster:
然后在目标cluster中找到加上进程p以后,剩余capacity最大的cpu:
pre_cpu是进程p上一次运行的cpu作为src_cpu,上面选择的target_cpu作为dst_cpu,就是尝试计算进程p从pre_cpu迁移到target_cpu系统的功耗差异:
- 1.2、计算负载变化前后,target_cpu和prev_cpu带来的power变化。如果没有power增加则返回target_cpu,如果有power增加则返回prev_cpu;
计算负载变化的函数energy_diff()循环很多比较复杂,仔细分析下来就是计算target_cpu/prev_cpu在“MC层次cpu所在sg链表”+“DIE层级cpu所在sg”,这两种范围在负载变化中的功耗差异:
energy_diff()的计算方法如下:
- 2、如果EAS不适应,使用传统的负载均衡方法来select_cpu:
- 2.1、find_idlest_group() -> find_idlest_cpu() 找出最时候的target_cpu;
- 2.2、最差的方法使用select_idle_sibling()讲究找到一个idle cpu作为target_cpu;
- 2.3、确定target_cpu后,继续使用hmp_select_task_rq_fair()来判断是否需要进行hmp迁移;
static intselect_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags){ struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; /* 默认new_cpu为prev_cpu */ int want_affine = 0; int sync = wake_flags & WF_SYNC; int policy = 0;#ifdef CONFIG_MTK_SCHED_VIP_TASKS /* mtk: If task is VIP task, prefer most efficiency idle cpu */ if (is_vip_task(p)) { int vip_idle_cpu; vip_idle_cpu = find_idle_vip_cpu(p); if (vip_idle_cpu >= 0) return vip_idle_cpu; }#endif /* (1) 优先使用EAS计算target cpu, mtk 对EAS定义了3种模式:EAS模式(energy_aware())、HMP模式(sched_feat(SCHED_HMP))、hybrid_support(EAS、HMP同时共存); hybrid_support()模式下:一般负载均衡交给EAS;如果cpu_rq(cpu)->rd->overutilized负载已经严重不均衡,交给HMP; */ /* * Consider EAS if only EAS enabled, but HMP * if hybrid enabled and system is over-utilized. */ if ((energy_aware() && !hybrid_support()) || (hybrid_support() && !cpu_rq(cpu)->rd->overutilized)) goto CONSIDER_EAS; /* (2) 非EAS情况,fork使用hmp balance */ /* HMP fork balance: * always put non-kernel forking tasks on a big domain */ if (sched_feat(SCHED_HMP) && p->mm && (sd_flag & SD_BALANCE_FORK)) { new_cpu = hmp_fork_balance(p, prev_cpu); /* to recover new_cpu value if something wrong */ if (new_cpu >= nr_cpu_ids) new_cpu = prev_cpu; else {#ifdef CONFIG_MTK_SCHED_TRACERS trace_sched_select_task_rq(p, (LB_FORK | new_cpu), prev_cpu, new_cpu);#endif return new_cpu; } }CONSIDER_EAS: /* (3) 如果唤醒flag中设置了SD_BALANCE_WAKE,优先使用唤醒cpu来运行进程p, 还需判断下面3个条件是否满足: !wake_wide(p) // 当前cpu的唤醒次数没有超标 task_fits_max(p, cpu) // 当前cpu的capacity能容纳进程p的util cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) // 当前cpu在进程在P的affinity中 EAS利用了want_affine这个标志,只要EAS使能,want_affine =1 */ if (sd_flag & SD_BALANCE_WAKE) want_affine = (!wake_wide(p) && task_fits_max(p, cpu) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) || energy_aware(); rcu_read_lock(); /* (4) 从下往上遍历当前cpu的sd,查询在哪个层次的sd进行负载均衡 */ for_each_domain(cpu, tmp) { /* (4.1 如果当前sd不支持负载均SD_LOAD_BALANCE,退出) */ if (!(tmp->flags & SD_LOAD_BALANCE)) break; /* (4.2) 优先找affine_sd,找到直接break; 需要符合以下3个条件: want_affine // (tmp->flags & SD_WAKE_AFFINE) // 当前sd支持SD_WAKE_AFFINE标志 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)) //当前sd->span[]中同时包含cpu、pre_cpu */ /* * If both cpu and prev_cpu are part of this domain, * cpu is a valid SD_WAKE_AFFINE target. */ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { affine_sd = tmp; break; } /* (4.3) 其次找一个符合sd_flag的sd */ if (tmp->flags & sd_flag) sd = tmp; /* (4.4) 如果以上都失败,直接跳出 */ else if (!want_affine) break; } /* (5) 如果affine_sd成功找到 */ if (affine_sd) { sd = NULL; /* Prefer wake_affine over balance flags */ if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) new_cpu = cpu; } /* (6) 没有找到符合sd_flag的sd */ if (!sd) { /* (6.1) EAS使能,且本cpu没有overutilized, 使用EAS负载均衡算法 */ if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) { new_cpu = energy_aware_wake_cpu(p, prev_cpu); policy |= LB_EAS; } /* (6.2) 如果不能使用EAS,且sd_flag中设置SD_BALANCE_WAKE标志 尝试在唤醒的cpu上运行p进程, ooooo前面辛苦计算的affine_sd没有派上用场? */ else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ if (true) {#ifdef CONFIG_CGROUP_SCHEDTUNE bool prefer_idle = schedtune_prefer_idle(p) > 0;#else bool prefer_idle = true;#endif int idle_cpu; idle_cpu = find_best_idle_cpu(p, prefer_idle); if (idle_cpu >= 0) { new_cpu = idle_cpu; policy |= LB_IDLEST; } else { new_cpu = select_max_spare_capacity_cpu(p, new_cpu); policy |= LB_SPARE; } } else /* (6.3) 不符合上述条件下的默认处理,尝试找一个idle cpu */ new_cpu = select_idle_sibling(p, new_cpu); } } else while (sd) { /* (7) 找到符合sd_flag的sd */ struct sched_group *group; int weight; policy |= LB_SMP; /* (7.1) */ if (!(sd->flags & sd_flag)) { sd = sd->child; continue; } /* (7.2) */ group = find_idlest_group(sd, p, cpu, sd_flag); if (!group) { sd = sd->child; continue; } /* (7.3) */ new_cpu = find_idlest_cpu(group, p, cpu); if (new_cpu == -1 || new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; continue; } /* (7.4) */ /* Now try balancing at a lower domain level of new_cpu */ cpu = new_cpu; weight = sd->span_weight; sd = NULL; for_each_domain(cpu, tmp) { if (weight <= tmp->span_weight) break; if (tmp->flags & sd_flag) sd = tmp; } /* while loop will break here if sd == NULL */ }#ifdef CONFIG_MTK_SCHED_TRACERS policy |= (new_cpu << LB_SMP_SHIFT);#endif rcu_read_unlock(); /* (8) 在EAS不能运行的情况下,在做一次HMP的select操作: 判断进程p是否符合hmp的迁移条件,如果符合一次迁移到位,避免后续hmp的操作 */ /* Consider hmp if no EAS or over-utiled in hybrid mode. */ if ((!energy_aware() && sched_feat(SCHED_HMP)) || (hybrid_support() && cpu_rq(cpu)->rd->overutilized)) { new_cpu = hmp_select_task_rq_fair(sd_flag, p, prev_cpu, new_cpu);#ifdef CONFIG_MTK_SCHED_TRACERS policy |= (new_cpu << LB_HMP_SHIFT);#endif policy |= LB_HMP; }#ifdef CONFIG_MTK_SCHED_TRACERS trace_sched_select_task_rq(p, policy, prev_cpu, new_cpu);#endif return new_cpu;}|→inline int hmp_fork_balance(struct task_struct *p, int prev_cpu){ int new_cpu = prev_cpu; int cpu = smp_processor_id(); /* (2.1) prev_cpu所在cluster是最快(fastest)的 */ if (hmp_cpu_is_fastest(prev_cpu)) { /* prev_cpu is fastest domain */ struct hmp_domain *hmpdom; __always_unused int lowest_ratio; hmpdom = list_entry( &hmp_cpu_domain(prev_cpu)->hmp_domains, struct hmp_domain, hmp_domains); /* (2.2) 尝试选出负载最小的cpu */ lowest_ratio = hmp_domain_min_load(hmpdom, &new_cpu); if (new_cpu < nr_cpu_ids && cpumask_test_cpu(new_cpu, tsk_cpus_allowed(p))) return new_cpu; new_cpu = cpumask_any_and(&hmp_faster_domain(cpu)->cpus, tsk_cpus_allowed(p)); if (new_cpu < nr_cpu_ids) return new_cpu; } else { /* (2.3) 尝试选出prev_cpu所在cluster中负载最小的cpu */ /* prev_cpu is not fastest domain */ new_cpu = hmp_select_faster_cpu(p, prev_cpu); if (new_cpu < nr_cpu_ids) return new_cpu; } return new_cpu;}|→static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync){ s64 this_load, load; s64 this_eff_load, prev_eff_load; int idx, this_cpu, prev_cpu; struct task_group *tg; unsigned long weight; int balanced; idx = sd->wake_idx; this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); /* (5.1) */ /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load * of the current CPU: */ if (sync) { tg = task_group(current); weight = current->se.avg.load_avg; this_load += effective_load(tg, this_cpu, -weight, -weight); load += effective_load(tg, prev_cpu, 0, -weight); } tg = task_group(p); weight = p->se.avg.load_avg; /* * In low-load situations, where prev_cpu is idle and this_cpu is idle * due to the sync cause above having dropped this_load to 0, we'll * always have an imbalance, but there's really nothing you can do * about that, so that's good too. * * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ this_eff_load = 100; this_eff_load *= capacity_of(prev_cpu); prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; prev_eff_load *= capacity_of(this_cpu); if (this_load > 0) { this_eff_load *= this_load + effective_load(tg, this_cpu, weight, weight); prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); } balanced = this_eff_load <= prev_eff_load; schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); if (!balanced) return 0; schedstat_inc(sd, ttwu_move_affine); schedstat_inc(p, se.statistics.nr_wakeups_affine); return 1;}|→static int energy_aware_wake_cpu(struct task_struct *p, int target){ int target_max_cap = INT_MAX; int target_cpu = task_cpu(p); unsigned long min_util; unsigned long new_util; int i, cpu; bool is_tiny = false; int nrg_diff = 0; int cluster_id = 0; struct cpumask cluster_cpus; int max_cap_cpu = 0; int best_cpu = 0; /* (6.1.1) 遍历cluster和cpu,找出一个capacity最小的cpu能容纳下util(p)为best_cpu */ /* * Find group with sufficient capacity. We only get here if no cpu is * overutilized. We may end up overutilizing a cpu by adding the task, * but that should not be any worse than select_idle_sibling(). * load_balance() should sort it out later as we get above the tipping * point. */ cluster_id = arch_get_nr_clusters(); for (i = 0; i < cluster_id; i++) { arch_get_cluster_cpus(&cluster_cpus, i); max_cap_cpu = cpumask_first(&cluster_cpus); /* Assuming all cpus are the same in group */ for_each_cpu(cpu, &cluster_cpus) { if (!cpu_online(cpu)) continue; if (capacity_of(max_cap_cpu) < target_max_cap && task_fits_max(p, max_cap_cpu)) { best_cpu = cpu; target_max_cap = capacity_of(max_cap_cpu); } break; } } if (task_util(p) < TINY_TASK_THRESHOLD) is_tiny = true; /* Find cpu with sufficient capacity */ min_util = boosted_task_util(p); if (!is_tiny) /* (6.1.2) 根据best_cpu所在的cluster和进程p的affinity, 找出加上util(p)以后,剩余capacity最大的cpu:target_cpu */ target_cpu = select_max_spare_capacity_cpu(p, best_cpu); else /* (6.1.3) 根据cluster和进程p的affinity, 找出加上util(p)以后,当前freq的capacity能满足的第一个cpu:target_cpu */ for_each_cpu_and(i, tsk_cpus_allowed(p), &cluster_cpus) { if (!cpu_online(i)) continue; /* * p's blocked utilization is still accounted for on prev_cpu * so prev_cpu will receive a negative bias due to the double * accounting. However, the blocked utilization may be zero. */ new_util = cpu_util(i) + task_util(p); /* * Ensure minimum capacity to grant the required boost. * The target CPU can be already at a capacity level higher * than the one required to boost the task. */ new_util = max(min_util, new_util);#ifdef CONFIG_MTK_SCHED_INTEROP if (cpu_rq(i)->rt.rt_nr_running && likely(!is_rt_throttle(i))) continue;#endif if (new_util > capacity_orig_of(i)) continue; if (new_util < capacity_curr_of(i)) { target_cpu = i; if (cpu_rq(i)->nr_running) break; } /* cpu has capacity at higher OPP, keep it as fallback */ if (target_cpu == task_cpu(p)) target_cpu = i; } /* (6.1.4) 如果pre_cpu和target_cpu是同一个cluster,直接成功返回 */ /* no need energy calculation if the same domain */ if (is_the_same_domain(task_cpu(p), target_cpu)) return target_cpu; /* no energy comparison if the same cluster */ if (target_cpu != task_cpu(p)) { /* (6.1.5) 构造需要迁移的环境变量 */ struct energy_env eenv = { .util_delta = task_util(p), .src_cpu = task_cpu(p), .dst_cpu = target_cpu, .task = p, }; /* Not enough spare capacity on previous cpu */ if (cpu_overutilized(task_cpu(p))) { trace_energy_aware_wake_cpu(p, task_cpu(p), target_cpu, (int)task_util(p), nrg_diff, true, is_tiny); return target_cpu; } /* (6.1.6) 计算进程p从pre_cpu迁移到target_cpu后的功耗差值nrg_diff, 如果功耗增加,nrg_diff >= 0,返回pre_cpu即task_cpu(p), 如果功耗减少,返回新的target_cpu */ nrg_diff = energy_diff(&eenv); if (nrg_diff >= 0) { trace_energy_aware_wake_cpu(p, task_cpu(p), target_cpu, (int)task_util(p), nrg_diff, false, is_tiny); return task_cpu(p); } } trace_energy_aware_wake_cpu(p, task_cpu(p), target_cpu, (int)task_util(p), nrg_diff, false, is_tiny); return target_cpu;}||→static inline intenergy_diff(struct energy_env *eenv){ unsigned int boost; int nrg_delta; /* Conpute "absolute" energy diff */ __energy_diff(eenv); /* Return energy diff when boost margin is 0 */#ifdef CONFIG_CGROUP_SCHEDTUNE boost = schedtune_task_boost(eenv->task);#else boost = get_sysctl_sched_cfs_boost();#endif if (boost == 0) return eenv->nrg.diff; /* Compute normalized energy diff */ nrg_delta = normalize_energy(eenv->nrg.diff); eenv->nrg.delta = nrg_delta; eenv->payoff = schedtune_accept_deltas( eenv->nrg.delta, eenv->cap.delta, eenv->task); /* * When SchedTune is enabled, the energy_diff() function will return * the computed energy payoff value. Since the energy_diff() return * value is expected to be negative by its callers, this evaluation * function return a negative value each time the evaluation return a * positive payoff, which is the condition for the acceptance of * a scheduling decision */ return -eenv->payoff;}static int __energy_diff(struct energy_env *eenv){ struct sched_domain *sd; struct sched_group *sg; int sd_cpu = -1, energy_before = 0, energy_after = 0; /* (6.1.6.1) 构造迁移前的环境变量 */ struct energy_env eenv_before = { .util_delta = 0, .src_cpu = eenv->src_cpu, .dst_cpu = eenv->dst_cpu, .nrg = { 0, 0, 0, 0}, .cap = { 0, 0, 0 }, };#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT int i;#endif if (eenv->src_cpu == eenv->dst_cpu) return 0;#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT /* To get max opp index of every cluster for power estimation of share buck */ for (i = 0; i < arch_get_nr_clusters(); i++) { /* for energy before */ eenv_before.opp_idx[i] = mtk_cluster_capacity_idx(i, &eenv_before); /* for energy after */ eenv->opp_idx[i] = mtk_cluster_capacity_idx(i, eenv); mt_sched_printf(sched_eas_energy_calc, "cid=%d, before max_opp:%d, after max_opp:%d\n", i, eenv_before.opp_idx[i], eenv->opp_idx[i]); }#endif /* (6.1.6.2) sd来至于cache sd_ea,是cpu对应的顶层sd(tl DIE层) */ sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu; sd = rcu_dereference(per_cpu(sd_ea, sd_cpu)); if (!sd) return 0; /* Error */ mt_sched_printf(sched_eas_energy_calc, "0. %s: move task from src=%d to dst=%d util=%d", __func__, eenv->src_cpu, eenv->dst_cpu, eenv->util_delta); sg = sd->groups; /* (6.1.6.3) 遍历sg所在sg链表,找到符合条件的sg, 累加计算eenv_before、eenv相关sg的功耗 */ do { /* (6.1.6.4) 如果当前sg包含src_cpu或者dst_cpu,计算 */ if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) { /* (6.1.6.5) 当前顶层sg为eenv的sg_top */ eenv_before.sg_top = eenv->sg_top = sg; mt_sched_printf(sched_eas_energy_calc, "1. %s: src=%d dst=%d mask=0x%lx (before)", __func__, eenv_before.src_cpu, eenv_before.dst_cpu, sg->cpumask[0]); /* (6.1.6.6) 计算eenv_before负载下sg的power */ if (sched_group_energy(&eenv_before)) return 0; /* Invalid result abort */ energy_before += eenv_before.energy; /* Keep track of SRC cpu (before) capacity */ eenv->cap.before = eenv_before.cap.before; eenv->cap.delta = eenv_before.cap.delta; mt_sched_printf(sched_eas_energy_calc, "2. %s: src=%d dst=%d mask=0x%lx (after)", __func__, eenv->src_cpu, eenv->dst_cpu, sg->cpumask[0]); /* (6.1.6.7) 计算eenv负载下sg的power */ if (sched_group_energy(eenv)) return 0; /* Invalid result abort */ energy_after += eenv->energy; } } while (sg = sg->next, sg != sd->groups); /* (6.1.6.8) 计算energy_after - energy_before */ eenv->nrg.before = energy_before; eenv->nrg.after = energy_after; eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; eenv->payoff = 0; trace_sched_energy_diff(eenv->task, eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, eenv->cap.before, eenv->cap.after, eenv->cap.delta, eenv->nrg.delta, eenv->payoff); mt_sched_printf(sched_eas_energy_calc, "5. %s: nrg.diff=%d cap.delta=%d", __func__, eenv->nrg.diff, eenv->cap.delta); return eenv->nrg.diff;}|||→static int sched_group_energy(struct energy_env *eenv){ struct sched_domain *sd; int cpu, total_energy = 0; struct cpumask visit_cpus; struct sched_group *sg;#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT int only_lv1_sd = 0;#endif WARN_ON(!eenv->sg_top->sge); cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top)); /* (6.1.6.6.1) 根据sg_top顶层sd,找到需要计算的cpu集合visit_cpus,逐个遍历其中每一个cpu ooooo这一套复杂的循环算法计算下来,其实就计算了几个power,以cpu0-cpu3为例: 4个底层sg的power + 1个顶层sg的power */ while (!cpumask_empty(&visit_cpus)) { struct sched_group *sg_shared_cap = NULL; /* (6.1.6.6.2) 选取visit_cpus中的第一个cpu */ cpu = cpumask_first(&visit_cpus); sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); if (!sd) { /* a corner racing with hotplug? sd doesn't exist in this cpu. */ return -EINVAL; } /* * Is the group utilization affected by cpus outside this * sched_group? */ sd = rcu_dereference(per_cpu(sd_scs, cpu));#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT /* Try to handle one CPU in this cluster by hotplug. * In it there is only lv-1 sched_domain exist which having * no share_cap_states. */ if (!sd) { sd = rcu_dereference(per_cpu(sd_ea, cpu)); only_lv1_sd = 1; }#endif if (!sd) { /* * We most probably raced with hotplug; returning a * wrong energy estimation is better than entering an * infinite loop. */ return -EINVAL; } if (sd->parent) sg_shared_cap = sd->parent->groups; /* (6.1.6.6.3) 从底层到顶层逐个遍历cpu所在的sd */ for_each_domain(cpu, sd) { sg = sd->groups; /* (6.1.6.6.4) 如果是顶层sd,只会计算一个sg */ /* Has this sched_domain already been visited? */ if (sd->child && group_first_cpu(sg) != cpu) break; /* (6.1.6.6.5) 逐个遍历该层次sg链表所在sg */ do { unsigned long group_util; int sg_busy_energy, sg_idle_energy; int cap_idx, idle_idx; if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) eenv->sg_cap = sg_shared_cap; else eenv->sg_cap = sg; /* (6.1.6.6.6) 根据eenv指示的负载变化,找出满足该sg中最大负载cpu的capacity_index */ cap_idx = find_new_capacity(eenv, sg->sge); if (sg->group_weight == 1) { /* Remove capacity of src CPU (before task move) */ if (eenv->util_delta == 0 && cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) { eenv->cap.before = sg->sge->cap_states[cap_idx].cap; eenv->cap.delta -= eenv->cap.before; } /* Add capacity of dst CPU (after task move) */ if (eenv->util_delta != 0 && cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) { eenv->cap.after = sg->sge->cap_states[cap_idx].cap; eenv->cap.delta += eenv->cap.after; } } /* (6.1.6.6.7) 找出sg所有cpu中最小的idle index */ idle_idx = group_idle_state(sg); /* (6.1.6.6.8) 累加sg中所有cpu的相对负载, 最大负载为sg->sge->cap_states[eenv->cap_idx].cap */ group_util = group_norm_util(eenv, sg); /* (6.1.6.6.9) 计算power = busy_power + idle_power */#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT /* * To support power estimation for MTK soc. * Consider share buck for dynamic power and SPARK/MCDI for static power. */ sg_busy_energy = (group_util * sg->sge->busy_power(group_first_cpu(sg), eenv, (sd->child) ? 1 : 0)) >> SCHED_CAPACITY_SHIFT; sg_idle_energy = ((SCHED_LOAD_SCALE - group_util) * sg->sge->idle_power(idle_idx, group_first_cpu(sg), eenv, (sd->child) ? 1 : 0)) >> SCHED_CAPACITY_SHIFT;#else /* Power value had been separated to static + dynamic here */ sg_busy_energy = (group_util * (sg->sge->cap_states[cap_idx].dyn_pwr + sg->sge->cap_states[cap_idx].lkg_pwr[sg->sge->lkg_idx])) >> SCHED_CAPACITY_SHIFT; sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[idle_idx].power) >> SCHED_CAPACITY_SHIFT;#endif total_energy += sg_busy_energy + sg_idle_energy; mt_sched_printf(sched_eas_energy_calc, "busy_energy=%d idle_eneryg=%d (cost=%d)", sg_busy_energy, sg_idle_energy, total_energy); /* (6.1.6.6.10) 如果遍历了底层sd,从visit_cpus中去掉对应的sg cpu */ if (!sd->child) cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT /* * We try to get correct energy estimation while racing with hotplug * and avoid entering a infinite loop. */ if (only_lv1_sd) { eenv->energy = total_energy; return 0; }#endif if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top))) goto next_cpu; } while (sg = sg->next, sg != sd->groups); } /* (6.1.6.6.11) 如果遍历了cpu的底层到顶层sd,从visit_cpus中去掉对应的cpu */next_cpu: cpumask_clear_cpu(cpu, &visit_cpus); continue; } eenv->energy = total_energy; return 0;}|→static struct sched_group *find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag){ struct sched_group *idlest = NULL, *group = sd->groups; struct sched_group *fit_group = NULL; unsigned long min_load = ULONG_MAX, this_load = 0; unsigned long fit_capacity = ULONG_MAX; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; /* (7.2.1) 选择load_idx */ if (sd_flag & SD_BALANCE_WAKE) load_idx = sd->wake_idx; /* (7.2.2) 当前cpu所在sd层次的sg,遍历sg所在的sg链表,选出负载最轻的idlest sg */ do { unsigned long load, avg_load; int local_group; int i; /* (7.2.3) 略过不符合p进程affinity的sg */ /* Skip over this group if it has no CPUs allowed */ if (!cpumask_intersects(sched_group_cpus(group), tsk_cpus_allowed(p))) continue; /* (7.2.4) local_group等于本cpu所在的sg */ local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); /* Tally up the load of all CPUs in the group */ avg_load = 0; /* (7.2.5) 遍历sg中的所有cpu,累加负载 */ for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); else load = target_load(i, load_idx);#ifdef CONFIG_MTK_SCHED_INTEROP load += mt_rt_load(i);#endif avg_load += load; /* (7.2.6) 如果EAS使能,找到能最小满足进程p的capacity sg */ /* * Look for most energy-efficient group that can fit * that can fit the task. */ if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) { fit_capacity = capacity_of(i); fit_group = group; } } /* (7.2.7) 用累计的负载计算相对负载 */ /* Adjust by relative CPU capacity of the group */ avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; /* (7.2.8) 计算idlest sg */ if (local_group) { this_load = avg_load; } else if (avg_load < min_load) { min_load = avg_load; idlest = group; } } while (group = group->next, group != sd->groups); /* (7.2.9) EAS使能,返回fit_group */ if (energy_aware() && fit_group) return fit_group; if (!idlest || 100*this_load < imbalance*min_load) return NULL; /* (7.2.11) 否则,返回idlest */ return idlest;}|→static intfind_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu){ unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; u64 latest_idle_timestamp = 0; int least_loaded_cpu = this_cpu; int shallowest_idle_cpu = -1; int i; /* (7.3.1) 遍历sg中符合p进程affinity的cpu */ /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { /* (7.3.2) 如果cpu的剩余capacity能容纳下p进程的load */ if (task_fits_spare(p, i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); /* (7.3.2.1) 优先选出idle状态,且退出idle开销最小的cpu */ if (idle && idle->exit_latency < min_exit_latency) { /* * We give priority to a CPU whose idle state * has the smallest exit latency irrespective * of any idle timestamp. */ min_exit_latency = idle->exit_latency; latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; } else if (idle_cpu(i) && (!idle || idle->exit_latency == min_exit_latency) && rq->idle_stamp > latest_idle_timestamp) { /* * If equal or no active idle state, then * the most recently idled CPU might have * a warmer cache. */ latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; } else if (shallowest_idle_cpu == -1) { /* * If we haven't found an idle CPU yet * pick a non-idle one that can fit the task as * fallback. */ shallowest_idle_cpu = i; } /* (7.3.3) cpu的剩余capacity容纳不下进程p,选出负载最轻的cpu */ } else if (shallowest_idle_cpu == -1) { load = weighted_cpuload(i);#ifdef CONFIG_MTK_SCHED_INTEROP load += mt_rt_load(i);#endif if (load < min_load || (load == min_load && i == this_cpu)) { min_load = load; least_loaded_cpu = i; } } } return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;}|→static int hmp_select_task_rq_fair(int sd_flag, struct task_struct *p, int prev_cpu, int new_cpu){ struct list_head *pos; struct sched_entity *se = &p->se; struct cpumask fast_cpu_mask, slow_cpu_mask;#ifdef CONFIG_HMP_TRACER int cpu = 0; for_each_online_cpu(cpu) trace_sched_cfs_runnable_load(cpu, cfs_load(cpu), cfs_length(cpu));#endif /* error handling */ if (prev_cpu >= num_possible_cpus()) return new_cpu; /* * Skip all the checks if only one CPU is online. * Otherwise, select the most appropriate CPU from cluster. */ if (num_online_cpus() == 1) goto out; /* (8.1) 找出fastest hmp_domain,只有一个, 找出slow hmp_domain,有多个, 在一个fast_cpu_mask和多个slow_cpu_mask之间,逐个尝试hmp_select_task_migration() p进程是否会满足hmp迁移 */ cpumask_clear(&fast_cpu_mask); cpumask_clear(&slow_cpu_mask); /* order: fast to slow hmp domain */ list_for_each(pos, &hmp_domains) { struct hmp_domain *domain = list_entry(pos, struct hmp_domain, hmp_domains); if (!cpumask_empty(&domain->cpus)) { if (cpumask_empty(&fast_cpu_mask)) { cpumask_copy(&fast_cpu_mask, &domain->possible_cpus); } else { cpumask_copy(&slow_cpu_mask, &domain->possible_cpus); new_cpu = hmp_select_task_migration(sd_flag, p, prev_cpu, new_cpu, &fast_cpu_mask, &slow_cpu_mask); } } }out: /* it happens when num_online_cpus=1 */ if (new_cpu >= nr_cpu_ids) { /* BUG_ON(1); */ new_cpu = prev_cpu; } cfs_nr_pending(new_cpu)++; cfs_pending_load(new_cpu) += se_load(se); return new_cpu;}||→static int hmp_select_task_migration(int sd_flag, struct task_struct *p, int prev_cpu, int new_cpu, struct cpumask *fast_cpu_mask, struct cpumask *slow_cpu_mask){ int step = 0; struct sched_entity *se = &p->se; int B_target = num_possible_cpus(); int L_target = num_possible_cpus(); struct clb_env clbenv; /* (8.1.1) 找出fast_cpu_mask中负载最轻的cpu B_target,且符合p进程的affinity */ B_target = hmp_select_cpu(HMP_SELECT_RQ, p, fast_cpu_mask, prev_cpu, 0); /* (8.1.2) 找出slow_cpu_mask中负载最轻的cpu L_target,且符合p进程的affinity */ L_target = hmp_select_cpu(HMP_SELECT_RQ, p, slow_cpu_mask, prev_cpu, 1); /* * Only one cluster exists or only one cluster is allowed for this task * Case 1: return the runqueue whose load is minimum * Case 2: return original CFS runqueue selection result */ if (B_target >= num_possible_cpus() && L_target >= num_possible_cpus()) goto out; if (B_target >= num_possible_cpus()) goto select_slow; if (L_target >= num_possible_cpus()) goto select_fast; /* * Two clusters exist and both clusters are allowed for this task * Step 1: Move newly created task to the cpu where no tasks are running * Step 2: Migrate heavy-load task to big * Step 3: Migrate light-load task to LITTLE * Step 4: Make sure the task stays in its previous hmp domain */ step = 1; if (task_created(sd_flag) && !task_low_priority(p->prio)) { if (!rq_length(B_target)) goto select_fast; if (!rq_length(L_target)) goto select_slow; } /* (8.1.3) 计算如果L_target和B_target发生hmp迁移,各种负载和thershold的计算 */ memset(&clbenv, 0, sizeof(clbenv)); clbenv.flags |= HMP_SELECT_RQ; cpumask_copy(&clbenv.lcpus, slow_cpu_mask); cpumask_copy(&clbenv.bcpus, fast_cpu_mask); clbenv.ltarget = L_target; clbenv.btarget = B_target; sched_update_clbstats(&clbenv); /* (8.1.4) 判断进程p从L_target up到 B_target的可行性 */ step = 2; if (hmp_up_migration(L_target, &B_target, se, &clbenv)) goto select_fast; /* (8.1.5) 判断进程p从B_target down到 L_target的可行性 */ step = 3; if (hmp_down_migration(B_target, &L_target, se, &clbenv)) goto select_slow; /* (8.1.6) 如果prev_cpu是slowest */ step = 4; if (hmp_cpu_is_slowest(prev_cpu)) goto select_slow; goto select_fast; /* (8.1.7) 返回 B_target */select_fast: new_cpu = B_target; cpumask_clear(slow_cpu_mask); goto out; /* (8.1.8) 返回 L_target */select_slow: new_cpu = L_target; cpumask_copy(fast_cpu_mask, slow_cpu_mask); cpumask_clear(slow_cpu_mask); goto out;out:#ifdef CONFIG_HMP_TRACER trace_sched_hmp_load(clbenv.bstats.load_avg, clbenv.lstats.load_avg);#endif return new_cpu;}
4.2、HMP负载均衡
除了SMP load_balance()负载均衡以外,我们还希望在多个SMP cluster之间能遵守一种规则:heavy任务跑在big core上,light任务跑在little core上,这样能快速的达到一个合理的负载状态。这种算法就叫做HMP负载均衡,EAS会统一的考虑负载、性能、功耗,EAS使能后HMP就被禁用了。
HMP负载均衡的操作分两种:
- 1、heavy task从little cpu迁移到big cpu。这种叫做up操作,对应的函数hmp_force_up_migration();
- 2、light task从big cpu迁移到little cpu。这种叫做down操作,对应的函数hmp_force_down_migration();
4.2.1、hmp domain初始化
hmp在初始化的时候会每个cluster分配一个hmp_domain,把所有hmp_domain加入到全局链表hmp_domains中。hmp_domains链表构建完成以后,离链表头hmp_domains最近的hmp_domain是速度最快的cluster,离hmp_domains越远hmp_domain对应的速度越慢。因为在构造链表时是按照cluster id来加入的,速度最快cluster的hmp_domain最后加入,所以离表头最近。
static int __init hmp_cpu_mask_setup(void){ struct hmp_domain *domain; struct list_head *pos; int dc, cpu; pr_warn("Initializing HMP scheduler:\n"); /* Initialize hmp_domains using platform code */ /* (1) 调用arch相关的hmp_domains初始化函数 */ arch_get_hmp_domains(&hmp_domains); if (list_empty(&hmp_domains)) { pr_warn("HMP domain list is empty!\n"); return 0; } /* Print hmp_domains */ dc = 0; list_for_each(pos, &hmp_domains) { domain = list_entry(pos, struct hmp_domain, hmp_domains); for_each_cpu(cpu, &domain->possible_cpus) { /* (2) 给per_cpu变量hmp_cpu_domain赋值 */ per_cpu(hmp_cpu_domain, cpu) = domain; } dc++; } return 1;}|→void __init arch_get_hmp_domains(struct list_head *hmp_domains_list){ struct hmp_domain *domain; struct cpumask cpu_mask; int id, maxid; cpumask_clear(&cpu_mask); maxid = arch_get_nr_clusters(); /* * Initialize hmp_domains * Must be ordered with respect to compute capacity. * Fastest domain at head of list. */ /* (1.1) 按照cluster id初始化对应的hmp_domain */ for (id = 0; id < maxid; id++) { arch_get_cluster_cpus(&cpu_mask, id); domain = (struct hmp_domain *) kmalloc(sizeof(struct hmp_domain), GFP_KERNEL); cpumask_copy(&domain->possible_cpus, &cpu_mask); cpumask_and(&domain->cpus, cpu_online_mask, &domain->possible_cpus); /* (1.2) 将hmp_domain加入到全局链表hmp_domains_list即hmp_domains中 */ list_add(&domain->hmp_domains, hmp_domains_list); }}
4.2.2、hmp_force_up_migration()
hmp_force_up_migration()的操作主要有以下几个步骤:
需要重点提一下的是:负载计算计算了3种负载(load_avg、loadwop_avg、util_avg),rebalance_domains主要使用其中的loadwop_avg。
- 1、根据当前cpu,选择fast_cpu_mask、slow_cpu_mask;
hmp_force_up_migration尝试把slow cpu上的heavy进程迁移到fast cpu上,关于slow、fast的选择有以下几种场景:
2、选择当前cpu的heaviest进程作为迁移进程p;并不会遍历cpu上所有进程去选出heaviest进程,只会查询curr进程和cfs_rq中5个进程中的heaviest;
3、根据fast_cpu_mask,选择一个负载最少的target cpu;
- 4、根据源cpu(curr_cpu)、目的cpu(target_cpu),计算负载;
重要的数据计算方法:
重要数据 所属结构 含义 更新/获取函数 计算方法 clbenv->bstats.cpu_power clbenv->bstats B族cpu的绝对计算能力 sched_update_clbstats() arch_scale_cpu_capacity(NULL, clbenv->btarget) clbenv->lstats.cpu_power clbenv->lstats L族cpu的绝对计算能力 sched_update_clbstats() arch_scale_cpu_capacity(NULL, clbenv->ltarget) clbenv->lstats.cpu_capacity clbenv->lstats B族cpu的相对计算能力,大于1024 sched_update_clbstats() SCHED_CAPACITY_SCALE * clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1) clbenv->bstats.cpu_capacity clbenv->bstats L族cpu的相对计算能力,等于1024 sched_update_clbstats() SCHED_CAPACITY_SCALE clbs->ncpu clbenv->bstats/clbenv->lstats L族/B族online的cpu数量 collect_cluster_stats() if (cpu_online(cpu)) clbs->ncpu++; clbs->ntask clbenv->bstats/clbenv->lstats L族/B族所有online cpu中所有层级se的总和 collect_cluster_stats() clbs->ntask += cpu_rq(cpu)->cfs.h_nr_running; clbs->load_avg clbenv->bstats/clbenv->lstats L族/B族online cpu的平均runnable负载,不带weight collect_cluster_stats() sum(cpu_rq(cpu)->cfs.avg.loadwop_avg)/clbs->ncpu clbs->scaled_acap clbenv->bstats/clbenv->lstats L族/B族target cpu计算能力的剩余值 collect_cluster_stats() hmp_scale_down(clbs->cpu_capacity - cpu_rq(target)->cfs.avg.loadwop_avg) clbs->scaled_atask clbenv->bstats/clbenv->lstats L族/B族target cpu的task space的剩余值 collect_cluster_stats() hmp_scale_down(clbs->cpu_capacity - cpu_rq(target)->cfs.h_nr_running * cpu_rq(target)->cfs.avg.loadwop_avg) clbenv->bstats.threshold clbenv->bstats 进程要up迁移到B族的负载门限值 adj_threshold() HMP_MAX_LOAD - HMP_MAX_LOAD * b_nacap * b_natask / ((b_nacap + l_nacap) * (b_natask + l_natask) + 1);b_nacap、b_natask会乘以一个放大系数(b_cpu_power/l_cpu_power),类似如cpu_capacity的计算 clbenv->lstats.threshold clbenv->lstats 进程要down迁移到L族的负载门限值 adj_threshold() HMP_MAX_LOAD * l_nacap * l_natask / ((b_nacap + l_nacap) * (b_natask + l_natask) + 1);b_nacap、b_natask会乘以一个放大系数(b_cpu_power/l_cpu_power),类似如cpu_capacity的计算- 5、根据计算的负载情况,判断进程p是否符合up迁移条件((se_load(se) > B->threshold),等其他条件);
up-migration条件列表(hmp_up_migration()):
条件 含义 计算方法 计算解析 [1] Migration stabilizing 如果target cpu刚做过up迁移,不适合再进行迁移 if (!hmp_up_stable(*target_cpu)) check->result = 0; (((now - hmp_last_up_migration(cpu)) >> 10)static void run_rebalance_domains(struct softirq_action *h){ struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; int this_cpu = smp_processor_id(); /* bypass load balance of HMP if EAS consideration */ /* (1) 在EAS不使能的情况下,尝试进行HMP负载均衡 */ if ((!energy_aware() && sched_feat(SCHED_HMP)) || (hybrid_support() && cpu_rq(this_cpu)->rd->overutilized)) hmp_force_up_migration(this_cpu); /* * If this cpu has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle cpus whose ticks are * stopped. Do nohz_idle_balance *before* rebalance_domains to * give the idle cpus a chance to load balance. Else we may * load balance only within the local sched_domain hierarchy * and abort nohz_idle_balance altogether if we pull some load. */ nohz_idle_balance(this_rq, idle); rebalance_domains(this_rq, idle);}|→static void hmp_force_up_migration(int this_cpu){ int curr_cpu, target_cpu;a struct sched_entity *se; struct rq *target; unsigned long flags; unsigned int force = 0; struct task_struct *p; struct clb_env clbenv;#ifdef CONFIG_SCHED_HMP_PLUS struct sched_entity *orig;#endif if (!spin_trylock(&hmp_force_migration)) return;#ifdef CONFIG_HMP_TRACER for_each_online_cpu(curr_cpu) trace_sched_cfs_runnable_load(curr_cpu, cfs_load(curr_cpu), cfs_length(curr_cpu));#endif /* Migrate heavy task from LITTLE to big */ /* (1.1) 逐个online cpu尝试进行heavy task从little cpu到big cpu的迁移 */ for_each_online_cpu(curr_cpu) { struct hmp_domain *hmp_domain = NULL; struct cpumask fast_cpu_mask, slow_cpu_mask; cpumask_clear(&fast_cpu_mask); cpumask_clear(&slow_cpu_mask); /* (1.2) 如果当前cpu不属于速度最快(fastest)的domain, 则尝试进行up操作 */ if (!hmp_cpu_is_fastest(curr_cpu)) { /* current cpu is slow_cpu_mask*/ /* (1.2.1) 当前cpu所在的hmp_domain为slow_cpu_mask */ hmp_domain = hmp_cpu_domain(curr_cpu); cpumask_copy(&slow_cpu_mask, &hmp_domain->possible_cpus); /* (1.2.2) 最fastest且online的hmp_domain为fast_cpu_mask */ while (&hmp_domain->hmp_domains != hmp_domains.next) { struct list_head *pos = &hmp_domain->hmp_domains; hmp_domain = list_entry(pos->prev, struct hmp_domain, hmp_domains); if (!cpumask_empty(&hmp_domain->cpus)) { cpumask_copy(&fast_cpu_mask, &hmp_domain->possible_cpus); break; } } } else { /* (1.3) 如果当前cpu属于速度最快(fastest)的domain, 则直接进行down操作 */ hmp_force_down_migration(this_cpu); continue; } if (!hmp_domain || hmp_domain == hmp_cpu_domain(curr_cpu)) continue; if (cpumask_empty(&fast_cpu_mask) || cpumask_empty(&slow_cpu_mask)) continue; force = 0; /* (1.4) 取出当前cpu的当前cfs进程 */ target = cpu_rq(curr_cpu); raw_spin_lock_irqsave(&target->lock, flags); se = target->cfs.curr; if (!se) { raw_spin_unlock_irqrestore(&target->lock, flags); continue; } /* Find task entity */ if (!entity_is_task(se)) { struct cfs_rq *cfs_rq; cfs_rq = group_cfs_rq(se); while (cfs_rq) { se = cfs_rq->curr; cfs_rq = group_cfs_rq(se); } }#ifdef CONFIG_SCHED_HMP_PLUS orig = se; /* (1.5) 或者取出当前cpu前5个cfs进程中,负载最重(heaviest)的进程 */ se = hmp_get_heaviest_task(se, -1); if (!se) { raw_spin_unlock_irqrestore(&target->lock, flags); continue; } if (!entity_is_task(se)) p = task_of(orig); else#endif p = task_of(se); /* (1.6) 选择fast_cpu_mask domain中,负载最少的cpu */ target_cpu = hmp_select_cpu(HMP_GB, p, &fast_cpu_mask, -1, 0); if (target_cpu >= num_possible_cpus()) { raw_spin_unlock_irqrestore(&target->lock, flags); continue; } /* Collect cluster information */ /* (1.7) up操作的对象已经选择好: 源little cpu:curr_cpu 目的big cpu:target_cpu */ memset(&clbenv, 0, sizeof(clbenv)); clbenv.flags |= HMP_GB; clbenv.ltarget = curr_cpu; clbenv.btarget = target_cpu; cpumask_copy(&clbenv.lcpus, &slow_cpu_mask); cpumask_copy(&clbenv.bcpus, &fast_cpu_mask); /* (1.8) up操作前的数据计算 */ sched_update_clbstats(&clbenv); /* Check migration threshold */ /* (1.9) 根据计算的数据,判断up操作的可行性 */ if (!target->active_balance && hmp_up_migration(curr_cpu, &target_cpu, se, &clbenv) && !cpu_park(cpu_of(target))) { if (p->state != TASK_DEAD) { /* 准备从target rq中迁移进程p到target_cpu, 设置rq正在处理负载balance标志active_balance */ get_task_struct(p); target->active_balance = 1; /* force up */ target->push_cpu = target_cpu; target->migrate_task = p; force = 1; trace_sched_hmp_migrate(p, target->push_cpu, 1); hmp_next_up_delay(&p->se, target->push_cpu); } } raw_spin_unlock_irqrestore(&target->lock, flags); /* (1.10) 判断结果是可以进行up操作, 则调用hmp_force_up_cpu_stop()进行实际的up操作 */ if (force) { if (stop_one_cpu_dispatch(cpu_of(target), hmp_force_up_cpu_stop, target, &target->active_balance_work)) { /* 迁移完成,清除标志 */ put_task_struct(p); /* out of rq->lock */ raw_spin_lock_irqsave(&target->lock, flags); target->active_balance = 0; force = 0; raw_spin_unlock_irqrestore(&target->lock, flags); } } else /* (1.11) 否则,再尝试进行down操作 */ hmp_force_down_migration(this_cpu); }#ifdef CONFIG_HMP_TRACER trace_sched_hmp_load(clbenv.bstats.load_avg, clbenv.lstats.load_avg);#endif spin_unlock(&hmp_force_migration);}||→static const int hmp_max_tasks = 5;static struct sched_entity *hmp_get_heaviest_task( struct sched_entity *se, int target_cpu){ int num_tasks = hmp_max_tasks; struct sched_entity *max_se = se; unsigned long int max_ratio = se->avg.loadwop_avg; const struct cpumask *hmp_target_mask = NULL; struct hmp_domain *hmp; /* (1.5.1) 如果本cpu是fastest cpu,则不用查找直接返回, 因为本函数的目的是找little cpu中的heaviest进程 */ if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq))) return max_se; /* (1.5.2) 获取比本cpu fater一级cpu的hmp_domain,作为进程亲和力判断的mask */ hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq)); hmp_target_mask = &hmp->cpus; /* (1.5.3) 传入参数target_cpu = -1, 所以hmp_target_mask使用的是源cpu hmp_domain的hmp->cpus */ if (target_cpu >= 0) { /* idle_balance gets run on a CPU while * it is in the middle of being hotplugged * out. Bail early in that case. */ if (!cpumask_test_cpu(target_cpu, hmp_target_mask)) return NULL; hmp_target_mask = cpumask_of(target_cpu); } /* The currently running task is not on the runqueue */ /* (1.5.4) 从当前cpu的cfs红黑树中,连续5个进程和curr进程比较,选出heaviest进程 比较使用的负载为se->avg.loadwop_avg,不带weight分量 */ se = __pick_first_entity(cfs_rq_of(se)); while (num_tasks && se) { if (entity_is_task(se) && se->avg.loadwop_avg > max_ratio && cpumask_intersects(hmp_target_mask, tsk_cpus_allowed(task_of(se)))) { max_se = se; max_ratio = se->avg.loadwop_avg; } se = __pick_next_entity(se); num_tasks--; } return max_se;}||→static unsigned int hmp_select_cpu(unsigned int caller, struct task_struct *p, struct cpumask *mask, int prev, int up){ int curr = 0; int target = num_possible_cpus(); unsigned long curr_wload = 0; unsigned long target_wload = 0; struct cpumask srcp; /* (1.6.1) 综合fast_cpu_mask、cpu_online_mask、tsk_cpus_allowed(p), 选取first cpu为target */ cpumask_and(&srcp, cpu_online_mask, mask); target = cpumask_any_and(&srcp, tsk_cpus_allowed(p)); if (target >= num_possible_cpus()) goto out; /* * RT class is taken into account because CPU load is multiplied * by the total number of CPU runnable tasks that includes RT tasks. */ /* (1.6.2) 计算target cpu所对应的load, target_wload = (rq->cfs.avg.loadwop_avg + rq->cfs.avg.pending_load) * (rq->nr_running + rq->cfs.avg.nr_pending) 该负载会受RT进程的影响,因为rq->nr_running会统计包括RT进程的数量 */ target_wload = hmp_inc(cfs_load(target)); target_wload += cfs_pending_load(target); target_wload *= rq_length(target); for_each_cpu(curr, mask) { /* Check CPU status and task affinity */ if (!cpu_online(curr) || !cpumask_test_cpu(curr, tsk_cpus_allowed(p))) continue; /* For global load balancing, unstable CPU will be bypassed */ /* (1.6.3) 如果当前是up操作,如果cpu在短时间内进行了down操作,则不适合马上进行up操作 */ if (hmp_caller_is_gb(caller) && !hmp_cpu_stable(curr, up)) continue; curr_wload = hmp_inc(cfs_load(curr)); curr_wload += cfs_pending_load(curr); curr_wload *= rq_length(curr); /* (1.6.4) 选择load最小的作为target cpu */ if (curr_wload < target_wload) { target_wload = curr_wload; target = curr; /* (1.6.5) 在load同样小的情况下,选择prev cpu */ } else if (curr_wload == target_wload && curr == prev) { target = curr; } }out: return target;}||→static void sched_update_clbstats(struct clb_env *clbenv){ /* init cpu power and capacity */ /* (1.8.1) L族和B族的绝对运行能力和相对运算能力, .cpu_power = 绝对运算能力 .cpu_capacity = 相对运算能力 */ clbenv->bstats.cpu_power = (int) arch_scale_cpu_capacity(NULL, clbenv->btarget); clbenv->lstats.cpu_power = (int) arch_scale_cpu_capacity(NULL, clbenv->ltarget); clbenv->lstats.cpu_capacity = SCHED_CAPACITY_SCALE; clbenv->bstats.cpu_capacity = SCHED_CAPACITY_SCALE * clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1); /* (1.8.2) L族和B族的 */ collect_cluster_stats(&clbenv->bstats, &clbenv->bcpus, clbenv->btarget); collect_cluster_stats(&clbenv->lstats, &clbenv->lcpus, clbenv->ltarget); /* (1.8.3) L族和B族的 */ adj_threshold(clbenv);}|||→static void collect_cluster_stats(struct clb_stats *clbs, struct cpumask *cluster_cpus, int target){#define HMP_RESOLUTION_SCALING (4)#define hmp_scale_down(w) ((w) >> HMP_RESOLUTION_SCALING) /* Update cluster informatics */ int cpu; /* (1.8.2.1) 累加本族online cpu的值 */ for_each_cpu(cpu, cluster_cpus) { if (cpu_online(cpu)) { clbs->ncpu++; clbs->ntask += cpu_rq(cpu)->cfs.h_nr_running; clbs->load_avg += cpu_rq(cpu)->cfs.avg.loadwop_avg;#ifdef CONFIG_SCHED_HMP_PRIO_FILTER clbs->nr_normal_prio_task += cfs_nr_normal_prio(cpu); clbs->nr_dequeuing_low_prio += cfs_nr_dequeuing_low_prio(cpu);#endif } } if (!clbs->ncpu || target >= num_possible_cpus() || !cpumask_test_cpu(target, cluster_cpus)) return; /* * Calculate available CPU capacity * Calculate available task space * * Why load ratio should be multiplied by the number of task ? * The task is the entity of scheduling unit so that we should consider * it in scheduler. Only considering task load is not enough. * Thus, multiplying the number of tasks can adjust load ratio to a more * reasonable value. */ /* (1.8.2.2) 计算本族剩余的cpu计算能力 capacity = 相对计算能力(clbs->cpu_capacity) - 本cpu的负载(rq->cfs.avg.loadwop_avg) :clbs->cpu_capacity是B族和L族相对的(L是1024,B大于1024),而负载(rq->cfs.avg.loadwop_avg)是相对自己的B族和L族的最大值都是1024 */ clbs->load_avg /= clbs->ncpu; clbs->acap = clbs->cpu_capacity - cpu_rq(target)->cfs.avg.loadwop_avg; clbs->scaled_acap = hmp_scale_down(clbs->acap); /* (1.8.2.3) 计算本族剩余的task空间 scaled_atask = 相对计算能力(clbs->cpu_capacity) - 本cpu的负载(rq->cfs.avg.loadwop_avg)*本cpu所有的进程数量(rq->cfs.h_nr_running) ooooo这里的计算也不是在同一纬度上的 */ clbs->scaled_atask = cpu_rq(target)->cfs.h_nr_running * cpu_rq(target)->cfs.avg.loadwop_avg; clbs->scaled_atask = clbs->cpu_capacity - clbs->scaled_atask; clbs->scaled_atask = hmp_scale_down(clbs->scaled_atask); mt_sched_printf(sched_log, "[%s] cpu/cluster:%d/%02lx load/len:%lu/%u stats:%d,%d,%d,%d,%d,%d,%d,%d\n", __func__, target, *cpumask_bits(cluster_cpus), cpu_rq(target)->cfs.avg.loadwop_avg, cpu_rq(target)->cfs.h_nr_running, clbs->ncpu, clbs->ntask, clbs->load_avg, clbs->cpu_capacity, clbs->acap, clbs->scaled_acap, clbs->scaled_atask, clbs->threshold);}|||→/* * Task Dynamic Migration Threshold Adjustment. * * If the workload between clusters is not balanced, adjust migration * threshold in an attempt to move task precisely. * * Diff. = Max Threshold - Min Threshold * * Dynamic UP-Threshold = * B_nacap B_natask * Max Threshold - Diff. x ----------------- x ------------------- * B_nacap + L_nacap B_natask + L_natask * * * Dynamic Down-Threshold = * L_nacap L_natask * Min Threshold + Diff. x ----------------- x ------------------- * B_nacap + L_nacap B_natask + L_natask */static void adj_threshold(struct clb_env *clbenv){#define POSITIVE(x) ((int)(x) < 0 ? 0 : (x)) unsigned long b_cap = 0, l_cap = 0; int b_nacap, l_nacap, b_natask, l_natask; b_cap = clbenv->bstats.cpu_power; l_cap = clbenv->lstats.cpu_power; /* (1.8.3.1) 把B族剩余cpu计算能力和task空间,转换成L族的相对值 */ b_nacap = POSITIVE(clbenv->bstats.scaled_acap * clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1)); b_natask = POSITIVE(clbenv->bstats.scaled_atask * clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1)); /* L族的值维持不变 */ l_nacap = POSITIVE(clbenv->lstats.scaled_acap); l_natask = POSITIVE(clbenv->lstats.scaled_atask); /* (1.8.3.2) 计算up的threshold, up-threshold = HMP_MAX_LOAD - HMP_MAX_LOAD*B族剩余 */ clbenv->bstats.threshold = HMP_MAX_LOAD - HMP_MAX_LOAD * b_nacap * b_natask / ((b_nacap + l_nacap) * (b_natask + l_natask) + 1); /* (1.8.3.3) 计算down的threshold, down-threshold = HMP_MAX_LOAD*L族剩余 */ clbenv->lstats.threshold = HMP_MAX_LOAD * l_nacap * l_natask / ((b_nacap + l_nacap) * (b_natask + l_natask) + 1); mt_sched_printf(sched_log, "[%s]\tup/dl:%4d/%4d L(%d:%4lu) b(%d:%4lu)\n", __func__, clbenv->bstats.threshold, clbenv->lstats.threshold, clbenv->ltarget, l_cap, clbenv->btarget, b_cap);}||→/* * Check whether this task should be migrated to big * Briefly summarize the flow as below; * 1) Migration stabilizing * 2) Filter low-priority task * 2.5) Keep all cpu busy * 3) Check CPU capacity * 4) Check dynamic migration threshold */static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se, struct clb_env *clbenv){ struct task_struct *p = task_of(se); struct clb_stats *L, *B; struct mcheck *check; int curr_cpu = cpu;#ifdef CONFIG_HMP_TRACER unsigned int caller = clbenv->flags;#endif L = &clbenv->lstats; B = &clbenv->bstats; check = &clbenv->mcheck; check->status = clbenv->flags; check->status |= HMP_TASK_UP_MIGRATION; check->result = 0; /* * No migration is needed if * 1) There is only one cluster * 2) Task is already in big cluster * 3) It violates task affinity */ if (!L->ncpu || !B->ncpu || cpumask_test_cpu(curr_cpu, &clbenv->bcpus) || !cpumask_intersects(&clbenv->bcpus, tsk_cpus_allowed(p))) goto out; /* (1.9.1) 如果目标cpu短时间内已经执行了up操作,则为up unstable状态,退出 */ /* * [1] Migration stabilizing * Let the task load settle before doing another up migration. * It can prevent a bunch of tasks from migrating to a unstable CPU. */ if (!hmp_up_stable(*target_cpu)) goto out; /* (1.9.2) 过滤掉优先级较低的进程,不进行迁移操作。具体有3个条件: (task_low_priority(p->prio) && \ // nice值大于5 (B->ntask >= B->ncpu || 0 != L->nr_normal_prio_task) && \ // B组进程大于cou数 || 正常优先级的进程不为0 (p->se.avg.loadwop_avg < 800)) // 平均负载小于800 */ /* [2] Filter low-priority task */#ifdef CONFIG_SCHED_HMP_PRIO_FILTER if (hmp_low_prio_task_up_rejected(p, B, L)) { check->status |= HMP_LOW_PRIORITY_FILTER; goto trace; }#endif /* (1.9.3) 如果B组的target cpu为idle,不用过多判断,直接准备迁移 */ /* [2.5]if big is idle, just go to big */ if (rq_length(*target_cpu) == 0) { check->status |= HMP_BIG_IDLE; check->status |= HMP_MIGRATION_APPROVED; check->result = 1; goto trace; } /* (1.9.4) 判断B族target cpu的capacity是否足够, (se_load(se) + cfs_load(cpu)) < (B->cpu_capacity - (B->cpu_capacity >> 2)) // target cpu负载 + 要迁移的se负载 是否小于 3/4 B族cpu的capacity */ /* * [3] Check CPU capacity * Forbid up-migration if big CPU can't handle this task */ if (!hmp_task_fast_cpu_afford(B, se, *target_cpu)) { check->status |= HMP_BIG_CAPACITY_INSUFFICIENT; goto trace; } /* (1.9.5) 判断se的负载是否已经大于up-threshold(B->threshold) */ /* * [4] Check dynamic migration threshold * Migrate task from LITTLE to big if load is greater than up-threshold */ if (se_load(se) > B->threshold) { check->status |= HMP_MIGRATION_APPROVED; check->result = 1; }trace:#ifdef CONFIG_HMP_TRACER if (check->result && hmp_caller_is_gb(caller)) hmp_stats.nr_force_up++; trace_sched_hmp_stats(&hmp_stats); trace_sched_dynamic_threshold(task_of(se), B->threshold, check->status, curr_cpu, *target_cpu, se_load(se), B, L); trace_sched_dynamic_threshold_draw(B->threshold, L->threshold);#endifout: return check->result;}||→static int hmp_force_up_cpu_stop(void *data){ /* (1.10.1) 执行进程迁移 */ return hmp_active_task_migration_cpu_stop(data);}|||→static int hmp_active_task_migration_cpu_stop(void *data){ struct rq *busiest_rq = data; struct task_struct *p = NULL; int busiest_cpu = cpu_of(busiest_rq); int target_cpu = busiest_rq->push_cpu; struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; raw_spin_lock_irq(&busiest_rq->lock); p = busiest_rq->migrate_task; /* make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || !busiest_rq->active_balance)) { goto out_unlock; } /* Is there any task to move? */ if (busiest_rq->nr_running <= 1) goto out_unlock; /* Are both target and busiest cpu online */ if (!cpu_online(busiest_cpu) || !cpu_online(target_cpu)) goto out_unlock; /* Task has migrated meanwhile, abort forced migration */ if ((!p) || (task_rq(p) != busiest_rq)) goto out_unlock; /* * This condition is "impossible", if it occurs * we need to fix it. Originally reported by * Bjorn Helgaas on a 128-cpu setup. */ WARN_ON(busiest_rq == target_rq); /* (1.10.1.1) 将源、目的rq lock住 */ /* move a task from busiest_rq to target_rq */ double_lock_balance(busiest_rq, target_rq); /* (1.10.1.2) 搜索target cpu所在的某一层次的sd,其sd->span[]即包含源cpu又包含目的cpu */ /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) break; } /* (1.10.1.3) 构造数据,在同一sd下进行迁移 */ if (likely(sd)) { struct lb_env env = { .sd = sd, .dst_cpu = target_cpu, .dst_rq = target_rq, .src_cpu = busiest_rq->cpu, .src_rq = busiest_rq, .idle = CPU_IDLE, }; schedstat_inc(sd, alb_count); /* (1.10.1.4) 任务迁移 */ if (move_specific_task(&env, p)) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); } rcu_read_unlock(); double_unlock_balance(busiest_rq, target_rq);out_unlock: busiest_rq->active_balance = 0; raw_spin_unlock_irq(&busiest_rq->lock); put_task_struct(p); return 0;}||||→static int move_specific_task(struct lb_env *env, struct task_struct *pm){ struct task_struct *p, *n; /* (1.10.1.4.1) 从源rq->cfs_tasks逐个取出任务,直到查到pm */ list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { /* (1.10.1.4.2) task group的throttled判断 */ if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) continue; /* (1.10.1.4.3) 判断任务能否被迁移 */ if (!hmp_can_migrate_task(p, env)) continue; /* Check if we found the right task */ if (p != pm) continue; /* (1.10.1.4.4) 迁移 */ move_task(p, env); /* * Right now, this is only the third place move_task() * is called, so we can safely collect move_task() * stats here rather than inside move_task(). */ schedstat_inc(env->sd, lb_gained[env->idle]); return 1; } return 0;}|||||→static void move_task(struct task_struct *p, struct lb_env *env){ deactivate_task(env->src_rq, p, 0); set_task_cpu(p, env->dst_cpu); activate_task(env->dst_rq, p, 0); check_preempt_curr(env->dst_rq, p, 0);}
4.2.3、hmp_force_down_migration()
hmp_force_down_migration()的操作主要有以下几个步骤:
- 1、根据当前cpu,选择fast_cpu_mask、slow_cpu_mask;
hmp_force_down_migration尝试把fast cpu上的light进程迁移到slow cpu上,关于fast、slow的选择有以下几种场景:
2、选择当前cpu的lightest进程作为迁移进程p;并不会遍历cpu上所有进程去选出lightest进程,只会查询curr进程和cfs_rq中5个进程中的lightest;
3、根据slow_cpu_mask,选择一个负载最少的target cpu;
- 4、根据源cpu(curr_cpu)、目的cpu(target_cpu),计算负载;
重要的数据计算方法和hmp_force_up_migration()一致,参考上一节;
- 5、根据计算的负载情况,判断进程p是否符合down迁移条件((L->threshold >= se_load(se)),等其他条件);
down-migration条件列表(hmp_down_migration()):
条件 含义 计算方法 计算解析 [1] Migration stabilizing 如果target cpu刚做过down迁移,不适合再进行迁移 if (!hmp_down_stable(*target_cpu)) check->result = 0; (((now - hmp_last_down_migration(cpu)) >> 10)static void hmp_force_down_migration(int this_cpu){ int target_cpu; struct sched_entity *se; struct rq *target; unsigned long flags; unsigned int force = 0; struct task_struct *p; struct clb_env clbenv;#ifdef CONFIG_SCHED_HMP_PLUS struct sched_entity *orig; int B_cpu;#endif struct hmp_domain *hmp_domain = NULL; struct cpumask fast_cpu_mask, slow_cpu_mask; cpumask_clear(&fast_cpu_mask); cpumask_clear(&slow_cpu_mask); /* Migrate light task from big to LITTLE */ /* (1) 如果当前cpu不是最慢的cpu(slowest),则尝试down操作 */ if (!hmp_cpu_is_slowest(this_cpu)) { /* (2) 当前cpu所在的hmp_domain为fast_cpu_mask */ hmp_domain = hmp_cpu_domain(this_cpu); cpumask_copy(&fast_cpu_mask, &hmp_domain->possible_cpus); /* (3) 查找相比当前最慢且online的hmp_domain作为slow_cpu_mask */ while (!list_is_last(&hmp_domain->hmp_domains, &hmp_domains)) { struct list_head *pos = &hmp_domain->hmp_domains; hmp_domain = list_entry(pos->next, struct hmp_domain, hmp_domains); if (!cpumask_empty(&hmp_domain->cpus)) { cpumask_copy(&slow_cpu_mask, &hmp_domain->possible_cpus); break; } } } if (!hmp_domain || hmp_domain == hmp_cpu_domain(this_cpu)) return; /* (4) 找不到可操作的fast_cpu_mask、slow_cpu_mask直接返回 */ if (cpumask_empty(&fast_cpu_mask) || cpumask_empty(&slow_cpu_mask)) return; /* (5) 源cpu = this_cpu,源rq = target */ force = 0; target = cpu_rq(this_cpu); raw_spin_lock_irqsave(&target->lock, flags); se = target->cfs.curr; if (!se) { raw_spin_unlock_irqrestore(&target->lock, flags); return; } /* (6) 首先尝试使用curr进程作为down迁移的进程 */ /* Find task entity */ if (!entity_is_task(se)) { struct cfs_rq *cfs_rq; cfs_rq = group_cfs_rq(se); while (cfs_rq) { se = cfs_rq->curr; cfs_rq = group_cfs_rq(se); } }#ifdef CONFIG_SCHED_HMP_PLUS /* (7) 在curr进程开始的5个进程中,挑负载最轻的进程作为down迁移进程 */ orig = se; se = hmp_get_lightest_task(orig, 1); if (!entity_is_task(se)) p = task_of(orig); else#endif p = task_of(se);#ifdef CONFIG_SCHED_HMP_PLUS /* (8) 找出B族中负载最轻的cpu,如果其为idle状态,则放弃down操作 因为load_balance中的idle_balance会重新把任务迁移回idle的big cpu,避免相互的乒乓操作 */ /* Don't offload to little if there is one idle big, let load balance to do it's work */ /* Also, to prevent idle_balance from leading to potential ping-pong */ B_cpu = hmp_select_cpu(HMP_GB, p, &fast_cpu_mask, this_cpu, 0); if (B_cpu < nr_cpu_ids && !rq_length(B_cpu)) { raw_spin_unlock_irqrestore(&target->lock, flags); return; }#endif /* (9) 找出L族中负载最轻的cpu作为target_cpu */ target_cpu = hmp_select_cpu(HMP_GB, p, &slow_cpu_mask, -1, 1); if (target_cpu >= num_possible_cpus()) { raw_spin_unlock_irqrestore(&target->lock, flags); return; } /* (10) 迁移前对B族、L族负载和threshold的计算 */ /* Collect cluster information */ memset(&clbenv, 0, sizeof(clbenv)); clbenv.flags |= HMP_GB; clbenv.btarget = this_cpu; clbenv.ltarget = target_cpu; cpumask_copy(&clbenv.lcpus, &slow_cpu_mask); cpumask_copy(&clbenv.bcpus, &fast_cpu_mask); sched_update_clbstats(&clbenv);#ifdef CONFIG_SCHED_HMP_PLUS if (cpu_rq(this_cpu)->cfs.h_nr_running < 2) { raw_spin_unlock_irqrestore(&target->lock, flags); return; }#endif /* (11) 检查down操作的迁移条件是否成立,hmp_down_migration() */ /* Check migration threshold */ if (!target->active_balance && hmp_down_migration(this_cpu, &target_cpu, se, &clbenv) && !cpu_park(cpu_of(target))) { if (p->state != TASK_DEAD) { get_task_struct(p); target->active_balance = 1; /* force down */ target->push_cpu = target_cpu; target->migrate_task = p; force = 1; trace_sched_hmp_migrate(p, target->push_cpu, 1); hmp_next_down_delay(&p->se, target->push_cpu); } } raw_spin_unlock_irqrestore(&target->lock, flags); /* (12) 条件成立进行实际的down迁移操作hmp_force_down_cpu_stop() */ if (force) { if (stop_one_cpu_dispatch(cpu_of(target), hmp_force_down_cpu_stop, target, &target->active_balance_work)) { put_task_struct(p); /* out of rq->lock */ raw_spin_lock_irqsave(&target->lock, flags); target->active_balance = 0; force = 0; raw_spin_unlock_irqrestore(&target->lock, flags); } }}|→static struct sched_entity *hmp_get_lightest_task( struct sched_entity *se, int migrate_down){ int num_tasks = hmp_max_tasks; struct sched_entity *min_se = se; unsigned long int min_ratio = se->avg.loadwop_avg; const struct cpumask *hmp_target_mask = NULL; if (migrate_down) { struct hmp_domain *hmp; /* (7.1) 如果cpu是最慢cpu(slowest)则直接退出, 因为本函数的目的是找出faster cpu中lightest进程 */ if (hmp_cpu_is_slowest(cpu_of(se->cfs_rq->rq))) return min_se; /* (7.2) 将更slow一级的hmp_domain作为进程cpu亲和力的mask */ hmp = hmp_slower_domain(cpu_of(se->cfs_rq->rq)); hmp_target_mask = &hmp->cpus; } /* The currently running task is not on the runqueue */ se = __pick_first_entity(cfs_rq_of(se)); /* (7.3) 从当前cpu的cfs红黑树中,连续5个进程和curr进程比较,选出lightest进程 比较使用的负载为se->avg.loadwop_avg,不带weight分量 */ while (num_tasks && se) { if (entity_is_task(se) && (se->avg.loadwop_avg < min_ratio && hmp_target_mask && cpumask_intersects(hmp_target_mask, tsk_cpus_allowed(task_of(se))))) { min_se = se; min_ratio = se->avg.loadwop_avg; } se = __pick_next_entity(se); num_tasks--; } return min_se;}|→/* * Check whether this task should be migrated to LITTLE * Briefly summarize the flow as below; * 1) Migration stabilizing * 1.5) Keep all cpu busy * 2) Filter low-priority task * 3) Check CPU capacity * 4) Check dynamic migration threshold */static unsigned int hmp_down_migration(int cpu, int *target_cpu, struct sched_entity *se, struct clb_env *clbenv){ struct task_struct *p = task_of(se); struct clb_stats *L, *B; struct mcheck *check; int curr_cpu = cpu; unsigned int caller = clbenv->flags; L = &clbenv->lstats; B = &clbenv->bstats; check = &clbenv->mcheck; check->status = caller; check->status |= HMP_TASK_DOWN_MIGRATION; check->result = 0; /* * No migration is needed if * 1) There is only one cluster * 2) Task is already in LITTLE cluster * 3) It violates task affinity */ if (!L->ncpu || !B->ncpu || cpumask_test_cpu(curr_cpu, &clbenv->lcpus) || !cpumask_intersects(&clbenv->lcpus, tsk_cpus_allowed(p))) goto out; /* (11.1) 目的little cpu target_cpu近期如果有做过down操作,不适合再做down迁移 */ /* * [1] Migration stabilizing * Let the task load settle before doing another down migration. * It can prevent a bunch of tasks from migrating to a unstable CPU. */ if (!hmp_down_stable(*target_cpu)) goto out; /* (11.2) 如果big busy,little idle则不用进行threshold判断 */ /* [1.5]if big is busy and little is idle, just go to little */ if (rq_length(*target_cpu) == 0 && caller == HMP_SELECT_RQ && rq_length(curr_cpu) > 0) { struct rq *curr_rq = cpu_rq(curr_cpu); /* (11.2.1) 如果big cpu,curr进程不是heavy进程,但是p是heavy进程,直接准许down迁移 heavy进程的判断标准为:负载>=650 */ /* if current big core is not heavy task and wake up task is heavy task no go to little */ if (!(!is_heavy_task(curr_rq->curr) && is_heavy_task(p))) { check->status |= HMP_BIG_BUSY_LITTLE_IDLE; check->status |= HMP_MIGRATION_APPROVED; check->result = 1; goto trace; } } /* (11.3) 低优先级进程,如果满足以下条件,准许迁移: (task_low_priority(p->prio) && !B->nr_dequeuing_low_prio && \ // nice值大于5 B->ntask >= B->ncpu && 0 != L->nr_normal_prio_task && \ // B和L都不是特别空闲 (p->se.avg.loadwop_avg < 800)) // L上准备迁移的进程负载小于800 */ /* [2] Filter low-priority task */#ifdef CONFIG_SCHED_HMP_PRIO_FILTER if (hmp_low_prio_task_down_allowed(p, B, L)) { cfs_nr_dequeuing_low_prio(curr_cpu)++; check->status |= HMP_LOW_PRIORITY_FILTER; check->status |= HMP_MIGRATION_APPROVED; check->result = 1; goto trace; }#endif /* * [3] Check CPU capacity * Forbid down-migration if either of the following conditions is true * 1) big cpu is not oversubscribed (if big CPU seems to have spare * cycles, do not force this task to run on LITTLE CPU, but * keep it staying in its previous cluster instead) * 2) LITTLE cpu doesn't have available capacity for this new task */ /* (11.4) 如果big cpu有足够的空闲周期,不需要强制把light任务迁移到little cpu上 cfs_load(cpu) < (B->cpu_capacity - (B->cpu_capacity >> 2)) */ if (!hmp_fast_cpu_oversubscribed(caller, B, se, curr_cpu)) { check->status |= HMP_BIG_NOT_OVERSUBSCRIBED; goto trace; } /* (11.5) 判断L族cpu的capacity是否足够容纳需要迁移的进程, (L->acap > 0 && L->acap >= se_load(se)) */ if (!hmp_task_slow_cpu_afford(L, se)) { check->status |= HMP_LITTLE_CAPACITY_INSUFFICIENT; goto trace; } /* (11.6) 判断se的负载是否已经小于down-threshold(L->threshold) */ /* * [4] Check dynamic migration threshold * Migrate task from big to LITTLE if load ratio is less than * or equal to down-threshold */ if (L->threshold >= se_load(se)) { check->status |= HMP_MIGRATION_APPROVED; check->result = 1; }trace:#ifdef CONFIG_HMP_TRACER if (check->result && hmp_caller_is_gb(caller)) hmp_stats.nr_force_down++; trace_sched_hmp_stats(&hmp_stats); trace_sched_dynamic_threshold(task_of(se), L->threshold, check->status, curr_cpu, *target_cpu, se_load(se), B, L); trace_sched_dynamic_threshold_draw(B->threshold, L->threshold);#endifout: return check->result;}
4.2.4、hmp_select_task_rq_fair()
4.3、cpu freq调整
前面讲的负载均衡的手段都是负载迁移,把负载迁移到最idle或者最省power的cpu上。另外一种方式就是调整cpu的freq,从而改变cpu的curr_capacity,来满足性能和功耗的需求。
cpu的频率调整是基于3个层次的:cpufreq governor、cpufreq core、cpufreq driver。
- 1、cpufreq governor决定cpu调频的算法,计算负载、根据负载的变化来动态调整频率;
- 2、cpufreq core对通用层进行了一些封装,比如cpufreq_policy的封装;
- 3、cpufreq driver是底层操作的实现,比如freq_table的初始化、cpu target频率的配置;
如果是MTK平台,cpufreq driver除了接受governor的频率调整还需要接受ppm的频率调整,它的框图大概如下:
4.3.1、cpufreq core & cpufreq driver
cpufreq core层次最核心的就是每个cpu有一个自己的cpufreq_policy policy,放在per_cpu(cpufreq_cpu_data, cpu)变量中。实际上cpufreq_policy是一个cluster对应一个的,因为在现有的架构中,同一个cluster cpu都是同一个频率,所以同cluster中所有cpu的per_cpu(cpufreq_cpu_data, cpu)都指向同一个cpufreq_policy。
4.3.1.1、cpufreq_policy policy初始化
struct cpufreq_policy { /* CPUs sharing clock, require sw coordination */ cpumask_var_t cpus; /* Online CPUs only */ cpumask_var_t related_cpus; /* Online + Offline CPUs */ cpumask_var_t real_cpus; /* Related and present */ unsigned int shared_type; /* ACPI: ANY or ALL affected CPUs should set cpufreq */ unsigned int cpu; /* cpu managing this policy, must be online */ struct clk *clk; struct cpufreq_cpuinfo cpuinfo;/* see above */ unsigned int min; /* in kHz */ unsigned int max; /* in kHz */ unsigned int cur; /* in kHz, only needed if cpufreq * governors are used */ unsigned int restore_freq; /* = policy->cur before transition */ unsigned int suspend_freq; /* freq to set during suspend */ unsigned int policy; /* see above */ unsigned int last_policy; /* policy before unplug */ struct cpufreq_governor *governor; /* see below */ void *governor_data; bool governor_enabled; /* governor start/stop flag */ char last_governor[CPUFREQ_NAME_LEN]; /* last governor used */ struct work_struct update; /* if update_policy() needs to be * called, but you're in IRQ context */ struct cpufreq_user_policy user_policy; struct cpufreq_frequency_table *freq_table; struct list_head policy_list; struct kobject kobj; struct completion kobj_unregister; /* * The rules for this semaphore: * - Any routine that wants to read from the policy structure will * do a down_read on this semaphore. * - Any routine that will write to the policy structure and/or may take away * the policy altogether (eg. CPU hotplug), will hold this lock in write * mode before doing so. * * Additional rules: * - Lock should not be held across * __cpufreq_governor(data, CPUFREQ_GOV_POLICY_EXIT); */ struct rw_semaphore rwsem; /* Synchronization for frequency transitions */ bool transition_ongoing; /* Tracks transition status */ spinlock_t transition_lock; wait_queue_head_t transition_wait; struct task_struct *transition_task; /* Task which is doing the transition */ /* cpufreq-stats */ struct cpufreq_stats *stats; /* For cpufreq driver's internal use */ void *driver_data;}
在系统初始化化的时候初始化online cpu的cpufreq_policy,cpu在hotplug online的时候也会重新初始化cpufreq_policy。
- 1、在mtk的cpufreq_driver驱动初始化函数_mt_cpufreq_pdrv_probe()中注册了_mt_cpufreq_driver:
static int _mt_cpufreq_pdrv_probe(struct platform_device *pdev){ /* 注册cpufreq_driver */ cpufreq_register_driver(&_mt_cpufreq_driver); /* 注册ppm的回调 */ mt_ppm_register_client(PPM_CLIENT_DVFS, &ppm_limit_callback);}static struct cpufreq_driver _mt_cpufreq_driver = { .flags = CPUFREQ_ASYNC_NOTIFICATION, .verify = _mt_cpufreq_verify, .target = _mt_cpufreq_target, .init = _mt_cpufreq_init, .exit = _mt_cpufreq_exit, .get = _mt_cpufreq_get, .name = "mt-cpufreq", .attr = _mt_cpufreq_attr,};
- 2、在驱动注册cpufreq_register_driver()过程中会初始化online cpu的cpufreq_policy:
_mt_cpufreq_pdrv_probe() -> cpufreq_register_driver() -> subsys_interface_register() -> cpufreq_add_dev() -> cpufreq_online()↓static int cpufreq_online(unsigned int cpu){ struct cpufreq_policy *policy; bool new_policy; unsigned long flags; unsigned int j; int ret; pr_debug("%s: bringing CPU%u online\n", __func__, cpu); /* (1) 检查per_cpu(cpufreq_cpu_data, cpu)中的cpufreq_policy, 如果为NULL,重新分配空间 */ /* Check if this CPU already has a policy to manage it */ policy = per_cpu(cpufreq_cpu_data, cpu); if (policy) { WARN_ON(!cpumask_test_cpu(cpu, policy->related_cpus)); if (!policy_is_inactive(policy)) return cpufreq_add_policy_cpu(policy, cpu); /* This is the only online CPU for the policy. Start over. */ new_policy = false; down_write(&policy->rwsem); policy->cpu = cpu; policy->governor = NULL; up_write(&policy->rwsem); } else { new_policy = true; policy = cpufreq_policy_alloc(cpu); if (!policy) return -ENOMEM; } cpumask_copy(policy->cpus, cpumask_of(cpu)); /* (2) 调用cpufreq_driver的初始化函数来初始化cpufreq_policy, 这步比较重要,初始化了以下的数据: */ /* call driver. From then on the cpufreq must be able * to accept all calls to ->verify and ->setpolicy for this CPU */ ret = cpufreq_driver->init(policy); if (ret) { pr_debug("initialization failed\n"); goto out_free_policy; } down_write(&policy->rwsem); /* (3) 如果cpufreq_policy是新分配空间的, 做一些相应的初始化工作 */ if (new_policy) { /* related_cpus should at least include policy->cpus. */ cpumask_copy(policy->related_cpus, policy->cpus); /* Remember CPUs present at the policy creation time. */ cpumask_and(policy->real_cpus, policy->cpus, cpu_present_mask); /* Name and add the kobject */ ret = kobject_add(&policy->kobj, cpufreq_global_kobject, "policy%u", cpumask_first(policy->related_cpus)); if (ret) { pr_err("%s: failed to add policy->kobj: %d\n", __func__, ret); goto out_exit_policy; } } /* * affected cpus must always be the one, which are online. We aren't * managing offline cpus here. */ cpumask_and(policy->cpus, policy->cpus, cpu_online_mask); if (new_policy) { policy->user_policy.min = policy->min; policy->user_policy.max = policy->max; write_lock_irqsave(&cpufreq_driver_lock, flags); /* (3.1) 同一个cluster中所有cpu的per_cpu(cpufreq_cpu_data, j),共享同一个cpufreq_policy */ for_each_cpu(j, policy->related_cpus) per_cpu(cpufreq_cpu_data, j) = policy; write_unlock_irqrestore(&cpufreq_driver_lock, flags); } /* (4) 获取cpufreq_policy的当前频率 */ if (cpufreq_driver->get && !cpufreq_driver->setpolicy) { policy->cur = cpufreq_driver->get(policy->cpu); if (!policy->cur) { pr_err("%s: ->get() failed\n", __func__); goto out_exit_policy; } } /* * Sometimes boot loaders set CPU frequency to a value outside of * frequency table present with cpufreq core. In such cases CPU might be * unstable if it has to run on that frequency for long duration of time * and so its better to set it to a frequency which is specified in * freq-table. This also makes cpufreq stats inconsistent as * cpufreq-stats would fail to register because current frequency of CPU * isn't found in freq-table. * * Because we don't want this change to effect boot process badly, we go * for the next freq which is >= policy->cur ('cur' must be set by now, * otherwise we will end up setting freq to lowest of the table as 'cur' * is initialized to zero). * * We are passing target-freq as "policy->cur - 1" otherwise * __cpufreq_driver_target() would simply fail, as policy->cur will be * equal to target-freq. */ if ((cpufreq_driver->flags & CPUFREQ_NEED_INITIAL_FREQ_CHECK) && has_target()) { /* Are we running at unknown frequency ? */ ret = cpufreq_frequency_table_get_index(policy, policy->cur); if (ret == -EINVAL) { /* Warn user and fix it */ pr_warn("%s: CPU%d: Running at unlisted freq: %u KHz\n", __func__, policy->cpu, policy->cur); ret = __cpufreq_driver_target(policy, policy->cur - 1, CPUFREQ_RELATION_L); /* * Reaching here after boot in a few seconds may not * mean that system will remain stable at "unknown" * frequency for longer duration. Hence, a BUG_ON(). */ BUG_ON(ret); pr_warn("%s: CPU%d: Unlisted initial frequency changed to: %u KHz\n", __func__, policy->cpu, policy->cur); } } blocking_notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_START, policy); if (new_policy) { ret = cpufreq_add_dev_interface(policy); if (ret) goto out_exit_policy; blocking_notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_CREATE_POLICY, policy); write_lock_irqsave(&cpufreq_driver_lock, flags); list_add(&policy->policy_list, &cpufreq_policy_list); write_unlock_irqrestore(&cpufreq_driver_lock, flags); } /* (5) 调用cpufreq governor的初始化函数,来初始化cpufreq_policy */ ret = cpufreq_init_policy(policy); if (ret) { pr_err("%s: Failed to initialize policy for cpu: %d (%d)\n", __func__, cpu, ret); /* cpufreq_policy_free() will notify based on this */ new_policy = false; goto out_exit_policy; } up_write(&policy->rwsem); kobject_uevent(&policy->kobj, KOBJ_ADD); /* Callback for handling stuff after policy is ready */ if (cpufreq_driver->ready) cpufreq_driver->ready(policy); pr_debug("initialization complete\n"); return 0;out_exit_policy: up_write(&policy->rwsem); if (cpufreq_driver->exit) cpufreq_driver->exit(policy);out_free_policy: cpufreq_policy_free(policy, !new_policy); return ret;}|→static int _mt_cpufreq_init(struct cpufreq_policy *policy){ int ret = -EINVAL; unsigned long flags; FUNC_ENTER(FUNC_LV_MODULE); policy->shared_type = CPUFREQ_SHARED_TYPE_ANY; cpumask_setall(policy->cpus); policy->cpuinfo.transition_latency = 1000; { enum mt_cpu_dvfs_id id = _get_cpu_dvfs_id(policy->cpu); struct mt_cpu_dvfs *p = id_to_cpu_dvfs(id); unsigned int lv = _mt_cpufreq_get_cpu_level(); struct opp_tbl_info *opp_tbl_info; struct opp_tbl_m_info *opp_tbl_m_info; struct opp_tbl_m_info *opp_tbl_m_cci_info; struct mt_cpu_dvfs *p_cci; cpufreq_ver("DVFS: _mt_cpufreq_init: %s(cpu_id = %d)\n", cpu_dvfs_get_name(p), p->cpu_id); opp_tbl_info = &opp_tbls[id][lv]; p->cpu_level = lv; /* (2.1) 给policy->freq_table赋值 给policy->cpus赋值 给policy->related_cpus赋值 */ ret = _mt_cpufreq_setup_freqs_table(policy, opp_tbl_info->opp_tbl, opp_tbl_info->size); /* (2.2) 给policy->cpuinfo.max_freq赋值 给policy->cpuinfo.min_freq赋值 */ policy->cpuinfo.max_freq = cpu_dvfs_get_max_freq(p); policy->cpuinfo.min_freq = cpu_dvfs_get_min_freq(p); opp_tbl_m_info = &opp_tbls_m[id][lv]; p->freq_tbl = opp_tbl_m_info->opp_tbl_m; cpufreq_lock(flags); /* Sync p */ if (_mt_cpufreq_sync_opp_tbl_idx(p) >= 0) if (p->idx_normal_max_opp == -1) p->idx_normal_max_opp = p->idx_opp_tbl; /* (2.3) 给policy->cur赋值 给policy->max赋值 给policy->min赋值 */ policy->cur = cpu_dvfs_get_cur_freq(p); /* use cur phy freq is better */ policy->max = cpu_dvfs_get_freq_by_idx(p, p->idx_opp_ppm_limit); policy->min = cpu_dvfs_get_freq_by_idx(p, p->idx_opp_ppm_base); p->mt_policy = policy; p->armpll_is_available = 1;#ifdef CONFIG_HYBRID_CPU_DVFS if (turbo_flag && cpu_dvfs_is(p, MT_CPU_DVFS_B) && !turbo_is_inited) { unsigned int turbo_f, turbo_v; turbo_f = ((cpu_dvfs_get_max_freq(p) * 104 / 100) / 13) * 13 / 1000; if (picachu_need_higher_volt(MT_PICACHU_DOMAIN2)) turbo_v = MAX_VPROC_VOLT; else turbo_v = MAX_VPROC_VOLT - 2000; /* turbo_v = p->opp_tbl[0].cpufreq_volt; */ cpuhvfs_set_turbo_scale(turbo_f * 1000, turbo_v); turbo_is_inited = 1; }#endif /* Sync cci */ if (cci_is_inited == 0) { p_cci = id_to_cpu_dvfs(MT_CPU_DVFS_CCI); /* init cci freq idx */ if (_mt_cpufreq_sync_opp_tbl_idx(p_cci) >= 0) if (p_cci->idx_normal_max_opp == -1) p_cci->idx_normal_max_opp = p_cci->idx_opp_tbl; opp_tbl_m_cci_info = &opp_tbls_m[MT_CPU_DVFS_CCI][lv]; p_cci->freq_tbl = opp_tbl_m_cci_info->opp_tbl_m; p_cci->mt_policy = NULL; p_cci->armpll_is_available = 1; cci_is_inited = 1; }#ifdef CONFIG_HYBRID_CPU_DVFS cpuhvfs_set_cluster_on_off(arch_get_cluster_id(p->cpu_id), 1);#endif cpufreq_unlock(flags); } if (ret) cpufreq_err("failed to setup frequency table\n"); FUNC_EXIT(FUNC_LV_MODULE); return ret;}||→static int _mt_cpufreq_setup_freqs_table(struct cpufreq_policy *policy, struct mt_cpu_freq_info *freqs, int num){ struct mt_cpu_dvfs *p; int ret = 0; FUNC_ENTER(FUNC_LV_LOCAL); p = id_to_cpu_dvfs(_get_cpu_dvfs_id(policy->cpu));#ifdef CONFIG_CPU_FREQ ret = cpufreq_frequency_table_cpuinfo(policy, p->freq_tbl_for_cpufreq); /* (2.1.1) 给policy->freq_table赋值 */ if (!ret) policy->freq_table = p->freq_tbl_for_cpufreq; /* (2.1.2) 根据cpu相同cluster中有哪些cpu 给policy->cpus赋值 给policy->related_cpus赋值 */ cpumask_copy(policy->cpus, topology_core_cpumask(policy->cpu)); cpumask_copy(policy->related_cpus, policy->cpus);#endif FUNC_EXIT(FUNC_LV_LOCAL); return 0;}
- 3、在cpufreq_online()初始化完cpufreq_policy,最后会调用cpufreq_init_policy()继续governor的初始化:
static int cpufreq_init_policy(struct cpufreq_policy *policy){ struct cpufreq_governor *gov = NULL; struct cpufreq_policy new_policy; memcpy(&new_policy, policy, sizeof(*policy)); /* (5.1) 使用last或者default的governor, 给new_policy.governor赋值 */ /* Update governor of new_policy to the governor used before hotplug */ gov = find_governor(policy->last_governor); if (gov) pr_debug("Restoring governor %s for cpu %d\n", policy->governor->name, policy->cpu); else gov = CPUFREQ_DEFAULT_GOVERNOR; new_policy.governor = gov; /* Use the default policy if there is no last_policy. */ if (cpufreq_driver->setpolicy) { if (policy->last_policy) new_policy.policy = policy->last_policy; else cpufreq_parse_governor(gov->name, &new_policy.policy, NULL); } /* (5.2) 启动governor来使用cpufreq_policy */ /* set default policy */ return cpufreq_set_policy(policy, &new_policy);}|→static int cpufreq_set_policy(struct cpufreq_policy *policy, struct cpufreq_policy *new_policy){ struct cpufreq_governor *old_gov; int ret; pr_debug("setting new policy for CPU %u: %u - %u kHz\n", new_policy->cpu, new_policy->min, new_policy->max); memcpy(&new_policy->cpuinfo, &policy->cpuinfo, sizeof(policy->cpuinfo)); /* (5.2.1) 对policy、new_policy的一堆合法性判断 */ /* * This check works well when we store new min/max freq attributes, * because new_policy is a copy of policy with one field updated. */ if (new_policy->min > new_policy->max) return -EINVAL; /* verify the cpu speed can be set within this limit */ ret = cpufreq_driver->verify(new_policy); if (ret) return ret; /* adjust if necessary - all reasons */ blocking_notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_ADJUST, new_policy); /* * verify the cpu speed can be set within this limit, which might be * different to the first one */ ret = cpufreq_driver->verify(new_policy); if (ret) return ret; /* notification of the new policy */ blocking_notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_NOTIFY, new_policy); scale_freq_capacity(new_policy, NULL); policy->min = new_policy->min; policy->max = new_policy->max; trace_cpu_frequency_limits(policy->max, policy->min, policy->cpu); pr_debug("new min and max freqs are %u - %u kHz\n", policy->min, policy->max); if (cpufreq_driver->setpolicy) { policy->policy = new_policy->policy; pr_debug("setting range\n"); return cpufreq_driver->setpolicy(new_policy); } if (new_policy->governor == policy->governor) goto out; pr_debug("governor switch\n"); /* (5.2.2) 如果旧的governor在工作中, 依次调用 CPUFREQ_GOV_STOP、CPUFREQ_GOV_POLICY_EXIT停止旧的governor */ /* save old, working values */ old_gov = policy->governor; /* end old governor */ if (old_gov) { ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP); if (ret) { /* This can happen due to race with other operations */ pr_debug("%s: Failed to Stop Governor: %s (%d)\n", __func__, old_gov->name, ret); return ret; } up_write(&policy->rwsem); ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT); down_write(&policy->rwsem); if (ret) { pr_err("%s: Failed to Exit Governor: %s (%d)\n", __func__, old_gov->name, ret); return ret; } } /* (5.2.3) 依次调用 CPUFREQ_GOV_POLICY_INIT、CPUFREQ_GOV_START让新的governor开工 */ /* start new governor */ policy->governor = new_policy->governor; ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT); if (!ret) { ret = __cpufreq_governor(policy, CPUFREQ_GOV_START); if (!ret) goto out; up_write(&policy->rwsem); __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT); down_write(&policy->rwsem); } /* new governor failed, so re-start old one */ pr_debug("starting governor %s failed\n", policy->governor->name); if (old_gov) { policy->governor = old_gov; if (__cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT)) policy->governor = NULL; else __cpufreq_governor(policy, CPUFREQ_GOV_START); } return ret; out: pr_debug("governor: change or update limits\n"); return __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);}||→static int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event){ /* __cpufreq_governor()调用的各种命令最后调用的都是governor的具体函数 */ ret = policy->governor->governor(policy, event);}
- 4、以interactive governor为例,说明policy->governor->governor()对CPUFREQ_GOV_POLICY_INIT、CPUFREQ_GOV_START、CPUFREQ_GOV_STOP、CPUFREQ_GOV_POLICY_EXIT这几个命令的实现:
struct cpufreq_governor cpufreq_gov_interactive = { .name = "interactive", .governor = cpufreq_governor_interactive, .max_transition_latency = 10000000, .owner = THIS_MODULE,};↓static int cpufreq_governor_interactive(struct cpufreq_policy *policy, unsigned int event){ int rc; unsigned int j; struct cpufreq_interactive_cpuinfo *pcpu; struct cpufreq_frequency_table *freq_table; struct cpufreq_interactive_tunables *tunables; unsigned long flags; if (have_governor_per_policy()) tunables = policy->governor_data; else tunables = common_tunables; WARN_ON(!tunables && (event != CPUFREQ_GOV_POLICY_INIT)); switch (event) { /* (1) CPUFREQ_GOV_POLICY_INIT命令的实现: 初始化tunables,tunables是interactive governor在计算时使用的各种参数 相关的sysfs注册 */ case CPUFREQ_GOV_POLICY_INIT: if (have_governor_per_policy()) { WARN_ON(tunables); } else if (tunables) { tunables->usage_count++; policy->governor_data = tunables; return 0; } tunables = kzalloc(sizeof(*tunables), GFP_KERNEL); if (!tunables) { pr_err("%s: POLICY_INIT: kzalloc failed\n", __func__); return -ENOMEM; } tunables->usage_count = 1; tunables->above_hispeed_delay = default_above_hispeed_delay; tunables->nabove_hispeed_delay = ARRAY_SIZE(default_above_hispeed_delay); tunables->go_hispeed_load = DEFAULT_GO_HISPEED_LOAD; tunables->target_loads = default_target_loads; tunables->ntarget_loads = ARRAY_SIZE(default_target_loads); tunables->min_sample_time = DEFAULT_MIN_SAMPLE_TIME; tunables->timer_rate = DEFAULT_TIMER_RATE; tunables->boostpulse_duration_val = DEFAULT_MIN_SAMPLE_TIME; tunables->timer_slack_val = DEFAULT_TIMER_SLACK; spin_lock_init(&tunables->target_loads_lock); spin_lock_init(&tunables->above_hispeed_delay_lock); policy->governor_data = tunables; if (!have_governor_per_policy()) { common_tunables = tunables; } rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr()); if (rc) { kfree(tunables); policy->governor_data = NULL; if (!have_governor_per_policy()) { common_tunables = NULL; } return rc; } if (!policy->governor->initialized) { idle_notifier_register(&cpufreq_interactive_idle_nb); cpufreq_register_notifier(&cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); } break; /* (2) CPUFREQ_GOV_POLICY_EXIT命令的实现: remove相关的sysfs */ case CPUFREQ_GOV_POLICY_EXIT: if (!--tunables->usage_count) { if (policy->governor->initialized == 1) { cpufreq_unregister_notifier(&cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); idle_notifier_unregister(&cpufreq_interactive_idle_nb); }#ifdef CONFIG_MEIZU_BSP }#else sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr()); kfree(tunables); common_tunables = NULL; } policy->governor_data = NULL;#endif //CONFIG_MEIZU_BSP break; /* (3) CPUFREQ_GOV_START命令的实现: 因为同一个cluster中的多个cpu是共享一个cpufreq_policy的, 所以使用同一个cpufreq_policy来初始化cluster中多个online cpu的per_cpu(cpuinfo, j)变量: pcpu->target_freq // 当前频率 pcpu->freq_table // 频率表 并且启动cpu上的interactive_timer=pcpu->cpu_timer: cpufreq_interactive_timer_start(tunables, j); */ case CPUFREQ_GOV_START: mutex_lock(&gov_lock); freq_table = cpufreq_frequency_get_table(policy->cpu); if (tunables && !tunables->hispeed_freq) tunables->hispeed_freq = policy->max; for_each_cpu(j, policy->cpus) { pcpu = &per_cpu(cpuinfo, j); pcpu->policy = policy; pcpu->target_freq = policy->cur; pcpu->freq_table = freq_table; pcpu->floor_freq = pcpu->target_freq; pcpu->pol_floor_val_time = ktime_to_us(ktime_get()); pcpu->loc_floor_val_time = pcpu->pol_floor_val_time; pcpu->pol_hispeed_val_time = pcpu->pol_floor_val_time; pcpu->loc_hispeed_val_time = pcpu->pol_floor_val_time; down_write(&pcpu->enable_sem); del_timer_sync(&pcpu->cpu_timer); del_timer_sync(&pcpu->cpu_slack_timer); cpufreq_interactive_timer_start(tunables, j); pcpu->governor_enabled = 1; up_write(&pcpu->enable_sem); } mutex_unlock(&gov_lock); break; /* (4) CPUFREQ_GOV_STOP命令的实现: 如果同一个cluster中的多个cpu都已经offline,停掉对应的governor: 停掉cpu上的interactive_timer=pcpu->cpu_timer */ case CPUFREQ_GOV_STOP: mutex_lock(&gov_lock); for_each_cpu(j, policy->cpus) { pcpu = &per_cpu(cpuinfo, j); down_write(&pcpu->enable_sem); pcpu->governor_enabled = 0; del_timer_sync(&pcpu->cpu_timer); del_timer_sync(&pcpu->cpu_slack_timer); up_write(&pcpu->enable_sem); } mutex_unlock(&gov_lock); break; case CPUFREQ_GOV_LIMITS: if (policy->max < policy->cur) __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); else if (policy->min > policy->cur) __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); for_each_cpu(j, policy->cpus) { pcpu = &per_cpu(cpuinfo, j); down_read(&pcpu->enable_sem); if (pcpu->governor_enabled == 0) { up_read(&pcpu->enable_sem); continue; } spin_lock_irqsave(&pcpu->target_freq_lock, flags); if (policy->max < pcpu->target_freq) pcpu->target_freq = policy->max; else if (policy->min > pcpu->target_freq) pcpu->target_freq = policy->min; spin_unlock_irqrestore(&pcpu->target_freq_lock, flags); up_read(&pcpu->enable_sem); } break; }
4.3.1.2、cpufrep的频率配置
cpufreq一个重要的作用就是能把用户需要的cpu频率配置下去,这部分的代码也需要cpufreq core和cpufreq driver的配合。频率调整也叫DVFS(Dynamic Voltage and Frequency Scaling),需要按照对应关系把电压和频率一起配置下去。
具体的代码解析如下:
int __cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation){ unsigned int old_target_freq = target_freq; int retval = -EINVAL; if (cpufreq_disabled()) return -ENODEV; /* (1) target目标频率在policy中的合法性检测 */ /* Make sure that target_freq is within supported range */ if (target_freq > policy->max) target_freq = policy->max; if (target_freq < policy->min) target_freq = policy->min; pr_debug("target for CPU %u: %u kHz, relation %u, requested %u kHz\n", policy->cpu, target_freq, relation, old_target_freq); /* (2) 如果当前频率就是target频率,不用调整直接返回 */ /* * This might look like a redundant call as we are checking it again * after finding index. But it is left intentionally for cases where * exactly same freq is called again and so we can save on few function * calls. */ if (target_freq == policy->cur) return 0; /* Save last value to restore later on errors */ policy->restore_freq = policy->cur; if (cpufreq_driver->target) /* (3) 调用实际的驱动target()函数来调整cpu频率 */ retval = cpufreq_driver->target(policy, target_freq, relation); else if (cpufreq_driver->target_index) { struct cpufreq_frequency_table *freq_table; int index; freq_table = cpufreq_frequency_get_table(policy->cpu); if (unlikely(!freq_table)) { pr_err("%s: Unable to find freq_table\n", __func__); goto out; } retval = cpufreq_frequency_table_target(policy, freq_table, target_freq, relation, &index); if (unlikely(retval)) { pr_err("%s: Unable to find matching freq\n", __func__); goto out; } if (freq_table[index].frequency == policy->cur) { retval = 0; goto out; } retval = __target_index(policy, freq_table, index); }out: return retval;}|→static int _mt_cpufreq_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation){ struct mt_cpu_dvfs *p; int ret; unsigned int new_opp_idx; p = id_to_cpu_dvfs(_get_cpu_dvfs_id(policy->cpu)); if (!p) return -EINVAL; /* (3.1) 驱动根据频率电压表,配置target频率和对应电压 */ ret = cpufreq_frequency_table_target(policy, p->freq_tbl_for_cpufreq, target_freq, relation, &new_opp_idx); if (ret || new_opp_idx >= p->nr_opp_tbl) return -EINVAL; if (dvfs_disable_flag || p->dvfs_disable_by_suspend || p->dvfs_disable_by_procfs) return -EPERM; _mt_cpufreq_dvfs_request_wrapper(p, new_opp_idx, MT_CPU_DVFS_NORMAL, NULL); return 0;}
4.3.2、interactive governor
在所有的cpufreq governor中最有名气的就是interactive governor了,因为几乎所有的andriod系统中都在使用。
interactive的思想就是使用cpu的负载来调整cpu频率,核心就是:使用一个20ms的定时器来计算cpu占用率,根据cpu占用率的不同threshold来调整不同档位的频率。
interactive的负载计算方法如上图所示。interactive的整个计算方法大概如下:
- 1、计算cpu的累加负载。每20ms采样一次,每次采样统计增加的active_time和当前频率的乘积:cputime_speedadj += active_time * cur_freq;
- 2、计算cpu的占用率。当前cpu占用率 = (累加负载100)/(累加时间当前频率),cpu_load = (loadadjfreq*100)/(delta_time*cur_freq);
- 3、如果cpu_load达到高门限go_hispeed_load(99%)或者发生boost,直接调节频率到hispeed_freq(最高频率);
- 4、其他情况下使用choose_freq()公式计算新频率:new_freq = cur_freq*(cpu_load/DEFAULT_TARGET_LOAD(90));new_freq = cpufreq_frequency_table_target(new_freq, CPUFREQ_RELATION_L);
- 5、如果当前频率已经达到hispeed_freq,还需要往上调整,必须在之前的频率上保持above_hispeed_delay(20ms);如果当前频率已经达到hispeed_freq,还需要往下调整,必须在之前的频率上保持min_sample_time(80ms);
interactive governor从原理上看,有以下问题:
- 1、20ms的采样时间过长,负载变化到频率调整的反应时间过长;
- 2、负载累加计算有问题,历史负载没有老化机制,历史负载的权重和当前一样,造成当前的负载变化不真实;
- 3、计算cpu占用率=总历史负载/(总时间*当前频率),算法不合理历史负载对当前影响太大。如果之前是高频率,现在变成低频率,那么cpu_load计算出来的值可能超过100%;如果之前是低频率,现在是高频率,那么cpu_load计算出来的值也会大大被拉低;
- 4、choose_freq()的计算公式有重大漏洞。比如我们cpu频率表={800M, 900M},当前cur_freq=800m cur_load=100%,那么newfreq = (cur_freq*cur_load)/90 = 889M,使用CPUFREQ_RELATION_L选择档位,选择到还是800M根本不能向高档位前进。这是算法的一个漏洞,如果cpu不同档位的频率差值大于(100/90),那么正常往上调频是调不上去的,会被CPUFREQ_RELATION_L参数拦下来。所以实际的interactive调频,都是使用go_hispeed_load(99%)调到最高值的,再使用choose_freq()来降频。
所以interactive governor会逐渐的被cpufreq gorernor所取代。
4.3.2.1、interactive governor的初始化
- 1、interactive的一部分初始化在cpufreq_interactive_init()当中:
static int __init cpufreq_interactive_init(void){ unsigned int i; struct cpufreq_interactive_cpuinfo *pcpu; struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; /* (1) 初始化percpu变量per_cpu(cpuinfo, i): 每个cpu创建负载计算定时器pcpu->cpu_timer 其他的锁 */ /* Initalize per-cpu timers */ for_each_possible_cpu(i) { pcpu = &per_cpu(cpuinfo, i); init_timer_deferrable(&pcpu->cpu_timer); pcpu->cpu_timer.function = cpufreq_interactive_timer; pcpu->cpu_timer.data = i; init_timer(&pcpu->cpu_slack_timer); pcpu->cpu_slack_timer.function = cpufreq_interactive_nop_timer; spin_lock_init(&pcpu->load_lock); spin_lock_init(&pcpu->target_freq_lock); init_rwsem(&pcpu->enable_sem); } spin_lock_init(&speedchange_cpumask_lock); mutex_init(&gov_lock); /* (2) 创建频率调整进程speedchange_task, 把耗时的频率调整工作单独放到一个进程中去做 */ speedchange_task = kthread_create(cpufreq_interactive_speedchange_task, NULL, "cfinteractive"); if (IS_ERR(speedchange_task)) return PTR_ERR(speedchange_task); sched_setscheduler_nocheck(speedchange_task, SCHED_FIFO, ¶m); get_task_struct(speedchange_task); /* NB: wake up so the thread does not look hung to the freezer */ wake_up_process(speedchange_task); return cpufreq_register_governor(&cpufreq_gov_interactive);}
- 2、interactive另一部分初始化在cpufreq_governor_interactive()中的CPUFREQ_GOV_POLICY_INIT、CPUFREQ_GOV_START命令,在cpu online时执行:
static int cpufreq_governor_interactive(struct cpufreq_policy *policy, unsigned int event){ switch (event) { /* (1) CPUFREQ_GOV_POLICY_INIT命令初始化interactive governor最核心的参数 */ case CPUFREQ_GOV_POLICY_INIT: if (have_governor_per_policy()) { WARN_ON(tunables); } else if (tunables) { tunables->usage_count++; policy->governor_data = tunables; return 0; } tunables = kzalloc(sizeof(*tunables), GFP_KERNEL); if (!tunables) { pr_err("%s: POLICY_INIT: kzalloc failed\n", __func__); return -ENOMEM; } tunables->usage_count = 1; tunables->above_hispeed_delay = default_above_hispeed_delay; tunables->nabove_hispeed_delay = ARRAY_SIZE(default_above_hispeed_delay); tunables->go_hispeed_load = DEFAULT_GO_HISPEED_LOAD; tunables->target_loads = default_target_loads; tunables->ntarget_loads = ARRAY_SIZE(default_target_loads); tunables->min_sample_time = DEFAULT_MIN_SAMPLE_TIME; tunables->timer_rate = DEFAULT_TIMER_RATE; // interactive负载计算timer默认时间为20ms tunables->boostpulse_duration_val = DEFAULT_MIN_SAMPLE_TIME; tunables->timer_slack_val = DEFAULT_TIMER_SLACK; spin_lock_init(&tunables->target_loads_lock); spin_lock_init(&tunables->above_hispeed_delay_lock); policy->governor_data = tunables; if (!have_governor_per_policy()) { common_tunables = tunables; } rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr()); if (rc) { kfree(tunables); policy->governor_data = NULL; if (!have_governor_per_policy()) { common_tunables = NULL; } return rc; } if (!policy->governor->initialized) { idle_notifier_register(&cpufreq_interactive_idle_nb); cpufreq_register_notifier(&cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); } break; /* (2) CPUFREQ_GOV_START命令启动interactive负载计算的timer */ case CPUFREQ_GOV_START: mutex_lock(&gov_lock); freq_table = cpufreq_frequency_get_table(policy->cpu); if (tunables && !tunables->hispeed_freq) tunables->hispeed_freq = policy->max; for_each_cpu(j, policy->cpus) { pcpu = &per_cpu(cpuinfo, j); pcpu->policy = policy; pcpu->target_freq = policy->cur; pcpu->freq_table = freq_table; pcpu->floor_freq = pcpu->target_freq; pcpu->pol_floor_val_time = ktime_to_us(ktime_get()); pcpu->loc_floor_val_time = pcpu->pol_floor_val_time; pcpu->pol_hispeed_val_time = pcpu->pol_floor_val_time; pcpu->loc_hispeed_val_time = pcpu->pol_floor_val_time; down_write(&pcpu->enable_sem); del_timer_sync(&pcpu->cpu_timer); del_timer_sync(&pcpu->cpu_slack_timer); cpufreq_interactive_timer_start(tunables, j); pcpu->governor_enabled = 1; up_write(&pcpu->enable_sem); } mutex_unlock(&gov_lock); break; }
4.3.2.2、interactive governor的算法
interactive governor的核心算法在20ms周期的timer interactive governor()中:
static void cpufreq_interactive_timer(unsigned long data){ u64 now; unsigned int delta_time; u64 cputime_speedadj; int cpu_load; struct cpufreq_interactive_cpuinfo *pcpu = &per_cpu(cpuinfo, data); struct cpufreq_interactive_tunables *tunables = pcpu->policy->governor_data; unsigned int new_freq; unsigned int loadadjfreq; unsigned int index; unsigned long flags; u64 max_fvtime; int j; unsigned int max_t_freq = 0;#ifdef CPUDVFS_POWER_MODE /* default(normal), low power, just make, performance(sports) */ int min_sample_t[4] = { 80, 20, 20, 80 }; int ppb_idx;#endif if (!down_read_trylock(&pcpu->enable_sem)) return; if (!pcpu->governor_enabled) goto exit; spin_lock_irqsave(&pcpu->load_lock, flags); /* (1) 累加cpu上自从cpu_up()以来的负载, pcpu->cputime_speedadj += active_time * pcpu->policy->cur; pcpu->cputime_speedadj = (active_time * pcpu->policy->cur)samp1 + ... +(active_time * pcpu->policy->cur)sampn ; 每个采样周期为20mS,累加:第1个20ms中active_time*cur_cpu_freq + 第2个20ms中active_time*cur_cpu_freq +...+ 第n个20ms中active_time*cur_cpu_freq */ now = update_load(data); /* (2) 自从cpu_up()以来的总的时间 delta_time = active_time + ilde_time */ delta_time = (unsigned int)(now - pcpu->cputime_speedadj_timestamp); cputime_speedadj = pcpu->cputime_speedadj; spin_unlock_irqrestore(&pcpu->load_lock, flags); if (WARN_ON_ONCE(!delta_time)) goto rearm; spin_lock_irqsave(&pcpu->target_freq_lock, flags); /* (3) 总的负载/总时间 = 平均频率 */ do_div(cputime_speedadj, delta_time); /* (4) (平均频率 * 100)/当前频率 = 当前cpu的占用率 */ loadadjfreq = (unsigned int)cputime_speedadj * 100; cpu_load = loadadjfreq / pcpu->policy->cur; tunables->boosted = tunables->boost_val || now < tunables->boostpulse_endtime;#ifdef CPUDVFS_POWER_MODE ppb_idx = mt_cpufreq_get_ppb_state(); { unsigned int idx = mt_cpufreq_ppb_hispeed_freq(data, ppb_idx); tunables->hispeed_freq = pcpu->freq_table[idx].frequency; tunables->min_sample_time = min_sample_t[ppb_idx] * USEC_PER_MSEC; if (hispeed_freq_perf != 0) tunables->hispeed_freq = hispeed_freq_perf; if (min_sample_time_perf != 0) tunables->min_sample_time = min_sample_time_perf; }#endif /* (5) 如果cpu占用率达到go_hispeed_load(99%),或者在boost状态, 频率直接调整到最高频率hispeed_freq */ if (cpu_load >= tunables->go_hispeed_load || tunables->boosted) { if (pcpu->policy->cur < tunables->hispeed_freq) { new_freq = tunables->hispeed_freq; } else { new_freq = choose_freq(pcpu, loadadjfreq); if (new_freq < tunables->hispeed_freq) new_freq = tunables->hispeed_freq; } /* (6) 否则使用choose_freq()根据当前负载来计算对应的频率 */ } else { new_freq = choose_freq(pcpu, loadadjfreq); if (new_freq > tunables->hispeed_freq && pcpu->policy->cur < tunables->hispeed_freq) new_freq = tunables->hispeed_freq; } /* (7) 如果计算出的新频率 > hispeed_freq,不能马上调整, 在hispeed_freq以上的频率上必须待满above_hispeed_delay(20ms),才能继续往上调整频率 */ if (pcpu->policy->cur >= tunables->hispeed_freq && new_freq > pcpu->policy->cur && now - pcpu->pol_hispeed_val_time < freq_to_above_hispeed_delay(tunables, pcpu->policy->cur)) { trace_cpufreq_interactive_notyet( data, cpu_load, pcpu->target_freq, pcpu->policy->cur, new_freq); spin_unlock_irqrestore(&pcpu->target_freq_lock, flags); goto rearm; } pcpu->loc_hispeed_val_time = now; if (cpufreq_frequency_table_target(pcpu->policy, pcpu->freq_table, new_freq, CPUFREQ_RELATION_L, &index)) { spin_unlock_irqrestore(&pcpu->target_freq_lock, flags); goto rearm; } new_freq = pcpu->freq_table[index].frequency; /* (8) 如果之前的频率 > hispeed_freq,或者发生boost 现在需要往低调频,之前的频率需要待满min_sample_time(80ms) */ /* * Do not scale below floor_freq unless we have been at or above the * floor frequency for the minimum sample time since last validated. */ max_fvtime = max(pcpu->pol_floor_val_time, pcpu->loc_floor_val_time); if (new_freq < pcpu->floor_freq && pcpu->target_freq >= pcpu->policy->cur) { if (now - max_fvtime < tunables->min_sample_time) { trace_cpufreq_interactive_notyet( data, cpu_load, pcpu->target_freq, pcpu->policy->cur, new_freq); spin_unlock_irqrestore(&pcpu->target_freq_lock, flags); goto rearm; } } /* * Update the timestamp for checking whether speed has been held at * or above the selected frequency for a minimum of min_sample_time, * if not boosted to hispeed_freq. If boosted to hispeed_freq then we * allow the speed to drop as soon as the boostpulse duration expires * (or the indefinite boost is turned off). */ if (!tunables->boosted || new_freq > tunables->hispeed_freq) { pcpu->floor_freq = new_freq; if (pcpu->target_freq >= pcpu->policy->cur || new_freq >= pcpu->policy->cur) pcpu->loc_floor_val_time = now; } /* (9) 如果当前cpu往低调整频率,判断当前policy是否需要更新, 因为多个cpu共享一个policy,取最大期望频率cpu的值作为整个policy的调整值 */ if (pcpu->target_freq == new_freq && pcpu->target_freq <= pcpu->policy->cur) { max_t_freq = 0; for_each_cpu(j, pcpu->policy->cpus) { struct cpufreq_interactive_cpuinfo *pjcpu; pjcpu = &per_cpu(cpuinfo, j); max_t_freq = max(max_t_freq, pjcpu->target_freq); } if (max_t_freq != pcpu->policy->cur) goto pass_t; trace_cpufreq_interactive_already( data, cpu_load, pcpu->target_freq, pcpu->policy->cur, new_freq); spin_unlock_irqrestore(&pcpu->target_freq_lock, flags); goto rearm; }pass_t: trace_cpufreq_interactive_target(data, cpu_load, pcpu->target_freq, pcpu->policy->cur, new_freq); /* (10) 如果policy需要更新唤醒speedchange_task来执行调频动作 */ pcpu->target_freq = new_freq; spin_unlock_irqrestore(&pcpu->target_freq_lock, flags); spin_lock_irqsave(&speedchange_cpumask_lock, flags); cpumask_set_cpu(data, &speedchange_cpumask); spin_unlock_irqrestore(&speedchange_cpumask_lock, flags); wake_up_process(speedchange_task);rearm: if (!timer_pending(&pcpu->cpu_timer)) cpufreq_interactive_timer_resched(pcpu);exit: up_read(&pcpu->enable_sem); return;}|→static unsigned int choose_freq(struct cpufreq_interactive_cpuinfo *pcpu, unsigned int loadadjfreq){ unsigned int freq = pcpu->policy->cur; unsigned int prevfreq, freqmin, freqmax; unsigned int tl; int index; freqmin = 0; freqmax = UINT_MAX; do { prevfreq = freq; /* (6.1) tl = 90,loadadjfreq = (平均频率 * 100) 即 newfreq = (平均频率 * 100)/ 90 相当于cpufreq_frequency_table_target(CPUFREQ_RELATION_L), 相当于newfreq往低档位的计算, ooooo这里带来一个非常严重的问题,如果档位之间差值大于100/90,向上调频将调不上去 */ tl = freq_to_targetload(pcpu->policy->governor_data, freq); /* * Find the lowest frequency where the computed load is less * than or equal to the target load. */ if (cpufreq_frequency_table_target( pcpu->policy, pcpu->freq_table, loadadjfreq / tl, CPUFREQ_RELATION_L, &index)) break; freq = pcpu->freq_table[index].frequency; if (freq > prevfreq) { /* The previous frequency is too low. */ freqmin = prevfreq; if (freq >= freqmax) { /* * Find the highest frequency that is less * than freqmax. */ if (cpufreq_frequency_table_target( pcpu->policy, pcpu->freq_table, freqmax - 1, CPUFREQ_RELATION_H, &index)) break; freq = pcpu->freq_table[index].frequency; if (freq == freqmin) { /* * The first frequency below freqmax * has already been found to be too * low. freqmax is the lowest speed * we found that is fast enough. */ freq = freqmax; break; } } } else if (freq < prevfreq) { /* The previous frequency is high enough. */ freqmax = prevfreq; if (freq <= freqmin) { /* * Find the lowest frequency that is higher * than freqmin. */ if (cpufreq_frequency_table_target( pcpu->policy, pcpu->freq_table, freqmin + 1, CPUFREQ_RELATION_L, &index)) break; freq = pcpu->freq_table[index].frequency; /* * If freqmax is the first frequency above * freqmin then we have already found that * this speed is fast enough. */ if (freq == freqmax) break; } } /* If same frequency chosen as previous then done. */ } while (freq != prevfreq); return freq;}
4.4、cpu hotplug调整
还有一种调节负载的方式是cpu hotplug:
- 1、cpu被hotplug掉的功耗小于cpu进入idle的功耗;如果整个cluster的cpu都offline,cluster也可以poweroff;所以hotplug能够节省功耗;
- 2、但是hotplug是有开销的:hotplug动作在速度慢的时候达到了ms级别,另外进程的迁移也是有开销的;cpu的hotplug必须遵循顺序插拔的规则,如果先拔掉负载重的cpu也是不合理的;
- 3、MTK的技术限制必须使用hotplug:MTK平台只有在剩一个online cpu的情况下才能进入深度idle模式,所以MTK平台必须支持hotplug;而samsung、qualcomm在多核online的情况下可以进入深度idle,所以一般不支持cpu hotplug;
4.4.1、hotplug 底层实现
4.4.1.1、cpu_cup()/cpu_down()
kernel对hotplug的支持是很完善的,标准接口cpu_up()/cpu_down()可以进行hotplug。
4.4.1.2、hotplug 进程迁移
在cpu_down()时,需要调用migration_call() -> migrate_tasks()把cpu上所有runnable进程迁移到其他cpu;在cpu_up()时,并不需要在函数中迁移进程,直接等待负载均衡算法的迁移。
static void migrate_tasks(struct rq *dead_rq){ struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; int dest_cpu; /* * Fudge the rq selection such that the below task selection loop * doesn't get stuck on the currently eligible stop task. * * We're currently inside stop_machine() and the rq is either stuck * in the stop_machine_cpu_stop() loop, or we're executing this code, * either way we should never end up calling schedule() until we're * done here. */ rq->stop = NULL; /* * put_prev_task() and pick_next_task() sched * class method both need to have an up-to-date * value of rq->clock[_task] */ update_rq_clock(rq); unthrottle_offline_rt_rqs(rq); for (;;) { /* * There's this thread running, bail when that's the only * remaining thread. */ if (rq->nr_running == 1) break; /* (1) 逐个从rq中获取task = next */ /* * pick_next_task assumes pinned rq->lock. */ lockdep_pin_lock(&rq->lock); next = pick_next_task(rq, &fake_task); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); /* * Rules for changing task_struct::cpus_allowed are holding * both pi_lock and rq->lock, such that holding either * stabilizes the mask. * * Drop rq->lock is not quite as disastrous as it usually is * because !cpu_active at this point, which means load-balance * will not interfere. Also, stop-machine. */ lockdep_unpin_lock(&rq->lock); raw_spin_unlock(&rq->lock); raw_spin_lock(&next->pi_lock); raw_spin_lock(&rq->lock); /* * Since we're inside stop-machine, _nothing_ should have * changed the task, WARN if weird stuff happened, because in * that case the above rq->lock drop is a fail too. */ if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { raw_spin_unlock(&next->pi_lock); continue; } /* (2) 找到最适合next进程迁移的目的cpu */ /* Find suitable destination for @next, with force if needed. */ dest_cpu = select_fallback_rq(dead_rq->cpu, next); /* (3) 实施进程迁移 */ rq = __migrate_task(rq, next, dest_cpu); if (rq != dead_rq) { raw_spin_unlock(&rq->lock); rq = dead_rq; raw_spin_lock(&rq->lock); } raw_spin_unlock(&next->pi_lock); } rq->stop = stop;}|→static int select_fallback_rq(int cpu, struct task_struct *p){ int nid = cpu_to_node(cpu); const struct cpumask *nodemask = NULL; enum { cpuset, possible, fail } state = cpuset; int dest_cpu; /* * If the node that the cpu is on has been offlined, cpu_to_node() * will return -1. There is no cpu on the node, and we should * select the cpu on the other node. */ if (nid != -1) { nodemask = cpumask_of_node(nid); /* Look for allowed, online CPU in same node. */ for_each_cpu(dest_cpu, nodemask) { if (!cpu_online(dest_cpu)) continue; if (!cpu_active(dest_cpu)) continue; if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return dest_cpu; } } for (;;) { /* (2.1) 最好的情况:在tsk_cpus_allowed(p)中能找到online cpu迁移 */ /* Any allowed, online CPU? */ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { if (!cpu_online(dest_cpu)) continue; if (!cpu_active(dest_cpu)) continue; goto out; } /* No more Mr. Nice Guy. */ switch (state) { /* (2.2) 其次的情况:在cpuset中能找到online cpu迁移 */ case cpuset: if (IS_ENABLED(CONFIG_CPUSETS)) { cpuset_cpus_allowed_fallback(p); state = possible; break; } /* (2.3) 最差的情况:在系统所有cpu中能找到online cpu迁移 */ /* fall-through */ case possible: do_set_cpus_allowed(p, cpu_possible_mask); state = fail; break; case fail: BUG(); break; } }out: if (state != cpuset) { /* * Don't tell them about moving exiting tasks or * kernel threads (both mm NULL), since they never * leave kernel. */ if (p->mm && printk_ratelimit()) { printk_deferred("process %d (%s) no longer affine to cpu%d\n", task_pid_nr(p), p->comm, cpu); } } return dest_cpu;}
4.4.2、MTK hotplug算法
在有了hotplug的底层cpu_cup()、cpu_down()的实现以后,在此之上还需要有一套算法根据cpu的负载来动态hotplug。MTK这套算法比较齐全,主要分为HICA、hps_algo_main两部分。
4.4.2.1、HICA/PPM
HICA和hps的关系,其实是HICA决定了一种大的mode,而hps在大的mode中实现精细化的调整。
比如对MT6799 HICA支持3种模式:
- 1、LL_ONLY。 // 只开小核
- 2、L_ONLY。 // 只开中核
- 3、ALL。 // LL、L、B10核都可以使用
HICA在mt_ppm_hica_update_algo_data()中计算负载,根据负载变化来决定mode:
_hps_task_main() -> mt_ppm_hica_update_algo_data()↓void mt_ppm_hica_update_algo_data(unsigned int cur_loads, unsigned int cur_nr_heavy_task, unsigned int cur_tlp){ struct ppm_power_state_data *state_info = ppm_get_power_state_info(); struct ppm_state_transfer_data *data; enum ppm_power_state cur_state; enum ppm_mode cur_mode; int i, j; FUNC_ENTER(FUNC_LV_HICA); ppm_lock(&hica_policy.lock); ppm_hica_algo_data.ppm_cur_loads = cur_loads; ppm_hica_algo_data.ppm_cur_tlp = cur_tlp; ppm_hica_algo_data.ppm_cur_nr_heavy_task = cur_nr_heavy_task; cur_state = ppm_hica_algo_data.cur_state; cur_mode = ppm_main_info.cur_mode; ppm_dbg(HICA, "cur_loads = %d, cur_tlp = %d, cur_nr_heavy_task = %d, cur_state = %s, cur_mode = %d\n", cur_loads, cur_tlp, cur_nr_heavy_task, ppm_get_power_state_name(cur_state), cur_mode); if (!ppm_main_info.is_enabled || !hica_policy.is_enabled || ppm_main_info.is_in_suspend || cur_state == PPM_POWER_STATE_NONE) goto end;#if defined(CONFIG_MACH_MT6757) || defined(CONFIG_MACH_KIBOPLUS) if (setup_max_cpus == 4) goto end;#endif#ifdef PPM_IC_SEGMENT_CHECK if (ppm_main_info.fix_state_by_segment != PPM_POWER_STATE_NONE) goto end;#endif /* skip HICA if DVFS is not ready (we cannot get current freq...) */ if (!ppm_main_info.client_info[PPM_CLIENT_DVFS].limit_cb) goto end; /* Power state is fixed by user, skip HICA state calculation */ if (fix_power_state != PPM_POWER_STATE_NONE) goto end; /* (1) 从transfer_by_perf到transfer_by_pwr逐个遍历判断当前state是否需要改变 */ for (i = 0; i < 2; i++) { data = (i == 0) ? state_info[cur_state].transfer_by_perf : state_info[cur_state].transfer_by_pwr; /* (2) 如果当前state有几种变化逐个遍历,比如: 当前state为ALL, 可以ALL -> LL_ONLY 也可以ALL -> L_ONLY */ for (j = 0; j < data->size; j++) { if (!data->transition_data[j].transition_rule || !((1 << cur_mode) & data->transition_data[j].mode_mask)) continue; /* (3) 如果state变化,获取新的state返回 */ if (data->transition_data[j].transition_rule( ppm_hica_algo_data, &data->transition_data[j])) { ppm_hica_algo_data.new_state = data->transition_data[j].next_state; ppm_dbg(HICA, "[%s(%d)] Need state transfer: %s --> %s\n", (i == 0) ? "PERF" : "PWR", j, ppm_get_power_state_name(cur_state), ppm_get_power_state_name(ppm_hica_algo_data.new_state) ); goto end; /* (4) 如果state不变化,维持当前state,继续遍历*/ } else { ppm_hica_algo_data.new_state = cur_state;#ifdef PPM_HICA_2P0 ppm_dbg(HICA, "[%s(%d)]hold in %s state, capacity_hold_cnt = %d, bigtsk_hold_cnt = %d, freq_hold_cnt = %d\n", (i == 0) ? "PERF" : "PWR", j, ppm_get_power_state_name(cur_state), data->transition_data[j].capacity_hold_cnt, data->transition_data[j].bigtsk_hold_cnt, data->transition_data[j].freq_hold_cnt );#else#if PPM_HICA_VARIANT_SUPPORT ppm_dbg(HICA, "[%s(%d)]hold in %s state, loading_cnt = %d, freq_cnt = %d, overutil_l_hold_cnt = %d, .overutil_h_hold_cnt = %d\n", (i == 0) ? "PERF" : "PWR", j, ppm_get_power_state_name(cur_state), data->transition_data[j].loading_hold_cnt, data->transition_data[j].freq_hold_cnt, data->transition_data[j].overutil_l_hold_cnt, data->transition_data[j].overutil_h_hold_cnt );#else ppm_dbg(HICA, "[%s(%d)]hold in %s state, loading_cnt = %d, freq_cnt = %d\n", (i == 0) ? "PERF" : "PWR", j, ppm_get_power_state_name(cur_state), data->transition_data[j].loading_hold_cnt, data->transition_data[j].freq_hold_cnt );#endif#endif } } }end: ppm_unlock(&hica_policy.lock); FUNC_EXIT(FUNC_LV_HICA);}
关于计算state的函数和阈值定义在表中,除了heavy_task和big_task,基本是计算util/capacity的cpu占用情况:
struct ppm_power_state_data pwr_state_info_SB[NR_PPM_POWER_STATE] = { [0] = { .name = __stringify(LL_ONLY), .state = PPM_POWER_STATE_LL_ONLY, PWR_STATE_INFO(LL_ONLY, SB) }, [1] = { .name = __stringify(L_ONLY), .state = PPM_POWER_STATE_L_ONLY, PWR_STATE_INFO(L_ONLY, SB) }, [2] = { .name = __stringify(ALL), .state = PPM_POWER_STATE_ALL, PWR_STATE_INFO(ALL, SB) },};static struct ppm_state_transfer state_pwr_transfer_ALL[] = { TRANS_DATA( LL_ONLY, PPM_MODE_MASK_ALL_MODE, ppm_trans_rule_ALL_to_LL_ONLY, PPM_DEFAULT_HOLD_TIME, PPM_CAPACITY_DOWN, PPM_DEFAULT_BIGTSK_TIME, 0, 0, 0 ), TRANS_DATA( L_ONLY, PPM_MODE_MASK_ALL_MODE, ppm_trans_rule_ALL_to_L_ONLY, PPM_DEFAULT_HOLD_TIME, PPM_CAPACITY_DOWN, PPM_DEFAULT_BIGTSK_TIME, 2, 4, 0 ),};STATE_TRANSFER_DATA_PWR(ALL);static struct ppm_state_transfer state_perf_transfer_ALL[] = { TRANS_DATA(NONE, 0, NULL, 0, 0, 0, 0, 0, 0),};STATE_TRANSFER_DATA_PERF(ALL);/* 举例:当前state为ALL 尝试从power的角度从ALL切换到LL_ONLY:ppm_trans_rule_ALL_to_LL_ONLY() 尝试从power的角度从ALL切换到L_ONLY:ppm_trans_rule_ALL_to_L_ONLY() */static bool ppm_trans_rule_ALL_to_LL_ONLY( struct ppm_hica_algo_data data, struct ppm_state_transfer *settings){ /* keep in ALL state if root cluster is fixed at L or B */ if (ppm_main_info.fixed_root_cluster == PPM_CLUSTER_L || ppm_main_info.fixed_root_cluster == PPM_CLUSTER_B) return false; /* (1) 从heavy task负载判断是否需要切换模式 */#if PPM_HEAVY_TASK_INDICATE_SUPPORT { unsigned int heavy_task, i; for_each_ppm_clusters(i) { heavy_task = hps_get_hvytsk(i); if (heavy_task) { ppm_dbg(HICA, "Stay in ALL due to cluster%d heavy task = %d\n", i, heavy_task); trace_ppm_hica( ppm_get_power_state_name(PPM_POWER_STATE_ALL), ppm_get_power_state_name(PPM_POWER_STATE_LL_ONLY), -1, -1, -1, -1, heavy_task, -1, false); settings->capacity_hold_cnt = 0; return false; } } }#endif /* (2) 从big task负载判断是否需要切换模式 */#if PPM_BIG_TASK_INDICATE_SUPPORT { unsigned int big_task_L = hps_get_bigtsk(PPM_CLUSTER_L); unsigned int big_task_B = hps_get_bigtsk(PPM_CLUSTER_B); if (big_task_L || big_task_B) { ppm_dbg(HICA, "Stay in ALL due to L/B big task = %d/%d\n", big_task_L, big_task_B); trace_ppm_hica( ppm_get_power_state_name(PPM_POWER_STATE_ALL), ppm_get_power_state_name(PPM_POWER_STATE_LL_ONLY), -1, -1, big_task_L, big_task_B, -1, -1, false); settings->capacity_hold_cnt = 0; return false; } }#endif /* (3) 从util/capacity负载判断是否需要切换模式 */ { /* check capacity */ unsigned long usage, usage_total = 0, capacity = 0, dummy; unsigned int i; for_each_ppm_clusters(i) { if (sched_get_cluster_util(i, &usage, &dummy)) { ppm_err("Get cluster %d util failed\n", i); return false; } usage_total += usage; if (i == PPM_CLUSTER_LL) capacity = dummy; } ppm_dbg(HICA, "usage_total = %ld, LL capacity = %ld\n", usage_total, capacity); /* (3.1) (util/capacity)超过门限值(settings->capacity_bond) 是否达到次数settings->capacity_hold_time, 如果条件满足进行state切换 */ if (usage_total < capacity * settings->capacity_bond / 100) { settings->capacity_hold_cnt++; if (settings->capacity_hold_cnt >= settings->capacity_hold_time) { trace_ppm_hica( ppm_get_power_state_name(PPM_POWER_STATE_ALL), ppm_get_power_state_name(PPM_POWER_STATE_LL_ONLY), usage_total, capacity, -1, -1, -1, -1, true); return true; } } else settings->capacity_hold_cnt = 0; trace_ppm_hica( ppm_get_power_state_name(PPM_POWER_STATE_ALL), ppm_get_power_state_name(PPM_POWER_STATE_LL_ONLY), usage_total, capacity, -1, -1, -1, -1, false); } return false;}
新的state计算完成后,是通过以下通道配置下去的:
_hps_task_main() -> mt_ppm_main() -> ppm_hica_update_limit_cb() -> ppm_hica_set_default_limit_by_state()↓void ppm_hica_set_default_limit_by_state(enum ppm_power_state state, struct ppm_policy_data *policy){ unsigned int i; struct ppm_power_state_data *state_info = ppm_get_power_state_info(); FUNC_ENTER(FUNC_LV_HICA); for (i = 0; i < policy->req.cluster_num; i++) { if (state >= PPM_POWER_STATE_NONE) { if (state > NR_PPM_POWER_STATE) ppm_err("@%s: Invalid PPM state(%d)\n", __func__, state); policy->req.limit[i].min_cpu_core = get_cluster_min_cpu_core(i); policy->req.limit[i].max_cpu_core = get_cluster_max_cpu_core(i); policy->req.limit[i].min_cpufreq_idx = get_cluster_min_cpufreq_idx(i); policy->req.limit[i].max_cpufreq_idx = get_cluster_max_cpufreq_idx(i);#ifdef PPM_DISABLE_CLUSTER_MIGRATION /* keep at least 1 LL */ if (i == 0) policy->req.limit[i].min_cpu_core = 1;#endif /* (1) HICA根据新的state,配置对应的min_cpu_core/max_cpu_core到本policy当中 */ } else { policy->req.limit[i].min_cpu_core = state_info[state].cluster_limit->state_limit[i].min_cpu_core; policy->req.limit[i].max_cpu_core = state_info[state].cluster_limit->state_limit[i].max_cpu_core; policy->req.limit[i].min_cpufreq_idx = state_info[state].cluster_limit->state_limit[i].min_cpufreq_idx; policy->req.limit[i].max_cpufreq_idx = state_info[state].cluster_limit->state_limit[i].max_cpufreq_idx; } }#ifdef PPM_IC_SEGMENT_CHECK /* ignore HICA min freq setting for L cluster in L_ONLY state */ if (state == PPM_POWER_STATE_L_ONLY && ppm_main_info.fix_state_by_segment == PPM_POWER_STATE_L_ONLY) policy->req.limit[1].min_cpufreq_idx = get_cluster_min_cpufreq_idx(1);#endif FUNC_EXIT(FUNC_LV_HICA);}/*==============================================================*//* Local Variables *//*==============================================================*//* cluster limit for each power state */static const struct ppm_cluster_limit state_limit_LL_ONLY[] = { [0] = LIMIT(15, 0, 1, 4), [1] = LIMIT(15, 0, 0, 0), [2] = LIMIT(15, 0, 0, 0),};STATE_LIMIT(LL_ONLY);static const struct ppm_cluster_limit state_limit_L_ONLY[] = { [0] = LIMIT(15, 0, 0, 0), [1] = LIMIT(8, 0, 1, 4), [2] = LIMIT(15, 0, 0, 0),};STATE_LIMIT(L_ONLY);static const struct ppm_cluster_limit state_limit_ALL[] = { [0] = LIMIT(15, 0, 0, 4), [1] = LIMIT(15, 0, 0, 4), [2] = LIMIT(15, 0, 0, 2),};STATE_LIMIT(ALL);_hps_task_main() -> mt_ppm_main() -> ppm_limit_callback()↓static void ppm_limit_callback(struct ppm_client_req req){ struct ppm_client_req *p = (struct ppm_client_req *)&req; int i; /* (2) 将HICA state对应的policy配置到hps限制中hps_sys.cluster_info[i].ref_base_value/ref_limit_value */ mutex_lock(&hps_ctxt.para_lock); hps_sys.ppm_root_cluster = p->root_cluster; for (i = 0; i < p->cluster_num; i++) { /* * hps_warn("ppm_limit_callback -> cluster%d: has_advise_core = %d, [%d, %d]\n", * i, p->cpu_limit[i].has_advise_core, * p->cpu_limit[i].min_cpu_core, p->cpu_limit[i].max_cpu_core); */#ifdef _TRACE_ trace_ppm_limit_callback_update(i, p->cpu_limit[i].has_advise_core, p->cpu_limit[i].min_cpu_core, p->cpu_limit[i].max_cpu_core);#endif if (!p->cpu_limit[i].has_advise_core) { hps_sys.cluster_info[i].ref_base_value = p->cpu_limit[i].min_cpu_core; hps_sys.cluster_info[i].ref_limit_value = p->cpu_limit[i].max_cpu_core; } else { hps_sys.cluster_info[i].ref_base_value = hps_sys.cluster_info[i].ref_limit_value = p->cpu_limit[i].advise_cpu_core; } } mutex_unlock(&hps_ctxt.para_lock); hps_ctxt.is_interrupt = 1; hps_task_wakeup_nolock();}
4.4.2.2、hps_algo_main
_hps_task_main() -> hps_algo_main()↓void hps_algo_main(void){ unsigned int i, val, base_val, action_print, origin_root, action_break; char str_online[64], str_ref_limit[64], str_ref_base[64], str_criteria_limit[64], str_criteria_base[64], str_target[64], str_hvytsk[64], str_pwrseq[64], str_bigtsk[64]; char *online_ptr = str_online; char *criteria_limit_ptr = str_criteria_limit; char *criteria_base_ptr = str_criteria_base; char *ref_limit_ptr = str_ref_limit; char *ref_base_ptr = str_ref_base; char *hvytsk_ptr = str_hvytsk; char *target_ptr = str_target; char *pwrseq_ptr = str_pwrseq; char *bigtsk_ptr = str_bigtsk; static unsigned int hrtbt_dbg;#ifdef CONFIG_MEIZU_BSP static unsigned long int j;#endif //CONFIG_MEIZU_BSP#ifdef CONFIG_MTK_ICCS_SUPPORT unsigned char real_online_power_state_bitmask = 0; unsigned char real_target_power_state_bitmask = 0; unsigned char iccs_online_power_state_bitmask = 0; unsigned char iccs_target_power_state_bitmask = iccs_get_target_power_state_bitmask(); unsigned char target_cache_shared_state_bitmask = 0;#endif /* Initial value */ base_val = action_print = action_break = hps_sys.total_online_cores = 0; hps_sys.up_load_avg = hps_sys.down_load_avg = hps_sys.tlp_avg = hps_sys.rush_cnt = 0; hps_sys.action_id = origin_root = 0; /* * run algo or not by hps_ctxt.enabled */ if ((u64) ktime_to_ms(ktime_sub(ktime_get(), hps_ctxt.hps_hrt_ktime)) >= HPS_HRT_DBG_MS) action_print = hrtbt_dbg = 1; else hrtbt_dbg = 0; mutex_lock(&hps_ctxt.lock); hps_ctxt.action = ACTION_NONE; atomic_set(&hps_ctxt.is_ondemand, 0); if (!hps_ctxt.enabled) goto HPS_END; if (hps_ctxt.eas_indicator) { /*Set cpu cores by scheduler*/ goto HPS_ALGO_END; } /* * algo - begin */ /*Back up limit and base value for check */ mutex_lock(&hps_ctxt.para_lock); if ((hps_sys.cluster_info[0].base_value == 0) && (hps_sys.cluster_info[1].base_value == 0) && (hps_sys.cluster_info[2].base_value == 0) && (hps_sys.cluster_info[0].limit_value == 0) && (hps_sys.cluster_info[1].limit_value == 0) && (hps_sys.cluster_info[2].limit_value == 0)) { hps_sys.cluster_info[0].base_value = hps_sys.cluster_info[0].ref_base_value = 0; hps_sys.cluster_info[1].base_value = hps_sys.cluster_info[1].ref_base_value = 0; hps_sys.cluster_info[2].base_value = hps_sys.cluster_info[2].ref_base_value = 0; hps_sys.cluster_info[0].limit_value = hps_sys.cluster_info[0].ref_limit_value = 4; hps_sys.cluster_info[1].limit_value = hps_sys.cluster_info[1].ref_limit_value = 4; hps_sys.cluster_info[2].limit_value = hps_sys.cluster_info[2].ref_limit_value = 0; } for (i = 0; i < hps_sys.cluster_num; i++) { hps_sys.cluster_info[i].base_value = hps_sys.cluster_info[i].ref_base_value; hps_sys.cluster_info[i].limit_value = hps_sys.cluster_info[i].ref_limit_value; } for (i = 0; i < hps_sys.cluster_num; i++) { base_val += hps_sys.cluster_info[i].base_value; hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].online_core_num = 0; hps_sys.cluster_info[i].online_core_num = hps_get_cluster_cpus(hps_sys.cluster_info[i].cluster_id); hps_sys.total_online_cores += hps_sys.cluster_info[i].online_core_num; } mutex_unlock(&hps_ctxt.para_lock); /* Determine root cluster */ origin_root = hps_sys.root_cluster_id; hps_define_root_cluster(&hps_sys);#ifdef CONFIG_MACH_MT6799 if (hps_ctxt.smart_det_enabled) { mutex_lock(&hps_ctxt.para_lock); hps_sys.root_cluster_id = 1;/*Change root to L cluster when smart detection is enabled*/ mutex_unlock(&hps_ctxt.para_lock); }#endif if (origin_root != hps_sys.root_cluster_id) hps_sys.action_id = HPS_SYS_CHANGE_ROOT; /* * update history - tlp */ val = hps_ctxt.tlp_history[hps_ctxt.tlp_history_index]; hps_ctxt.tlp_history[hps_ctxt.tlp_history_index] = hps_ctxt.cur_tlp; hps_ctxt.tlp_sum += hps_ctxt.cur_tlp; hps_ctxt.tlp_history_index = (hps_ctxt.tlp_history_index + 1 == hps_ctxt.tlp_times) ? 0 : hps_ctxt.tlp_history_index + 1; ++hps_ctxt.tlp_count; if (hps_ctxt.tlp_count > hps_ctxt.tlp_times) { WARN_ON(hps_ctxt.tlp_sum < val); hps_ctxt.tlp_sum -= val; hps_ctxt.tlp_avg = hps_ctxt.tlp_sum / hps_ctxt.tlp_times; } else { hps_ctxt.tlp_avg = hps_ctxt.tlp_sum / hps_ctxt.tlp_count; } if (hps_ctxt.stats_dump_enabled) hps_ctxt_print_algo_stats_tlp(0); /*Determine eas enabled or not*/ if (!hps_ctxt.eas_enabled) hps_sys.hps_sys_ops[2].enabled = 0; for (i = 0 ; i < hps_sys.cluster_num ; i++) hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].online_core_num; /* (1) 逐个调用 hps_sys_ops()根据各种算法来判断当前cpu是否需要hotplug */ for (i = 0; i < hps_sys.func_num; i++) { if (hps_sys.hps_sys_ops[i].enabled == 1) { if (hps_sys.hps_sys_ops[i].hps_sys_func_ptr()) { hps_sys.action_id = hps_sys.hps_sys_ops[i].func_id; break; } } }/* if (hps_ctxt.heavy_task_enabled) if (hps_algo_heavytsk_det()) hps_sys.action_id = 0xE1;*/ if (hps_ctxt.big_task_enabled) if (hps_algo_big_task_det()) hps_sys.action_id = 0xE2; if (hps_sys.action_id == 0) goto HPS_END;HPS_ALGO_END:#ifdef CONFIG_MACH_MT6799 if (hps_ctxt.smart_det_enabled) { if (hps_sys.cluster_info[2].bigTsk_value <= 1) { mutex_lock(&hps_ctxt.para_lock); hps_sys.cluster_info[2].target_core_num = 1; mutex_unlock(&hps_ctxt.para_lock); } }#endif /* * algo - end */ /* (2) 对limit进行判断,HICA的值就配置到这里 */ /*Base and limit check */ hps_check_base_limit(&hps_sys); /* Ensure that root cluster must one online cpu at less */ if (hps_sys.cluster_info[hps_sys.root_cluster_id].target_core_num <= 0) hps_sys.cluster_info[hps_sys.root_cluster_id].target_core_num = 1;#ifdef CONFIG_MTK_ICCS_SUPPORT real_online_power_state_bitmask = 0; real_target_power_state_bitmask = 0; for (i = 0; i < hps_sys.cluster_num; i++) { real_online_power_state_bitmask |= ((hps_sys.cluster_info[i].online_core_num > 0) << i); real_target_power_state_bitmask |= ((hps_sys.cluster_info[i].target_core_num > 0) << i); } iccs_online_power_state_bitmask = iccs_target_power_state_bitmask; iccs_target_power_state_bitmask = real_target_power_state_bitmask; iccs_get_target_state(&iccs_target_power_state_bitmask, &target_cache_shared_state_bitmask); /* * pr_err("[%s] iccs_target_power_state_bitmask: 0x%x\n", __func__, iccs_target_power_state_bitmask); */ for (i = 0; i < hps_sys.cluster_num; i++) { hps_sys.cluster_info[i].iccs_state = (((real_online_power_state_bitmask >> i) & 1) << 3) | (((real_target_power_state_bitmask >> i) & 1) << 2) | (((iccs_online_power_state_bitmask >> i) & 1) << 1) | (((iccs_target_power_state_bitmask >> i) & 1) << 0); /* * pr_err("[%s] cluster: 0x%x iccs_state: 0x%x\n", __func__, i, hps_sys.cluster_info[i].iccs_state); */ if (hps_get_iccs_pwr_status(i) == 0x1) iccs_cluster_on_off(i, 1); else if (hps_get_iccs_pwr_status(i) == 0x2) iccs_cluster_on_off(i, 0); }#endif /* (3) 经过各种算法计算后目标值是target_core_num,而当前值是online_core_num; 如果不一致,进行cpu_up()/cpu_down()操作 */#if 1 /*Make sure that priority of power on action is higher than power down. */ for (i = 0; i < hps_sys.cluster_num; i++) { if (hps_sys.cluster_info[i].target_core_num > hps_sys.cluster_info[i].online_core_num) { if (hps_algo_do_cluster_action(i) == 1) { action_print = action_break = 1; break; } action_print = 1; } } if (!action_break) { for (i = 0; i < hps_sys.cluster_num; i++) { if (hps_sys.cluster_info[i].target_core_num < hps_sys.cluster_info[i].online_core_num) { if (hps_algo_do_cluster_action(i) == 1) { action_print = action_break = 1; break; } action_print = 1; } } }#else /*Process root cluster first */ if (hps_sys.cluster_info[hps_sys.root_cluster_id].target_core_num != hps_sys.cluster_info[hps_sys.root_cluster_id].online_core_num) { if (hps_algo_do_cluster_action(hps_sys.root_cluster_id) == 1) action_break = 1; else action_break = 0; action_print = 1; } for (i = 0; i < hps_sys.cluster_num; i++) { if (i == hps_sys.root_cluster_id) continue; if (hps_sys.cluster_info[i].target_core_num != hps_sys.cluster_info[i].online_core_num) { if (hps_algo_do_cluster_action(i) == 1) action_break = 1; else action_break = 0; action_print = 1; } }#endif#ifdef CONFIG_MTK_ICCS_SUPPORT for (i = 0; i < hps_sys.cluster_num; i++) { if (hps_get_cluster_cpus(hps_sys.cluster_info[i].cluster_id) != hps_sys.cluster_info[i].target_core_num) { if (hps_get_cluster_cpus(hps_sys.cluster_info[i].cluster_id) == 0) iccs_target_power_state_bitmask &= ~(1 << i); else if (hps_sys.cluster_info[i].target_core_num == 0) iccs_target_power_state_bitmask |= (1 << i); } } /* * pr_err("[%s] iccs_target_power_state_bitmask: 0x%x\n", __func__, iccs_target_power_state_bitmask); */ iccs_set_target_power_state_bitmask(iccs_target_power_state_bitmask);#endifHPS_END: if (action_print || hrtbt_dbg) { int online, target, ref_limit, ref_base, criteria_limit, criteria_base, hvytsk, pwrseq, bigtsk; mutex_lock(&hps_ctxt.para_lock); online = target = criteria_limit = criteria_base = 0; for (i = 0; i < hps_sys.cluster_num; i++) { if (i == origin_root) online = sprintf(online_ptr, "<%d>", hps_sys.cluster_info[i].online_core_num); else online = sprintf(online_ptr, "(%d)", hps_sys.cluster_info[i].online_core_num); if (i == hps_sys.root_cluster_id) target = sprintf(target_ptr, "<%d>", hps_sys.cluster_info[i].target_core_num); else target = sprintf(target_ptr, "(%d)", hps_sys.cluster_info[i].target_core_num); criteria_limit = sprintf(criteria_limit_ptr, "(%d)", hps_sys.cluster_info[i].limit_value); criteria_base = sprintf(criteria_base_ptr, "(%d)", hps_sys.cluster_info[i].base_value); ref_limit = sprintf(ref_limit_ptr, "(%d)", hps_sys.cluster_info[i].ref_limit_value); ref_base = sprintf(ref_base_ptr, "(%d)", hps_sys.cluster_info[i].ref_base_value); hvytsk = sprintf(hvytsk_ptr, "(%d)", hps_sys.cluster_info[i].hvyTsk_value); bigtsk = sprintf(bigtsk_ptr, "(%d)", hps_sys.cluster_info[i].bigTsk_value); if (i == 0) pwrseq = sprintf(pwrseq_ptr, "(%d->", hps_sys.cluster_info[i].pwr_seq); else if ((i != 0) && (i != (hps_sys.cluster_num - 1))) pwrseq = sprintf(pwrseq_ptr, "%d->", hps_sys.cluster_info[i].pwr_seq); else if (i == (hps_sys.cluster_num - 1)) pwrseq = sprintf(pwrseq_ptr, "%d) ", hps_sys.cluster_info[i].pwr_seq); online_ptr += online; target_ptr += target; criteria_limit_ptr += criteria_limit; criteria_base_ptr += criteria_base; ref_limit_ptr += ref_limit; ref_base_ptr += ref_base; hvytsk_ptr += hvytsk; bigtsk_ptr += bigtsk; pwrseq_ptr += pwrseq; } mutex_unlock(&hps_ctxt.para_lock); if (action_print) { hps_set_funct_ctrl(); if (action_break) hps_warn ("(0x%X)%s action break!! (%u)(%u)(%u) %s %s%s-->%s%s (%u)(%u)(%u)(%u) %s\n", ((hps_ctxt.hps_func_control << 12) | hps_sys.action_id), str_online, hps_ctxt.cur_loads, hps_ctxt.cur_tlp, hps_ctxt.cur_iowait, str_hvytsk, str_criteria_limit, str_criteria_base, str_ref_limit, str_ref_base, hps_sys.up_load_avg, hps_sys.down_load_avg, hps_sys.tlp_avg, hps_sys.rush_cnt, str_target); else { char str1[256]; char str2[256]; snprintf(str1, sizeof(str1), "(0x%X)%s action end (%u)(%u)(%u) %s %s[%u][%u](%u) %s %s%s (%u)(%u)(%u)(%u)", ((hps_ctxt.hps_func_control << 12) | hps_sys.action_id), str_online, hps_ctxt.cur_loads, hps_ctxt.cur_tlp, hps_ctxt.cur_iowait, str_hvytsk, str_bigtsk, hps_ctxt.is_screen_off, hps_ctxt.is_idle, hps_ctxt.idle_ratio, str_pwrseq, str_criteria_limit, str_criteria_base, hps_sys.up_load_avg, hps_sys.down_load_avg, hps_sys.tlp_avg, hps_sys.rush_cnt); snprintf(str2, sizeof(str2), "[%u,%u|%u,%u|%u,%u][%u,%u,%u] [%u,%u,%u] [%u,%u,%u] [%u,%u,%u] %s", hps_sys.cluster_info[0].up_threshold, hps_sys.cluster_info[0].down_threshold, hps_sys.cluster_info[1].up_threshold, hps_sys.cluster_info[1].down_threshold, hps_sys.cluster_info[2].up_threshold, hps_sys.cluster_info[2].down_threshold, hps_sys.cluster_info[0].loading, hps_sys.cluster_info[1].loading, hps_sys.cluster_info[2].loading, hps_sys.cluster_info[0].rel_load, hps_sys.cluster_info[1].rel_load, hps_sys.cluster_info[2].rel_load, hps_sys.cluster_info[0].abs_load, hps_sys.cluster_info[1].abs_load, hps_sys.cluster_info[2].abs_load, /* sched-assist hotplug: for debug */ hps_sys.cluster_info[0].sched_load, hps_sys.cluster_info[1].sched_load, hps_sys.cluster_info[2].sched_load, str_target);#ifdef CONFIG_MEIZU_BSP if (printk_timed_ratelimit(&j, 500)) hps_warn("%s%s\n", str1, str2);#else hps_warn("%s%s\n", str1, str2);#endif //CONFIG_MEIZU_BSP#ifdef _TRACE_ trace_hps_update(hps_sys.action_id, str_online, hps_ctxt.cur_loads, hps_ctxt.cur_tlp, hps_ctxt.cur_iowait, str_hvytsk, str_criteria_limit, str_criteria_base, hps_sys.up_load_avg, hps_sys.down_load_avg, hps_sys.tlp_avg, hps_sys.rush_hps_sys.cluster_info[0].up_threshold, hps_sys.cluster_info[0].down_threshold, hps_sys.cluster_info[0].up_threshold, hps_sys.cluster_info[0].down_threshold, hps_sys.cluster_info[2].up_threshold, hps_sys.cluster_info[2].down_threshold, hps_sys.cluster_info[0].loading, hps_sys.cluster_info[1].loading, hps_sys.cluster_info[2].loading, hps_ctxt.up_times, hps_ctxt.down_times, str_target);#endif } hps_ctxt_reset_stas_nolock(); } }#if HPS_HRT_BT_EN if (hrtbt_dbg && (action_print)) { hps_set_funct_ctrl(); hps_warn("(0x%X)%s HRT_BT_DBG (%u)(%u)(%u) %s %s %s %s%s (%u)(%u)(%u)(%u) %s\n", ((hps_ctxt.hps_func_control << 12) | hps_sys.action_id), str_online, hps_ctxt.cur_loads, hps_ctxt.cur_tlp, hps_ctxt.cur_iowait, str_hvytsk, str_bigtsk, str_pwrseq, str_criteria_limit, str_criteria_base, hps_sys.up_load_avg, hps_sys.down_load_avg, hps_sys.tlp_avg, hps_sys.rush_cnt, str_target); hrtbt_dbg = 0; hps_ctxt.hps_hrt_ktime = ktime_get(); }#endif action_print = 0; action_break = 0; mutex_unlock(&hps_ctxt.lock);}
当前hps_algo_main()的算法对应有几种:
static int (*hps_func[]) (void) = {/*hps_algo_perf_indicator, hps_algo_rush_boost, hps_algo_eas, hps_algo_up, hps_algo_down};*/hps_algo_perf_indicator, hps_algo_rush_boost, hps_algo_eas};/* (1) 取perf规定的最小值 */static int hps_algo_perf_indicator(void){ unsigned int i; if (atomic_read(&hps_ctxt.is_ondemand) != 0) { /* for ondemand request */ atomic_set(&hps_ctxt.is_ondemand, 0); mutex_lock(&hps_ctxt.para_lock); for (i = 0; i < hps_sys.cluster_num; i++) hps_sys.cluster_info[i].target_core_num = max(hps_sys.cluster_info[i].base_value, hps_sys.cluster_info[i].online_core_num); mutex_unlock(&hps_ctxt.para_lock); return 1; } return 0;}/* (2) 根据当前load的值是否达到boost门限,来决定是否启动boost */static int hps_algo_rush_boost(void){ int val, base_val; unsigned int idx, total_rel_load; idx = total_rel_load = 0; for (idx = 0 ; idx < hps_sys.cluster_num ; idx++) total_rel_load += hps_sys.cluster_info[idx].rel_load; if (!hps_ctxt.rush_boost_enabled) return 0; base_val = cal_base_cores(); if (total_rel_load > hps_ctxt.rush_boost_threshold * hps_sys.total_online_cores) ++hps_ctxt.rush_count; else hps_ctxt.rush_count = 0; if (hps_ctxt.rush_boost_times == 1) hps_ctxt.tlp_avg = hps_ctxt.cur_tlp; if ((hps_ctxt.rush_count >= hps_ctxt.rush_boost_times) && (hps_sys.total_online_cores * 100 < hps_ctxt.tlp_avg)) { val = hps_ctxt.tlp_avg / 100 + (hps_ctxt.tlp_avg % 100 ? 1 : 0); WARN_ON(!(val > hps_sys.total_online_cores)); if (val > num_possible_cpus()) val = num_possible_cpus(); if (val > base_val) val -= base_val; else val = 0; hps_sys.tlp_avg = hps_ctxt.tlp_avg; hps_sys.rush_cnt = hps_ctxt.rush_count; hps_cal_core_num(&hps_sys, val, base_val); /* [MET] debug for geekbench */ met_tag_oneshot(0, "sched_rush_boost", 1); return 1; } else { /* [MET] debug for geekbench */ met_tag_oneshot(0, "sched_rush_boost", 0); return 0; }}/* (3) 根据负载来计算需要的online cpu */static int hps_algo_eas(void){ int val, ret, i; ret = 0; for (i = 0 ; i < hps_sys.cluster_num ; i++) { hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].online_core_num; /*if up_threshold > loading > down_threshold ==> No action*/ if ((hps_sys.cluster_info[i].loading < (hps_sys.cluster_info[i].up_threshold*hps_sys.cluster_info[i].online_core_num)) && (hps_sys.cluster_info[i].loading > (hps_sys.cluster_info[i].down_threshold*hps_sys.cluster_info[i].online_core_num))) continue; /*if loading > up_threshod ==> power on cores*/ if ((hps_sys.cluster_info[i].loading > (hps_sys.cluster_info[i].up_threshold*hps_sys.cluster_info[i].online_core_num))) { val = hps_sys.cluster_info[i].loading / hps_sys.cluster_info[i].up_threshold; if (hps_sys.cluster_info[i].loading % hps_sys.cluster_info[i].up_threshold) val++; if (val <= hps_sys.cluster_info[i].limit_value) hps_sys.cluster_info[i].target_core_num = val; else hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].limit_value; ret = 1; } else if ((hps_sys.cluster_info[i].loading < (hps_sys.cluster_info[i].down_threshold*hps_sys.cluster_info[i].online_core_num))) { /*if loading < down_threshod ==> power off cores*/ if (!hps_sys.cluster_info[i].loading) { hps_sys.cluster_info[i].target_core_num = 0; continue; } val = hps_sys.cluster_info[i].loading / hps_sys.cluster_info[i].down_threshold; if (hps_sys.cluster_info[i].loading % hps_sys.cluster_info[i].down_threshold) val++; if (val >= hps_sys.cluster_info[i].base_value) hps_sys.cluster_info[i].target_core_num = val; else hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].base_value; ret = 1; } }#if 0 /*Check with big task criteriai*/ for (i = 1 ; i < hps_sys.cluster_num ; i++) { if ((!hps_sys.cluster_info[i].bigTsk_value) && (!(hps_sys.cluster_info[i].loading / hps_sys.cluster_info[i].down_threshold))) hps_sys.cluster_info[i].target_core_num = 0; }#endif return ret;}
4.5、NUMA负载均衡
NUMA arm架构没有使用,暂时不去解析。
- Linux schedule 4、负载均衡
- Linux schedule 3、负载计算
- Linux负载均衡视频
- Linux 负载均衡
- Linux 负载均衡
- Linux负载均衡
- Linux中断负载均衡
- Linux Nginx负载均衡
- Linux SMP负载均衡
- linux 负载均衡
- linux负载均衡
- linux负载均衡
- linux负载均衡
- Linux Nginx负载均衡
- Linux实现负载均衡
- linux cfs负载均衡
- Linux 负载均衡 LVS
- linux lvs 负载均衡
- Linux schedule 1、调度的时刻
- Linux schedule 2、调度算法
- Linux schedule 3、负载计算
- 无界面启动虚拟机
- Leetcode Problem -- 26
- Linux schedule 4、负载均衡
- Linux schedule 5、EAS(Energy-Aware Scheduling)
- Linux schedule 6、Cgoup
- linux --> shell --> date,cal,bc,man...
- shadowsocks-manager 实战大全
- linux --> shell --> umask,权限
- CentOS 7 用 yum 安装 Nginx
- Spring配置之OpenSessionInViewFilter
- 解决maven没有打包xml文件