CPU动态调频二：interactive governor

来源：互联网发布：mac打字不显示候选框编辑：程序博客网时间：2024/05/22 13:28

Linux提供了多种governor供用户选择，这里以interactive为例，毕竟现在的android手机中都是采用该governor.
基于linux 3.14
以下代码若未指明位置则默认在drivers/cpufreq/cpufreq_interactive.c中.

首先需要定义一个cpufreq_governor类型的结构体用来描述interactive governor.

struct cpufreq_governor cpufreq_gov_interactive = {    .name = "interactive",    .governor = cpufreq_governor_interactive,    .max_transition_latency = 10000000,    .owner = THIS_MODULE,};

看一下cpufreq_governor结构体：

struct cpufreq_governor {    char    name[CPUFREQ_NAME_LEN];    int initialized;    int (*governor) (struct cpufreq_policy *policy,                 unsigned int event);    ssize_t (*show_setspeed)    (struct cpufreq_policy *policy,                     char *buf);    int (*store_setspeed)   (struct cpufreq_policy *policy,                     unsigned int freq);    unsigned int max_transition_latency; /* HW must be able to switch to            next freq faster than this value in nano secs or we            will fallback to performance governor */    struct list_head    governor_list;    struct module       *owner;};

name：governor的名字，这里被赋值为interactive
initialized：初始化标志位
max_transition_latency：注释说的很清楚了，硬件从当前频率切换到下一个频率时所用的时间必须比max_transition_latency规定的时间小，否则governor将切换到performance.该数值以纳秒为单位.
governor_list：所有注册的governor都会被add到这个链表里面。
governor：这个calback用于控制governor的行为，比较重要，是governor的一个去切入点，后面会详解.
好了，现在我们静态的定义了一个interactive governor，在governor工作之前还要做一些初始化工作

static int __init cpufreq_interactive_init(void){    unsigned int i;    struct cpufreq_interactive_cpuinfo *pcpu;    struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };    /* Initalize per-cpu timers */    for_each_possible_cpu(i) {        pcpu = &per_cpu(cpuinfo, i);        init_timer_deferrable(&pcpu->cpu_timer);        pcpu->cpu_timer.function = cpufreq_interactive_timer;        pcpu->cpu_timer.data = i;        init_timer(&pcpu->cpu_slack_timer);        pcpu->cpu_slack_timer.function = cpufreq_interactive_nop_timer;        spin_lock_init(&pcpu->load_lock);        spin_lock_init(&pcpu->target_freq_lock);        init_rwsem(&pcpu->enable_sem);    }    spin_lock_init(&speedchange_cpumask_lock);    mutex_init(&gov_lock);    speedchange_task =        kthread_create(cpufreq_interactive_speedchange_task, NULL,                   "cfinteractive");    if (IS_ERR(speedchange_task))        return PTR_ERR(speedchange_task);    sched_setscheduler_nocheck(speedchange_task, SCHED_FIFO, &param);    get_task_struct(speedchange_task);    /* NB: wake up so the thread does not look hung to the freezer */    wake_up_process(speedchange_task);    return cpufreq_register_governor(&cpufreq_gov_interactive);}

遍历可能的CPU
get到每个CPU的cpuinfo成员
初始化可延时定时器
设置定时器的function，定时器超时时会调用该函数
设置定时器的data，这里表示CPU ID
初始化slack定时器
设置该定时器的function，定时器超时时会调用该函数
初始化两个定时器的spin_lock
初始化可读信号量
创建一个线程cpufreq_interactive_speedchange_task，返回的进程描述符用speedchange_task保存
设置该线程的调度策略和调度参数
该线程的引用计数加1
唤醒speedchange_task
调用cpufreq_register_governor注册interactive governor

drivers/cpufreq/cpufreq.cstatic LIST_HEAD(cpufreq_governor_list);int cpufreq_register_governor(struct cpufreq_governor *governor){    int err;    if (!governor)        return -EINVAL;    if (cpufreq_disabled())        return -ENODEV;    mutex_lock(&cpufreq_governor_mutex);    governor->initialized = 0;    err = -EBUSY;    if (__find_governor(governor->name) == NULL) {        err = 0;        list_add(&governor->governor_list, &cpufreq_governor_list);    }    mutex_unlock(&cpufreq_governor_mutex);    return err;}EXPORT_SYMBOL_GPL(cpufreq_register_governor);

cpufreq_governor_list用来保存已注册的governor
__find_governor会在cpufreq_governor_list中遍历寻找是否有与需要register的governor重名的governor，如果没有则将该governor添加到cpufreq_governor_list中
好的，简单介绍了一下governor的定义，初始化，注册。
现在我们已经拥有了一个interactive governor，CPUFREQ core如果想操作governor进行选频，那么interactive governor必须对外提供一个interface以供调用，这就是cpufreq_governor结构体中的governor callback，下面来以这个interface为切入点分析governor是如何工作的.

The governor->governor callback is called with the current (or to-be-set)cpufreq_policy struct for that CPU, and an unsigned int event. Thefollowing events are currently defined:CPUFREQ_GOV_START:   This governor shall start its duty for the CPU             policy->cpuCPUFREQ_GOV_STOP:    This governor shall end its duty for the CPU             policy->cpuCPUFREQ_GOV_LIMITS:  The limits for CPU policy->cpu have changed to             policy->min and policy->max.

在前面的定义中有

.governor = cpufreq_governor_interactive,

下面来看一下cpufreq_governor_interactive,分段分析：

static int cpufreq_governor_interactive(struct cpufreq_policy *policy,        unsigned int event){    int rc;    unsigned int j;    struct cpufreq_interactive_cpuinfo *pcpu;    struct cpufreq_frequency_table *freq_table;    struct cpufreq_interactive_tunables *tunables;    unsigned long flags;

定义了一堆变量：pcpu描述了cpu相关信息，结构体如下，用到的时候在看。

struct cpufreq_interactive_cpuinfo {    struct timer_list cpu_timer;    struct timer_list cpu_slack_timer;    spinlock_t load_lock; /* protects the next 4 fields */    u64 time_in_idle;    u64 time_in_idle_timestamp;    u64 cputime_speedadj;    u64 cputime_speedadj_timestamp;    struct cpufreq_policy *policy;    struct cpufreq_frequency_table *freq_table;    spinlock_t target_freq_lock; /*protects target freq */    unsigned int target_freq;    unsigned int floor_freq;    unsigned int max_freq;    u64 floor_validate_time;    u64 hispeed_validate_time;    struct rw_semaphore enable_sem;    int governor_enabled;};

freq_tab表示频率表，结构体如下，你会发现这是一个node，每个node代表一个频点，很多node关联在一起就成了一个tab：

struct cpufreq_frequency_table {    unsigned int    driver_data; /* driver specific data, not used by core */    unsigned int    frequency; /* kHz - doesn't need to be in ascending                    * order */};

struct cpufreq_interactive_tunables *tunables;这个结构体很重要，贯穿了整个governor callback，先给出结构体，接下来在函数中边看边分析。

struct cpufreq_interactive_tunables {    int usage_count;    /* Hi speed to bump to from lo speed when load burst (default max) */    unsigned int hispeed_freq;    /* Go to hi speed when CPU load at or above this value. */    #define DEFAULT_GO_HISPEED_LOAD 99    unsigned long go_hispeed_load;    /* Target load. Lower values result in higher CPU speeds. */    spinlock_t target_loads_lock;    unsigned int *target_loads;    int ntarget_loads;    /*     * The minimum amount of time to spend at a frequency before we can ramp     * down.     */    #define DEFAULT_MIN_SAMPLE_TIME (80 * USEC_PER_MSEC)    unsigned long min_sample_time;    /*     * The sample rate of the timer used to increase frequency     */    unsigned long timer_rate;    /*     * Wait this long before raising speed above hispeed, by default a     * single timer interval.     */    spinlock_t above_hispeed_delay_lock;    unsigned int *above_hispeed_delay;    int nabove_hispeed_delay;    /* Non-zero means indefinite speed boost active */    int boost_val;    /* Duration of a boot pulse in usecs */    int boostpulse_duration_val;    /* End time of boost pulse in ktime converted to usecs */    u64 boostpulse_endtime;    bool boosted;    /*     * Max additional time to wait in idle, beyond timer_rate, at speeds     * above minimum before wakeup to reduce speed, or -1 if unnecessary.     */    #define DEFAULT_TIMER_SLACK (4 * DEFAULT_TIMER_RATE)    int timer_slack_val;    bool io_is_busy;};

回到cpufreq_governor_interactive函数，

    if (have_governor_per_policy())        tunables = policy->governor_data;    else         tunables = common_tunables;

have_governor_per_policy判断是否每个policy都有自己的governor，我的项目中policy都采用interactive，所以这里tuables被赋值为common_tunables。
common_tunables被定义为：

/* For cases where we have single governor instance for system */static struct cpufreq_interactive_tunables *common_tunables;

但是没有分配内存和初始化。

继续往下看。

    switch (event) {    case CPUFREQ_GOV_POLICY_INIT:        if (have_governor_per_policy()) {            WARN_ON(tunables);        } else if (tunables) {            tunables->usage_count++;            policy->governor_data = tunables;            return 0;        }        tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);        if (!tunables) {            pr_err("%s: POLICY_INIT: kzalloc failed\n", __func__);            return -ENOMEM;        }        tunables->usage_count = 1;        tunables->above_hispeed_delay = default_above_hispeed_delay;        tunables->nabove_hispeed_delay =            ARRAY_SIZE(default_above_hispeed_delay);        tunables->go_hispeed_load = DEFAULT_GO_HISPEED_LOAD;        tunables->target_loads = default_target_loads;        tunables->ntarget_loads = ARRAY_SIZE(default_target_loads);        tunables->min_sample_time = DEFAULT_MIN_SAMPLE_TIME;        tunables->timer_rate = DEFAULT_TIMER_RATE;        tunables->boostpulse_duration_val = DEFAULT_MIN_SAMPLE_TIME;        tunables->timer_slack_val = DEFAULT_TIMER_SLACK;        spin_lock_init(&tunables->target_loads_lock);        spin_lock_init(&tunables->above_hispeed_delay_lock);        policy->governor_data = tunables;        if (!have_governor_per_policy()) {            common_tunables = tunables;            WARN_ON(cpufreq_get_global_kobject());        }        rc = sysfs_create_group(get_governor_parent_kobj(policy),                get_sysfs_attr());        if (rc) {            kfree(tunables);            policy->governor_data = NULL;            if (!have_governor_per_policy())                common_tunables = NULL;            return rc;        }        if (!policy->governor->initialized) {            idle_notifier_register(&cpufreq_interactive_idle_nb);            cpufreq_register_notifier(&cpufreq_notifier_block,                    CPUFREQ_TRANSITION_NOTIFIER);        }        break;

判断event的类型，并根据event进行不同的操作。
在include/linux/cpufreq.h中定义了几种Governor Events

/* Governor Events */#define CPUFREQ_GOV_START   1#define CPUFREQ_GOV_STOP    2#define CPUFREQ_GOV_LIMITS  3#define CPUFREQ_GOV_POLICY_INIT 4#define CPUFREQ_GOV_POLICY_EXIT 5

OK，一个一个来。

CPUFREQ_GOV_POLICY_INIT

该event表示要init governor policy.
首先判断have_governor_per_policy()，前面分析过了，返回false，并且tunables并没有被分配内存，所以执行下一条语句

        tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);

终于为tunables分配内存了~~
接下来就是对tunables的初始化：

        tunables->usage_count = 1;        tunables->above_hispeed_delay = default_above_hispeed_delay;        tunables->nabove_hispeed_delay =            ARRAY_SIZE(default_above_hispeed_delay);        tunables->go_hispeed_load = DEFAULT_GO_HISPEED_LOAD;        tunables->target_loads = default_target_loads;        tunables->ntarget_loads = ARRAY_SIZE(default_target_loads);        tunables->min_sample_time = DEFAULT_MIN_SAMPLE_TIME;        tunables->timer_rate = DEFAULT_TIMER_RATE;        tunables->boostpulse_duration_val = DEFAULT_MIN_SAMPLE_TIME;        tunables->timer_slack_val = DEFAULT_TIMER_SLACK;        spin_lock_init(&tunables->target_loads_lock);        spin_lock_init(&tunables->above_hispeed_delay_lock);

usage_count表示引用计数，初始化的时候设置为1

above_hispeed_delay，内核文档： /Documention/cpu-freq/governors.txt

above_hispeed_delay: When speed is at or above hispeed_freq, wait forthis long before raising speed in response to continued high load.The format is a single delay value, optionally followed by pairs ofCPU speeds and the delay to use at or above those speeds.  Colons canbe used between the speeds and associated delays for readability.  Forexample:   80000 1300000:200000 1500000:40000uses delay 80000 uS until CPU speed 1.3 GHz, at which speed delay200000 uS is used until speed 1.5 GHz, at which speed (and above)delay 40000 uS is used.  If speeds are specified these must appear inascending order.  Default is 20000 uS.

下面是我自己的理解，有错误请务必指出：当CPU频率大于等于hispeed_freq，并且此时workload仍在不停增加（continued high load），系统将等待一个above_hispeed_delay的时间。above_hispeed_delay一般是这样一种格式，一个单个的延时数值，后面跟上一组由CPU speeds 和 delay组成的数组，由冒号隔开。例如：
80000 1300000:200000 1500000:40000
当频率低于1.3G时，above_hispeed_delay的值取80000，1.3G到1.5G之间取20000，大于1.5G取40000.默认取20000us.如果频率被指定，那么这些数值必须必须是升序的。（最后一句不是很确定很理解）

#define DEFAULT_TIMER_RATE (20 * USEC_PER_MSEC)#define DEFAULT_ABOVE_HISPEED_DELAY DEFAULT_TIMER_RATEstatic unsigned int default_above_hispeed_delay[] = {    DEFAULT_ABOVE_HISPEED_DELAY };

可以看到default_above_hispeed_delay是一个数组，我的环境下只有一个数值，above_hispeed_delay的数值就是ta了。

nabove_hispeed_delay:default_above_hispeed_delays数组中元素的个数.

go_hispeed_load: The CPU load at which to ramp to hispeed_freq.Default is 99%.
高频阈值。当系统的负载超过该值，升频，否则降频。

#define DEFAULT_GO_HISPEED_LOAD 99

调频的时候会用到这个数值，后面会说到.

顺便说一下hispeed_freq: Hi speed to bump to from lo speed when load burst (default max)
当workload达到 go_hispeed_load时，频率将被拉高到这个值，默认的大小由policy来决定。

target_loads:

CPU load values used to adjust speed to influence thecurrent CPU load toward that value.  In general, the lower the targetload, the more often the governor will raise CPU speeds to bring loadbelow the target.  The format is a single target load, optionallyfollowed by pairs of CPU speeds and CPU loads to target at or abovethose speeds.  Colons can be used between the speeds and associatedtarget loads for readability.  For example:   85 1000000:90 1700000:99targets CPU load 85% below speed 1GHz, 90% at or above 1GHz, until1.7GHz and above, at which load 99% is targeted.  If speeds arespecified these must appear in ascending order.  Higher target loadvalues are typically specified for higher speeds, that is, target loadvalues also usually appear in an ascending order. The default istarget load 90% for all speeds.

target_loads使得CPU调整频率来影响当前的CPU workload，促使当前的CPU workload向target_loads靠近.
通常，target_loads的值越小，CPU就会越频繁地拉高频率使当前workload低于target_loads.
例如：频率小于1G时，取85%；1G—-1.7G，取90%；大于1.7G，取99%。默认值取90%.

/* Target load.  Lower values result in higher CPU speeds. */#define DEFAULT_TARGET_LOAD 90static unsigned int default_target_loads[] = {DEFAULT_TARGET_LOAD};

ntarget_loads：target_loads的个数

min_sample_time：min_sample_time: The minimum amount of time to spend at the current
frequency before ramping down. Default is 80000 uS.

#define DEFAULT_MIN_SAMPLE_TIME (80 * USEC_PER_MSEC)/include/linux/time.h#define USEC_PER_MSEC   1000L

最小采样时间，刚好80000us.

boostpulse_duration_val

boost: If non-zero, immediately boost speed of all CPUs to at leasthispeed_freq until zero is written to this attribute.  If zero, allowCPU speeds to drop below hispeed_freq according to load as usual.Default is zero.boostpulse: On each write, immediately boost speed of all CPUs tohispeed_freq for at least the period of time specified byboostpulse_duration, after which speeds are allowed to drop belowhispeed_freq according to load as usual.boostpulse_duration: Length of time to hold CPU speed at hispeed_freqon a write to boostpulse, before allowing speed to drop according toload as usual.  Default is 80000 uS.

boost即超频，操作方法是

echo 1 > /sys/devices/system/cpu/cpufreq/interactive/boost

此时会立即将所有CPU的频率提高到至少hispeed_freq.写入0时，根据workload降低频率.默认为0.

boostpulse，每次触发boost功能时，立即拉高所有CPU的频率到hispeed_freq并保持在该频率至少boostpulse_duration的时间，在这段时间以后，根据当前的workload，频率才允许被降低。

boostpulse_duration：默认值80000 uS.这里我的值也是80000 uS.

timer_rate和timer_slack_val

timer_rate: Sample rate for reevaluating CPU load when the CPU is notidle.  A deferrable timer is used, such that the CPU will not be wokenfrom idle to service this timer until something else needs to run.(The maximum time to allow deferring this timer when not running atminimum speed is configurable via timer_slack.)  Default is 20000 uS.timer_slack: Maximum additional time to defer handling the governorsampling timer beyond timer_rate when running at speeds above theminimum.  For platforms that consume additional power at idle whenCPUs are running at speeds greater than minimum, this places an upperbound on how long the timer will be deferred prior to re-evaluatingload and dropping speed.  For example, if timer_rate is 20000uS andtimer_slack is 10000uS then timers will be deferred for up to 30msecwhen not at lowest speed.  A value of -1 means defer timersindefinitely at all speeds.  Default is 80000 uS.

当CPU不处于idel状态时，timer_rate作为采样速率来计算CPU的workload.
当CPU处于idel状态，此时使用一个可延时定时器，会导致CPU不能从idel状态苏醒来响应定时器.
定时器的最大的可延时时间用timer_slack表示，默认值80000 uS.

继续向下看

        spin_lock_init(&tunables->target_loads_lock);        spin_lock_init(&tunables->above_hispeed_delay_lock);        policy->governor_data = tunables;        if (!have_governor_per_policy()) {            common_tunables = tunables;            WARN_ON(cpufreq_get_global_kobject());        }        rc = sysfs_create_group(get_governor_parent_kobj(policy),                get_sysfs_attr());        if (rc) {            kfree(tunables);            policy->governor_data = NULL;            if (!have_governor_per_policy())                common_tunables = NULL;            return rc;        }        if (!policy->governor->initialized) {            idle_notifier_register(&cpufreq_interactive_idle_nb);            cpufreq_register_notifier(&cpufreq_notifier_block,                    CPUFREQ_TRANSITION_NOTIFIER);        }        break;

初始化tunables结构体中的两个自旋锁.
将tunables指针赋值给policy->governor_data
将tunables指针赋值给common_tunables，这个全局变量会在一些文件的show和store函数中被调用，没有深入研究.
继续

        rc = sysfs_create_group(get_governor_parent_kobj(policy),                get_sysfs_attr());

看一下get_governor_parent_kobj和get_sysfs_attr

drivers/cpufreq/cpufreq.cstruct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy){    if (have_governor_per_policy())        return &policy->kobj;    else        return cpufreq_global_kobject;}EXPORT_SYMBOL_GPL(get_governor_parent_kobj);struct kobject *cpufreq_global_kobject;EXPORT_SYMBOL(cpufreq_global_kobject);

static struct attribute_group *get_sysfs_attr(void){    if (have_governor_per_policy())        return &interactive_attr_group_gov_pol;    else        return &interactive_attr_group_gov_sys;}static struct attribute_group interactive_attr_group_gov_sys = {    .attrs = interactive_attributes_gov_sys,    .name = "interactive",};/* One Governor instance for entire system */static struct attribute *interactive_attributes_gov_sys[] = {    &target_loads_gov_sys.attr,    &above_hispeed_delay_gov_sys.attr,    &hispeed_freq_gov_sys.attr,    &go_hispeed_load_gov_sys.attr,    &min_sample_time_gov_sys.attr,    &timer_rate_gov_sys.attr,    &timer_slack_gov_sys.attr,    &boost_gov_sys.attr,    &boostpulse_gov_sys.attr,    &boostpulse_duration_gov_sys.attr,    &io_is_busy_gov_sys.attr,    NULL,};

OK，我们把上述代码简化一下得到

        rc = sysfs_create_group(cpufreq_global_kobject,                interactive_attr_group_gov_sys);

在cpufreq_global_kobject所对应的目录cpufreq下创建一个名为interactive的目录，并创建与之关联的属性文件。通过以下方式可以看到这些属性文件

ls /sys/devices/system/cpu/cpufreq/interactive/

最后注册了两个notification，分别是idle相关和频率改变相关.
回顾一下CPUFREQ_GOV_POLICY_INIT都做了什么：
1.定义并初始化了一个cpufreq_interactive_tunables结构体，将该结构体指针赋值给policy->governor_data，在struct cpufreq_policy结构体中，policy->governor_data为void *指针，现在我们知道它的作用是指向tunables，而tunablesa对应的内存中存放了governor调节频率的参数，这就是policy->governor_data的作用.
2.创建对应的目录和属性文件

CPUFREQ_GOV_POLICY_EXIT

这个event对应的操作比较简单一些，主要是做一些policy和governor的“善后”工作，不必赘述了。

CPUFREQ_GOV_START

CPUFREQ_GOV_START:   This governor shall start its duty for the CPU             policy->cpu

启动一个governor，看代码

    case CPUFREQ_GOV_START:        mutex_lock(&gov_lock);        freq_table = cpufreq_frequency_get_table(policy->cpu);        if (!tunables->hispeed_freq)            tunables->hispeed_freq = policy->max;        for_each_cpu(j, policy->cpus) {            pcpu = &per_cpu(cpuinfo, j);            pcpu->policy = policy;            pcpu->target_freq = policy->cur;            pcpu->freq_table = freq_table;            pcpu->floor_freq = pcpu->target_freq;            pcpu->floor_validate_time =                ktime_to_us(ktime_get());            pcpu->hispeed_validate_time =                pcpu->floor_validate_time;            pcpu->max_freq = policy->max;            down_write(&pcpu->enable_sem);            del_timer_sync(&pcpu->cpu_timer);            del_timer_sync(&pcpu->cpu_slack_timer);            cpufreq_interactive_timer_start(tunables, j);            pcpu->governor_enabled = 1;            up_write(&pcpu->enable_sem);        }        mutex_unlock(&gov_lock);        break;

首先获取freq_tab.
如果没有设置hispeed_freq的值的话，就设置hispeed_freq为policy->max，和之前介绍hispeed_freq时说的一样.
接下来是一个for循环，policy->cpus表示所有处于online状态的CPU，for循环遍历所有处于online状态的CPU，在这个循环中：
get到cpu的cpuinfo结构体并把指针赋值给pcpu，一个struct cpufreq_interactive_cpuinfo结构体指针.
然后对pcpu的一些成员进行初始化，本质上还是设置online cpus的cpuinfo结构体成员.
然后调用cpufreq_interactive_timer_start启动相关的定时器
启动定时器以后governor就可以工作了，所以设置pcpu->governor_enabled为1

来看代码

/* The caller shall take enable_sem write semaphore to avoid any timer race. * The cpu_timer and cpu_slack_timer must be deactivated when calling this * function. */static void cpufreq_interactive_timer_start(    struct cpufreq_interactive_tunables *tunables, int cpu){    struct cpufreq_interactive_cpuinfo *pcpu = &per_cpu(cpuinfo, cpu);    unsigned long expires = jiffies +        usecs_to_jiffies(tunables->timer_rate);    unsigned long flags;    pcpu->cpu_timer.expires = expires;    add_timer_on(&pcpu->cpu_timer, cpu);    if (tunables->timer_slack_val >= 0 &&        pcpu->target_freq > pcpu->policy->min) {        expires += usecs_to_jiffies(tunables->timer_slack_val);        pcpu->cpu_slack_timer.expires = expires;        add_timer_on(&pcpu->cpu_slack_timer, cpu);    }    spin_lock_irqsave(&pcpu->load_lock, flags);    pcpu->time_in_idle =        get_cpu_idle_time(cpu, &pcpu->time_in_idle_timestamp,                  tunables->io_is_busy);    pcpu->cputime_speedadj = 0;    pcpu->cputime_speedadj_timestamp = pcpu->time_in_idle_timestamp;    spin_unlock_irqrestore(&pcpu->load_lock, flags);}

注释中有解释：The cpu_timer and cpu_slack_timer must be deactivated when calling this function.
所以在进入cpufreq_interactive_timer_start之前有一些deactive的操作：

del_timer_sync(&pcpu->cpu_timer);del_timer_sync(&pcpu->cpu_slack_timer);

看看cpufreq_interactive_timer_start究竟做了什么
设置定时器的到期时间expire
调用add_timer_on添加定时器，”start a timer on a particular CPU”
在指定的CPU上start一个定时器，假如我的手机上有4个CPU，那么将有四个定时器被添加到pcpu->cpu_timer链表中
cpu_slack_timer也是同样的操作
然后获取该CPU的idle时间，这个数值在统计更新时间的时候会被用到.

pcpu->time_in_idle = get_cpu_idle_time(cpu, &pcpu->time_in_idle_timestamp,                                            tunables->io_is_busy);

随后调用

    pcpu->cputime_speedadj = 0;    pcpu->cputime_speedadj_timestamp = pcpu->time_in_idle_timestamp;

time_in_idle_timestamp的数值在get_cpu_idle_time函数中被更新，在代码中形参的名字为last_update_time，可以理解为更新time_in_idle的时间戳。网上有人解释为计算机启动到现在的时间，是一样的.

OK，到这里start governor的工作就完成了，主要就是启动了两个定时器，定时器到期的话，会执行相关的操作最终选定要set的频率.
本来到这里我们应该回到cpufreq_governor_interactive中分析event为CPUFREQ_GOV_LIMITS的情况。
但是为了思路的流畅性，我们顺着定时器继续追代码，看定时器如何实现选频.

__init cpufreq_interactive_init函数中

 cpu->cpu_timer.function = cpufreq_interactive_timer; pcpu->cpu_timer.data = i;

定时器到期时，调用 cpufreq_interactive_timer，
这里data是cpu的索引号，在cpufreq_interactive_init中cpu_timer的data成员被赋值成为CPU的索引号，之后调用cpu_timer.function的时候作为实参.
分段看：

static void cpufreq_interactive_timer(unsigned long data){    u64 now;    unsigned int delta_time;    u64 cputime_speedadj;    int cpu_load;    struct cpufreq_interactive_cpuinfo *pcpu =        &per_cpu(cpuinfo, data);    struct cpufreq_interactive_tunables *tunables =        pcpu->policy->governor_data;    unsigned int new_freq;    unsigned int loadadjfreq;    unsigned int index;    unsigned long flags;    if (!down_read_trylock(&pcpu->enable_sem))        return;    if (!pcpu->governor_enabled)        goto exit;    spin_lock_irqsave(&pcpu->load_lock, flags);    now = update_load(data);    delta_time = (unsigned int)(now - pcpu->cputime_speedadj_timestamp);    cputime_speedadj = pcpu->cputime_speedadj;    spin_unlock_irqrestore(&pcpu->load_lock, flags);    if (WARN_ON_ONCE(!delta_time))        goto rearm;    spin_lock_irqsave(&pcpu->target_freq_lock, flags);    do_div(cputime_speedadj, delta_time);    loadadjfreq = (unsigned int)cputime_speedadj * 100;    cpu_load = loadadjfreq / pcpu->target_freq;    tunables->boosted = tunables->boost_val || now < tunables->boostpulse_endtime;

首先调用update_load，更新工作负载

static u64 update_load(int cpu){    struct cpufreq_interactive_cpuinfo *pcpu = &per_cpu(cpuinfo, cpu);    struct cpufreq_interactive_tunables *tunables =        pcpu->policy->governor_data;    u64 now;    u64 now_idle;    unsigned int delta_idle;    unsigned int delta_time;    u64 active_time;    now_idle = get_cpu_idle_time(cpu, &now, tunables->io_is_busy);    delta_idle = (unsigned int)(now_idle - pcpu->time_in_idle);    delta_time = (unsigned int)(now - pcpu->time_in_idle_timestamp);    if (delta_time <= delta_idle)        active_time = 0;    else        active_time = delta_time - delta_idle;    pcpu->cputime_speedadj += active_time * pcpu->policy->cur;    pcpu->time_in_idle = now_idle;    pcpu->time_in_idle_timestamp = now;    return now;}

now_idle:系统启动以后运行的idle的总时间

the cummulative idle time (since boot) for a given CPU, in microseconds.

pcpu->time_in_idle：上次统计时的idle的总时间
delta_idle：两次统计之间的idle总时间

now：本次的update time，应该是本次统计idle时的时间戳

variable to store update time in.

pcpu->time_in_idle_timestamp，上次统计idle时的时间戳
delta_time：两次统计之间系统运行的总时间

若delta_time <= delta_idle，说明运行期间CPU一直在idle，active_time赋值为0.
否则，active_time = delta_time - delta_idle;计算出两次统计之间CPU处于active的总时间.

然后更新pcpu的一些成员变量的值:
pcpu->cputime_speedadj 这个数值的计算方式是本身加上active_time * pcpu->policy->cur，是一共改变了多少频率的意思？不确定.
pcpu->time_in_idle = now_idle; 更新系统启动后运行的总idle时间.
pcpu->time_in_idle_timestamp = now;更新统计时的时间戳.
上面这两个数值被更新留作下次update_load使用

回到cpufreq_interactive_timer，update_load返回了最新一次统计idle时的时间戳，赋值给now.

    delta_time = (unsigned int)(now - pcpu->cputime_speedadj_timestamp);

再次计算两次统计之间的运行时间
在update_load中是now - pcpu->time_in_idle_timestamp，但是随后在update_load更新了time_in_idle_timestamp的值，所以now和time_in_idle_timestamp应该相等，不能再这么算.
这里用cputime_speedadj_timestamp，我在函数cpufreq_interactive_timer_resched和cpufreq_interactive_timer_start发现cputime_speedadj_timestamp都被赋值为time_in_idle_timestamp，所以我认为：cputime_speedadj_timestamp是作为time_in_idle_timestamp的一个“备份”，保存上次统计时的time_in_idle_timestamp.

然后取pcpu->cputime_speedadj赋值给局部变量cputime_speedadj，cpu->cputime_speedadj在update_load中已被计算并更新过了.

接下来的几行代码都是用来计算cpu_load，把这些数值展开看就变得很清晰了

loadadjfreq = (unsigned int)cputime_speedadj * 100;

替换后

cputime_speedadj =  active_time * policy->cur * 100cputime_speedadj = （delta_time - delta_idle）* policy->cur * 100cputime_speedadj = [(now -time_in_idle_timestamp) - (now_idle - time_in_idle)] * policy->cur * 100

now -time_in_idle_timestamp是两次统计间的运行时间，用x表示；
now_idle - time_in_idle是两次统计间CPU处于idle的时间，用y表示。
之前分析过，cputime_speedadj_timestamp是time_in_idle_timestamp的备份，且

do_div(cputime_speedadj, delta_time);

所以，可以替换为

cputime_speedadj = cputime_speedadj / delta_timedelta_time = now - pcpu->cputime_speedadj_timestampdelta_time = now - time_in_idle_timestamp

now -time_in_idle_timestamp是两次统计间的运行时间，用x表示；
now_idle - time_in_idle是两次统计间idle的总时间，用y表示
所以

cputime_speedadj = （x - y）* policy->cur / x * 100cpu_load = [（x - y） / x ] * [ policy->cur / pcpu->target_freq] * 100 cpu_load = (1 - x / y) * ( policy->cur / pcpu->target_freq ) * 100

(1 - x / y)是统计时间内CPU处于非idle的时间比例，policy->cur / pcpu->target_freq 表示当前频率占目标频率的比例，至于为什么要乘以100，是因为内核不支持浮点运算.

ok，到这里我们终于发现，影响cpu_load的两个因素
1. idle时间
2. 当前频率/目标频率

有一个疑问：
cpufreq_interactive_timer函数的目的是为了根据当前的workload选频，得到目标频率，然后传给cpufreq driver来设置频率。如果已经有了目标频率，那么直接调driver设置好了，所以这里的pcpu->target_freq不是本次选频得到的target_freq
在cpufreq_interactive_timer的后面代码中，我们看到

pcpu->target_freq = new_freq;

new_freq 是本次选频后得到的新频率，最后赋值给pcpu->target_freq，所以在cpufreq_interactive_timer中，该赋值语句之前的所有pcpu->target_freq都表示是上一次选频的target_freq

所以更正一下，影响cpu_load的两个因素
1. idle时间
2. 当前频率/上一次选频频率

OK，带着这个思路就比较好分析了

if (cpu_load >= tunables->go_hispeed_load || tunables->boosted) {                if (pcpu->target_freq < tunables->hispeed_freq) {                        new_freq = tunables->hispeed_freq;                } else {                        new_freq = choose_freq(pcpu, loadadjfreq);                        if (new_freq < tunables->hispeed_freq)                                new_freq = tunables->hispeed_freq;                }        } else {                new_freq = choose_freq(pcpu, loadadjfreq);                if (new_freq > tunables->hispeed_freq &&                                pcpu->target_freq < tunables->hispeed_freq)                        new_freq = tunables->hispeed_freq;        }

当cpu_load大于tunables->go_hispeed_load或者tunables->boosted的值为非0，此时我们需要拉高频率.
如果上一次选频频率比tunables->hispeed_freq小，那么直接设置new_freq为tunables->hispeed_freq;
如果上一次选频频率不小于tunables->hispeed_freq，调用choose_freq函数选频，若选频后仍然达不到tunables->hispeed_freq，那么直接设置new_freq为tunables->hispeed_freq。
可以看到，tunables->go_hispeed_load时，new_freq的频率要不小于tunables->hispeed_freq.

当cpu_load小于等于tunables->go_hispeed_load并且tunables->boosted的值为0，调用choose_freq选频.
若选频后new_freq的值大于tunables->hispeed_freq并且上一次选频频率小于tunables->hispeed_freq，那么直接设置new_freq为tunables->hispeed_freq.

关于choose_freq是如何选频的，我单独写了一篇文章：CPU动态调频三：interactive governor如何选频

继续探究cpufreq_interactive_timer

    if (pcpu->target_freq >= tunables->hispeed_freq &&        new_freq > pcpu->target_freq &&        now - pcpu->hispeed_validate_time <        freq_to_above_hispeed_delay(tunables, pcpu->target_freq)) {        trace_cpufreq_interactive_notyet(            data, cpu_load, pcpu->target_freq,            pcpu->policy->cur, new_freq);        spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);        goto rearm;    }

freq_to_above_hispeed_delay，只是返回了tunables->above_hispeed_delay[i]的数值，我们只设置了一个数值default_above_hispeed_delay.
重点是这个成员的含义，可以回头看一下INIT阶段的解释.
如果满足

pcpu->target_freq >= tunables->hispeed_freq &&        new_freq > pcpu->target_freq

上一次选频频率已经大于tunables->hispeed_freq，本次选频频率比上次更大（系统仍然想增加频率）

now - pcpu->hispeed_validate_time <        freq_to_above_hispeed_delay(tunables, pcpu->target_freq))

now是本次采样时间戳，pcpu->hispeed_validate_time是上次hispeed生效的时间戳，如果两次时间间隔比above_hispeed_delay小，那么直接goto rearm，不调节频率.

        pcpu->hispeed_validate_time = now;

更新hispeed_validate_time为now

    if (cpufreq_frequency_table_target(pcpu->policy, pcpu->freq_table,                       new_freq, CPUFREQ_RELATION_L,                       &index)) {        spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);        goto rearm;    }    new_freq = pcpu->freq_table[index].frequency;

取freq table中大于或等于new_freq的最小频率，返回index，再由index得到new freq，前面已经得到new freq了，这里为什么要再来一次，不是很理解.

    /*     * Do not scale below floor_freq unless we have been at or above the     * floor frequency for the minimum sample time since last validated.     */    if (new_freq < pcpu->floor_freq) {        if (now - pcpu->floor_validate_time <                tunables->min_sample_time) {            trace_cpufreq_interactive_notyet(                data, cpu_load, pcpu->target_freq,                pcpu->policy->cur, new_freq);            spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);            goto rearm;        }    }

当new_freq < pcpu->floor_freq，并且两次floor_validate_time的间隔小于min_sample_time，此时不需要更新频率.网上有大神说，“在最小抽样周期间隔内，CPU的频率是不会变化的.”

    /*     * Update the timestamp for checking whether speed has been held at     * or above the selected frequency for a minimum of min_sample_time,     * if not boosted to hispeed_freq.  If boosted to hispeed_freq then we     * allow the speed to drop as soon as the boostpulse duration expires     * (or the indefinite boost is turned off).     */    if (!tunables->boosted || new_freq > tunables->hispeed_freq) {        pcpu->floor_freq = new_freq;        pcpu->floor_validate_time = now;    }

做一些更新数据的工作

if (pcpu->target_freq == new_freq &&                        pcpu->target_freq <= pcpu->policy->cur) {                trace_cpufreq_interactive_already(                        data, cpu_load, pcpu->target_freq,                        pcpu->policy->cur, new_freq);                spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);                goto rearm_if_notmax;        }

rearm_if_notmax:        /*         * Already set max speed and don't see a need to change that,         * wait until next idle to re-evaluate, don't need timer.         */        if (pcpu->target_freq == pcpu->policy->max)                goto exit;

如果两次选频频率一样并且上一次选频频率不大于当前频率，那么进入rearm_if_notmax判断是否pcpu->target_freq == pcpu->policy->max，如果相等，那么直接退出，不需要调频，当前频率已经处于max speed

 pcpu->target_freq = new_freq; spin_unlock_irqrestore(&pcpu->target_freq_lock, flags); spin_lock_irqsave(&speedchange_cpumask_lock, flags); cpumask_set_cpu(data, &speedchange_cpumask); spin_unlock_irqrestore(&speedchange_cpumask_lock, flags); wake_up_process(speedchange_task);

将new_freq赋值给target_freq，更新目标频率的数值.
设置需要调节频率的CPUcore的cpumask
唤醒speedchange_task线程，改变CPU频率
speedchange_task被定义在

/* realtime thread handles frequency scaling */static struct task_struct *speedchange_task;

对应的线程是

        speedchange_task =                kthread_create(cpufreq_interactive_speedchange_task, NULL,                                "cfinteractive");

static int cpufreq_interactive_speedchange_task(void *data){    unsigned int cpu;    cpumask_t tmp_mask;    unsigned long flags;    struct cpufreq_interactive_cpuinfo *pcpu;    while (1) {        set_current_state(TASK_INTERRUPTIBLE);        spin_lock_irqsave(&speedchange_cpumask_lock, flags);        if (cpumask_empty(&speedchange_cpumask)) {            spin_unlock_irqrestore(&speedchange_cpumask_lock,                           flags);            schedule();            if (kthread_should_stop())                break;            spin_lock_irqsave(&speedchange_cpumask_lock, flags);        }        set_current_state(TASK_RUNNING);        tmp_mask = speedchange_cpumask;        cpumask_clear(&speedchange_cpumask);        spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);        for_each_cpu(cpu, &tmp_mask) {            unsigned int j;            unsigned int max_freq = 0;            pcpu = &per_cpu(cpuinfo, cpu);            if (!down_read_trylock(&pcpu->enable_sem))                continue;            if (!pcpu->governor_enabled) {                up_read(&pcpu->enable_sem);                continue;            }            for_each_cpu(j, pcpu->policy->cpus) {                struct cpufreq_interactive_cpuinfo *pjcpu =                    &per_cpu(cpuinfo, j);                if (pjcpu->target_freq > max_freq)                    max_freq = pjcpu->target_freq;            }            if (max_freq != pcpu->policy->cur)                __cpufreq_driver_target(pcpu->policy,                            max_freq,                            CPUFREQ_RELATION_H);            trace_cpufreq_interactive_setspeed(cpu,                             pcpu->target_freq,                             pcpu->policy->cur);            up_read(&pcpu->enable_sem);        }    }    return 0;}

这个函数比较简单，在一个while循环中，遍历speedchange_cpumask相关的CPU，然后再次遍历所有online CPU，得到最大的target_freq，将target_freq赋值给max_freq，即我们需要设置的CPU频率.
若max_freq != pcpu->policy->cur,说明当前频率不等于我们需要设置的频率，调用__cpufreq_driver_target完成频率设置.
__cpufreq_driver_target会调用对应的callback完成频率设置，具体和cpufreq driver相关，需要driver工程师根据自己的平台实现.

回顾一下之前的工作，我们分析了interactive governor的创建，初始化
如果CPUFREQ core想要启用interactive governor，就要调用interactive governor提供的interface——.governor
在这个callback中，分析了governor在policy方面的初始化，start一个governor，然后调频的工作就交给了定时器（定时器在start governor的时候被启动）.
在定时器中，计算cpu_load，然后根据cpu_load来选频，然后更新pcpu的一些数据，选频得到的频率交由CPUFREQ driver来设置到硬件中去.

PS：关于定时器的工作原理，我写了一片文章：

回到coufreq_gov_interactive.governor这个callbak，继续向下分析：

CPUFREQ_GOV_STOP

    case CPUFREQ_GOV_STOP:        mutex_lock(&gov_lock);        for_each_cpu(j, policy->cpus) {            pcpu = &per_cpu(cpuinfo, j);            down_write(&pcpu->enable_sem);            pcpu->governor_enabled = 0;            del_timer_sync(&pcpu->cpu_timer);            del_timer_sync(&pcpu->cpu_slack_timer);            up_write(&pcpu->enable_sem);        }        mutex_unlock(&gov_lock);        break;

遍历所有online的cpu：
获取cpuinfo
设置pcpu->governor_enabled为0
删除两个定时器

CPUFREQ_GOV_LIMITS

case CPUFREQ_GOV_LIMITS:        if (policy->max < policy->cur)            __cpufreq_driver_target(policy,                    policy->max, CPUFREQ_RELATION_H);        else if (policy->min > policy->cur)            __cpufreq_driver_target(policy,                    policy->min, CPUFREQ_RELATION_L);        for_each_cpu(j, policy->cpus) {            pcpu = &per_cpu(cpuinfo, j);            down_read(&pcpu->enable_sem);            if (pcpu->governor_enabled == 0) {                up_read(&pcpu->enable_sem);                continue;            }            spin_lock_irqsave(&pcpu->target_freq_lock, flags);            if (policy->max < pcpu->target_freq)                pcpu->target_freq = policy->max;            else if (policy->min > pcpu->target_freq)                pcpu->target_freq = policy->min;            spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);            up_read(&pcpu->enable_sem);            /* Reschedule timer only if policy->max is raised.             * Delete the timers, else the timer callback may             * return without re-arm the timer when failed             * acquire the semaphore. This race may cause timer             * stopped unexpectedly.             */            if (policy->max > pcpu->max_freq) {                down_write(&pcpu->enable_sem);                del_timer_sync(&pcpu->cpu_timer);                del_timer_sync(&pcpu->cpu_slack_timer);                cpufreq_interactive_timer_start(tunables, j);                up_write(&pcpu->enable_sem);            }            pcpu->max_freq = policy->max;        }        break;

该event被调用的场景是：change or update limits.
当policy的max或min被改变时，会调用cpufreq_update_policy—>cpufreq_set_policy—>__cpufreq_governor，在__cpufreq_governor中policy->governor->governor调用governor的governor callback
然后进入CPUFREQ_GOV_LIMITS
此时传入cpufreq_governor_interactive的policy指针已经是min或max被改变后的新policy了
对于新policy的处理如下：
改变当前频率，使其符合新policy的范围
遍历所有online CPU：
判断pcpu->target_freq的值，确保其在新policy的范围内
如果之前的policy->max，即pcpu->max_freq小于新的policy->max,那么删除两个定时器链表
调用cpufreq_interactive_timer_start,重新add定时器
将pcpu->max_freq的值更新为新policy的最大值

1 0