Linux时间管理（五）

来源：互联网发布：软件的数据接口编辑：程序博客网时间：2024/04/27 22:01

1.1 低精度下的hrtimer

在低精度模式下，hrtimer的核心处理函数是 hrtimer_run_queues，每一次 tick中断都要执行一次（在tick的中断处理函数中调用update_process_times）。这个函数的调用流程为：

update_process_times

run_local_timers

hrtimer_run_queues

raise_softirq(TIMER_SOFTIRQ)

其中hrtimer_run_queues是对到期的高精度定时器hrtimer的处理。

void hrtimer_run_queues(void)

{

struct timerqueue_node *node;

struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);

struct hrtimer_clock_base *base;

int index, gettime = 1;

//当hrtimer使能高精度时，该函数相当于空函数，不做任何处理直接返回。

if (hrtimer_hres_active())

return;

for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {

base = &cpu_base->clock_base[index];

if (!timerqueue_getnext(&base->active))

continue;

if (gettime) {

hrtimer_get_softirq_time(cpu_base);

gettime = 0;

}

raw_spin_lock(&cpu_base->lock);

while ((node = timerqueue_getnext(&base->active))) {

struct hrtimer *timer;

timer = container_of(node, struct hrtimer, node);

if (base->softirq_time.tv64 <=

hrtimer_get_expires_tv64(timer))

break;

//移除hrtimer并运行hrtimer的处理函数，更新hrtimer的状态

__run_hrtimer(timer, &base->softirq_time);

}

raw_spin_unlock(&cpu_base->lock);

}

可以看出：在未配置高精度模式时，hrtimer的到期由函数hrtimer_run_queues检查。hrtimer_run_queues是在run_local_timers中被调用，而run_local_timers又是在系统时钟中断中被调用。从这里可以看出，与传统的使用时间轮算法的定时器一样，hrtimer在未配置高精度模式时采用了在每一个系统时钟中断中轮循的方式来判断hrtimer是否到期，因此，这里的定时精度为时钟中断轮循的时间间隔。在函数hrtimer_run_queues的开始处，会执行一项检查：

if (hrtimer_hres_active())

return;

所以在配置高精度模式后，这里的hrtimer_run_queues函数相当于空函数，会直接返回。

1.2 高精度下的hrtimer

配置了高精度模式之后，hrtimer的到期由clock_event设备的产生的硬中断处理来调用hrtimer_interrupt函数。注意这里不再采用传统的轮循方式判断定时器是否到期，而是通过设置clock_event_device的中断，在第一个到期的定时器超时的时间点触发一个中断来执行超时操作。所以，这里的定时精度由clock_event_device的计时精度决定。

由于刚启动时没有特别重要的任务要做，因此默认是进入低精度+周期tick的工作模式，之后会根据硬件的配置（如硬件上是否支持高精度timer）和软件的配置（如是否通过命令行参数或者内核配置使能了高精度timer等特性）进行切换。切换过程的发起函数为run_timer_softirq，该函数被TIMER_SOFTIRQ软中断触发。其具体的流程为

run_timer_softirq

hrtimer_run_pending

tick_check_oneshot_change (在这里可能会切换到NOHZ模式，在后面进行分析)

hrtimer_switch_to_hres

在update_process_times中，除了处理处于低精度模式的hrtimer外，还要唤醒 IRQ0的 softIRQ（TIMER_SOFTIRQ）以便执行timer wheel的代码。由于hrtimer子系统的加入，在IRQ0的softIRQ中，还需要通过hrtimer_run_pending检查是否可以将hrtimer切换到高精度模式：

hrtimer 进行精度切换的处理函数

void hrtimer_run_pending(void)

{

if (hrtimer_hres_active())

return;

if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))

hrtimer_switch_to_hres();

}

每一次触发IRQ0的softIRQ都需要检查一次是否可以将hrtimer切换到高精度，显然是十分低效的，希望将来有更好的方法不用每次都进行检查。

如果可以将hrtimer切换到高精度模式，则调用hrtimer_switch_to_hres函数进行切换。hrtimer切换到高精度模式的核心函数

static int hrtimer_switch_to_hres(void)

{

int cpu = smp_processor_id();

struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);

unsigned long flags;

if (base->hres_active)

return 1;

local_irq_save(flags);

if (tick_init_highres()) {

local_irq_restore(flags);

printk(KERN_WARNING "Could not switch to high resolution "

"mode on CPU %d\n", cpu);

return 0;

}

base->hres_active = 1;

base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES;

base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES;

tick_setup_sched_timer();

/* "Retrigger" the interrupt to get things going */

retrigger_next_event(NULL);

local_irq_restore(flags);

return 1;

}

在这个函数中，首先使用tick_init_highres更新与原来的tick device绑定的时钟事件设备的event handler，例如将在低精度模式下的工作函数tick_handle_periodic或者tick_handle_ periodic_broadcast换成hrtimer_interrupt（它是hrtimer在高精度模式下的timer中断处理函数），同时将tick device的触发模式变为one-shot，即单次触发模式，这是使用dynamic tick或者hrtimer时tick device的工作模式。tick_init_highres通过调用tick_switch_to_oneshot函数来完成上述工作。

具体的代码如下：

int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))

{

struct tick_device *td = &__get_cpu_var(tick_cpu_device);

struct clock_event_device *dev = td->evtdev;

//都成立时

if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||

!tick_device_is_functional(dev)) {

printk(KERN_INFO "Clockevents: "

"could not switch to one-shot mode:");

if (!dev) {

printk(" no tick device\n");

} else {

if (!tick_device_is_functional(dev))

printk(" %s is not functional.\n", dev->name);

else

printk(" %s does not support one-shot mode.\n",

dev->name);

}

return -EINVAL;

}

td->mode = TICKDEV_MODE_ONESHOT;

dev->event_handler = handler;

clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);

tick_broadcast_switch_to_oneshot();

return 0;

}

由于dynamic tick可以随时停止和开始，以不规律的速度产生tick，因此支持one-shot模式的时钟事件设备是必须的；对于hrtimer，由于hrtimer采用事件机制驱动timer前进，因此使用one-shot的触发模式也是顺理成章的。不过这样一来，原本tick device每次执行中断时需要完成的周期性任务如更新jiffies / wall time (do_timer)以及更新process的使用时间（update_process_times）等工作在切换到高精度模式之后就没有了，因此在执行完tick_init_highres之后紧接着会调用tick_setup_sched_timer函数来完成这部分设置工作。

下面我们就来看一下，中断处理函数hrtimer_interrupt

void hrtimer_interrupt(struct clock_event_device *dev)

{

struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);

ktime_t expires_next, now, entry_time, delta;

int i, retries = 0;

BUG_ON(!cpu_base->hres_active);

cpu_base->nr_events++;

dev->next_event.tv64 = KTIME_MAX;

//保存进入中断处理的时间

entry_time = now = ktime_get();

retry:

expires_next.tv64 = KTIME_MAX;

raw_spin_lock(&cpu_base->lock);

* We set expires_next to KTIME_MAX here with cpu_base->lock

* held to prevent that a timer is enqueued in our queue via

* the migration code. This does not affect enqueueing of

* timers which run their callback and need to be requeued on

* this CPU.

cpu_base->expires_next.tv64 = KTIME_MAX;

for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {

struct hrtimer_clock_base *base;

struct timerqueue_node *node;

ktime_t basenow;

//active_bases的每一位表示一个CPU是否存在激活的hrtimer

if (!(cpu_base->active_bases & (1 << i)))

continue;

base = cpu_base->clock_base + i;

basenow = ktime_add(now, base->offset);

while ((node = timerqueue_getnext(&base->active))) {

struct hrtimer *timer;

timer = container_of(node, struct hrtimer, node);

//如果basenow小于最早到期的hrtimer的时间，意味着没有hrtimer到期

if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {

ktime_t expires;

expires = ktime_sub(hrtimer_get_expires(timer),

base->offset);

if (expires.tv64 < expires_next.tv64)

expires_next = expires;

break;

}

__run_hrtimer(timer, &basenow);

}

* Store the new expiry value so the migration code can verify

* against it.

cpu_base->expires_next = expires_next;

raw_spin_unlock(&cpu_base->lock);

/* Reprogramming necessary ? */

if (expires_next.tv64 == KTIME_MAX ||

!tick_program_event(expires_next, 0)) {

cpu_base->hang_detected = 0;

return;

}

//时钟已经到期，由于一些时间不能及时处理

now = ktime_get();

cpu_base->nr_retries++;

if (++retries < 3)

goto retry;

* Give the system a chance to do something else than looping

* here. We stored the entry time, so we know exactly how long

* we spent here. We schedule the next event this amount of

* time away.

cpu_base->nr_hangs++;

cpu_base->hang_detected = 1;

delta = ktime_sub(now, entry_time);

if (delta.tv64 > cpu_base->max_hang_time.tv64)

cpu_base->max_hang_time = delta;

* Limit it to a sensible value as we enforce a longer

* delay. Give the CPU at least 100ms to catch up.

if (delta.tv64 > 100 * NSEC_PER_MSEC)

expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);

else

expires_next = ktime_add(now, delta);

tick_program_event(expires_next, 1);

printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",

ktime_to_ns(delta));

}

这个函数的逻辑相对比较复杂，遍历每个时钟基，判断时钟是否到期，如果有到期的时钟就处理；如果hrtimer还没有到期，则计算下次到期的时间并根据需要确认是否编程。

if (expires_next.tv64 == KTIME_MAX ||

!tick_program_event(expires_next, 0)) {

cpu_base->hang_detected = 0;

return;

}

if语句判断的两个条件表示两种情况，第一种是expires_next.tv64 == KTIME_MAX成立，表示完成对到期的定时器的处理，正常结束；第二种是expires_next.tv64 == KTIME_MAX不成立，需要重启编程tick_program_event(expires_next, 0)，编程成功后返回。

当上述的两种情况都不成立，即expires_next.tv64 == KTIME_MAX不成立且tick_program_event(expires_next, 0)编程不成功（主要是由于expires_nex是过去的时间点）。则会跳到retry进行重试，三次不成功后，则设置相应的状态并重新编程退出。

下面就看一下hrtimer_interrupt的核心处理函数__run_hrtimer。该函数的主要功能是：移除到期的hrtimer，执行hrtimer的回调函数并更新hrtimer的状态。

static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)

{

struct hrtimer_clock_base *base = timer->base;

struct hrtimer_cpu_base *cpu_base = base->cpu_base;

enum hrtimer_restart (*fn)(struct hrtimer *);

int restart;

WARN_ON(!irqs_disabled());

debug_deactivate(timer);

__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);

timer_stats_account_hrtimer(timer);

fn = timer->function;

* Because we run timers from hardirq context, there is no chance

* they get migrated to another cpu, therefore its safe to unlock

* the timer base.

raw_spin_unlock(&cpu_base->lock);

trace_hrtimer_expire_entry(timer, now);

restart = fn(timer);

trace_hrtimer_expire_exit(timer);

raw_spin_lock(&cpu_base->lock);

* Note: We clear the CALLBACK bit after enqueue_hrtimer and

* we do not reprogramm the event hardware. Happens either in

* hrtimer_start_range_ns() or in hrtimer_interrupt()

if (restart != HRTIMER_NORESTART) {

BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);

enqueue_hrtimer(timer, base);

}

WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));

timer->state &= ~HRTIMER_STATE_CALLBACK;

}

在一个支持hrtimer高精度模式并使能了dynamic tick的系统中，在系统的时钟源已经切换到支持高精度的时钟源后的第一次发生IRQ0的软中断时hrtimer就会进行从低精度到高精度的切换，然后再进一步切换到NOHZ模式。

1.3 Hrtimer初始化过程

负责启动系统的CPU的hrtimer的初始化也是在start_kernel是进行的，其具体流程如下：

start_kernel

hrtimers_init

hrtimer_cpu_notify

init_hrtimers_cpu

register_cpu_notify

opensoftirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq)

首先，分析一下hrtimers_init的代码，其主要完成的初始化当前cpu的hrtimer的相关数据结构，把hrtimer_nb通知块注册到cpu_chain通知链，并根据内核配置CONFIG_HIGH_RES_TIMERS决定是否打开HRTIMER_SOFTIRQ软中断。

void __init hrtimers_init(void)

{

hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,

(void *)(long)smp_processor_id());

register_cpu_notifier(&hrtimers_nb);

#ifdef CONFIG_HIGH_RES_TIMERS

open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);

#endif

}

hrtimer相关的数据结构（cpu base和clockbase）都是在hrtimer_cpu_notify中调用init_hrtimers_cpu来完成的，具体代码如下：

static void __cpuinitinit_hrtimers_cpu(int cpu)

{

//获取per-cup变量hrtimer_base

struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);

int i;

raw_spin_lock_init(&cpu_base->lock);

for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {

cpu_base->clock_base[i].cpu_base = cpu_base;

//初始化红黑树和timerqueue

timerqueue_init_head(&cpu_base->clock_base[i].active);

}

//完成cpu_base的初始化，即设置cpu_base的下次事件的到期时间为KTIME_MAX

//并设置cpu_base中的域hres_active为0，表示现在处于低精度模式。

hrtimer_init_hres(cpu_base);

}