简谈一下时间轮（Time Wheel）

来源：互联网发布：淘宝装修全套教程编辑：程序博客网时间：2024/05/17 02:50

转自: http://bookjovi.iteye.com/blog/1329614

如果一个程序员不知道 Time Wheel，那么那个程序员一定不是个合格的程序员。

timer对于操作系统还是一个虚拟机语言或大型中间件都起着重要的作用，同时timer算法的选择也直接影响着性能。

Time Wheel翻译为时间轮，是用于实现定时器timer的经典算法，算法细节就不多说了，这里主要是看看Erlang中和Linux kernel的time wheel实现有哪些不同。

Erlang中的Time Wheel实现文件是time.c，kernel中的实现文件是timer.c，好了，先看看kernel中的实现吧！

Linux kernel中的time wheel这么多年一直没怎么改变，主要特点是以下几点：

1）kernel中的timer是在softirq中执行

2）多CPU同时执行，和process差不多，timer也可在cpu中migrate

3）使用percpu

4）核心数据结构：

view plain
struct tvec_base {  
        spinlock_t lock;  
        struct timer_list *running_timer;  
        unsigned long timer_jiffies;  
        unsigned long next_timer;  
        struct tvec_root tv1;  
        struct tvec tv2;  
        struct tvec tv3;  
        struct tvec tv4;  
        struct tvec tv5;  
} ____cacheline_aligned;  

这里的base属于percpu数据，即每个cpu拥有一个base，这样每个cpu执行自己base里面的timer。这里有tv1/tv2/tv3/tv4/tv5，这几个vector维护着所有timer，每次加timer时根据timeout的时间分别加入到不同的vector中，tv1是最近的，tv5是最远的，kernel首先会在tv1中遍历timeout的timer，如果遍历完tv1，则从tv2中的timer list加到tv1中，如果tv2中的timer list用完后，再从tv3中取，注意tv3中的timer可以分布到tv1和tv2中，以此类推，实现代码如下：

view plain
#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)  
  
/** 
 * __run_timers - run all expired timers (if any) on this CPU. 
 * @base: the timer vector to be processed. 
 * 
 * This function cascades all vectors and executes all expired timer 
 * vectors. 
 */  
static inline void __run_timers(struct tvec_base *base)  
{  
        struct timer_list *timer;  
  
        spin_lock_irq(&base->lock);  
        while (time_after_eq(jiffies, base->timer_jiffies)) {  
                struct list_head work_list;  
                struct list_head *head = &work_list;  
                int index = base->timer_jiffies & TVR_MASK;  
  
                /* 
                 * Cascade timers: 
                 */  
                if (!index &&  
                        (!cascade(base, &base->tv2, INDEX(0))) &&  
                                (!cascade(base, &base->tv3, INDEX(1))) &&  
                                        !cascade(base, &base->tv4, INDEX(2)))  
                        cascade(base, &base->tv5, INDEX(3));  
                ++base->timer_jiffies;  
                list_replace_init(base->tv1.vec + index, &work_list);  
                while (!list_empty(head)) {  
                        void (*fn)(unsigned long);  
                        unsigned long data;  
  
                        timer = list_first_entry(head, struct timer_list,entry);  
                        fn = timer->function;  
                        data = timer->data;  
  
                        timer_stats_account_timer(timer);  
  
                        base->running_timer = timer;  
                        detach_timer(timer, 1);  
  
                        spin_unlock_irq(&base->lock);  
                        call_timer_fn(timer, fn, data);  
                        spin_lock_irq(&base->lock);  
                }  
        }  
        base->running_timer = NULL;  
        spin_unlock_irq(&base->lock);  
}  
  
static int cascade(struct tvec_base *base, struct tvec *tv, int index)  
{  
        /* cascade all the timers from tv up one level */  
        struct timer_list *timer, *tmp;  
        struct list_head tv_list;  
  
        list_replace_init(tv->vec + index, &tv_list);  
  
        /* 
         * We are removing _all_ timers from the list, so we 
         * don't have to detach them individually. 
         */  
        list_for_each_entry_safe(timer, tmp, &tv_list, entry) {  
                BUG_ON(tbase_get_base(timer->base) != base);  
                internal_add_timer(base, timer);  
        }  
  
        return index;  
}  

可以看出，即使你加了很多长时间的timer，kernel的timer性能并没有减少，因为长时间time被分布到不同的vector中，因此Linux kernel中的time wheel算法适合大容量的timer应用场景。

（注意kernel中每个base的lock用的是spin lock，而不是mutex，下面会讲到）

下面再来看看Erlang中的timer实现，erlang普遍应用于并发量比较高的场景，erlang的process通信是通过message，message的发送接收显然离不开timer，erlang甚至把timer提升到语言语法的层次，从此可看出timer在

erlang中使用是多么的广泛。

和Linux kernel的time wheel比较，erlang有以下几点不同：

1）erlang的timer执行过程是在erlang process schedule时发生，而不是像很多中间件timer实现那样用单独的线程，这是有历史原因的（Erlang应兼顾到plain cpu的情形）。

2）erlang的scheduler线程可以有多个，所以timer wheel需要lock的支持

3）没有percpu，由于erlang在user space，所以percpu是个很难的问题，原因是抢占的问题，kernel实现的percpu可以显著提高性能，但也是有代价的，代价就是在很多percpu的处理过程中要关闭抢占，这也就是为什么RT kernel的人比较头疼percpu的原因。而在用户空间，抢占被操作系统强制执行，导致用户空间程序无法使用percpu。

4）Erlang中time wheel没有像Linux kernel那样把timeout根据相对时间挂载到tv1/tv2/tv3/tv4/tv5中，但是erlang中的wheel slot却比较大（kernel中的slot是16或64），可以是8192或65536，这在一定程度上缓解了大量长时间timer对性能带来的影响，如果把每个wheel的slot的间隔时间算作是1ms，wheel算作8192，那么几乎是8s一个wheel就遍历完，如果程序中有大量的timer超时时间大于8s，那么那些timer就会对8192取模挂载在相应的slot下，这就意味着每次遍历是会有很多并未超时的timer被访问到，而这在Linux kernel中则不存在。核心代码如下：

view plain
static ERTS_INLINE void bump_timer_internal(erts_short_time_t dt) /* PRE: tiw_lock is write-locked */  
{  
    Uint keep_pos;  
    Uint count;  
    ErlTimer *p, **prev, *timeout_head, **timeout_tail;  
    Uint dtime = (Uint) dt;  
  
    /* no need to bump the position if there aren't any timeouts */  
    if (tiw_nto == 0) {  
        erts_smp_mtx_unlock(&tiw_lock);  
        return;  
    }  
  
    /* if do_time > TIW_SIZE we want to go around just once */  
    count = (Uint)(dtime / TIW_SIZE) + 1;  
    keep_pos = (tiw_pos + dtime) % TIW_SIZE;  
    if (dtime > TIW_SIZE) dtime = TIW_SIZE;  
  
    timeout_head = NULL;  
    timeout_tail = &timeout_head;  
    while (dtime > 0) {  
        /* this is to decrease the counters with the right amount */  
        /* when dtime >= TIW_SIZE */  
        if (tiw_pos == keep_pos) count--;  
        prev = &tiw[tiw_pos];  
        while ((p = *prev) != NULL) {  
            ASSERT( p != p->next);  
            if (p->count < count) {     /* we have a timeout */  
                /* remove min time */  
                if (tiw_min_ptr == p) {  
                    tiw_min_ptr = NULL;  
                    tiw_min = 0;  
                }  
  
                /* Remove from list */  
                remove_timer(p);  
                *timeout_tail = p;      /* Insert in timeout queue */  
                timeout_tail = &p->next;  
            }  
            else {  
                /* no timeout, just decrease counter */  
                p->count -= count;  
                prev = &p->next;  
            }  
        }  
        tiw_pos = (tiw_pos + 1) % TIW_SIZE;  
        dtime--;  
    }  
    tiw_pos = keep_pos;  
    if (tiw_min_ptr)  
        tiw_min -= dt;  
  
    erts_smp_mtx_unlock(&tiw_lock);  

综上比较，在面对大容量timer的情况下Linux kernel的time wheel算法会比Erlang更有效率一些。最后还有一点要注意，Erlang的time wheel使用的lock是mutex（上面说过Linux kernel使用spin lock），在这里那种lock会更适合time wheel呢？个人觉得spin lock会好些，毕竟临界区代码处理应该会很快。当然如果erlang中ethread mutex使用的是mutex spin机制（mutex使用的是futex，在进入kernel futex前，进行spin lock很短一段时间），那就无所谓了。

0 0