Linux内核CFS调度器

来源：互联网发布：landesk软件下载编辑：程序博客网时间：2024/05/02 05:06

http://blog.csdn.net/zhoudaxia/article/details/7375668

https://www.ibm.com/developerworks/cn/linux/l-completely-fair-scheduler/

http://blog.csdn.net/hs794502825/article/details/10495161

Linux CFS思想简介

目前linux内核中所使用的调度器为CFS调度器[16, 26]，CFS定义了一种新的模型，它给CFS的运行队列中的每一个进程安排一个虚拟时钟，vruntime。如果一个进程得以执行，其vruntime将不断增大。没有得到执行的进程vruntime不变。而调度器总是选择vruntime最小的进程来执行。这就是所谓的“完全公平”。为了区别不同进程的优先级，优先级高的进程vruntime增长的较慢，以至于它可能得到更多的运行机会。

CFS的基本设计思路就是根据各个进程的权重分配运行时间。进程的运行时间计算公式为：

分配给进程的运行时间 = 调度周期 *（进程权重 / 所有进程权重之和），

它的公平就体现咋另外一个量上面，vruntime，它记录着进程已经运行的时间，但是并不是直接记录，而是根据运行时间放大或者缩小一个比例。实际运行时间到vruntime 的换算公式为：

vruntime = 实际运行时间 * 1024 / 进程权重，

这里的1024代表nice值为0的进程权重。所有的进程都以nice为0的权重1024作为基准，计算自己的vruntime。上面两个公式可得出，虽然进程的权重不同，但是它们的 vruntime增长速度应该是一样的，与权重无关。既然所有进程的vruntime增长速度宏观上看应该是同时推进的，那么就可以用vruntime来选择运行的进程，vruntime值较小就说明它以前占用cpu的时间较短，受到了“不公平”对待，因此下一个运行进程就是它。这样既能公平选择进程，又能保证高优先级进程获得较多的运行时间，这就是CFS的主要思想。

Linux的CFS的实现

Linux主要实现了两大类调度算法，CFS(完全公平调度算法）和实时调度算法。宏SCHED_NOMAL和SCHED_BATCH主要用于CFS调度，而SCHED_FIFO和SCHED_RR主要用于实时调度。这几个宏的定义可以在include/linux/sched.h中找到。文件kernel/sched.c包含了内核调度器及相关系统调用的实现。调度的核心函数为sched.c中的schedule()，schedule函数封装了内核调度的框架。细节实现上调用具体的调度算法类中的函数实现，如kernel/sched_fair.c或kernel/sched_rt.c中的实现。
1、时钟tick中断的处理

在CFS中，当产生时钟tick中断时，sched.c中scheduler_tick()函数会被时钟中断（定时器timer的代码）直接调用，我们调用它则是在禁用中断时。注意在fork的代码中，当修改父进程的时间片时，也会导致sched_tick的调用。sched_tick函数首先更新调度信息，然后调整当前进程在红黑树中的位置。调整完成后如果发现当前进程不再是最左边的叶子，就标记need_resched标志，中断返回时就会调用scheduler()完成进程切换，否则当前进程继续占用CPU。注意这与以前的调度器不同，以前是tick中断导致时间片递减，当时间片被用完时才触发优先级调整并重新调度。sched_tick函数的代码如下：

copy
void scheduler_tick(void)  
{  
    int cpu = smp_processor_id();  
    struct rq *rq = cpu_rq(cpu);  
    struct task_struct *curr = rq->curr;  
  
    sched_clock_tick();  
  
    spin_lock(&rq->lock);  
    update_rq_clock(rq);  
    update_cpu_load(rq);  
    curr->sched_class->task_tick(rq, curr, 0);  
    spin_unlock(&rq->lock);  
  
    perf_event_task_tick(curr, cpu);  
  
#ifdef CONFIG_SMP  
    rq->idle_at_tick = idle_cpu(cpu);  
    trigger_load_balance(rq, cpu);  
#endif  
}  

它先获取目前CPU上的运行队列中的当前运行进程，更新runqueue级变量clock，然后通过sched_class中的接口名task_tick，调用CFS的tick处理函数task_tick_fair()，以处理时钟中断。我们看kernel/sched_fair.c中的CFS算法实现。具体的调度类如下：

copy
static const struct sched_class fair_sched_class = {  
    .next           = &idle_sched_class,  
    .enqueue_task       = enqueue_task_fair,  
    .dequeue_task       = dequeue_task_fair,  
    .yield_task     = yield_task_fair,  
  
    .check_preempt_curr = check_preempt_wakeup,  
  
    .pick_next_task     = pick_next_task_fair,  
    .put_prev_task      = put_prev_task_fair,  
  
#ifdef CONFIG_SMP  
    .select_task_rq     = select_task_rq_fair,  
  
    .load_balance       = load_balance_fair,  
    .move_one_task      = move_one_task_fair,  
    .rq_online      = rq_online_fair,  
    .rq_offline     = rq_offline_fair,  
  
    .task_waking        = task_waking_fair,  
#endif  
  
    .set_curr_task          = set_curr_task_fair,  
    .task_tick      = task_tick_fair,  
    .task_fork      = task_fork_fair,  
  
    .prio_changed       = prio_changed_fair,  
    .switched_to        = switched_to_fair,  
  
    .get_rr_interval    = get_rr_interval_fair,  
  
#ifdef CONFIG_FAIR_GROUP_SCHED  
    .task_move_group    = task_move_group_fair,  
#endif  
};  

task_tick_fair函数用于轮询调度类的中一个进程。实现如下：

[cpp] view plaincopy
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)  
{  
    struct cfs_rq *cfs_rq;  
    struct sched_entity *se = &curr->se;  
  
    for_each_sched_entity(se) {  /* 考虑了组调度 */  
        cfs_rq = cfs_rq_of(se);  
        entity_tick(cfs_rq, se, queued);  
    }  
}  

该函数获取各层的调度实体，对每个调度实体获取CFS运行队列，调用entity_tick进程进行处理。kernel/sched_fair.c中的函数entity_tick源代码如下：

copy
static void  
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)  
{  
    /* 
     * Update run-time statistics of the 'current'. 
     */  
    update_curr(cfs_rq);  
  
#ifdef CONFIG_SCHED_HRTICK  
    /* 
     * queued ticks are scheduled to match the slice, so don't bother 
     * validating it and just reschedule. 
     */  
    if (queued) {  
        resched_task(rq_of(cfs_rq)->curr);  
        return;  
    }  
    /* 
     * don't let the period tick interfere with the hrtick preemption 
     */  
    if (!sched_feat(DOUBLE_TICK) &&  
            hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))  
        return;  
#endif  
  
    if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))  
        check_preempt_tick(cfs_rq, curr);  
}  

该函数用kernel/sched_fair.c:update_curr()更新当前进程的运行时统计信息，然后调用kernel/sched_fair.c:check_preempt_tick()，检测是否需要重新调度，用下一个进程来抢占当前进程。update_curr()实现记账功能,由系统定时器周期调用，实现如下：

[cpp] view plaincopy
static inline void  
__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,  
          unsigned long delta_exec)  
{  
    unsigned long delta_exec_weighted;  
  
    schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));  
  
    curr->sum_exec_runtime += delta_exec; /* 总运行时间更新 */  
    schedstat_add(cfs_rq, exec_clock, delta_exec); /* 更新cfs_rq的exec_clock */  
    /* 用优先级和delta_exec来计算weighted，以用于更新vruntime */  
    delta_exec_weighted = calc_delta_fair(delta_exec, curr);  
  
    curr->vruntime += delta_exec_weighted; /* 更新当前进程的vruntime */  
    update_min_vruntime(cfs_rq);  
}  
  
static void update_curr(struct cfs_rq *cfs_rq)  
{  
    struct sched_entity *curr = cfs_rq->curr;  
    u64 now = rq_of(cfs_rq)->clock_task;  /* now计时器 */  
    unsigned long delta_exec;  
  
    if (unlikely(!curr))  
        return;  
  
    /* 
     * 获取从最后一次修改负载后当前进程所占用的运行总时间， 
     * 即计算当前进程的执行时间 
     */  
    delta_exec = (unsigned long)(now - curr->exec_start);  
    if (!delta_exec)  /* 如果本次没有执行过，不用重新更新了 */  
        return;  
    /* 根据当前可运行进程总数对运行时间进行加权计算 */  
    __update_curr(cfs_rq, curr, delta_exec);  
    curr->exec_start = now;  /* 将exec_start属性置为now */  
  
    if (entity_is_task(curr)) {  /* 下面为关于组调度的 */  
        struct task_struct *curtask = task_of(curr);  
  
        trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);  
        cpuacct_charge(curtask, delta_exec);  
        account_group_exec_runtime(curtask, delta_exec);  
    }  
}  

这里delta_exec获得从最后一次修改负载后当前进程所占用的运行总时间，即计算当前进程的执行时间。然后调用__update_curr()更新进程的vruntime。更新前需要计算weighted，这由sched_fair.c:calc_delta_fair()实现，如下：

[cpp] view plaincopy
static inline unsigned long  
calc_delta_fair(unsigned long delta, struct sched_entity *se)  
{  
    if (unlikely(se->load.weight != NICE_0_LOAD))  
        delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);  
  
    return delta;  
}  

在calc_delta_fair中，如果进程的优先级为0,那么就是返回delta，如果不为0,就要调用kernel/sched.c中的calc_delta_mine对delta值进行修正，如下：

[cpp] view plaincopy
#if BITS_PER_LONG == 32  
# define WMULT_CONST    (~0UL)  
#else  
# define WMULT_CONST    (1UL << 32)  
#endif  
  
#define WMULT_SHIFT 32  
  
/* 
 * Shift right and round: 
 */  
#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))  
  
/* 
 * delta *= weight / lw 
 */  
static unsigned long  
calc_delta_mine(unsigned long delta_exec, unsigned long weight,  
        struct load_weight *lw)  
{  
    u64 tmp;  
  
    if (!lw->inv_weight) {  
        if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))  
            lw->inv_weight = 1;  
        else  
            lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)  
                / (lw->weight+1);  
    }  
  
    tmp = (u64)delta_exec * weight;  
    /* 
     * Check whether we'd overflow the 64-bit multiplication: 
     */  
    if (unlikely(tmp > WMULT_CONST))  
        tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,  
            WMULT_SHIFT/2);  
    else  
        tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);  
  
    return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);  
}  

    CFS允许每个进程运行一段时间、循环轮转、选择运行最少的进程作为下一个运行进程，而不再采用分配给每个进程时间片的做法了，CFS在所有可运行进程总数基础上计算出一个进程应该运行多久，而不是依靠nice值来计算时间片。nice值在CFS中被作为进程获得的处理器运行比的权重，越高的nice值（越低的优先级）进程获得更低的处理器使用权重，这是相对默认nice值进程的进程而言的；相反，更低的nice值（越高的优先级）的进程获得更高的处理器使用权重。
   这里delta的计算有如下关系: delta=delta* NICE_0_LOAD/se->load。se->load值是怎么来的呢？可以跟踪sys_nice()，就可以发现se->load其实就是表示nice对应的load值，nice越低，值越大。据此就可以得到一个结论，在执行相同时间的条件下(delta相同)，高优先的进程计算出来的delta值会比低优先级的进程计算出来的低。应此高优先的进程就会位于rb_tree的左边，在下次调度的时候就会优先调度。
   回到entity_tick，我们看check_preempt_tick()的实现，它用来检测是否需要重新调度下一个进程。如下：

[cpp] view plaincopy
static void  
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)  
{  
    unsigned long ideal_runtime, delta_exec;  
  
    ideal_runtime = sched_slice(cfs_rq, curr);  
    delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;  
    if (delta_exec > ideal_runtime) {  
        resched_task(rq_of(cfs_rq)->curr);  
        /* 
         * The current task ran long enough, ensure it doesn't get 
         * re-elected due to buddy favours. 
         */  
        clear_buddies(cfs_rq, curr);  
        return;  
    }  
  
    /* 
     * Ensure that a task that missed wakeup preemption by a 
     * narrow margin doesn't have to wait for a full slice. 
     * This also mitigates buddy induced latencies under load. 
     */  
    if (!sched_feat(WAKEUP_PREEMPT))  
        return;  
  
    if (delta_exec < sysctl_sched_min_granularity)  
        return;  
  
    if (cfs_rq->nr_running > 1) { /* 用于组调度 */  
        struct sched_entity *se = __pick_next_entity(cfs_rq);  
        s64 delta = curr->vruntime - se->vruntime;  
  
        if (delta > ideal_runtime)  
            resched_task(rq_of(cfs_rq)->curr);  
    }  
}  

    该函数先获取当前进程的理想运行时间，如果当前执行时间超过理想时间，调用kernel/sched.c:resched_task()设置need_resched标志，完成设置的函数为resched_task()--->set_tsk_need_resched(p)，表示需要重新调度进程。
   从上面分析可以看出，通过调用链sched_tick()--->task_tick_fair()--->entity_tick()--->［update_curr()--->__update_curr()--->calc_delta_fair()--->calc_delta_mine()] 和 [check_preempt_tick()--->resched_task()]，最终会更新调度信息，设置need_resched调度标志。当中断返回时，就会调用schedule()进行抢占式调度。
   2、CFS调度操作
   在sched_fair.c中，CFS实现了用红黑树对运行队列进行管理的相关操作。

（1）进程插入enqueue_task_fair：更新调度信息，调用enqueue_entity()--->__enqueue_entity()将调度实体插入到红黑树中。它会在nr_running递增之前被调用。插入时，会找到右边的空间并进行插入，然后缓存最左边的节点。对于组调度，会对组中的所有进程进行操作。如下：

[cpp] view plaincopy
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)  
{  
    struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;  
    struct rb_node *parent = NULL;  
    struct sched_entity *entry;  
    s64 key = entity_key(cfs_rq, se); /* key为被插入进程的vruntime */  
    int leftmost = 1;  
  
    /* 
     * Find the right place in the rbtree: 
     */  
    while (*link) {  
        parent = *link;  
        entry = rb_entry(parent, struct sched_entity, run_node);  
        /* 
         * We dont care about collisions. Nodes with 
         * the same key stay together. 
         */  
        if (key < entity_key(cfs_rq, entry)) {  
            link = &parent->rb_left;  
        } else {  
            link = &parent->rb_right;  
            leftmost = 0;  
        }  
    }  
  
    /* 
     * Maintain a cache of leftmost tree entries (it is frequently 
     * used): 
     */  
    if (leftmost)  
        cfs_rq->rb_leftmost = &se->run_node;  
  
    rb_link_node(&se->run_node, parent, link);  
    rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);  
}  

可见CFS的运行队列布局是放在红黑树里面的，而这颗红黑树的排序方式是按照运行实体的vruntime来的。vruntime的计算方式在上面已经做了分析。在前面“Linux进程管理”的几节介绍中，我们可以看到fork()在创建子进程时最后就会调用enqueue_task_fair()，将新创建的进程插入到红黑树中。
（2）进程选择pick_next_task_fair：CFS调度算法的核心是选择具有最小vruntine的任务。运行队列采用红黑树方式存放，其中节点的键值便是可运行进程的虚拟运行时间。CFS调度器选取待运行的下一个进程，是所有进程中vruntime最小的那个，他对应的便是在树中最左侧的叶子节点。实现选择的函数为 pick_next_task_fair。如下：

[cpp] view plaincopy
static struct task_struct *pick_next_task_fair(struct rq *rq)  
{  
    struct task_struct *p;  
    struct cfs_rq *cfs_rq = &rq->cfs;  
    struct sched_entity *se;  
  
    if (unlikely(!cfs_rq->nr_running))  
        return NULL;  
  
    do {   /* 此循环为了考虑组调度 */  
        se = pick_next_entity(cfs_rq);  
        set_next_entity(cfs_rq, se);  /* 设置为当前运行进程 */  
        cfs_rq = group_cfs_rq(se);  
    } while (cfs_rq);  
  
    p = task_of(se);  
    hrtick_start_fair(rq, p);  
  
    return p;  
}  

该函数调用pick_next_entity()--->__pick_next_entity()完成获取下一个进程的工作，这个函数如下：

[cpp] view plaincopy
static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)  
{  
    struct rb_node *left = cfs_rq->rb_leftmost;  
  
    if (!left)  
        return NULL;  
  
    return rb_entry(left, struct sched_entity, run_node);  
}  

该函数并不会遍历红黑树来找到最左叶子节点（是所有进程中vruntime最小的那个），因为该值已经缓存在rb_leftmost字段中。它通过rb_entry函数返回这个缓存的节点进程。完成实质工作的调用为include/linux/rbtree.h:rb_entry()--->include/linux/kernel.h:container_of()，这是一个宏定义。
（3）进程删除dequeue_task_fair：从红黑树中删除进程，并更新调度信息。它会在nr_running递减之前被调用。完成实质工作的函数为dequeue_entity()--->__dequeue_entity()。如下：

[cpp] view plaincopy
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)  
{  
    if (cfs_rq->rb_leftmost == &se->run_node) {  
        struct rb_node *next_node;  
  
        next_node = rb_next(&se->run_node);  
        cfs_rq->rb_leftmost = next_node;  
    }  
  
    rb_erase(&se->run_node, &cfs_rq->tasks_timeline);  
}  

该函数会删除当前进程，并从红黑树中选出下一个具体最小vruntime值的节点，作为新的最左边节点缓存起来。

学习

红黑树（平衡二叉树）是自平衡二叉树，由 Rudolf Bayer 发明。它是一种非常有用的树表示法，能以时间复杂度进行插入、搜索和删除等操作。您可以在各种应用程序中发现红黑树，包括关联数组的构建。
任务调度是操作系统设计的一个重要方面，从桌面操作系统调度器到实时调度器和嵌入式操作系统调度器。来自 Martin C. Rinard 操作系统讲座的文章提供了关于处理器调度话题的要点总结。
大O符号是用于描述函数渐近行为的数学符号。此维基百科条目包含了一些重要信息以及函数类的有用列表。
您可以在 “Linux 调度器内幕” （developerWorks，2006 年 6 月）中了解 Linux O(1) 调度器的更多信息。
Con Kolivas 从事新的实验性的 Linux 调度器研究有一段时间了。您可以了解关于其调度器的更多信息，包括 Staircase Process Scheduler 和 Rotating Staircase Deadline Scheduler，这些调度器最终证明公平共享调度是可以实现的。
Avinesh Kumar 对 CFS 及其他新增功能进行了介绍。了解其文章 “使用完全公平调度程序（CFS）进行多任务处理” （developerWorks，2008 年 1 月）。
没有戏剧生活将会怎样？要了解幕后故事，请参阅 CFS 的开发，查看此 Linux 内核邮件列表结合，其中包括 Con Kolivas（他引入了 RSDL 补丁、实现了 CFS 的核心思想）和 Ingo Molnar（调度器代码看门人，在最初拒绝该思想后，他稍后推出了自己的调度器版本）。真相总是介于两者之间，但是来自Kerneltrap 的这篇有趣的文章揭示了开源开发的另一面。

0 0