Linux进程调度之CFS算法分析

来源：互联网发布：mp288墨盒清零软件编辑：程序博客网时间：2024/05/02 03:44

最新的Linux内核使用CFS调度算法。

CFS概念：CFS算法允许每个进程运行一段时间、循环轮转、选择运行最少的进程作为下一个运行进程。而不再使用分配时间片的做法。

nice值在CFS中作为进程获得处理器运行比的权重，更低的nice值的进程获得更高的处理器使用权重。

每个进程都按其权重在全部可运行进程中所占比例的“时间片”来运行。CFS为每个进程的时间片设置的最小底线是1ms（默认）。这样即便可运行的进程数趋于无限，每个进程至少运行1ms，确保切换消耗被控制在一定范围内。

任何处理进程所获得的处理时间是由它自己和其他所有可运行进程nice值（-20~20）的相对差值决定的。例如：对于同样的目标延迟20ms，两个进程的nice值分别是0和5，与两个进程的nice值分别是10和15，它们分别获得的处理时间都是15ms和5ms。

调度的实现：

1、时间记账

vruntime变量存放进程的虚拟运行时间。通过update_curr()函数实现了该功能。update_curr()由系统定时器周期性调用，无论进程处于运行状态，还是被阻塞处于不可运行状态，vruntime可以准确测量给定的进程的运行时间。

2、进程选择

2.1、CFS利用一个简单的法则寻找下一个运行的进程：选择具有最小vruntime值的进程。Linux使用rbtree来维护进程，具有最小vruntime的进程在rbtree的最左边叶子节点。

static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
        struct sched_entity *se = __pick_first_entity(cfs_rq);
        struct sched_entity *left = se;

        /*
         * Avoid running the skip buddy, if running something else can
         * be done without getting too unfair.
         */
        if (cfs_rq->skip == se) {
                struct sched_entity *second = __pick_next_entity(se);
                if (second && wakeup_preempt_entity(second, left) < 1)
                        se = second;
        }

        /*
         * Prefer last buddy, try to return the CPU to a preempted task.
         */
        if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
                se = cfs_rq->last;

        /*
         * Someone really wants this to run. If it's not unfair, run it.
         */
        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
                se = cfs_rq->next;

        clear_buddies(cfs_rq, se);

        return se;
}

调用__pick_first_entity()找到最左结点：

struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *left = cfs_rq->rb_leftmost;

if (!left)
return NULL;

return rb_entry(left, struct sched_entity, run_node);
}

该函数本身并不会遍历找到最左叶子节点，因为该值已经缓存在rb——leftmost中。

2.2、现在来看CFS将进程加入rbtree中：这一切发生在进程变为可运行状态，或者通过fork()创建第一个进程时：

enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update the normalized vruntime before updating min_vruntime
* through callig update_curr().
*/
if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
se->vruntime += cfs_rq->min_vruntime;

/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);     //更新当前任务的统计数据
enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);

if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
enqueue_sleeper(cfs_rq, se);
}

update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se);
if (se != cfs_rq->curr)
    __enqueue_entity(cfs_rq, se);     //调用 __enqueue_entity进行真正的插入工作
se->on_rq = 1;

if (cfs_rq->nr_running == 1) {
list_add_leaf_cfs_rq(cfs_rq);
check_enqueue_throttle(cfs_rq);
}
}

具体的插入rbtree的工作由 __enqueue_entity()实现：

static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
        struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
        struct rb_node *parent = NULL;
        struct sched_entity *entry;
        int leftmost = 1;

        /*
         * Find the right place in the rbtree:     //在rbtree中查找合适的位置
         */
        while (*link) {
                parent = *link;
                entry = rb_entry(parent, struct sched_entity, run_node);
                /*
                 * We dont care about collisions. Nodes with
                 * the same key stay together.
                 */
                if (entity_before(se, entry)) {
                        link = &parent->rb_left;
                } else {
                        link = &parent->rb_right;
                        leftmost = 0;     //一旦走过一次右边分支，说明插入的进程不会是最左叶子结点
                }
        }

        /*
         * Maintain a cache of leftmost tree entries (it is frequently    //维护一个缓存，其中存放最左叶子结点
         * used):
         */
        if (leftmost)
                cfs_rq->rb_leftmost = &se->run_node;

rb_link_node(&se->run_node, parent, link);

}

2.3、CFS从rbtree中删除一个进程：删除动作发生在阻塞(不可运行态)或者进程终结时。

dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
        /*
         * Update run-time statistics of the 'current'.    //更新当前任务的运行时统计数据
         */
        update_curr(cfs_rq);
        dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);

        update_stats_dequeue(cfs_rq, se);
        if (flags & DEQUEUE_SLEEP) {
#ifdef CONFIG_SCHEDSTATS
                if (entity_is_task(se)) {
                        struct task_struct *tsk = task_of(se);

                        if (tsk->state & TASK_INTERRUPTIBLE)
                                se->statistics.sleep_start = rq_of(cfs_rq)->clock;
                        if (tsk->state & TASK_UNINTERRUPTIBLE)
                                se->statistics.block_start = rq_of(cfs_rq)->clock;
                }
#endif
        }

        clear_buddies(cfs_rq, se);

        if (se != cfs_rq->curr)
                __dequeue_entity(cfs_rq, se);   //实际工作由__dequeue_entity()完成
        se->on_rq = 0;
        account_entity_dequeue(cfs_rq, se);

        /*
         * Normalize the entity after updating the min_vruntime because the
         * update can refer to the ->curr item and we need to reflect this
         * movement in our normalized position.

*/
        if (!(flags & DEQUEUE_SLEEP))
                se->vruntime -= cfs_rq->min_vruntime;

        /* return excess runtime on last dequeue */
        return_cfs_rq_runtime(cfs_rq);

        update_min_vruntime(cfs_rq);
        update_cfs_shares(cfs_rq);
}

__dequeue_entity()的实现：

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
        if (cfs_rq->rb_leftmost == &se->run_node) {     //如果删除的是最左结点，那么要调用rb_next()找到新的最左结点
                struct rb_node *next_node;

                next_node = rb_next(&se->run_node);
                cfs_rq->rb_leftmost = next_node;
        }

        rb_erase(&se->run_node, &cfs_rq->tasks_timeline);     //rb_erase()完成删除工作
}

3、调度器入口

进程调度的主要入口点是函数schedule(),定义在文件kernel/sched.c中。它选择哪个进程可以运行，何时将其投入运行。

注：struct rq,每个cpu都会有一个struct rq，记录进程调度的信息，rq->nr_running保存了当前处理器执行task的数量。

static void __sched __schedule(void)
{
        struct task_struct *prev, *next;
        unsigned long *switch_count;
        struct rq *rq;
        int cpu;

need_resched:
        preempt_disable(); //禁止抢占
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
        rcu_note_context_switch(cpu);
        prev = rq->curr;    //当前进程变为prew进程

        schedule_debug(prev);

        if (sched_feat(HRTICK))
                hrtick_clear(rq);

        raw_spin_lock_irq(&rq->lock);

        switch_count = &prev->nivcsw;     //保存进程切换计数
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                if (unlikely(signal_pending_state(prev->state, prev))) {
                        prev->state = TASK_RUNNING;    //如果进程有非阻塞挂起信号，而且状态为TASK_INTERRUPTIBLE，就不将进程移除就绪队列，并将其状态设为TASK_RUNNING。
                } else {
                        deactivate_task(rq, prev, DEQUEUE_SLEEP);   //将进程移除队列
                        prev->on_rq = 0;

                        /*
                         * If a worker went to sleep, notify and ask workqueue
                         * whether it wants to wake up a task to maintain
                         * concurrency.

*/
                        if (prev->flags & PF_WQ_WORKER) {     // 每一个工作队列都有一个工作者线程，如果当前进程是一个工作者线程，就查看是否有其他处于等待中的工作者线程，如果有在本地CPU上唤醒。
                                struct task_struct *to_wakeup;

                                to_wakeup = wq_worker_sleeping(prev, cpu);
                                if (to_wakeup)
                                        try_to_wake_up_local(to_wakeup);
                        }
                }
                switch_count = &prev->nvcsw;
        }

        pre_schedule(rq, prev);

        if (unlikely(!rq->nr_running))     // 如果CPU将变为空闲状态，即就绪队列上没有可运行的进程，就试图从其他CPU上可运行的进程移动到本地CPU的就绪队列上。
                idle_balance(cpu, rq);

        put_prev_task(rq, prev);    // 通知调度器类当前运行的进程将要被另一个进程代替。
        next = pick_next_task(rq);    //选择新的进程
        clear_tsk_need_resched(prev);     // 清除重调度标志TIF_NEED_RESCHED
        rq->skip_clock_update = 0;

        if (likely(prev != next)) {
                rq->nr_switches++;
                rq->curr = next;
                ++*switch_count;

                context_switch(rq, prev, next); /* unlocks the rq */     // 完成进程上下文切换
                /*
                 * The context switch have flipped the stack from under us
                 * and restored the local variables which were saved when
                 * this task called schedule() in the past. prev == current

* is still correct, but it can be moved to another cpu/rq.

*/
                cpu = smp_processor_id();
                rq = cpu_rq(cpu);
        } else
                raw_spin_unlock_irq(&rq->lock);

        post_schedule(rq);

        sched_preempt_enable_no_resched();
        if (need_resched())
                goto need_resched;

}

每一个调度类都实现了pick_next_task()函数，它返回下一个可以运行的函数的指针。

pick_next_task(struct rq *rq)
{
        const struct sched_class *class;
        struct task_struct *p;

        /*
         * Optimization: we know that if all tasks are in
         * the fair class we can call that function directly:
         */
        if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
                p = fair_sched_class.pick_next_task(rq);
                if (likely(p))
                        return p;
        }

        for_each_class(class) {
                p = class->pick_next_task(rq);
                if (p)
                        return p;
        }

        BUG(); /* the idle class will always have a runnable task */
}

4、睡眠和唤醒

进程睡眠的原因：等待某一时间（如文件I/O，等待信号量等等）

内核操作：1、进程把自己标记为睡眠。2、从可执行红黑树移出，放入等待队列。然后调用schedule（）选择和执行一个其他进程。

睡眠有两种状态：TASK_INTERRUPTIBLE和TASK_UNINTERRUPTIBLE，区别在于是否响应信号。

唤醒操作由函数wake_up()进行，它会唤醒指定等待队列上的所有进程，调用函数try_to_wake_up()，该函数负责将进程设置为TASK_RUNNING状态，调用enqueue_task()将此进程放入rbtree中。如果被唤醒的进程比正在执行的进程优先级高，还要设置need_resched标志。

0 0