进程管理之CFS调度器

来源：互联网发布：帝国时代非洲王朝mac 编辑：程序博客网时间：2024/05/16 08:35

/* * All the scheduling class methods: */static const struct sched_class fair_sched_class = {.next= &idle_sched_class, //指向下一个调度器.enqueue_task= enqueue_task_fair, .dequeue_task= dequeue_task_fair,.yield_task= yield_task_fair,.yield_to_task= yield_to_task_fair,.check_preempt_curr= check_preempt_wakeup,.pick_next_task= pick_next_task_fair,.put_prev_task= put_prev_task_fair,#ifdef CONFIG_SMP.select_task_rq= select_task_rq_fair,.rq_online= rq_online_fair,.rq_offline= rq_offline_fair,.task_waking= task_waking_fair,#endif.set_curr_task          = set_curr_task_fair,.task_tick= task_tick_fair,.task_fork= task_fork_fair,.prio_changed= prio_changed_fair,.switched_from= switched_from_fair,.switched_to= switched_to_fair,.get_rr_interval= get_rr_interval_fair,#ifdef CONFIG_FAIR_GROUP_SCHED.task_move_group= task_move_group_fair,#endif};

下面对调度器中一些重要的方法进行解释：

首先是enqueue_task_fair():

/* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and * then put the task into the rbtree: */static voidenqueue_task_fair(struct rq *rq, struct task_struct *p, int flags){struct cfs_rq *cfs_rq;struct sched_entity *se = &p->se;for_each_sched_entity(se) {if (se->on_rq) //如果se已经在rq队列上了，那么就不再执行enqueue操作了。break;cfs_rq = cfs_rq_of(se);enqueue_entity(cfs_rq, se, flags);flags = ENQUEUE_WAKEUP;}for_each_sched_entity(se) {struct cfs_rq *cfs_rq = cfs_rq_of(se);update_cfs_load(cfs_rq, 0);update_cfs_shares(cfs_rq);}hrtick_update(rq);}

将给定进程连入rq队列的函数中，关键的操作就是enqueue_entity():

static voidenqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags){/* * Update the normalized vruntime before updating min_vruntime * through callig update_curr(). */if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))se->vruntime += cfs_rq->min_vruntime; //将se的虚拟时钟初始化为cfs_rq队列的最小的虚拟时钟的值（有关min_vruntime可以参考《深入Linux内核框架》）/* * Update run-time statistics of the 'current'. */update_curr(cfs_rq);  //更新实际时钟和虚拟时钟update_cfs_load(cfs_rq, 0); //与组调度相关account_entity_enqueue(cfs_rq, se); //增加nr_running，即增加cfs队列中进程的个数。update_cfs_shares(cfs_rq);  //与组调度相关if (flags & ENQUEUE_WAKEUP) {place_entity(cfs_rq, se, 0); //确定进程正确的虚拟时间enqueue_sleeper(cfs_rq, se);}update_stats_enqueue(cfs_rq, se);check_spread(cfs_rq, se);if (se != cfs_rq->curr)__enqueue_entity(cfs_rq, se); //将进程置于红黑树中。使用内核的标准方法将进程排序到红黑树中。se->on_rq = 1;if (cfs_rq->nr_running == 1)list_add_leaf_cfs_rq(cfs_rq);}

首先来看一下update_curr函数，它用来更新任务的物理时间和虚拟时间，还有min_vruntime。

static void update_curr(struct cfs_rq *cfs_rq){struct sched_entity *curr = cfs_rq->curr;u64 now = rq_of(cfs_rq)->clock_task; //获取当前时间unsigned long delta_exec;if (unlikely(!curr))return;/* * Get the amount of time the current task was running * since the last time we changed load (this cannot * overflow on 32 bits): */delta_exec = (unsigned long)(now - curr->exec_start); //计算当前时间和上一次更新负荷统计量时的时间差if (!delta_exec) //如果为0，则不做任何事情return;__update_curr(cfs_rq, curr, delta_exec);curr->exec_start = now; //重新设置开始执行的时间if (entity_is_task(curr)) {struct task_struct *curtask = task_of(curr);trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);cpuacct_charge(curtask, delta_exec);account_group_exec_runtime(curtask, delta_exec);}}

这个函数的核心操作当然是__update_curr()：

/* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. */static inline void__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,      unsigned long delta_exec){unsigned long delta_exec_weighted;schedstat_set(curr->statistics.exec_max,      max((u64)delta_exec, curr->statistics.exec_max));curr->sum_exec_runtime += delta_exec;schedstat_add(cfs_rq, exec_clock, delta_exec);delta_exec_weighted = calc_delta_fair(delta_exec, curr); curr->vruntime += delta_exec_weighted;update_min_vruntime(cfs_rq);#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHEDcfs_rq->load_unacc_exec_time += delta_exec;#endif}

这个函数的重点是calc_exec_weighted，比较复杂，但是完成的主要工作是delta_exec_weighted = delta_exec * (NICE_0_LOAD / curr->load.weight)，详细的解释可以参看《深入Linux内核框架》和《独辟蹊径》。
最后，还要更新min_vruntime。

继续看调度器中的方法，与enqueue_task_fair对应的就是dequeue_task_fair():

/* * The dequeue_task method is called before nr_running is * decreased. We remove the task from the rbtree and * update the fair scheduling stats: */static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags){struct cfs_rq *cfs_rq;struct sched_entity *se = &p->se;int task_sleep = flags & DEQUEUE_SLEEP;for_each_sched_entity(se) {cfs_rq = cfs_rq_of(se);dequeue_entity(cfs_rq, se, flags);/* Don't dequeue parent if it has other entities besides us */if (cfs_rq->load.weight) {/* * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */if (task_sleep && parent_entity(se))set_next_buddy(parent_entity(se));break;}flags |= DEQUEUE_SLEEP;}for_each_sched_entity(se) {struct cfs_rq *cfs_rq = cfs_rq_of(se);update_cfs_load(cfs_rq, 0);update_cfs_shares(cfs_rq);}hrtick_update(rq);}

看完了enqueue_task_fair的代码，这段代码就应该比较好理解了，这里就只介绍其核心操作：

static voiddequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags){/* * Update run-time statistics of the 'current'. */update_curr(cfs_rq);update_stats_dequeue(cfs_rq, se);if (flags & DEQUEUE_SLEEP) {#ifdef CONFIG_SCHEDSTATSif (entity_is_task(se)) {struct task_struct *tsk = task_of(se);if (tsk->state & TASK_INTERRUPTIBLE)se->statistics.sleep_start = rq_of(cfs_rq)->clock;if (tsk->state & TASK_UNINTERRUPTIBLE)se->statistics.block_start = rq_of(cfs_rq)->clock;}#endif}clear_buddies(cfs_rq, se);if (se != cfs_rq->curr)__dequeue_entity(cfs_rq, se);se->on_rq = 0;update_cfs_load(cfs_rq, 0);account_entity_dequeue(cfs_rq, se);/* * Normalize the entity after updating the min_vruntime because the * update can refer to the ->curr item and we need to reflect this * movement in our normalized position. */if (!(flags & DEQUEUE_SLEEP))se->vruntime -= cfs_rq->min_vruntime;update_min_vruntime(cfs_rq);update_cfs_shares(cfs_rq);}

这段代码中，我们也只关注两个操作：
其一，clear_buddies();
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se){if (cfs_rq->last == se)__clear_buddies_last(se);if (cfs_rq->next == se)__clear_buddies_next(se);if (cfs_rq->skip == se)__clear_buddies_skip(se);}
这是将cfs_rq队列中的last、next、skip字段清空，因为他们会影响schedule中挑选下一个进程的操作。
其二，__dequeue_entity
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se){if (cfs_rq->rb_leftmost == &se->run_node) {struct rb_node *next_node;next_node = rb_next(&se->run_node);cfs_rq->rb_leftmost = next_node;}rb_erase(&se->run_node, &cfs_rq->tasks_timeline);}

这段代码的作用很明显，不过是将se从rb树上摘除，并且选取下一个leftmost。

yield_task_fair的作用与下文要讲到的yield_to_task_fair的作用类似，我们挑其中一个讲就可以了。

接着来看调度器中的方法:yield_to_task_fair，curr放弃CPU占用权，并将占用权转让给p进程：

static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt){struct sched_entity *se = &p->se;if (!se->on_rq)return false;/* Tell the scheduler that we'd really like pse to run next. */set_next_buddy(se); //将se设为下一个buddy，也就是将其设为cfs_rq队列中的next字段指向se，内核调度的时候会优先选择seyield_task_fair(rq); //将cfs_rq队列中的skip字段指向curr，内核调度的时候会暂时忽略该进程。return true;}

static void set_next_buddy(struct sched_entity *se){if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))return;for_each_sched_entity(se)cfs_rq_of(se)->next = se;}

/* * sched_yield() is very simple * * The magic of dealing with the ->skip buddy is in pick_next_entity. */static void yield_task_fair(struct rq *rq){struct task_struct *curr = rq->curr;struct cfs_rq *cfs_rq = task_cfs_rq(curr);struct sched_entity *se = &curr->se;/* * Are we the only task in the tree? */if (unlikely(rq->nr_running == 1))return;clear_buddies(cfs_rq, se);if (curr->policy != SCHED_BATCH) {update_rq_clock(rq);/* * Update run-time statistics of the 'current'. */update_curr(cfs_rq);}set_skip_buddy(se);}

回到调度器中，我们来看一下check_preempt_wakeup方法，这个方法的作用，在《深入Linux内核框架》一书中有详细的解释。

/* * Preempt the current task with a newly woken task if needed: */static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags){struct task_struct *curr = rq->curr;struct sched_entity *se = &curr->se, *pse = &p->se;struct cfs_rq *cfs_rq = task_cfs_rq(curr);int scale = cfs_rq->nr_running >= sched_nr_latency;int next_buddy_marked = 0;if (unlikely(se == pse))return;if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { //set_next_buddy(pse);next_buddy_marked = 1;}/* * We can come here with TIF_NEED_RESCHED already set from new task * wake up path. */if (test_tsk_need_resched(curr)) //如果当前进程是可以被抢占的，则不用设置，直接返回就行了。return;/* Idle tasks are by definition preempted by non-idle tasks. */if (unlikely(curr->policy == SCHED_IDLE) &&    likely(p->policy != SCHED_IDLE)) //如果当前进程的policy是SCHED_IDLE而p的policy不是SCHED_IDLE，很明显是可以抢占的。goto preempt;/* * Batch and idle tasks do not preempt non-idle tasks (their preemption * is driven by the tick): */if (unlikely(p->policy != SCHED_NORMAL)) //如果p的policy不是NORMAL，同时也不是IDLE，那么一定是RT，则是不能抢占的。return;if (!sched_feat(WAKEUP_PREEMPT))return;update_curr(cfs_rq); //更新实际时间和虚拟时间find_matching_se(&se, &pse); //与组调度相关BUG_ON(!pse);if (wakeup_preempt_entity(se, pse) == 1) { //请参看博客http://blog.csdn.net/sunnybeike/article/details/6918586/*  * Bias pick_next to pick the sched entity that is * triggering this preemption. */if (!next_buddy_marked)set_next_buddy(pse); //将cfs_rq队列中的next字段指向pse，以最快速地调度到pse。goto preempt;}return;preempt:resched_task(curr); //将curr设置为可抢占的。/* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved * with schedule on the ->pre_schedule() or idle_balance() * point, either of which can * drop the rq lock. * * Also, during early boot the idle thread is in the fair class, * for obvious reasons its a bad idea to schedule back to it. */if (unlikely(!se->on_rq || curr == rq->idle))return;if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))set_last_buddy(se); //将cfs_rq中的last字段指向curr的se，则再pse完成后会优先选择curr。}

接着看调度器中的方法，pick_next_task_fair方法在博客http://blog.csdn.net/sunnybeike/article/details/6918586中已经详细的解释了，不再赘述，可以参考博文。

下面一个是put_prev_task_fair，这个函数的作用在《深入Linux内核架构》中有详细的解释：

put_prev_task first announces to the scheduler class that the currently running task is going to be replaced by another one. Note that this is not equivalent to taking the task off the run queue, but provides the opportunity to perform some accounting and bring statistics up to date.

/* * Account for a descheduled task: */static void put_prev_task_fair(struct rq *rq, struct task_struct *prev){struct sched_entity *se = &prev->se;struct cfs_rq *cfs_rq;for_each_sched_entity(se) {cfs_rq = cfs_rq_of(se);put_prev_entity(cfs_rq, se);}}

这个函数的最后一个操作是最关键的：

static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev){/* * If still on the runqueue then deactivate_task() * was not called and update_curr() has to be done: */if (prev->on_rq)update_curr(cfs_rq);check_spread(cfs_rq, prev);if (prev->on_rq) {update_stats_wait_start(cfs_rq, prev);/* Put 'current' back into the tree. */__enqueue_entity(cfs_rq, prev); //将previous进程放回到cfs_rq队列中。} cfs_rq->curr = NULL; //将cfs_rq队列中的curr置空。}

我们并不关心SMP相关的操作，因此我们忽略调度器中的如下操作，以后有机会再来关心：

#ifdef CONFIG_SMP.select_task_rq= select_task_rq_fair,.rq_online= rq_online_fair,.rq_offline= rq_offline_fair,.task_waking= task_waking_fair,#endif

接下来是方法是set_curr_task_fair，函数的注释已经给出了详细的解释，主要是当一个进程合并在合并的到groups/classes时调用，代码比较简单，就不详述了。

/* Account for a task changing its policy or group. * * This routine is mostly called to set cfs_rq->curr field when a task * migrates between groups/classes. */static void set_curr_task_fair(struct rq *rq){struct sched_entity *se = &rq->curr->se;for_each_sched_entity(se)set_next_entity(cfs_rq_of(se), se);}

来看调度器中的下面一个方法task_tick_fair，这个方法主要用来处理周期调度（“强制”调度），由时钟中断调用。

《深入Linux内核架构》对该函数有比较详细的解释，可以参看。

/* * scheduler tick hitting a task of our scheduling class: */static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued){struct cfs_rq *cfs_rq;struct sched_entity *se = &curr->se;for_each_sched_entity(se) {cfs_rq = cfs_rq_of(se);entity_tick(cfs_rq, se, queued);}}

这个方法主要是由entity_tick()操作完成的，我们重点来看这个操作：

static voidentity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued){/* * Update run-time statistics of the 'current'. */update_curr(cfs_rq); //更新虚拟时钟和实际时钟/* * Update share accounting for long-running entities. */update_entity_shares_tick(cfs_rq);#ifdef CONFIG_SCHED_HRTICK/* * queued ticks are scheduled to match the slice, so don't bother * validating it and just reschedule. */if (queued) {resched_task(rq_of(cfs_rq)->curr);return;}/* * don't let the period tick interfere with the hrtick preemption */if (!sched_feat(DOUBLE_TICK) &&hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))return;#endifif (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) //如果cfs_rq中的进程数量大于1check_preempt_tick(cfs_rq, curr); }

最后一个操作函数调用完成了函数的功能：

/* * Preempt the current task with a newly woken task if needed: */static voidcheck_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr){unsigned long ideal_runtime, delta_exec;ideal_runtime = sched_slice(cfs_rq, curr); //分配给curr的时间delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; //curr运行的实际时间 if (delta_exec > ideal_runtime) { //如果实际运行的时间比分配给它的时间大resched_task(rq_of(cfs_rq)->curr); //将curr设置为可以调度的/* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. */clear_buddies(cfs_rq, curr); //将curr从last、next、skip中清除掉，防止接下来的调度再次调度到curr进程。return;}/* * Ensure that a task that missed wakeup preemption by a * narrow margin doesn't have to wait for a full slice. * This also mitigates buddy induced latencies under load. */if (!sched_feat(WAKEUP_PREEMPT))return;if (delta_exec < sysctl_sched_min_granularity) //如果delta_exec小于粒度，那么就不管。return;if (cfs_rq->nr_running > 1) {struct sched_entity *se = __pick_first_entity(cfs_rq); //挑选一个进程以供调度。s64 delta = curr->vruntime - se->vruntime;if (delta < 0)  //如果curr的vruntime小于vruntime，那么就不用调度了。return;if (delta > ideal_runtime) //如果curr和se的vruntime之差足够大，大于实际分配给curr的时间，说明se是一个相当轻的进程，那么最好先把它执行掉。？？？要好好理解注释中的解释。resched_task(rq_of(cfs_rq)->curr);}}

回到调度器中，看下一个方法task_fork_fair，将一个新建立的进程加入到队列中：

/* * called on fork with the child task as argument from the parent's context *  - child not yet on the tasklist *  - preemption disabled */static void task_fork_fair(struct task_struct *p){struct cfs_rq *cfs_rq = task_cfs_rq(current);struct sched_entity *se = &p->se, *curr = cfs_rq->curr;int this_cpu = smp_processor_id();struct rq *rq = this_rq();unsigned long flags;raw_spin_lock_irqsave(&rq->lock, flags);update_rq_clock(rq);if (unlikely(task_cpu(p) != this_cpu)) {rcu_read_lock();__set_task_cpu(p, this_cpu);  //将p放在其父进程执行的CPU上，因为只有这样，父子进程才能在同一个rq队列中，才能进行操作。rcu_read_unlock();}update_curr(cfs_rq); //更新实际时钟和虚拟时钟。if (curr)se->vruntime = curr->vruntime; //先将子进程的vruntime设置为父进程的vruntimeplace_entity(cfs_rq, se, 1); //调整子进程的vruntime，在这一步中设置的子进程的vruntime总是不小于父进程的vruntime的，如果下文没有变动的话，从是父进程先得到调度。if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { //如果新创建的子进程需要被先执行并且curr的vruntime<se的vruntime/* * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. */swap(curr->vruntime, se->vruntime); //将父子进程的vruntime交换。resched_task(rq->curr);  //将curr设置为可调度的。}se->vruntime -= cfs_rq->min_vruntime; //因为在place_entity中设置的vruntime多加了cfs_rq->min_vruntime，因此这里要恢复成正确的vruntimeraw_spin_unlock_irqrestore(&rq->lock, flags);}

调度器中的下一个方法是prio_changed_fair，源码中已经有了很清楚的解释，我们就不再赘述了。不过要说明的是，这方法属于“强制”调度的范畴。

/* * Priority of the task has changed. Check to see if we preempt * the current task. */static voidprio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio){if (!p->se.on_rq)return;/* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */if (rq->curr == p) {if (p->prio > oldprio)resched_task(rq->curr);} elsecheck_preempt_curr(rq, p, 0);}

接着来看调度器中的方法switched_from_fair，这个方法的作用直接参看源码中给出的注释就行了。不过这个方法是3.04内核新增的，从方法的名字理解，这个函数主要是在从CFS算法转向其它算法的时候使用的，与其相应的是调度器中的下面一个方法，switched_to_fair，应该是从其它算法转向CFS算法时被执行的。

static void switched_from_fair(struct rq *rq, struct task_struct *p){struct sched_entity *se = &p->se;struct cfs_rq *cfs_rq = cfs_rq_of(se);/* * Ensure the task's vruntime is normalized, so that when its * switched back to the fair class the enqueue_entity(.flags=0) will * do the right thing. * * If it was on_rq, then the dequeue_entity(.flags=0) will already * have normalized the vruntime, if it was !on_rq, then only when * the task is sleeping will it still have non-normalized vruntime. */if (!se->on_rq && p->state != TASK_RUNNING) {/* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. */place_entity(cfs_rq, se, 0);se->vruntime -= cfs_rq->min_vruntime;}}

/* * We switched to the sched_fair class. */static void switched_to_fair(struct rq *rq, struct task_struct *p){if (!p->se.on_rq)return;/* * We were most likely switched from sched_rt, so * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */if (rq->curr == p) //如果p进程正在运行，则将其置为可抢占的resched_task(rq->curr); elsecheck_preempt_curr(rq, p, 0); //否则，检查能否被抢占，下文详述。}

我们来看switched_to_fair的最后一个操作check_preempt_curr：

static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags){const struct sched_class *class;if (p->sched_class == rq->curr->sched_class) { //如果rq和p同属一个调度类，那么直接用调用该调度类进行判断就行了。rq->curr->sched_class->check_preempt_curr(rq, p, flags);} else {//否则for_each_class(class) { if (class == rq->curr->sched_class) //这是个很巧妙的实现。for_each_class扫描的顺序是stop_task、rt_task、normal_task、idle_taskbreak;                      //如果这个条件判断为真，那么说名curr的优先级一定大于p的，那么不能抢占。if (class == p->sched_class) {resched_task(rq->curr);     //将curr设置为可以抢占的break;}}}/* * A queue event has occurred, and we're going to schedule.  In * this case, we can save a useless back to back clock update. */if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))rq->skip_clock_update = 1;}

来看最后一个需要关注的调度器方法get_rr_interval_fair，这个方法的功能还是很明了的，就是获得task的rr(Round Robin)，但是这个方法是不经常被用到的，因为在CFS中，基本上已经抛弃了rr算法。

static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task){struct sched_entity *se = &task->se;unsigned int rr_interval = 0;/* * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise * idle runqueue: */if (rq->cfs.load.weight)rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));return rr_interval;}

至此，我们已经将CFS调度器解释完毕！