linux cfs调度器

来源：互联网发布：网络上贷款需要些什么编辑：程序博客网时间：2024/05/01 21:13

CFS(completely fair scheduler)完全公平调度器，对应应用设置的调度策略为SCHED_NORMAL/SCHED_BATCH。
这种调度策略区别于实时调度，进程优先级低于实时调度进程，用nice值来表示进程的重要程度，nice值得范围-20~19，
转换成优先级为100~140。

应用程序有多种类型，cpu 消耗型，交互型，nice值一般也不同。cfs调度器如何保证完全公平呢？
cfs是将进程所有影响调度的因素都转换为对vruntime的处理，从而近似公平。

说下对CFS “完全公平” 的理解：
①不再区分进程类型，所有进程公平对待
②对I/O消耗型进程，仍然会提供快速响应(对睡眠进程做时间补偿)
③优先级高的进程，获得CPU时间更多(vruntime增长的更慢)

可见CFS的完全公平，并不是说所有进程绝对的平等，占用CPU时间完全相同，而是体现在vruntime数值上，所有进程都用虚拟时间来度量，总是让vruntime最小的进程抢占，这样看起来是完全公平的，但实际上vruntime的更新，增长速度，不同进程是不尽一样的。CFS利用这么个简单的vruntime机制，实现了以往需要相当复杂算法实现的进度调度需求。

vruntime在每个时钟中断，task创建，task加入运行队列，移出运行队列调用到，
接下来看下vruntime更新的相关代码：

每个tick中断时会调用tick_handle_periodic，tick_periodic里面只有在某个特定cpu上执行才会更新全局变量jiffy，

其他cpu上都只会更新自己rq上的进程信息。

//只有一个CPU是负责更新jffies的，其他的CPU只会更新当前自己的进程static void tick_periodic(int cpu){if (tick_do_timer_cpu == cpu) {//更新jiffywrite_seqlock(&jiffies_lock);/* Keep track of the next tick event */tick_next_period = ktime_add(tick_next_period, tick_period);do_timer(1);write_sequnlock(&jiffies_lock);}//更新进程时间信息update_process_times(user_mode(get_irq_regs()));profile_tick(CPU_PROFILING);}

update_process_times中会继续调用scheduler_tick，

/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. */ //更新CPU和当前进行的一些数据，然后根据当前进程的调度类，调用task_tick()函数void scheduler_tick(void){int cpu = smp_processor_id();struct rq *rq = cpu_rq(cpu);struct task_struct *curr = rq->curr; //当前正在执行的进程u32 old_load;struct related_thread_group *grp;    //更新调度时间sched_clock_tick();raw_spin_lock(&rq->lock);old_load = task_load(curr);grp = task_related_thread_group(curr);//task 所在groupset_window_start(rq);update_rq_clock(rq);//更新cpu load信息update_cpu_load_active(rq);//调用cfs 子类task_tick_faircurr->sched_class->task_tick(rq, curr, 0);update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(), 0);raw_spin_unlock(&rq->lock);perf_event_task_tick();#ifdef CONFIG_SMPrq->idle_balance = idle_cpu(cpu);trigger_load_balance(rq, cpu);#endifrq_last_tick_reset(rq);if (update_preferred_cluster(grp, curr, old_load))set_preferred_cluster(grp);if (curr->sched_class == &fair_sched_class)check_for_migration(rq, curr);}static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued){struct cfs_rq *cfs_rq;struct sched_entity *se = &curr->se;//组调度，更新task group中每个实体的时间信息for_each_sched_entity(se) {cfs_rq = cfs_rq_of(se);entity_tick(cfs_rq, se, queued);}if (sched_feat_numa(NUMA))task_tick_numa(rq, curr);update_rq_runnable_avg(rq, 1);}//每个时钟周期，更新vruntimestatic voidentity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued){/* * Update run-time statistics of the 'current'. *///更新rq 时间信息update_curr(cfs_rq);/* * Ensure that runnable average is periodically updated. */update_entity_load_avg(curr, 1);update_cfs_rq_blocked_load(cfs_rq, 1);update_cfs_shares(cfs_rq);#ifdef CONFIG_SCHED_HRTICK/* * queued ticks are scheduled to match the slice, so don't bother * validating it and just reschedule. */if (queued) {resched_task(rq_of(cfs_rq)->curr);return;}/* * don't let the period tick interfere with the hrtick preemption */if (!sched_feat(DOUBLE_TICK) &&hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))return;#endif//运行状态task数量大于1的话，检查是否要发生进程调度抢占if (cfs_rq->nr_running > 1)check_preempt_tick(cfs_rq, curr);}//获取task实际运行的时间，加权后变为vruntimestatic void update_curr(struct cfs_rq *cfs_rq){struct sched_entity *curr = cfs_rq->curr;u64 now = rq_clock_task(rq_of(cfs_rq));unsigned long delta_exec;if (unlikely(!curr))return;/* * Get the amount of time the current task was running * since the last time we changed load (this cannot * overflow on 32 bits): *///当前系统时间减去本次任务开始执行时间， 差值为task 运行的时间delta_exec = (unsigned long)(now - curr->exec_start);if (!delta_exec)return;//task运行的实际时间加权后，加到当前task 的vruntime中__update_curr(cfs_rq, curr, delta_exec);//当前task执行开始时间curr->exec_start = now;if (entity_is_task(curr)) {struct task_struct *curtask = task_of(curr);trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);cpuacct_charge(curtask, delta_exec);account_group_exec_runtime(curtask, delta_exec);}account_cfs_rq_runtime(cfs_rq, delta_exec);}

static inline void__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,      unsigned long delta_exec){unsigned long delta_exec_weighted;schedstat_set(curr->statistics.exec_max,      max((u64)delta_exec, curr->statistics.exec_max));/* 总运行时间更新 */  curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec);/* 用优先级和delta_exec来计算weighted，以用于更新vruntime */ delta_exec_weighted = calc_delta_fair(delta_exec, curr);//加权后的执行时间//vruntime 增长值不是实际运行的时间，而是加权后的值，这样保证运行的公平curr->vruntime += delta_exec_weighted;//cfs_rq 保存所有进程最小的vruntime,下次调度时直接使用update_min_vruntime(cfs_rq);} //若当前进程nice为0，直接返回实际运行时间，其他所有nice值的加权都是以0 nice值为参考增加或减少的。static inline unsigned longcalc_delta_fair(unsigned long delta, struct sched_entity *se){if (unlikely(se->load.weight != NICE_0_LOAD))//nice 不为0，就得计算delta加权值delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);return delta;}/* * delta *= weight / lw */ /* NICE_0_LOAD is 1024 delta*=NICE_0_LOAD/lw  weight=NICE_0_LOAD， nice值越大的，自身的load越小，计算后delta相比实际时间就增长越快，这样下次被调度到的机会降低。 另外也说明为了达到同样的vruntime，nice值高的进程可以分配更少的运行时间 nice值对应的load 参考prio_to_weight。 */static unsigned longcalc_delta_mine(unsigned long delta_exec, unsigned long weight,struct load_weight *lw){u64 tmp;/* * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched * entities since MIN_SHARES = 2. Treat weight as 1 if less than * 2^SCHED_LOAD_RESOLUTION. */if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))tmp = (u64)delta_exec * scale_load_down(weight);//tmp=delta_exec*NICE_0_LOAD;elsetmp = (u64)delta_exec;if (!lw->inv_weight) {unsigned long w = scale_load_down(lw->weight);//w=lw->weight;if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))lw->inv_weight = 1;else if (unlikely(!w))lw->inv_weight = WMULT_CONST;elselw->inv_weight = WMULT_CONST / w; //1^32 /load}/* * Check whether we'd overflow the 64-bit multiplication: */if (unlikely(tmp > WMULT_CONST))tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,WMULT_SHIFT/2);elsetmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);//数据按照1<<32位对齐,tmp等于WMULT_SHIFT的倍数return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);}

/* * Preempt the current task with a newly woken task if needed: */ //更新实体vruntime后，检查是否需要重新调度,当前进程执行时间超过调度器分配的时间，就重新调度static voidcheck_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr){unsigned long ideal_runtime, delta_exec;struct sched_entity *se;s64 delta;//ideal_runtime是理论上的处理器运行时间片，这个时间不固定，根据当前rq中的running task数量计算出来ideal_runtime = sched_slice(cfs_rq, curr);//该进程本轮调度累计运行时间delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; // 假如实际运行超过调度器分配的时间，就标记重新调度标志if (delta_exec > ideal_runtime) {resched_task(rq_of(cfs_rq)->curr);/* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. */clear_buddies(cfs_rq, curr);return;}/* * Ensure that a task that missed wakeup preemption by a * narrow margin doesn't have to wait for a full slice. * This also mitigates buddy induced latencies under load. */ //sysctl_sched_min_granularity 调度最小的间隔，不需要重新调度，直接返回if (delta_exec < sysctl_sched_min_granularity)return;//找到位于rq最左边的节点，这个entity就是vruntime最小的entityse = __pick_first_entity(cfs_rq);delta = curr->vruntime - se->vruntime;//当前任务跟rq中最小的vruntime节点对比if (delta < 0)return;//设置重新调度标记，需要切换到其他entity运行if (delta > ideal_runtime)resched_task(rq_of(cfs_rq)->curr);}

继续看下调度实体出对，入队的操作，对cfs调度器，入队调用的enqueue_task_fair

/* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and * then put the task into the rbtree: */static voidenqueue_task_fair(struct rq *rq, struct task_struct *p, int flags){struct cfs_rq *cfs_rq;struct sched_entity *se = &p->se;/* 这里是一个迭代，我们知道，进程有可能是处于一个进程组中的，所以当这个处于进程组中的进程加入到该进程组的队列中时，要对此队列向上迭代 */for_each_sched_entity(se) {if (se->on_rq)break;cfs_rq = cfs_rq_of(se);//se 插入rb treeenqueue_entity(cfs_rq, se, flags);/* * end evaluation on encountering a throttled cfs_rq * * note: in the case of encountering a throttled cfs_rq we will * post the final h_nr_running increment below.*/if (cfs_rq_throttled(cfs_rq))break;cfs_rq->h_nr_running++;inc_cfs_rq_hmp_stats(cfs_rq, p, 1);flags = ENQUEUE_WAKEUP;}//只有se不处于队列中或者cfs_rq_throttled(cfs_rq)返回假才会运行这个循环，用来处理task group余下的taskfor_each_sched_entity(se) {cfs_rq = cfs_rq_of(se);cfs_rq->h_nr_running++;inc_cfs_rq_hmp_stats(cfs_rq, p, 1);if (cfs_rq_throttled(cfs_rq))break;update_cfs_shares(cfs_rq);update_entity_load_avg(se, 1);}if (!se) {update_rq_runnable_avg(rq, rq->nr_running);inc_nr_running(rq);inc_rq_hmp_stats(rq, p, 1);}hrtick_update(rq);}static voidenqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags){/* * Update the normalized vruntime before updating min_vruntime * through callig update_curr(). */if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))se->vruntime += cfs_rq->min_vruntime;/* * Update run-time statistics of the 'current'. */update_curr(cfs_rq);enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);account_entity_enqueue(cfs_rq, se);update_cfs_shares(cfs_rq);//当前进程之前处于睡眠状态，刚被唤醒,新建进程flag为0if (flags & ENQUEUE_WAKEUP) {place_entity(cfs_rq, se, 0);enqueue_sleeper(cfs_rq, se);}update_stats_enqueue(cfs_rq, se, !!(flags & ENQUEUE_MIGRATING));check_spread(cfs_rq, se);if (se != cfs_rq->curr)//将se插入到运行队列cfs_rq的红黑树中__enqueue_entity(cfs_rq, se);se->on_rq = 1;if (cfs_rq->nr_running == 1) {list_add_leaf_cfs_rq(cfs_rq);check_enqueue_throttle(cfs_rq);}}

/*对"新"进程处理，initial为0表示新创建进程，为1代表睡眠唤醒的进程子进程在创建时，vruntime初值首先被设置为min_vruntime；然后，如果sched_features中设置了START_DEBIT位，vruntime会在min_vruntime的基础上再增大一些。设置完子进程的vruntime之后，检查sched_child_runs_first参数，如果为1的话，就比较父进程和子进程的vruntime，若是父进程的vruntime更小，就对换父、子进程的vruntime，这样就保证了子进程会在父进程之前运行。*/static voidplace_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial){//新建task以cfs的最小vruntime为初始值u64 vruntime = cfs_rq->min_vruntime;/* * The 'current' period is already promised to the current tasks, * however the extra weight of the new task will slow them down a * little, place the new task so that it fits in the slot that * stays open at the end. */ /*      sched_features是控制调度器特性的开关，每个bit表示调度器的一个特性。      在sched_features.h文件中记录了全部的特性。START_DEBIT是其中之一，      如果打开这个特性，表示给新进程的vruntime初始值要设置得比默认值更大一些，      这样会推迟它的运行时间，以防进程通过不停的fork来获得cpu时间片。  */if (initial && sched_feat(START_DEBIT))//新进程并且设置了START_DEBITvruntime += sched_vslice(cfs_rq, se);// 加上一个调度周期内的"时间片"/* sleeps up to a single latency don't count. *///对于休眠进程，vruntime会比rq中最小vruntime还要小，这样在下次调度切换时最先得到调度，//也就是交互性进程唤醒后最先得到调度if (!initial) {//休眠进程unsigned long thresh = sysctl_sched_latency;//一个调度周期/* * Halve their sleep time's effect, to allow * for a gentler effect of sleepers: */if (sched_feat(GENTLE_FAIR_SLEEPERS))thresh >>= 1;vruntime -= thresh;}/* ensure we never gain time by being placed backwards. *///第三种情况，进程之前处于执行状态se->vruntime = max_vruntime(se->vruntime, vruntime);}//将调度实体加入rb tree中，这样才有机会被调度到static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se){struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;struct rb_node *parent = NULL;struct sched_entity *entry;int leftmost = 1;/* * Find the right place in the rbtree: *///在rb tree中找到合适插入点 while (*link) {parent = *link;entry = rb_entry(parent, struct sched_entity, run_node);/* * We dont care about collisions. Nodes with * the same key stay together. */if (entity_before(se, entry)) {link = &parent->rb_left;} else {link = &parent->rb_right;leftmost = 0; //往右子树走过之后，新插入entity的vruntime不可能最小}}/* * Maintain a cache of leftmost tree entries (it is frequently * used): *///加入一个新的task后，判断是否要更新rb_leftmost，//保存rb tree 最左子树，避免了rb tree的遍历查找if (leftmost)cfs_rq->rb_leftmost = &se->run_node;//插入 rb treerb_link_node(&se->run_node, parent, link);//调整rb tree，达到平衡rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);}

//sched entity 出队操作static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags){struct cfs_rq *cfs_rq;struct sched_entity *se = &p->se;int task_sleep = flags & DEQUEUE_SLEEP;//对于task group，依次出队,group作为一个整体进行处理，出队、入队，以及选择group内的某个taskfor_each_sched_entity(se) {cfs_rq = cfs_rq_of(se);dequeue_entity(cfs_rq, se, flags);/* * end evaluation on encountering a throttled cfs_rq * * note: in the case of encountering a throttled cfs_rq we will * post the final h_nr_running decrement below.*/if (cfs_rq_throttled(cfs_rq))break;cfs_rq->h_nr_running--;dec_cfs_rq_hmp_stats(cfs_rq, p, 1);/* Don't dequeue parent if it has other entities besides us */if (cfs_rq->load.weight) {/* * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */if (task_sleep && parent_entity(se))set_next_buddy(parent_entity(se));/* avoid re-evaluating load for this entity */se = parent_entity(se);break;}flags |= DEQUEUE_SLEEP;}for_each_sched_entity(se) {cfs_rq = cfs_rq_of(se);cfs_rq->h_nr_running--;dec_cfs_rq_hmp_stats(cfs_rq, p, 1);if (cfs_rq_throttled(cfs_rq))break;update_cfs_shares(cfs_rq);update_entity_load_avg(se, 1);}if (!se) {dec_nr_running(rq);update_rq_runnable_avg(rq, 1);dec_rq_hmp_stats(rq, p, 1);}hrtick_update(rq);}

/*当进程从一个CPU的运行队列中出来 (dequeue_entity) 的时候，它的vruntime要减去队列的min_vruntime值；而当进程加入另一个CPU的运行队列 ( enqueue_entiry) 时，它的vruntime要加上该队列的min_vruntime值。这样，进程从一个CPU迁移到另一个CPU之后，vruntime保持相对公平。*/static voiddequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags){/* * Update run-time statistics of the 'current'. *///更新当前task的vruntimeupdate_curr(cfs_rq);dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);update_stats_dequeue(cfs_rq, se, !!(flags & DEQUEUE_MIGRATING));if (flags & DEQUEUE_SLEEP) {#ifdef CONFIG_SCHEDSTATSif (entity_is_task(se)) {struct task_struct *tsk = task_of(se);if (tsk->state & TASK_INTERRUPTIBLE)se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));if (tsk->state & TASK_UNINTERRUPTIBLE)se->statistics.block_start = rq_clock(rq_of(cfs_rq));}#endif}clear_buddies(cfs_rq, se);if (se != cfs_rq->curr)__dequeue_entity(cfs_rq, se);se->on_rq = 0;account_entity_dequeue(cfs_rq, se);/* * Normalize the entity after updating the min_vruntime because the * update can refer to the ->curr item and we need to reflect this * movement in our normalized position. *///更新se vruntime if (!(flags & DEQUEUE_SLEEP))se->vruntime -= cfs_rq->min_vruntime;/* return excess runtime on last dequeue */return_cfs_rq_runtime(cfs_rq);//更新cfs rq的min_vruntimeupdate_min_vruntime(cfs_rq);update_cfs_shares(cfs_rq);}//从rb tree中移除entitystatic void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se){if (cfs_rq->rb_leftmost == &se->run_node) {struct rb_node *next_node;next_node = rb_next(&se->run_node);cfs_rq->rb_leftmost = next_node;}rb_erase(&se->run_node, &cfs_rq->tasks_timeline);}

0 0