enqueue_task和dequeue_task源码分析

来源：互联网发布：excel多个数据求和编辑：程序博客网时间：2024/06/12 00:29

enqueue_task源码分析

本章分析enqueue_task和dequeue_task函数的源码。enqueue_task将进程添加到具体的运行队列中，dequeue_task相反，下面来看。

enqueue_task
kernel/sched/core.c

static void enqueue_task(struct rq *rq, struct task_struct *p, int flags){    update_rq_clock(rq);    p->sched_class->enqueue_task(rq, p, flags);}

update_rq_clock设置运行队列rq的当前时间。本章sched_class假设为fair_sched_clas，对应cfs调度策略。
根据调度策略的不同，sched_class可以为stop_sched_class、dl_sched_class、rt_sched_class、fair_sched_class和idle_sched_class，分别对应不同的调度策略和进程类型。

enqueue_task->update_rq_clock
kernel/sched/core.c

void update_rq_clock(struct rq *rq){    s64 delta;    if (rq->clock_skip_update & RQCF_ACT_SKIP)        return;    delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;    if (delta < 0)        return;    rq->clock += delta;    update_rq_clock_task(rq, delta);}

如果clock_skip_update标志位设置了RQCF_ACT_SKIP，则不进行clock值的累加。
sched_clock_cpu获得cpu当前时间，其实rq最终的clock值就等于sched_clock_cpu函数获得的时间。
update_rq_clock_task函数内部将delta减去系统软中断和硬中断的时间并累加到clock_task中，表示runqueues中的任务实际被处理器执行的时间。

enqueue_task->enqueue_task_fair
kernel/sched/fair.c

static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags){    struct cfs_rq *cfs_rq;    struct sched_entity *se = &p->se;    for_each_sched_entity(se) {        if (se->on_rq)            break;        cfs_rq = cfs_rq_of(se);        enqueue_entity(cfs_rq, se, flags);        cfs_rq->h_nr_running++;    }}

首先获得进程对应的调度实体se。
for_each_sched_entity向上级遍历sched_entity，这里假设只有一个，因为考虑到进程组的因素，即一个调度算法可以调度一个进程组，因此产生了sched_entity。
cfs_rq_of宏通过调度实体获得进程的task_struct结构指针，再继而获得其运行队列的指针，最终获得其中的cfs队列指针。

static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se){    struct task_struct *p = task_of(se);    struct rq *rq = task_rq(p);    return &rq->cfs;}static inline struct task_struct *task_of(struct sched_entity *se){    return container_of(se, struct task_struct, se);}

enqueue_entity完成主要的插入工作。

enqueue_task->enqueue_task_fair->enqueue_entity
kernel/sched/fair.c

static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags){    if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))        se->vruntime += cfs_rq->min_vruntime;    update_curr(cfs_rq);    enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);    account_entity_enqueue(cfs_rq, se);    if (flags & ENQUEUE_WAKEUP) {        place_entity(cfs_rq, se, 0);    }    if (se != cfs_rq->curr)        __enqueue_entity(cfs_rq, se);    se->on_rq = 1;}

第一个if语句判断ENQUEUE_WAKEUP和ENQUEUE_WAKING标志位，如果成立，表示该进程可能切换了cpu继续执行，由于每个cpu对应一个cfs运行队列，此时会造成问题。假设cpu1和cpu2，分别对应cfs队列1和cfs队列2，而cfs队列1的min_vruntime要小于cfs队列2的min_vruntime，当进程A从cpu1切换到cpu2去执行的时候，可能就会产生问题，假设进程A在cfs队列1里的虚拟时间vruntime偏小，当切换到cfs队列2里时，就可能占用大部分的cpu运行时间，因此cfs的做法是当有进程dequeue_entity时，将vruntime减去当前cfs队列的min_vruntime，当再一次enqueue_entiry时，将vruntime再加上新队列的min_vruntime，这是一种简单处理该问题的方法。

接下来update_curr函数将更新cfs队列中正在运行的调度实体的各个统计值，最重要的是更新整个cfs队列的min_vruntime值。

enqueue_entity_load_avg函数和多核cpu的负载均衡有关，以后碰到了再分析。
account_entity_enqueue函数计算调度实体se对应的队列（包括运行队列rq和cfs队列）的总体权重。

如果flags包含了ENQUEUE_WAKEUP，就要通过place_entity进行时间补偿。假设一个进程进行了休眠，它的vruntime就不会继续增长，而cfs队列中的其他进程的vruntime同时在不断增长，如果此时该进程被唤醒，由于其vruntime相比其他未睡眠的进程较小，就会一直获得cpu的运行时间，容易造成进程饥饿，因此需要通过place_entity进行补偿。

再往下的__enqueue_entity实现插入操作，将se插入到cfs队列中。
然后将on_rq置1，表示该调度实体在cfs运行队列中了。

enqueue_task->enqueue_task_fair->enqueue_entity->update_curr
kernel/sched/fair.c

static void update_curr(struct cfs_rq *cfs_rq){    struct sched_entity *curr = cfs_rq->curr;    u64 now = rq_clock_task(rq_of(cfs_rq));    u64 delta_exec = now - curr->exec_start;    curr->exec_start = now;    curr->sum_exec_runtime += delta_exec;    curr->vruntime += calc_delta_fair(delta_exec, curr);    update_min_vruntime(cfs_rq);    if (entity_is_task(curr)) {        struct task_struct *curtask = task_of(curr);        cpuacct_charge(curtask, delta_exec);        account_group_exec_runtime(curtask, delta_exec);    }}

首先获得cfs队列中当前正在运行的进程对应的调度实体curr。因为有新的进程要加入到cfs队列中，
通过cfs_rq获得rq。

static inline u64 rq_clock_task(struct rq *rq){    lockdep_assert_held(&rq->lock);    return rq->clock_task;}

rq_clock_task获取的当前时间，在前面update_rq_clock_task函数中设置。
exec_start是上一次运行的时间，delta_exec为上一次运行到从cfs运行队列中删除的时间，
exec_start暂时设置为当前时间。
sum_exec_runtime累计了进程一共运行的时间。
vruntime为cfs虚拟运行时间，其计算考虑了进程的权重。
update_min_vruntime更新cfs队列cfs_rq中的min_vruntime，表示当前最小的虚拟时间。

#define entity_is_task(se)  (!se->my_q)

cpuacct_charge在cpu计算组里统计cpu运行时间。ca = task_ca(tsk);ca->cpuusage
account_group_exec_runtime统计整个线程组的运行时间。

enqueue_task->enqueue_task_fair->enqueue_entity->update_curr->calc_delta_fair
kernel/sched/fair.c

static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se){    if (unlikely(se->load.weight != NICE_0_LOAD))        delta = __calc_delta(delta, NICE_0_LOAD, &se->load);    return delta;}

如果进程的权重等于NICE_0_LOAD，默认值为1024，此时不需要计算，直接返回delta，否则就要通过__calc_delta函数对delta进行调整。
NICE_0_LOAD是一个权重的标准值，linux中进程的权重一共有40个值，如下所示，

static const int prio_to_weight[40] = { /* -20 */     88761,     71755,     56483,     46273,     36291, /* -15 */     29154,     23254,     18705,     14949,     11916, /* -10 */      9548,      7620,      6100,      4904,      3906, /*  -5 */      3121,      2501,      1991,      1586,      1277, /*   0 */      1024,       820,       655,       526,       423, /*   5 */       335,       272,       215,       172,       137, /*  10 */       110,        87,        70,        56,        45, /*  15 */        36,        29,        23,        18,        15,};

NICE_0_LOAD正好对应了最中间的一个权重值。__calc_delta函数可以简化为

新delta=旧delta*NICE_0_LOAD/(se的权重weight)

因此calc_delta_fair函数最终计算的是一个相对的权重值。

enqueue_task->enqueue_task_fair->enqueue_entity->update_curr->calc_delta_fair->__calc_delta
kernel/sched/fair.c

static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw){    u64 fact = weight;    int shift = WMULT_SHIFT;    __update_inv_weight(lw);    if (unlikely(fact >> 32)) {        while (fact >> 32) {            fact >>= 1;            shift--;        }    }    fact = (u64)(u32)fact * lw->inv_weight;    while (fact >> 32) {        fact >>= 1;        shift--;    }    return mul_u64_u32_shr(delta_exec, fact, shift);}static void __update_inv_weight(struct load_weight *lw){    unsigned long w;    if (likely(lw->inv_weight))        return;    w = lw->weight;    ...    lw->inv_weight = WMULT_CONST / w;}#define WMULT_CONST (~0U)#define WMULT_SHIFT 32static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift){    u32 ah, al;    u64 ret;    al = a;    ah = a >> 32;    ret = ((u64)al * mul) >> shift;    if (ah)        ret += ((u64)ah * mul) << (32 - shift);    return ret;}

__calc_delta函数最终计算的是delta_exec*weight/lw->weight。
__update_inv_weight函数会计算weight的倒数，也即inv_weight。
注意公式中的weight/lw->weight，因为是整型相除，精度不够，为了保证足够的精度，这里采用了“fixed-point arithmetic”，即对于一个小数而言，使用一个64位整型变量的高32位表示整数部分，用低32位表示小数部分，对应函数中的WMULT_CONST / w，转换之后，将其（也即inv_weight）乘以fact（也即weight），得到weight/lw->weight经过扩大后值，最后通过mul_u64_u32_shr将该值乘以delta_exec，并按照shift缩小最后的结果。
函数中将fact进行平移操作是保证fact的值不会超过32位，并且平移后造成的误差只是两个值相乘后的低位，因此不影响最终的结果。
mul_u64_u32_shr函数将参数a分成高32位ah和低32位al，分别和mul进行相乘再相加。由于shift大于等于0，因此平移操作保证一个32位数和64位数相乘，其结果不超过64位，并且shift用于补偿mul的右移操作。

enqueue_task->enqueue_task_fair->enqueue_entity->account_entity_enqueue
kernel/sched/fair.c

static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se){    update_load_add(&cfs_rq->load, se->load.weight);    update_load_add(&rq_of(cfs_rq)->load, se->load.weight);    if (entity_is_task(se)) {        struct rq *rq = rq_of(cfs_rq);        list_add(&se->group_node, &rq->cfs_tasks);    }    cfs_rq->nr_running++;}

account_entity_enqueue函数连续两次调用update_load_add函数更新cfs队列cfs_rq和运行队列rq的总体权重。
如果调度实体se代表的是一个进程，还会将其加入到运行队列rq的cfs_tasks链表中，该链表在做cpu负载均衡时会用到。
最后递增cfs队列的nr_running变量。

enqueue_task->enqueue_task_fair->enqueue_entity->account_entity_enqueue->update_load_add
kernel/sched/fair.c

static inline void update_load_add(struct load_weight *lw, unsigned long inc){    lw->weight += inc;    lw->inv_weight = 0;}

update_load_add函数将load_weight里的权重weight增加inc，并且把inv_weight置位0，在__update_inv_weight函数中，当需要计算虚拟时间的倒数时，如果检测到inv_weight为0，就会计算该值。

enqueue_task->enqueue_task_fair->enqueue_entity->place_entity
kernel/sched/fair.c

static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial){    u64 vruntime = cfs_rq->min_vruntime;    if (initial && sched_feat(START_DEBIT))        vruntime += sched_vslice(cfs_rq, se);    if (!initial) {        unsigned long thresh = sysctl_sched_latency;        if (sched_feat(GENTLE_FAIR_SLEEPERS))            thresh >>= 1;        vruntime -= thresh;    }    se->vruntime = max_vruntime(se->vruntime, vruntime);}

首先获得当前cfs队列的最小运行时间min_vruntime保存在vruntime中。
传入的参数initial表示是否为新进程，在linux系统调用fork时会设为真，此时，如果sysctl_sched_features的标志位包含了START_DEBIT，就表示新进程需要延迟运行，一般fork一个新进程的时候，如果其vruntime设为0，肯定不合适，因为会不停地占用cpu运行时间，造成运行饥饿，如果设为cfs队列的min_vruntime，则新fork出来的进程也会占用大部分的cpu运行时间，因此如果一个进程一直fork出新进程，这些进程就很可能一直占用着cpu，为了防止这种情况，就可以将START_DEBIT标志位置位，这样新fork出的进程的vruntime就会设置得比min_vruntime稍微大一点，具体大多少就由sched_vslice函数决定。

如果是正常唤醒的进程，此时initial=0，进入第二个if语句，sysctl_sched_latency是当cfs队列中的进程较少时的调度周期，默认为6ms。
sched_feat宏检查sysctl_sched_features标志位中是否包含了GENTLE_FAIR_SLEEPERS，如果设置了，就将thresh减去一般，最后将vruntime减去thresh。这里的thresh即为唤醒进程的奖励，thresh越大，vruntime越小，被唤醒的进程就越有可能抢占到cpu执行，因此GENTLE_FAIR_SLEEPERS标志位就决定了唤醒进程奖励的大小，如果GENTLE_FAIR_SLEEPERS被置位，则奖励变减半，因此如果disable该标志位，就会使交互进程的响应更快，反之。

最终设置唤醒进程的虚拟运行时间为se->vruntime和刚刚计算的vruntime的最大值，一种情况是睡眠的进程持续了一段时间，由于cfs运行队列的min_vruntime在不断增大，此时大概率vruntime的值会大于se->vruntime的值，因此使用vruntime的值；另一种情况假设一个进程刚睡眠就被唤醒，此时se->vruntime的值大于vruntime的值，此时使用原来的值，也即se->vruntime，这也是为了防止进程频繁睡眠而不停抢占cpu。

enqueue_task->enqueue_task_fair->enqueue_entity->place_entity->sched_vslice
kernel/sched/fair.c

static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se){    return calc_delta_fair(sched_slice(cfs_rq, se), se);}static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se){    u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);    struct load_weight *load;    struct load_weight lw;    cfs_rq = cfs_rq_of(se);    load = &cfs_rq->load;    if (unlikely(!se->on_rq)) {        lw = cfs_rq->load;        update_load_add(&lw, se->load.weight);        load = &lw;    }    slice = __calc_delta(slice, se->load.weight, load);    return slice;}

__sched_period函数返回最小的调度周期。cfs_rq_of函数获得调度实体se对应的cfs队列，再获得队列的权重load。
如果调度实体不在队列上，即on_rq为false。此时需要通过update_load_add函数更新cfs队列的整体权重，然后调用__calc_delta函数获得一个周期内新的调度实体应该占用的虚拟时间。这两个函数前面都分析过了。
__calc_delta函数的执行结果可以简写为

slice*se权重/运行队列权重

因此也可以看出，sched_slice最终返回整个cfs调度周期中新的调度实体se按权重比例占的那一份。
最后调用calc_delta_fair函数对slice进行标准化处理，该函数前面也分析过了。

enqueue_task->enqueue_task_fair->enqueue_entity->place_entity->sched_vslice->__sched_period
kernel/sched/fair.c

static u64 __sched_period(unsigned long nr_running){    u64 period = sysctl_sched_latency;    unsigned long nr_latency = sched_nr_latency;    if (unlikely(nr_running > nr_latency)) {        period = sysctl_sched_min_granularity;        period *= nr_running;    }    return period;}

sched_nr_latency是一个阈值，当cfs队列中的进程数小于sched_nr_latency时，就返回调度周期sysctl_sched_latency。如果cfs队列中的进程数大于sched_nr_latency时，因为当一个cfs队列的进程太多时，每个进程分到的时间片会减少，因此cfs设置当进程数量大于sched_nr_latency时，每个进程获得的最小时间片长度为sysctl_sched_min_granularity。因此当cfs队列中的进程数nr_running大于阈值sched_nr_latency时，其最小调度周期period变为nr_running*sysctl_sched_min_granularity。

enqueue_task->enqueue_task_fair->enqueue_entity->__enqueue_entity
kernel/sched/fair.c

static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se){    struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;    struct rb_node *parent = NULL;    struct sched_entity *entry;    int leftmost = 1;    while (*link) {        parent = *link;        entry = rb_entry(parent, struct sched_entity, run_node);        if (entity_before(se, entry)) {            link = &parent->rb_left;        } else {            link = &parent->rb_right;            leftmost = 0;        }    }    if (leftmost)        cfs_rq->rb_leftmost = &se->run_node;    rb_link_node(&se->run_node, parent, link);    rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);}

__enqueue_entity函数将调度实体se插入到cfs队列的红黑树中。cfs队列的tasks_timeline是该红黑树的根节点。接下来从根节点开始循环遍历，rb_entry宏根据红黑树上的rb_node指针获得对应的调度实体entry指针，entity_before检查红黑树上节点entry的虚拟运行时间vruntime是否大于即将插入的调度实体se的vruntime，如果大于则返回真，遍历向左子树移动，否则向右子树移动。

static inline int entity_before(struct sched_entity *a,                struct sched_entity *b){    return (s64)(a->vruntime - b->vruntime) < 0;}

只要有一次向右子树移动，就将leftmost置位0，表示无法到达红黑树中的最左叶子节点。

退出循环后，如果leftmost为1，表示到达了最左叶子节点，此时要更新cfs队列的rb_leftmost指针，指向即将插入的se调度实体。

接下来通过rb_link_node函数将调度实体插入到红黑树中link对应的节点上。最后的rb_insert_color用于调整红黑树的颜色，本章不关心。

dequeue_task源码分析

dequeue_task和enqueue_task相反，将进程从运行队列中移除，代码类似，有些相同的函数就不分析了，下面来看。

dequeue_task
kernel/sched/core.c

static void dequeue_task(struct rq *rq, struct task_struct *p, int flags){    update_rq_clock(rq);    p->sched_class->dequeue_task(rq, p, flags);}

update_rq_clock函数更新运行队列rq的始终。下面假设进程的sched_class也为fair_sched_class。

dequeue_task->dequeue_task_fair
kernel/sched/fair.c

static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags){    struct cfs_rq *cfs_rq;    struct sched_entity *se = &p->se;    for_each_sched_entity(se) {        cfs_rq = cfs_rq_of(se);        dequeue_entity(cfs_rq, se, flags);        cfs_rq->h_nr_running--;    }}

首先获得调度实体se对应的cfs队列cfs_rq，然后调用dequeue_entity函数将其从cfs队列cfs_rq中删除。

dequeue_task->dequeue_task->dequeue_entity
kernel/sched/fair.c

static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags){    update_curr(cfs_rq);    clear_buddies(cfs_rq, se);    if (se != cfs_rq->curr)        __dequeue_entity(cfs_rq, se);    se->on_rq = 0;    account_entity_dequeue(cfs_rq, se);    update_min_vruntime(cfs_rq);}

update_curr计算并更新cfs队列和调度实体se的各个统计值。dequeue_entity_load_avg函数和多核cpu的负载均衡有关，本章不关心。
clear_buddies函数检测调度实体se是否在cfs队列的last、next、skip指针中，是则清空。
__dequeue_entity函数完成主要的删除工作，将调度实体se从cfs队列cfs_rq的红黑树中删除，然后将on_rq置0。
最后，因为有调度实体se被删除，因此通过account_entity_dequeue函数重新更新整个cfs队列的权重，通过update_min_vruntime函数更新队列的min_vruntime值。

dequeue_task->dequeue_task->dequeue_entity->clear_buddies
kernel/sched/fair.c

static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se){    if (cfs_rq->last == se)        __clear_buddies_last(se);    if (cfs_rq->next == se)        __clear_buddies_next(se);    if (cfs_rq->skip == se)        __clear_buddies_skip(se);}static void __clear_buddies_last(struct sched_entity *se){    for_each_sched_entity(se) {        struct cfs_rq *cfs_rq = cfs_rq_of(se);        if (cfs_rq->last != se)            break;        cfs_rq->last = NULL;    }}

clear_buddies函数检查即将出队的调度实体se是否在cfs队列的last、next或skip指针中，如果是，就调用响应的函数将该指针设置为null。这三个指针的作用放在schedule调度源码中分析。

dequeue_task->dequeue_task->dequeue_entity->__dequeue_entity
kernel/sched/fair.c

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se){    if (cfs_rq->rb_leftmost == &se->run_node) {        struct rb_node *next_node;        next_node = rb_next(&se->run_node);        cfs_rq->rb_leftmost = next_node;    }    rb_erase(&se->run_node, &cfs_rq->tasks_timeline);}

如果待删除的调度实体se对应cfs队列红黑树中最左子节点，就需要通过rb_next函数重新挑选一个最左子几点。rb_next会根据se对应的几点是否有右子节点判断是向子节点遍历还是向父节点遍历，本章就不过多分析了。重新设置了最左节点后就调用rb_erase函数将调度实体从红黑树中删除。

dequeue_task->dequeue_task->dequeue_entity->account_entity_dequeue
kernel/sched/fair.c

static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se){    update_load_sub(&cfs_rq->load, se->load.weight);    update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);    if (entity_is_task(se)) {        list_del_init(&se->group_node);    }    cfs_rq->nr_running--;}static inline void update_load_sub(struct load_weight *lw, unsigned long dec){    lw->weight -= dec;    lw->inv_weight = 0;}

account_entity_dequeue函数通过update_load_sub函数，将cfs队列和rq队列的整体权重减去即将出队的se的权重，并将其倒数inv_weight置0。然后通过list_del_init函数将调度实体se从运行队列rq的cfs_tasks链表中删除。最后将nr_running递减。

dequeue_task->dequeue_task->dequeue_entity->update_min_vruntime
kernel/sched/fair.c

static void update_min_vruntime(struct cfs_rq *cfs_rq){    u64 vruntime = cfs_rq->min_vruntime;    if (cfs_rq->curr)        vruntime = cfs_rq->curr->vruntime;    if (cfs_rq->rb_leftmost) {        struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,                           struct sched_entity,                           run_node);        if (!cfs_rq->curr)            vruntime = se->vruntime;        else            vruntime = min_vruntime(vruntime, se->vruntime);    }    cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);}

因为cfs策略在调度进程时，会选择cfs队列中红黑树的最左节点rb_leftmost进行调度，因此只需要比较该节点调度实体的虚拟运行时间和当前正在运行的调度实体的虚拟运行时间即可（进程在运行时有可能cfs队列的rb_leftmost被更改）。假设存在正在运行的调度实体，最终设置的min_vruntime可以改写为

max_vruntime(cfs_rq->min_vruntime, min_vruntime(cfs_rq->curr->vruntime, se->vruntime));

第一个min_vruntime在正在运行的调度实体的vruntime和最左节点vruntime取最小值，一般情况下前者大，后者小，但是当前者在运行中时，有可能后者插入了一个有较大vruntime的调度实体，此时有可能前者小，后者大。因此需要通过min_vruntime取两者较小值。
max_vruntime就是取最大值，其实就是更新操作。

阅读全文

0 0