linux2.6调度分析

来源：互联网发布：mac口红色号推荐编辑：程序博客网时间：2024/05/02 04:34

/*首先我先说一下操作系统进程调度的重要性。我们都知道，linux的所有进程是并发执行的。进程调度，其实质是把处理器公平，合理，高效的分配给各个任务，以使进程的并发执行达到类似并行的效果。调度是实现多任务并发执行的必要手段。
负载指的是可运行进程的平均数，以及因此而等待CPU的时间。
进程运行时间的长短由它的优先级来确定,进程在最初执行的时候有一个静态优先级,同时在执行的时候还有一个动态优先级,它执行时间的长短由这些来确定.静态优先级是一确定就不变的,动态优先级会改变,它会受到你是交互性或者是消耗性这些性质的改变. */

struct prio_array {
int nr_active; /*本进程组中的进程数 */
unsigned long bitmap[BITMAP_SIZE]; /*位图，用来加速HASH表的访问*/
struct list_head queue[MAX_PRIO]; /*以优先级为索引的HASH表*/
};

/*linux2.6的调度函数里没有使用godness()来选择进程，而是多定义了一个就绪队列。
代码来自linux-2.6.0/kernel/sched.c 199行～215行*/
struct runqueue {
spinlock_t lock; /*runqueue 的自旋锁，当需要对 runqueue 进行操作时，仍然应该锁定，但这个锁定操作只影响一个 CPU 上的就绪队列，
因此,竞争发生的概率要小多了。*/
unsigned long nr_running, nr_switches, expired_timestamp,
nr_uninterruptible;/*nr_running就绪队列中可运行进程数,nr_switch是CPU进程切换的次数,expired_timestamp是最早发生进程
用完时间片的时间，nr_uninterruptible是曾经在runqueue但是现在处于 TASK_UNINTERRUPTIBLE
状态的进程数*/
task_t *curr, *idle; /*curr是本CPU正在运行的进程，idle是指向本 CPU 的 idle 进程（这里）*/
struct mm_struct *prev_mm;/*prev_mm是在进程却换工程中，保存正被替换的进程的地址空间*/
prio_array_t *active, *expired, arrays[2]; /*每个CPU的就绪队列按时间片是否用完分为两部分，active指向时间片没有用完，可调度
的就绪进程；expire指向时间片用完的就绪进程；arrays[2]是实际的优先级进程队列*/
int prev_cpu_load[NR_CPUS]; /*记录进行负载平衡时各个 CPU 上的负载状态*/
#ifdef CONFIG_NUMA
atomic_t *node_nr_running;
int prev_node_load[MAX_NUMNODES];/*这两个属性仅在 NUMA 体系结构下有效，记录各个 NUMA 节点上的就绪进程数和上一次负载平衡操作时的负载情况*/
#endif
task_t *migration_thread; /*指向本 CPU 的迁移进程。每个 CPU 都有一个核心线程用于执行进程迁移操作*/
struct list_head migration_queue; /*需要进行迁移的进程列表*/

atomic_t nr_iowait; /*曾经在runqueue但是现在正在等待I/O操作完成的进程数*/
};

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/*2.6内核对task_struct也作了很大改进*/
/*由于task_struct很大，这里我只说一些简单的*/
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
/*state有以下六种进程状态*/
#define TASK_RUNNING 0
#define TASK_INTERRUPTIBLE 1
#define TASK_UNINTERRUPTIBLE 2
#define TASK_STOPPED 4
#define TASK_ZOMBIE 8
#define TASK_DEAD 16 /*已经退出且不需要父进程回收的进程*/
......
unsigned long long timestamp; /*进程发生调度事件的时间*/

int prio, static_prio; /*prio是优先级，相当于 2.4 中 goodness() 的计算结果，在 0~MAX_PRIO-1 之间
取值（MAX_PRIO 定义为 140），
其中 0~MAX_RT_PRIO-1 （MAX_RT_PRIO 定义为100）属于实时进程范围，
MAX_RT_PRIO~MX_PRIO-1 属于非实时进程。数值越大，表示进程优先级越小。
这样定义实时进程也总是优先于普通进程。
static_prio是静态优先级，进程初始时间片的大小仅决定于进程的静态优先级，
这一点不论是实时进程还是非实时进程都一样，不过实时进程的static_prio 不参与优先级计算。*/
int activated; /*表示进程因什么原因进入就绪态，这一原因会影响到调度优先级的计算。*/

unsigned long sleep_avg;/*进程的平均等待时间（以 nanosecond 为单位），在 0 到 NS_MAX_SLEEP_AVG 之间取值，
初值为 0，相当于进程等待时间与运行时间的差值。
这个值是动态优先级计算的关键因子，sleep_avg 越大，计算出来的进程优先级也越高（数值越小）。*/

unsigned int time_slice, first_time_slice;/*time_slice是进程剩余的时间片，相当于2.4的counter,但是不再直接影响动态优先级
first_time_slive取值0 或 1，表示是否是第一次拥有时间片（刚创建的进程）。
这一变量用来判断进程结束时是否应当将自己的剩余时间片返还给父进程*/

prio_array_t *array; /*记录当前 CPU 的活跃就绪队列*/

......
}
//代码来自2.6.0内核linux-2.6.0/kernel/sched.c
asmlinkage void schedule(void)
{
task_t *prev, *next; /*prev是调度前运行的进程，next是调度后运行的进程*/
runqueue_t *rq;
prio_array_t *array;
struct list_head *queue;
unsigned long long now;
unsigned long run_time;
int idx;

/*
* Test if we are atomic. Since do_exit() needs to call into
* schedule() atomically, we ignore that path for now.
* Otherwise, whine if we are scheduling when we should not be.
*/
if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) { /*判断当前进程状态是不是TASK_DEAD或TASK_ZOMBIE*/
if (unlikely(in_atomic())) {
printk(KERN_ERR "bad: scheduling while atomic!/n");
dump_stack();
}
}

need_resched:
preempt_disable(); /*关闭内核枪占*/
prev = current; /*把当前进程报存在prev内*/
rq = this_rq(); /*CPU相关的就绪队列的地址保存在rq中*/

release_kernel_lock(prev); /*释放内核锁*/
now = sched_clock();/* 调用sched_clock( )，读取TSC，并且将TSC转换成纳秒，得到的timestamp保存在now中，然后Schedule计算prev使用
的时间片。*/
if (likely(now - prev->timestamp <>
run_time = now - prev->timestamp;
else
run_time = NS_MAX_SLEEP_AVG;

/*
* Tasks with interactive credits get charged less run_time
* at high sleep_avg to delay them losing their interactive
* status
*/
if (HIGH_CREDIT(prev))
run_time /= (CURRENT_BONUS(prev) ? : 1);

spin_lock_irq(&rq->lock);/*关闭当前CPU中断，并且获取自旋锁保护runqueue.*/

/*
* if entering off of a kernel preemption go straight
* to picking the next task.
*/
if (unlikely(preempt_count() & PREEMPT_ACTIVE))
goto pick_next_task;

switch (prev->state) { /*根据prev的状态做相应的处理*/
case TASK_INTERRUPTIBLE: /*此状态表明该进程可以被信号中断*/
if (unlikely(signal_pending(prev))) { /*如果该进程有未处理的信号，则变为可运行状态*/
prev->state = TASK_RUNNING;
break;
}
default: /*如果是不可运行的，则从就绪队列中删除它*/
deactivate_task(prev, rq); /*删除prev指向的进程*/
prev->nvcsw++;
break;
case TASK_RUNNING: /*如果是运行态，则继续执行*/
prev->nivcsw++;
}
pick_next_task:
if (unlikely(!rq->nr_running)) {
#ifdef CONFIG_SMP
load_balance(rq, 1, cpu_to_node_mask(smp_processor_id()));/*保持多系统下runqueue平衡。检查当前CPU，保证一个域中runqueue
平衡。*/
if (rq->nr_running) /*如果有可运行的进程*/
goto pick_next_task;
#endif
next = rq->idle;
rq->expired_timestamp = 0;
goto switch_tasks;
}
/*下面的代码是将active指向的队列与expired指向的队列交换*/
array = rq->active;
if (unlikely(!array->nr_active)) {
/*
* Switch the active and expired arrays.
*/
rq->active = rq->expired;
rq->expired = array;
array = rq->active;
rq->expired_timestamp = 0;
}

idx = sched_find_first_bit(array->bitmap);/*找到第一个非空的就绪进程链表*/
queue = array->queue + idx;
next = list_entry(queue->next, task_t, run_list);/*next指向将替换prev的进程描述符*/
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
if (next->activated > 0) {//这里表示是从TASK_INTERRUPTIBLE的状态唤醒
unsigned long long delta = now - next->timestamp; //中间这一段我不明白，这些涉及都是时间方面的计算，这让我
//感觉好像迷路了一样。我感觉这应该是硬件方面的原因。
if (next->activated == 1)
delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;

array = next->array;
dequeue_task(next, array);
recalc_task_prio(next, next->timestamp + delta);
enqueue_task(next, array);
}
next->activated = 0;
switch_tasks:
prefetch(next);
clear_tsk_need_resched(prev);
RCU_qsctr(task_cpu(prev))++;

prev->sleep_avg -= run_time;
if ((long)prev->sleep_avg <= 0){
prev->sleep_avg = 0;
if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
prev->interactive_credit--;
}
prev->timestamp = now;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

if (likely(prev != next)) {/*如果要运行的进程不是当前进程，执行进程切换*/
next->timestamp = now;
rq->nr_switches++;
rq->curr = next;

prepare_arch_switch(rq, next);
prev = context_switch(rq, prev, next);
barrier();

finish_task_switch(prev);
} else /*如果要运行的进程是当前进程，放弃切换*/
spin_unlock_irq(&rq->lock);

reacquire_kernel_lock(current);
preempt_enable_no_resched();
if (test_thread_flag(TIF_NEED_RESCHED))
goto need_resched;
}
/*对于2.6的这一进程调度的分析我仍然是很模糊的，这里我看到它和2.4的调度函数有些不一样，2.6没有goodness()函数，每一个CUP都有自己的就绪队列，活动的和不活动的。当活动的就绪队列为空时，只要将两个就绪队列进行交换，便可继续执行。