Linux中实时进程的调度---Linux内核笔记

来源：互联网发布：简述网络舆情的特点编辑：程序博客网时间：2024/04/27 22:08

Linux的进程分普通进程和实时进程，而实时进程又分SCHED_FIFO与SCHED_RR，它们都比普通进程的优先级高。

对于SCHED_FIFO进程，它就一直运行直到退出，除非它阻塞才会释放CPU, 或被更高优先级的实时进程抢占。

对于SCHED_RR(时间片轮转)进程，只有当它的时间片用完，内核会把它放到进程队列的末尾。

我们来看看在2.4内核中对应的调度代码：

asmlinkage void schedule(void)
{
struct schedule_data * sched_data;
struct task_struct *prev, *next, *p;
struct list_head *tmp;
int this_cpu, c;

spin_lock_prefetch(&runqueue_lock);

BUG_ON(!current->active_mm);
need_resched_back:
prev = current; //prev指向当前进程
this_cpu = prev->processor;

if (unlikely(in_interrupt())) {
printk("Scheduling in interrupt/n");
BUG();
}

release_kernel_lock(prev, this_cpu);

/*
* 'sched_data' is protected by the fact that we can run
* only one process per CPU.
*/
sched_data = & aligned_data[this_cpu].schedule_data;

spin_lock_irq(&runqueue_lock);

/* move an exhausted RR process to be last.. */
if (unlikely(prev->policy == SCHED_RR))
              //如果当前进程是SCHED_RR(时间片轮转)实时进程，查看它的时间片是否用完，
              //如果用完把它移到队列末尾
if (!prev->counter) {
   prev->counter = NICE_TO_TICKS(prev->nice);
   move_last_runqueue(prev);
}

switch (prev->state) {
case TASK_INTERRUPTIBLE:
   if (signal_pending(prev)) {
    prev->state = TASK_RUNNING;
    break;
   }
default:
   del_from_runqueue(prev);
case TASK_RUNNING:;
}
prev->need_resched = 0;

/*
* this is the scheduler proper:
*/

repeat_schedule:
/*
* Default process to select..
*/
next = idle_task(this_cpu);
c = -1000;
              //遍历全部进程，找出最需要运行的进程，由函数goodness计算它们的权值
list_for_each(tmp, &runqueue_head) {
p = list_entry(tmp, struct task_struct, run_list);
if (can_schedule(p, this_cpu)) {
   int weight = goodness(p, this_cpu, prev->active_mm);
   if (weight > c)
    c = weight, next = p;
}
}

/* Do we need to re-calculate counters? */
//如果选中的进程没有时间片了，说明全部进程的时间片都用完了，这时重新计算全部
//进程的时间片，然后跳回去重调度
if (unlikely(!c)) {
struct task_struct *p;

spin_unlock_irq(&runqueue_lock);
read_lock(&tasklist_lock);
for_each_task(p)
p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
read_unlock(&tasklist_lock);
spin_lock_irq(&runqueue_lock);
goto repeat_schedule;
}

...........................
}

再看看goodness函数
static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
{
int weight;

/*
* select the current process after every other
* runnable process, but before the idle thread.
* Also, dont trigger a counter recalculation.
*/
//如果此进程为YIELD状态，返回-1(当前还它不需要运行)
weight = -1;
if (p->policy & SCHED_YIELD)
goto out;

/*
* Non-RT process - normal case first.
*/
//先检果普通进程，普通进程的计算方法是返回它的counter值，同时，如果此进程（线程）与
//当前正在运行的进程属于同一个进程，权值+1(优先选择它，这样少了页表切换的开销，有利于提高效率)
if (p->policy == SCHED_OTHER) {
/*
   * Give the process a first-approximation goodness value
   * according to the number of clock-ticks it has left.
   *
   * Don't do any other calculations if the time slice is
   * over..
   */
weight = p->counter;
if (!weight)
   goto out;

#ifdef CONFIG_SMP
/* Give a largish advantage to the same processor...   */
/* (this is equivalent to penalizing other processors) */
//优先调度原来在此cpu上运行的进程
if (p->processor == this_cpu)
   weight += PROC_CHANGE_PENALTY;
#endif

/* .. and a slight advantage to the current MM */
//判断是不是同一个进程内的线程(同一个进程内的两个线程mm相同)
if (p->mm == this_mm || !p->mm)
weight += 1;
weight += 20 - p->nice;
goto out;
}

/*
* Realtime process, select the first one on the
* runqueue (taking priorities within processes
* into account).
*/
//剩下的就是实时进程(SCHED_RR或SCHED_RR)，它的权值固定为1000+进程优先级，
//远远大于普通进程，所以它们总是能得到及时调度，而且更高优先级的实时进程总是
//能抢占低优先级的进程
weight = 1000 + p->rt_priority;
out:
return weight;
}

通过以上代码，我们发现2.4以前的内核，有几点缺陷:
1) 每次调度时要遍历全部进程，时间复杂度为O(N)
2) 当全部进程时间片用完后，要为它们重新计算时间片
3) 在内核态不可抢占
4) 多个cpu共用一个运行队列，需要频繁的加锁，影响效率

2.6内核重写的进程调度这部分，其时间复杂度为O(1)。
先看看2.6与调度相关的几个数据结构：
struct rq {
spinlock_t lock;
//当前处理器上有多少个可运行的进程
unsigned long nr_running;
unsigned long raw_weighted_load;
unsigned long expired_timestamp;
struct mm_struct *prev_mm;
//active: 活动进程队列, expired: 过期进程队列,这里是实现O(1)调度的关键
struct prio_array *active, *expired, arrays[2];
int best_expired_prio;
};

//每个CPU分配一个运行队列runqueues
static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;

struct prio_array {
unsigned int nr_active; //当前队列进程数
DECLARE_BITMAP(bitmap, MAX_PRIO+1);//位图，每一位表示对应级别的进程链表是否有进程
struct list_head queue[MAX_PRIO]; //进程链表，共MAX_PRIO(140)级，进程按其优先级存放在这个链表中
};

从上面结构来看，每个cpu有自己单独的运行队列，而每个运行队列中，把进程分为活动进程队列和
过期进程队列，每次调度时，从活动进程队列的最高优先级链表中选择第一个进程作为next。
我们来看看它是如何选择的。
我们先看prio_arry中的queue[MAX_PRIO]，进程按优先级放入这个队列中，queue[0]中的全部进程其优先级为0,
其优先级最高，queue[1]中的全部进程其优先级为1, 优先级的值越小优先运行。0~MAX_RT_PRIO(100)为实时进程的优先级，
MAX_RT_PRIO~MAX_PRIO(140)为普通进程的优先级。bitmap为5个32位整数，它的前140位对应140个优先级，
比如：bitmap的第5位置1，表示优先级为5的进程队列存在进程。
idx = sched_find_first_bit(array->bitmap)就是查找bitmap中第一个为1的位，那么就可以获取当前优先级最高的进程队列。

asmlinkage void __sched schedule(void)
{
struct task_struct *prev, *next;
struct prio_array *array;
struct list_head *queue;
unsigned long long now;
unsigned long run_time;
int cpu, idx, new_prio;
long *switch_count;
struct rq *rq;

/*
* Test if we are atomic. Since do_exit() needs to call into
* schedule() atomically, we ignore that path for now.
* Otherwise, whine if we are scheduling when we should not be.
*/
if (unlikely(in_atomic() && !current->exit_state)) {
printk(KERN_ERR "BUG: scheduling while atomic: "
   "%s/0x%08x/%d/n",
   current->comm, preempt_count(), current->pid);
debug_show_held_locks(current);
if (irqs_disabled())
   print_irqtrace_events(current);
dump_stack();
}
profile_hit(SCHED_PROFILING, __builtin_return_address(0));

need_resched:
preempt_disable();
prev = current; //prev指向当前进程
release_kernel_lock(prev);
need_resched_nonpreemptible:
rq = this_rq(); //当前处理器上的进程队列

/*
* The idle thread is not allowed to schedule!
* Remove this check after it has been exercised a bit.
*/
if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
printk(KERN_ERR "bad: scheduling from the idle thread!/n");
dump_stack();
}

schedstat_inc(rq, sched_cnt);
now = sched_clock();
if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
run_time = now - prev->timestamp;
if (unlikely((long long)(now - prev->timestamp) < 0))
run_time = 0;
} else
run_time = NS_MAX_SLEEP_AVG;

/*
* Tasks charged proportionately less run_time at high sleep_avg to
* delay them losing their interactive status
*/
run_time /= (CURRENT_BONUS(prev) ? : 1);

spin_lock_irq(&rq->lock);

switch_count = &prev->nivcsw;

//如果运行状态不为runable
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
switch_count = &prev->nvcsw;
if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
    unlikely(signal_pending(prev))))
   prev->state = TASK_RUNNING;
else {
   if (prev->state == TASK_UNINTERRUPTIBLE)
    rq->nr_uninterruptible++;
   deactivate_task(prev, rq); //如果prev为INTERRUPTIBLE或UNINTERRUPTIBLE状态，把它从队列中删除
}
}

cpu = smp_processor_id();
if (unlikely(!rq->nr_running)) {
idle_balance(cpu, rq);
if (!rq->nr_running) {
   next = rq->idle;
   rq->expired_timestamp = 0;
   goto switch_tasks;
}
}

//当前cpu上的活动进程队列数组
array = rq->active;
if (unlikely(!array->nr_active)) {
//如果活动队列中没有活动进程了，用它与过期队列(expired)交换
schedstat_inc(rq, sched_switch);
rq->active = rq->expired;
rq->expired = array;
array = rq->active;
rq->expired_timestamp = 0;
rq->best_expired_prio = MAX_PRIO;
}

              //查找活动队列中最高优先级的index
idx = sched_find_first_bit(array->bitmap);
              //取这个进程链表
queue = array->queue + idx;
              //取这个链表中的第一个进程
next = list_entry(queue->next, struct task_struct, run_list);

              //如果next不是实时进程且它是交互进程
if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
                            //计算睡眠时间，睡眠时间越长，要提同它的优先级，让它先运行
unsigned long long delta = now - next->timestamp;
if (unlikely((long long)(now - next->timestamp) < 0))
   delta = 0;

if (next->sleep_type == SLEEP_INTERACTIVE)
delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;

array = next->array;
//重新计算它的优先级
new_prio = recalc_task_prio(next, next->timestamp + delta);

                            //重新按它的优先级插入队列
if (unlikely(next->prio != new_prio)) {
   dequeue_task(next, array);
   next->prio = new_prio;
   enqueue_task(next, array);
}
}
next->sleep_type = SLEEP_NORMAL;
开始进程切换。。。
switch_tasks:
if (next == rq->idle)
schedstat_inc(rq, sched_goidle);
prefetch(next);
prefetch_stack(next);
clear_tsk_need_resched(prev);
rcu_qsctr_inc(task_cpu(prev));

update_cpu_clock(prev, rq, now);

prev->sleep_avg -= run_time;
if ((long)prev->sleep_avg <= 0)
prev->sleep_avg = 0;
prev->timestamp = prev->last_ran = now; //timestamp记录换出时间

sched_info_switch(prev, next);
if (likely(prev != next)) {
next->timestamp = next->last_ran = now; //记录换入时间
rq->nr_switches++;
rq->curr = next;
++*switch_count;

prepare_task_switch(rq, next);
prev = context_switch(rq, prev, next);
barrier();
/*
   * this_rq must be evaluated again because prev may have moved
   * CPUs since it called schedule(), thus the 'rq' on its stack
   * frame will be invalid.
   */
finish_task_switch(this_rq(), prev);
} else
spin_unlock_irq(&rq->lock);

prev = current;
if (unlikely(reacquire_kernel_lock(prev) < 0))
goto need_resched_nonpreemptible;
preempt_enable_no_resched();
if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
goto need_resched;
}

再看看内核时钟发生发时做了那些操作，代码在scheduler_tick()->task_running_tick()

static void task_running_tick(struct rq *rq, struct task_struct *p)
{
if (p->array != rq->active) {
/* Task has expired but was not scheduled yet */
set_tsk_need_resched(p);
return;
}
spin_lock(&rq->lock);
/*
* The task was running during this tick - update the
* time slice counter. Note: we do not update a thread's
* priority until it either goes to sleep or uses up its
* timeslice. This makes it possible for interactive tasks
* to use up their timeslices at their highest priority levels.
*/
if (rt_task(p)) {
//如果是SCHED_RR进程，更新它的时间片，如果时间片用完，重计算时间片。
//SCHED_FIFO进程不需要时间片，它一直运行
if ((p->policy == SCHED_RR) && !--p->time_slice) {
   p->time_slice = task_timeslice(p); //重计算时间片
   p->first_time_slice = 0;
   set_tsk_need_resched(p);

//把它移动队列末尾
requeue_task(p, rq->active);
}
goto out_unlock;
}

//如果是普通进程用完时间片
if (!--p->time_slice) {
dequeue_task(p, rq->active); //从活动队列中删除
set_tsk_need_resched(p);
p->prio = effective_prio(p); //重新计算其优先级
p->time_slice = task_timeslice(p); //重新计算其时间片
p->first_time_slice = 0;

if (!rq->expired_timestamp)
rq->expired_timestamp = jiffies;

//如果不是交互进程，移到过期队列
if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
   enqueue_task(p, rq->expired);
   if (p->static_prio < rq->best_expired_prio)
    rq->best_expired_prio = p->static_prio;
} else
   enqueue_task(p, rq->active); //如果是交互进程，移到其优先级队列末尾
} else {
if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
   p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
   (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
   (p->array == rq->active)) {

requeue_task(p, rq->active);//如果时间片未用完，时间片又太长，把它移到队列末尾，给其它进程运行机会
set_tsk_need_resched(p);
}
}
out_unlock:
spin_unlock(&rq->lock);
}

从task_running_tick我们可以看到，每个时钟模周期(tick)不同类型的进程处理方式不同：
1) 实时进程SCHED_RR: 时间片用完后，重计算它的时间片，优先级不变，把它移到当前
优先级队列末尾。所以，如果有更高级的FIFO、RR进程，或跟它相同优先级的RR进程，
它都会被抢占(因为它在队列末尾)
2) 实时进程SCHED_FIFO: 完全不使用时间片，不修改其优先级、不修改其在队列中的位置。所以，它只会被
更高级的FIFO、RR进程抢占，跟它相同优先级的进程没有机会得到执行，除非它退出。
3)普通进程：A) 时间片用完，从活动队列中删除，重计算它的时间片，如果它是交互进程
把它插入活动队例，给它继续运行的机会，奖励交互进程达到更好的交互响应时间，
如果不是交互进程，插入过期队列中。
B)时间片未用完，如果是交互进程且它的剩余时间片太长(这个还没搞清楚怎么计算的)，把它移动
当前优先级队列的末尾，也就是说它要被更高优先级或跟它相同优先级的进程抢占。

遗留问题：
1) 进程阻塞(Sleep或IO操作)之后，其在active队列还是在expired队列，或者两个队列都不放？

答：进程阻塞之后，会把它的状态设为TASK_INTERRUPTIBLE，同时设置task的need_schedule

标志，在系统调用结束时，检查need_schedule标志后会调用schedule()做进程切换，schedule()检查

到当前进程的状态为TASK_INTERRUPTIBLE后，把它从runqueue中删除，所以进程既不在active队列中也不在

expried队列中。当阻塞结束时，会调用activate_task()把进程重放入active队列中。
2) 进程阻塞被唤醒之后，它要被放入active队例中，它是放在它的优先级队列头还是尾？

答：会被放入active队列，进程所在优先级队列末尾，那么对于FIFO进程，一旦它阻塞再恢复

运行之后，可能同等优先级的FIFO进程抢占。看代码

static void __activate_task(struct task_struct *p, struct rq *rq)
{
struct prio_array *target = rq->active;

if (batch_task(p))
target = rq->expired;
enqueue_task(p, target);
inc_nr_running(p, rq);
}

static void enqueue_task(struct task_struct *p, struct prio_array *array)
{
sched_info_queued(p);
list_add_tail(&p->run_list, array->queue + p->prio); //放入队列末尾
__set_bit(p->prio, array->bitmap);
array->nr_active++;
p->array = array;
}
3) 如何判断一个进程是否是交互进程?

答：通过task->timestamp来计算，task->timestamp在三种情况会更新它：

1. 进程换入时 2. 进程换出 3.睡眠时，所以在schedule()中，

now - prev->timestamp表示当前进程运行时间, prev为当前正在运行的进程，prev->timestamp正好是它上次换入的时间

now-next->timestamp表示next的睡眠时间，next为准备运行的进程，next->timestamp正好是next上次换出的时间