Linux中实时进程的调度---Linux内核笔记

来源:互联网 发布:简述网络舆情的特点 编辑:程序博客网 时间:2024/04/27 22:08

Linux的进程分普通进程和实时进程,而实时进程又分SCHED_FIFO与SCHED_RR,它们都比普通进程的优先级高。

对于SCHED_FIFO进程,它就一直运行直到退出,除非它阻塞才会释放CPU, 或被更高优先级的实时进 程抢占。

对于SCHED_RR(时间片轮转)进程,只有当它的时间片用完,内核会把它放到进程队列的末尾。

我们来看看在2.4内核中对应的调度代码:

asmlinkage void schedule(void)
{
 struct schedule_data * sched_data;
 struct task_struct *prev, *next, *p;
 struct list_head *tmp;
 int this_cpu, c;


 spin_lock_prefetch(&runqueue_lock);

 BUG_ON(!current->active_mm);
need_resched_back:
 prev = current;            //prev指向当前进程
 this_cpu = prev->processor;

 if (unlikely(in_interrupt())) {
  printk("Scheduling in interrupt/n");
  BUG();
 }

 release_kernel_lock(prev, this_cpu);

 /*
  * 'sched_data' is protected by the fact that we can run
  * only one process per CPU.
  */
 sched_data = & aligned_data[this_cpu].schedule_data;

 spin_lock_irq(&runqueue_lock);

 /* move an exhausted RR process to be last.. */
 if (unlikely(prev->policy == SCHED_RR))
              //如果当前进程是SCHED_RR(时间片轮转)实时进程,查看它的时间片是否用完,
              //如果用完把它移到队列末尾
  if (!prev->counter) {
   prev->counter = NICE_TO_TICKS(prev->nice);
   move_last_runqueue(prev);
  }

 switch (prev->state) {
  case TASK_INTERRUPTIBLE:
   if (signal_pending(prev)) {
    prev->state = TASK_RUNNING;
    break;
   }
  default:
   del_from_runqueue(prev);
  case TASK_RUNNING:;
 }
 prev->need_resched = 0;

 /*
  * this is the scheduler proper:
  */

repeat_schedule:
 /*
  * Default process to select..
  */
 next = idle_task(this_cpu);
 c = -1000;
              //遍历全部进程,找出最需要运行的进程,由函数goodness计算它们的权值
 list_for_each(tmp, &runqueue_head) {
  p = list_entry(tmp, struct task_struct, run_list);
  if (can_schedule(p, this_cpu)) {
   int weight = goodness(p, this_cpu, prev->active_mm);
   if (weight > c)
    c = weight, next = p;
  }
 }

 /* Do we need to re-calculate counters? */
  //如果选中的进程没有时间片了,说明全部进程的时间片都用完了,这时重新计算全部
 //进程的时间片,然后跳回去重调度
 if (unlikely(!c)) {
  struct task_struct *p;

  spin_unlock_irq(&runqueue_lock);
  read_lock(&tasklist_lock);
  for_each_task(p)
       p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
  read_unlock(&tasklist_lock);
  spin_lock_irq(&runqueue_lock);
  goto repeat_schedule;
 }

              ...........................
}


再看看goodness函数
static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
{
 int weight;

 /*
  * select the current process after every other
  * runnable process, but before the idle thread.
  * Also, dont trigger a counter recalculation.
  */
  //如果此进程为YIELD状态,返回-1(当前还它不需要运行)
 weight = -1;
 if (p->policy & SCHED_YIELD)
  goto out;

 /*
  * Non-RT process - normal case first.
  */
  //先检果普通进程,普通进程的计算方法是返回它的counter值,同时,如果此进程(线程)与
 //当前正在运行的进程属于同一个进程,权值+1(优先选择它,这样少了页表切换的开销,有利于提高效率)
 if (p->policy == SCHED_OTHER) {
  /*
   * Give the process a first-approximation goodness value
   * according to the number of clock-ticks it has left.
   *
   * Don't do any other calculations if the time slice is
   * over..
   */
  weight = p->counter;
  if (!weight)
   goto out;
  
#ifdef CONFIG_SMP
  /* Give a largish advantage to the same processor...   */
  /* (this is equivalent to penalizing other processors) */
//优先调度原来在此cpu上运行的进程
  if (p->processor == this_cpu)
   weight += PROC_CHANGE_PENALTY;
#endif

  /* .. and a slight advantage to the current MM */
//判断是不是同一个进程内的线程(同一个进程内的两个线程mm相同)
  if (p->mm == this_mm || !p->mm)
   weight += 1;
  weight += 20 - p->nice; 
  goto out;
 }

 /*
  * Realtime process, select the first one on the
  * runqueue (taking priorities within processes
  * into account).
  */
 //剩下的就是实时进程(SCHED_RR或SCHED_RR),它的权值固定为1000+进程优先级,
 //远远大于普通进程,所以它们总是能得到及时调度,而且更高优先级的实时进程总是
//能抢占低优先级的进程
 weight = 1000 + p->rt_priority;
out:
 return weight;
}

通过以上代码,我们发现2.4以前的内核,有几点缺陷:
1) 每次调度时要遍历全部进程,时间复杂度为O(N)
2) 当全部进程时间片用完后,要为它们重新计算时间片
3) 在内核态不可抢占
4) 多个cpu共用一个运行队列,需要频繁的加锁,影响效率

2.6内核重写的进程调度这部分,其时间复杂度为O(1)。
先看看2.6与调度相关的几个数据结构:
struct rq {
 spinlock_t lock;
              //当前处理器上有多少个可运行的进程
 unsigned long nr_running;
 unsigned long raw_weighted_load;
 unsigned long expired_timestamp;
 struct mm_struct *prev_mm;
              //active: 活动进程队列, expired: 过期进程队列,这里是实现O(1)调度的关键
 struct prio_array *active, *expired, arrays[2];
 int best_expired_prio;
};

//每个CPU分配一个运行队列runqueues
static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;

struct prio_array {
 unsigned int nr_active;   //当前队列进程数
 DECLARE_BITMAP(bitmap, MAX_PRIO+1);//位图,每一位表示对应级别的进程链表是否有进程
 struct list_head queue[MAX_PRIO]; //进程链表,共MAX_PRIO(140)级,进程按其优先级存放在这个链表中
};

从上面结构来看,每个cpu有自己单独的运行队列,而每个运行队列中,把进程分为活动进程队列和
过期进程队列,每次调度时,从活动进程队列的最高优先级链表中选择第一个进程作为next。
我们来看看它是如何选择的。
我们先看prio_arry中的queue[MAX_PRIO], 进程按优先级放入这个队列中,queue[0]中的全部进程其优先级为0,
其优先级最高,queue[1]中的全部进程其优先级为1, 优先级的值越小优先运行。0~MAX_RT_PRIO(100)为实时进程的优先级,
MAX_RT_PRIO~MAX_PRIO(140)为普通进程的优先级。bitmap为5个32位整数,它的前140位对应140个优先级,
比如:bitmap的第5位置1,表示优先级为5的进程队列存在进程。
idx = sched_find_first_bit(array->bitmap)就是查找bitmap中第一个为1的位,那么就可以获取当前优先级最高的进程队列。

asmlinkage void __sched schedule(void)
{
 struct task_struct *prev, *next;
 struct prio_array *array;
 struct list_head *queue;
 unsigned long long now;
 unsigned long run_time;
 int cpu, idx, new_prio;
 long *switch_count;
 struct rq *rq;

 /*
  * Test if we are atomic.  Since do_exit() needs to call into
  * schedule() atomically, we ignore that path for now.
  * Otherwise, whine if we are scheduling when we should not be.
  */
 if (unlikely(in_atomic() && !current->exit_state)) {
  printk(KERN_ERR "BUG: scheduling while atomic: "
   "%s/0x%08x/%d/n",
   current->comm, preempt_count(), current->pid);
  debug_show_held_locks(current);
  if (irqs_disabled())
   print_irqtrace_events(current);
  dump_stack();
 }
 profile_hit(SCHED_PROFILING, __builtin_return_address(0));

need_resched:
 preempt_disable();
 prev = current;  //prev指向当前进程
 release_kernel_lock(prev);
need_resched_nonpreemptible:
 rq = this_rq();      //当前处理器上的进程队列

 /*
  * The idle thread is not allowed to schedule!
  * Remove this check after it has been exercised a bit.
  */
 if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
  printk(KERN_ERR "bad: scheduling from the idle thread!/n");
  dump_stack();
 }

 schedstat_inc(rq, sched_cnt);
 now = sched_clock();
 if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
  run_time = now - prev->timestamp;
  if (unlikely((long long)(now - prev->timestamp) < 0))
   run_time = 0;
 } else
  run_time = NS_MAX_SLEEP_AVG;

 /*
  * Tasks charged proportionately less run_time at high sleep_avg to
  * delay them losing their interactive status
  */
 run_time /= (CURRENT_BONUS(prev) ? : 1);

 spin_lock_irq(&rq->lock);

 switch_count = &prev->nivcsw;

//如果运行状态不为runable
 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
  switch_count = &prev->nvcsw;
  if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
    unlikely(signal_pending(prev))))
   prev->state = TASK_RUNNING;
  else {
   if (prev->state == TASK_UNINTERRUPTIBLE)
    rq->nr_uninterruptible++;
   deactivate_task(prev, rq); //如果prev为INTERRUPTIBLE或UNINTERRUPTIBLE状态,把它从队列中删除
  }
 }

 cpu = smp_processor_id();
 if (unlikely(!rq->nr_running)) {
  idle_balance(cpu, rq);
  if (!rq->nr_running) {
   next = rq->idle;
   rq->expired_timestamp = 0;
   goto switch_tasks;
  }
 }

              //当前cpu上的活动进程队列数组
 array = rq->active;
 if (unlikely(!array->nr_active)) {
                            //如果活动队列中没有活动进程了,用它与过期队列(expired)交换
  schedstat_inc(rq, sched_switch);
  rq->active = rq->expired;
  rq->expired = array;
  array = rq->active;
  rq->expired_timestamp = 0;
  rq->best_expired_prio = MAX_PRIO;
 }

              //查找活动队列中最高优先级的index
 idx = sched_find_first_bit(array->bitmap);
              //取这个进程链表
 queue = array->queue + idx;
              //取这个链表中的第一个进程
 next = list_entry(queue->next, struct task_struct, run_list);

              //如果next不是实时进程且它是交互进程
 if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
                            //计算睡眠时间,睡眠时间越长,要提同它的优先级,让它先运行
  unsigned long long delta = now - next->timestamp;
  if (unlikely((long long)(now - next->timestamp) < 0))
   delta = 0;

  if (next->sleep_type == SLEEP_INTERACTIVE)
   delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;

  array = next->array;
                            //重新计算它的优先级
  new_prio = recalc_task_prio(next, next->timestamp + delta);

                            //重新按它的优先级插入队列
  if (unlikely(next->prio != new_prio)) {
   dequeue_task(next, array);
   next->prio = new_prio;
   enqueue_task(next, array);
  }
 }
 next->sleep_type = SLEEP_NORMAL;
开始进程切换。。。
switch_tasks:
 if (next == rq->idle)
  schedstat_inc(rq, sched_goidle);
 prefetch(next);
 prefetch_stack(next);
 clear_tsk_need_resched(prev);
 rcu_qsctr_inc(task_cpu(prev));

 update_cpu_clock(prev, rq, now);

 prev->sleep_avg -= run_time;
 if ((long)prev->sleep_avg <= 0)
  prev->sleep_avg = 0;
 prev->timestamp = prev->last_ran = now;  //timestamp记录换出时间

 sched_info_switch(prev, next);
 if (likely(prev != next)) {
  next->timestamp = next->last_ran = now; //记录换入时间
  rq->nr_switches++;
  rq->curr = next;
  ++*switch_count;

  prepare_task_switch(rq, next);
  prev = context_switch(rq, prev, next);
  barrier();
  /*
   * this_rq must be evaluated again because prev may have moved
   * CPUs since it called schedule(), thus the 'rq' on its stack
   * frame will be invalid.
   */
  finish_task_switch(this_rq(), prev);
 } else
  spin_unlock_irq(&rq->lock);

 prev = current;
 if (unlikely(reacquire_kernel_lock(prev) < 0))
  goto need_resched_nonpreemptible;
 preempt_enable_no_resched();
 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
  goto need_resched;
}

 

再看看内核时钟发生发时做了那些操作,代码在scheduler_tick()->task_running_tick()

static void task_running_tick(struct rq *rq, struct task_struct *p)
{
 if (p->array != rq->active) {
  /* Task has expired but was not scheduled yet */
  set_tsk_need_resched(p);
  return;
 }
 spin_lock(&rq->lock);
 /*
  * The task was running during this tick - update the
  * time slice counter. Note: we do not update a thread's
  * priority until it either goes to sleep or uses up its
  * timeslice. This makes it possible for interactive tasks
  * to use up their timeslices at their highest priority levels.
  */
 if (rt_task(p)) {
  //如果是SCHED_RR进程,更新它的时间片,如果时间片用完,重计算时间片。
  //SCHED_FIFO进程不需要时间片,它一直运行
  if ((p->policy == SCHED_RR) && !--p->time_slice) {
   p->time_slice = task_timeslice(p); //重计算时间片
   p->first_time_slice = 0;
   set_tsk_need_resched(p);

   //把它移动队列末尾
   requeue_task(p, rq->active);
  }
  goto out_unlock;
 }

 //如果是普通进程用完时间片
 if (!--p->time_slice) {
  dequeue_task(p, rq->active); //从活动队列中删除
  set_tsk_need_resched(p);
  p->prio = effective_prio(p); //重新计算其优先级
  p->time_slice = task_timeslice(p); //重新计算其时间片
  p->first_time_slice = 0;

  if (!rq->expired_timestamp)
   rq->expired_timestamp = jiffies;

  //如果不是交互进程,移到过期队列
  if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
   enqueue_task(p, rq->expired);
   if (p->static_prio < rq->best_expired_prio)
    rq->best_expired_prio = p->static_prio;
  } else
   enqueue_task(p, rq->active); //如果是交互进程,移到其优先级队列末尾
 } else {
  if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
   p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
   (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
   (p->array == rq->active)) {

   requeue_task(p, rq->active);//如果时间片未用完,时间片又太长,把它移到队列末尾,给其它进程运行机会
   set_tsk_need_resched(p);
  }
 }
out_unlock:
 spin_unlock(&rq->lock);
}

 从task_running_tick我们可以看到,每个时钟模周期(tick)不同类型的进程处理方式不同:
1) 实时进程SCHED_RR: 时间片用完后, 重计算它的时间片,优先级不变,把它移到当前
优先级队列末尾。所以,如果有更高级的FIFO、RR进程,或跟它相同优先级的RR进程,
它都会被抢占(因为它在队列末尾)
2) 实时进程SCHED_FIFO: 完全不使用时间片,不修改其优先级、不修改其在队列中的位置。所以,它只会被
更高级的FIFO、RR进程抢占,跟它相同优先级的进程没有机会得到执行,除非它退出。
3)普通进程:A) 时间片用完,从活动队列中删除,重计算它的时间片,如果它是交互进程
把它插入活动队例,给它继续运行的机会,奖励交互进程达到更好的交互响应时间,
如果不是交互进程,插入过期队列中。
B)时间片未用完,如果是交互进程且它的剩余时间片太长(这个还没搞清楚怎么计算的),把它移动
当前优先级队列的末尾,也就是说它要被更高优先级或跟它相同优先级的进程抢占。


遗留问题:
1) 进程阻塞(Sleep或IO操作)之后,其在active队列还是在expired队列,或者两个队列都不放?

答:进程阻塞之后,会把它的状态设为TASK_INTERRUPTIBLE,同时设置task的need_schedule

标志,在系统调用结束时,检查need_schedule标志后会调用schedule()做进程切换,schedule()检查

到当前进程的状态为TASK_INTERRUPTIBLE后,把它从runqueue中删除,所以进程既不在active队列中也不在

expried队列中。当阻塞结束时,会调用activate_task()把进程重放入active队列中。
2) 进程阻塞被唤醒之后,它要被放入active队例中,它是放在它的优先级队列头还是尾?

答:会被放入active队列,进程所在优先级队列末尾,那么对于FIFO进程,一旦它阻塞再恢复

运行之后,可能同等优先级的FIFO进程抢占。看代码

static void __activate_task(struct task_struct *p, struct rq *rq)
{
 struct prio_array *target = rq->active;

 if (batch_task(p))
  target = rq->expired;
 enqueue_task(p, target);
 inc_nr_running(p, rq);
}

static void enqueue_task(struct task_struct *p, struct prio_array *array)
{
 sched_info_queued(p);
 list_add_tail(&p->run_list, array->queue + p->prio); //放入队列末尾
 __set_bit(p->prio, array->bitmap);
 array->nr_active++;
 p->array = array;
}
3) 如何判断一个进程是否是交互进程?

答:通过task->timestamp来计算,task->timestamp在三种情况会更新它:

1. 进程换入时 2. 进程换出 3.睡眠时,所以在schedule()中,

now - prev->timestamp表示当前进程运行时间, prev为当前正在运行的进程,prev->timestamp正好是它上次换入的时间

now-next->timestamp表示next的睡眠时间,next为准备运行的进程,next->timestamp正好是next上次换出的时间

 

原创粉丝点击