初始化调度程序

来源：互联网发布：婚纱电子相册制作软件编辑：程序博客网时间：2024/04/28 04:31

5.9 初始化调度程序

回到start_kernel函数中，mm_init()执行后，所有的绝大多数内存管理的初始化都完毕，后面的代码可以开开心心的使用Linux复杂、庞大而又高效的内存管理器了。来看下一个函数，超级重点的进程调度初始化函数sched_init()。不过自从Linux 2.6.23（2007年5月），内核引入了一种所谓的完全公平调度程序（Completely Fair Scheduler，CFS），试图按照对 CPU 时间的“最大需求（gravest need）” 运行任务；这有助于确保每个进程可以获得对 CPU 的公平共享。

虽然CFS理论上非常高效，但它的实际性能并不如预期，因为如果某个任务休眠时间 “非常短”，那么 CFS 不会将该任务视为休眠任务 —— 短暂休眠的进程可能会获得一些额外时间，但是决不会超过它的未休眠时间。因此，很多应用软件并没有享受到实际的好处。比如我们的Android就没有使用CFS，仍然使用传统的O(1)调度程序。

因此，我们避免给大家带来争议的CFS算法，还是仍然分析O(1)调度的初始化程序，那么我们这里就分析一个版本稍微低一点的内核——Linux 2.6.18，来看看它的kernel/sched.c：

6728void __init sched_init(void)

6729{

6730 int i, j, k;

6731

6732 for_each_possible_cpu(i) {

6733 struct prio_array *array;

6734 struct rq *rq;

6735

6736 rq = cpu_rq(i);

6737 spin_lock_init(&rq->lock);

6738 lockdep_set_class(&rq->lock, &rq->rq_lock_key);

6739 rq->nr_running = 0;

6740 rq->active = rq->arrays;

6741 rq->expired = rq->arrays + 1;

6742 rq->best_expired_prio = MAX_PRIO;

6743

6744#ifdef CONFIG_SMP

6745 rq->sd = NULL;

6746 for (j = 1; j < 3; j++)

6747 rq->cpu_load[j] = 0;

6748 rq->active_balance = 0;

6749 rq->push_cpu = 0;

6750 rq->migration_thread = NULL;

6751 INIT_LIST_HEAD(&rq->migration_queue);

6752#endif

6753 atomic_set(&rq->nr_iowait, 0);

6754

6755 for (j = 0; j < 2; j++) {

6756 array = rq->arrays + j;

6757 for (k = 0; k < MAX_PRIO; k++) {

6758 INIT_LIST_HEAD(array->queue + k);

6759 __clear_bit(k, array->bitmap);

6760 }

6761 // delimiter for bitsearch

6762 __set_bit(MAX_PRIO, array->bitmap);

6763 }

6764 }

6765

6766 set_load_weight(&init_task);

6767

6768#ifdef CONFIG_RT_MUTEXES

6769 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);

6770#endif

6771

6772 /*

6773 * The boot idle thread does lazy MMU switching as well:

6774 */

6775 atomic_inc(&init_mm.mm_count);

6776 enter_lazy_tlb(&init_mm, current);

6777

6778 /*

6779 * Make us the idle thread. Technically, schedule() should not be

6780 * called from this thread, however somewhere below it might be,

6781 * but because we are the idle thread, we just pick up running again

6782 * when this runqueue becomes "idle".

6783 */

6784 init_idle(current, smp_processor_id());

6785}

6733行，prio_array结构，用于计算进程优先级的数据结构，来自同一个文件：

struct prio_array {

unsigned int nr_active;

DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */

struct list_head queue[MAX_PRIO];

};

#define MAX_USER_RT_PRIO 100

#define MAX_RT_PRIO MAX_USER_RT_PRIO

#define MAX_PRIO (MAX_RT_PRIO + 40)

prio_array数据结构都表示一个可运行进程的集合，并包括140个双向链表头（每个链表对应一个可能的进程优先级）、一个优先级位图和一个集合中所包含的进程数量的计数器。

6771行，rq结构，著名的运行队列（run queue），操作系统课程的核心概念：

struct rq {

spinlock_t lock;

unsigned long nr_running;

unsigned long raw_weighted_load;

#ifdef CONFIG_SMP

unsigned long cpu_load[3];

#endif

unsigned long long nr_switches;

unsigned long nr_uninterruptible;

unsigned long expired_timestamp;

unsigned long long timestamp_last_tick;

struct task_struct *curr, *idle;

struct mm_struct *prev_mm;

struct prio_array *active, *expired, arrays[2];

int best_expired_prio;

atomic_t nr_iowait;

#ifdef CONFIG_SMP

struct sched_domain *sd;

/* For active balancing */

int active_balance;

int push_cpu;

int cpu; /* cpu of this runqueue */

struct task_struct *migration_thread;

struct list_head migration_queue;

#endif

#ifdef CONFIG_SCHEDSTATS

……

#endif

struct lock_class_key rq_lock_key;

};

rq数据结构中最重要的字段是与可运行进程的链表相关的字段。系统中的每个可运行进程属于且只属于一个运行队列。只要可运行进程保持在同一个运行队列中，它就只可能在拥有该运行队列的CPU上执行。运行队列arrays[2]字段是一个包含两个prio_array结构的数组。每个数据结构都表示一个可运行进程的集合，并包括140个双向链表头（每个链表对应一个可能的进程优先级）、一个优先级位图和本运行队列所包含的进程数量计数器。

那么，这个rq是如何初始化的呢？在kernel/sched.c的269行有这么一行代码：

static DEFINE_PER_CPU(struct rq, runqueues);

DEFINE_PER_CPU还记得吧：

11 #define DEFINE_PER_CPU(type, name) /

12 __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name

也就是说，初始化的时候，把类型为rq的每CPU变量per_cpu__runqueues存放进.data.percpu段中。于是6773行代码得到这个变量，并且把他的地址赋给内部变量rq。然后6774行开始初始化这个rq。

最后，sched_init函数调用init_idle，来自同一个文件，传递给他的参数是当前的进程，也就是0号进程的task_struct结构和当前CPU的id：

void __devinit init_idle(struct task_struct *idle, int cpu)

{

struct rq *rq = cpu_rq(cpu);

unsigned long flags;

idle->timestamp = sched_clock();

idle->sleep_avg = 0;

idle->array = NULL;

idle->prio = idle->normal_prio = MAX_PRIO;

idle->state = TASK_RUNNING;

idle->cpus_allowed = cpumask_of_cpu(cpu);

set_task_cpu(idle, cpu);

spin_lock_irqsave(&rq->lock, flags);

rq->curr = rq->idle = idle;

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

idle->oncpu = 1;

#endif

spin_unlock_irqrestore(&rq->lock, flags);

/* Set the preempt count _outside_ the spinlocks! */

#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)

task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);

#else

task_thread_info(idle)->preempt_count = 0;

#endif

}

init_idle函数把0号进程的task_struct结构调度相关的字段给赋上值，具体的细节比较简单，我们就不一一说明了。init_idle函数结束后，整个内核的最核心部分——调度程序——就算完成了，如果您对O(1)调度器有更多的兴趣，请访问博客“进程调度的数据结构和优先级” http://blog.csdn.net/yunsongice/archive/2010/04/25/5526256.aspx。

回到start_kernel，596行preempt_disable()，禁止内核抢占。不过我们的.config已经没有设置CONFIG_PREEMPT了，所以这个函数是个空函数。597~601，再一次检查一下是否已经屏蔽了所有中断：local_irq_disable函数，我们早已见过了。

继续走，602行调用rcu_init()初始化rcu机制。对这个读-拷贝-更新（RCU）机制感兴趣的同学可以参考博客“RCU机制”

http://blog.csdn.net/yunsongice/archive/2010/05/19/5607680.aspx

start_kernel的603行，调用radix_tree_init()初始化页高速缓存需要使用的基树：

void __init radix_tree_init(void)

{

radix_tree_node_cachep = kmem_cache_create("radix_tree_node",

sizeof(struct radix_tree_node), 0,

SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,

radix_tree_node_ctor);

radix_tree_init_maxindex();

hotcpu_notifier(radix_tree_callback, 0);