linux 多核启动

来源：互联网发布：南方全站仪pts数据编辑：程序博客网时间：2024/04/30 13:07

转至http://blog.chinaunix.net/uid-27411029-id-3480919.html

Linux kernel启动的过程概览
init/main.c:start_kernel()
    |
   \|/
init/main.c:rest_init
{
……
kernel_thread(kernel_init, NULL, CLONES_FS | CLONE_SIGHAND)
……
cpu_idle()
}
    |
   \|/
init/main.c:kernel_init//从上面代码可以看出，kernel_init是一个内核线程
    |
   \|/
init/main.c:init_post //会在最后调用启动脚本
{
……
823         /*
824          * We try each of these until one succeeds.
825          *
826          * The Bourne shell can be used instead of init if we are
827          * trying to recover a really broken machine.
828          */
829         if (execute_command) {
830                 run_init_process(execute_command);
831                 printk(KERN_WARNING "Failed to execute %s. Attempting "
832                                         "defaults...\n", execute_command);
833         }
834         run_init_process("/sbin/init");
835         run_init_process("/etc/init");
836         run_init_process("/bin/init");
837         run_init_process("/bin/sh");
838
839         panic("No init found. Try passing init= option to kernel.");
……
}

我们再来看看内核启动多核的详细过程。

init/main.c:start_kernel()
    |
   \|/
init/main.c:rest_init
{
……
kernel_thread(kernel_init, NULL, CLONES_FS | CLONE_SIGHAND)
……
}
    |
   \|/
kernel_init
    |
   \|/
/* called by boot processor to activate the rest */
init/main.c: smp_init()
{
……
for_each_present_cpu(cpu) {
          if (num_onlien_cpus() >= setup_max_cpus)
               break;
          if ( !cpu_online(cpu))
               cpu_up(cpu);
}
/* Any cleanup work */
printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
smp_cpu_done(setup_max_cpus);
……
}
--------------------------------------------------------------
cpu_up = native_cpu_up是一个回调函数。
注册地方是在：arch/x86/kernel/smp.c

struct smp_ops smp_ops = {
   ……
.cpu_up = native_cpu_up,
   ……
}
--------------------------------------------------------------
    |
   \|/
arch/x86/kernel/smpboot.c:native_cpu_up(unsigned int cpu)
    |
   \|/
arch/x86/kernel/smpboot.c: do_boot_cpu(int apicid, int cpu)
    |
   \|/
wakeup_secondary_cpu_via_init(apicid, start_ip)

在启动多核的过程中有两个bitmap很重要，一个是cpu_callin_mask，另一个是cpu_callout_mask。
cpu_callin_mask代表某个cpu是否已经启动，它的某个bit被与之对应的cpu在启动后置位，标记已经启动。
cpu_callout_mask在do_boot_cpu中被置位，并在检查到对应cpu已经启动后重新清零。

我们下面来详细看看do_boot_cpu(int apicid, int cpu)与wakeup_secondary_cpu_via_init(apicid, start_ip)

C代码

/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
* (ie clustered apic addressing mode), this is a LOGICAL apic ID.
* Returns zero if CPU booted OK, else error code from
* ->wakeup_secondary_cpu.
*/
static int __cpuinit do_boot_cpu(int apicid, int cpu)
{
unsigned long boot_error = 0;
unsigned long start_ip;
int timeout;
struct create_idle c_idle = {
.cpu = cpu,
.done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
};
INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle);
alternatives_smp_switch(1);
c_idle.idle = get_idle_for_cpu(cpu);
/*
* We can't use kernel_thread since we must avoid to
* reschedule the child.
*/
if (c_idle.idle) {
c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
(THREAD_SIZE + task_stack_page(c_idle.idle))) - 1);
init_idle(c_idle.idle, cpu);
goto do_rest;
}
if (!keventd_up() || current_is_keventd())
c_idle.work.func(&c_idle.work);
else {
schedule_work(&c_idle.work);
wait_for_completion(&c_idle.done);
}
if (IS_ERR(c_idle.idle)) {
printk("failed fork for CPU %d\n", cpu);
destroy_work_on_stack(&c_idle.work);
return PTR_ERR(c_idle.idle);
}
set_idle_for_cpu(cpu, c_idle.idle);
do_rest:
per_cpu(current_task, cpu) = c_idle.idle;
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
#else
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
per_cpu(kernel_stack, cpu) =
(unsigned long)task_stack_page(c_idle.idle) -
KERNEL_STACK_OFFSET + THREAD_SIZE;
#endif
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
stack_start.sp = (void *) c_idle.idle->thread.sp;
/* start_ip had better be page-aligned! */
start_ip = setup_trampoline();
/* So we see what's up */
announce_cpu(cpu, apicid);
/*
* This grunge runs the startup process for
* the targeted processor.
*/
atomic_set(&init_deasserted, 0);
if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
pr_debug("Setting warm reset code and vector.\n");
smpboot_setup_warm_reset_vector(start_ip);
/*
* Be paranoid about clearing APIC errors.
*/
if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}
}
/*
* Kick the secondary CPU. Use the method in the APIC driver
* if it's defined - or use an INIT boot APIC message otherwise:
*/
if (apic->wakeup_secondary_cpu)
boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
else
boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
if (!boot_error) {
/*
* allow APs to start initializing.
*/
pr_debug("Before Callout %d.\n", cpu);
cpumask_set_cpu(cpu, cpu_callout_mask);
pr_debug("After Callout %d.\n", cpu);
/*
* Wait 5s total for a response
*/
for (timeout = 0; timeout < 50000; timeout++) {
if (cpumask_test_cpu(cpu, cpu_callin_mask))
break; /* It has booted */
udelay(100);
}
if (cpumask_test_cpu(cpu, cpu_callin_mask))
pr_debug("CPU%d: has booted.\n", cpu);
else {
boot_error = 1;
if (*((volatile unsigned char *)trampoline_base)
== 0xA5)
/* trampoline started but...? */
pr_err("CPU%d: Stuck ??\n", cpu);
else
/* trampoline code not run */
pr_err("CPU%d: Not responding.\n", cpu);
if (apic->inquire_remote_apic)
apic->inquire_remote_apic(apicid);
}
}
if (boot_error) {
/* Try to put things back the way they were before ... */
numa_remove_cpu(cpu); /* was set by numa_add_cpu */
/* was set by do_boot_cpu() */
cpumask_clear_cpu(cpu, cpu_callout_mask);
/* was set by cpu_init() */
cpumask_clear_cpu(cpu, cpu_initialized_mask);
set_cpu_present(cpu, false);
per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
}
/* mark "stuck" area as not stuck */
*((volatile unsigned long *)trampoline_base) = 0;
if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
/*
* Cleanup possible dangling ends...
*/
smpboot_restore_warm_reset_vector();
}
destroy_work_on_stack(&c_idle.work);
return boot_error;
}

C代码

/*
* Currently trivial. Write the real->protected mode
* bootstrap into the page concerned. The caller
* has made sure it's suitably aligned.
*/
unsigned long __trampinit setup_trampoline(void)
{
memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
return virt_to_phys(trampoline_base);
}

可以从上面代码中看出do_boot_cpu会为编号为apicid的AP设定好它将要使用的stack以及它将要执行的代码start_eip，在完成这些后，通过发送IPI序列来启动AP，
并会将cpu_callout_mask的代表相应AP的位清零。

C代码

static int __cpuinit
wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
{
unsigned long send_status, accept_status = 0;
int maxlvt, num_starts, j;
maxlvt = lapic_get_maxlvt();
/*
* Be paranoid about clearing APIC errors.
*/
if (APIC_INTEGRATED(apic_version[phys_apicid])) {
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}
pr_debug("Asserting INIT.\n");
/*
* Turn INIT on target chip
*/
/*
* Send IPI
*/
apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
phys_apicid);
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
mdelay(10);
pr_debug("Deasserting INIT.\n");
/* Target chip */
/* Send IPI */
apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
mb();
atomic_set(&init_deasserted, 1);
/*
* Should we send STARTUP IPIs ?
*
* Determine this based on the APIC version.
* If we don't have an integrated APIC, don't send the STARTUP IPIs.
*/
if (APIC_INTEGRATED(apic_version[phys_apicid]))
num_starts = 2;
else
num_starts = 0;
/*
* Paravirt / VMI wants a startup IPI hook here to set up the
* target processor state.
*/
startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
(unsigned long)stack_start.sp);
/*
* Run STARTUP IPI loop.
*/
pr_debug("#startup loops: %d.\n", num_starts);
for (j = 1; j <= num_starts; j++) {
pr_debug("Sending STARTUP #%d.\n", j);
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
pr_debug("After apic_write.\n");
/*
* STARTUP IPI
*/
/* Target chip */
/* Boot on the stack */
/* Kick the second */
apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
phys_apicid);
/*
* Give the other CPU some time to accept the IPI.
*/
udelay(300);
pr_debug("Startup point 1.\n");
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
/*
* Give the other CPU some time to accept the IPI.
*/
udelay(200);
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
accept_status = (apic_read(APIC_ESR) & 0xEF);
if (send_status || accept_status)
break;
}
pr_debug("After Startup.\n");
if (send_status)
printk(KERN_ERR "APIC never delivered???\n");
if (accept_status)
printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
return (send_status | accept_status);
}

一段wakeup_secondary_cpu_via_init执行的log

C代码

656 CPU17: has booted.
657 WP output: cpu :18
658 ------native_cpu_up cpu:18, apicid:18----------
659 ------------in 3 do_boot_cpu------- #18
660 Asserting INIT.
661 Waiting for send to finish...
662 Deasserting INIT.
663 Waiting for send to finish...
664 #startup loops: 2.
665 Sending STARTUP #1.
666 After apic_write.
667 Startup point 1.
668 Waiting for send to finish...
669 Sending STARTUP #2.
670 After apic_write.
671 Startup point 1.
672 Waiting for send to finish...
673 in the cpu_init())
674 After Startup.
675 Before Callout 18.
676 After Callout 18.
677 cpu is: 12
678 in the enable_x2apic()
679 ------in x2apic_phys_get_apic_id-----
680 CPU#18 (phys ID: 18) waiting for CALLOUT
681 CALLIN, before setup_local_APIC().
682 ------3------
683 Stack at about ffff88021f953f44
684 ------in x2apic_phys_get_apic_id-----
685 CPU18: has booted.

wakeup_secondary_cpu_via_init是与硬件相关的代码，它的主要作用是通过发送INIT-INIT-Startup IPI序列来将AP从halted的状态唤醒并让它开始执行代码start_eip所指向的代码。
Startup IPI会有一个域来指定需要执行代码的地址：apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), phys_apicid);
如果想彻底搞清楚一段代码，请去看Intel文档。

start_secondary是AP会执行的代码，这段代码通过smp_callin来将设定cpu_callin_mask来告诉BSP它已经启动。start_secondary最后是idle循环。

C代码

/*
* Activate a secondary processor.
*/
notrace static void __cpuinit start_secondary(void *unused)
{
/*
* Don't put *anything* before cpu_init(), SMP booting is too
* fragile that we want to limit the things done here to the
* most necessary things.
*/
vmi_bringup();
cpu_init();
preempt_disable();
smp_callin();
/* otherwise gcc will move up smp_processor_id before the cpu_init */
barrier();
/*
* Check TSC synchronization with the BP:
*/
check_tsc_sync_target();
if (nmi_watchdog == NMI_IO_APIC) {
disable_8259A_irq(0);
enable_NMI_through_LVT0();
enable_8259A_irq(0);
}
#ifdef CONFIG_X86_32
while (low_mappings)
cpu_relax();
__flush_tlb_all();
#endif
/* This must be done before setting cpu_online_mask */
set_cpu_sibling_map(raw_smp_processor_id());
wmb();
/*
* We need to hold call_lock, so there is no inconsistency
* between the time smp_call_function() determines number of
* IPI recipients, and the time when the determination is made
* for which cpus receive the IPI. Holding this
* lock helps us to not include this cpu in a currently in progress
* smp_call_function().
*
* We need to hold vector_lock so there the set of online cpus
* does not change while we are assigning vectors to cpus. Holding
* this lock ensures we don't half assign or remove an irq from a cpu.
*/
ipi_call_lock();
lock_vector_lock();
__setup_vector_irq(smp_processor_id());
set_cpu_online(smp_processor_id(), true);
unlock_vector_lock();
ipi_call_unlock();
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
/* enable local interrupts */
local_irq_enable();
x86_cpuinit.setup_percpu_clockev();
wmb();
cpu_idle();
}

[置顶] ARM多核处理器启动过程分析

转至：http://blog.csdn.net/qianlong4526888/article/details/27695173

说明：

该流程图按照代码执行时间顺序划分为4部分：

1. Bootloader在图片上半部，最先启动；

2. Kernel在图片下半部，由bootloader引导启动；

3．CPU0执行流程在图片左半部，bootloader代码会进行判断，先行启动CPU0；

4. Secondary CPUs在图片右半部，由CPU唤醒

具体启动流程如下：

1. 在bootloader启动时，会判断执行代码的是否为CPU0，如果不是，则执行wfe等待CPU0发出sev指令唤醒。如果是CPU0，则继续进行初始化工作。

mrs x4,mpidr_el1

tst x4,#15 //testwether the current cpu is CPU0, ie. mpidr_el1=15

b.eq 2f

* Secondary CPUs

1: wfe

ldr x4, mbox

cbz x4, 1b //if x4==0(ie. The value in address of mbox is 0) dead loop,or jump to x4

br x4 // branch to thegiven address

2:…… //UART initialisation (38400 8N1)

以上mbox的地址在Makefile中写定，是0x8000fff8，该地址处初始状态内容为全0。上面代码判断，若mbox地址处内容为0，则死循环；如果不为0则直接跳转到该地址所包含内容处执行。

2. 在dts中，对cpu-release-addr进行赋值，将其地址设为0x8000fff8。即只要往该地址写入相应的值，例如地址A，并且发送sev指令，就能将次级CPU唤醒，并跳转到A地址处执行。

cpu-release-addr = <0x0 0x8000fff8>;

3. 内核中smp_prepare_cpus 函数对0x8000fff8地址处内容进行了赋值，其值为函数secondary_holding_pen 的地址：

release_addr = __va(cpu_release_addr[cpu]);

release_addr[0] = (void*)__pa(secondary_holding_pen);//write function address to mbox

以上代码执行完后发送sev指令，唤醒其他次级CPU执行secondary_holding_pen函数：

* Send an event to wake up the secondaries.

sev();

4. secondary cpu 执行secondary_holding_pen()函数时都会去判断当前CPU的ID，并与secondary_holding_pen_release变量做比对，如果相等，则执行进一步初始化，否则执行WFE等待；

secondary_holding_pen_release变量的修改过程由CPU0调用smp_init()函数进行。该函数首先为相应CPU绑定一个idle线程，然后修改secondary_holding_pen_release的值（其值即CPU0欲唤醒的CPU的ID），最后发送sev指令，唤醒相应CPU执行idle线程。

secondary_holding_pen()函数代码如下：

* This provides a"holding pen" for platforms to hold all secondary

* cores are helduntil we're ready for them to initialise.

ENTRY(secondary_holding_pen)

bl el2_setup // Drop to EL1

mrs x0, mpidr_el1

and x0, x0, #15 // CPU number

adr x1, 1b

ldp x2, x3, [x1]

sub x1, x1, x2

add x3, x3, x1

pen: ldr x4, [x3]

cmp x4,x0

b.eq secondary_startup

wfe

b pen

ENDPROC(secondary_holding_pen)

附录：

内核中启动secondary cpus函数调用过程大致如下：

start_kernel èrest_initèkernel_inièkernel_init_freeable èsmp_init() kernel/smp.c line 649, 由CPU0激活剩余的处理器

cpu_upè_cpu_up()è__cpu_up ()èboot_secondary ()èwrite_pen_release该函数中有一句：secondary_holding_pen_release = val; 然后发送sev指令，激活剩余处理器。

linux SMP多核启动分析

startup_32:

cld //决定内存地址的增长方向DF = xx ,与STD对立

cli //禁止中断

movl $(KERNEL_DS),%eax

mov %ax,%ds

mov %ax,%es

mov %ax,%fs

mov %ax,%gs

#ifdef __SMP__

orw %bx,%bx # What state are we in BX=1 for SMP

# 0 for boot

jz 2f # Initial boot

//根据bx值指示是主cpu(bx=0)还是次cpu(bx=1)

//然后会有不同的执行路径

* We are trampolining an SMP processor

*//这里是其他次cpu执行路径

mov %ax,%ss

xorl %eax,%eax # Back to 0

mov %cx,%ax # SP low 16 bits

movl %eax,%esp

pushl 0 # Clear NT

popfl

ljmp $(KERNEL_CS), $0x100000 # Into C and sanity

2://这里是主cpu的执行路径

#endif

lss SYMBOL_NAME(stack_start),%esp

xorl %eax,%eax

1: incl %eax # check that A20 really IS enabled

movl %eax,0x000000 # loop forever if it isn't

cmpl %eax,0x100000

je 1b

* Initialize eflags. Some BIOS's leave bits like NT set. This would

* confuse the debugger if this code is traced.

* XXX - best to initialize before switching to protected mode.

pushl $0

popfl

* Clear BSS

xorl %eax,%eax

movl $ SYMBOL_NAME(_edata),%edi

movl $ SYMBOL_NAME(_end),%ecx

subl %edi,%ecx

cld

rep

stosb

* Do the decompression, and jump to the new kernel..

subl $16,%esp # place for structure on the stack

pushl %esp # address of structure as first arg

call SYMBOL_NAME(decompress_kernel)

orl %eax,%eax

jnz 3f

xorl %ebx,%ebx

ljmp $(KERNEL_CS), $0x100000

这个其实就是跳到start_kernel函数。

asmlinkage void start_kernel(void)

{

char * command_line;

* This little check will move.

#ifdef __SMP__

static int first_cpu=1;

//这个不是函数局部变量，是函数静态变量，主cpu执行这个函数时复位为1，其他cpu为0，因为主cpu总是第一个执行这个函数的。

if(!first_cpu)

start_secondary();

//对于

first_cpu=0;

#endif

* Interrupts are still disabled. Do necessary setups, then

* enable them

setup_arch(&command_line, &memory_start, &memory_end);

memory_start = paging_init(memory_start,memory_end);

trap_init();

init_IRQ();

sched_init();

time_init();

parse_options(command_line);

#ifdef CONFIG_MODULES

init_modules();

#endif

#ifdef CONFIG_PROFILE

if (!prof_shift)

#ifdef CONFIG_PROFILE_SHIFT

prof_shift = CONFIG_PROFILE_SHIFT;

#else

prof_shift = 2;

#endif

if (prof_shift) {

prof_buffer = (unsigned int *) memory_start;

/* only text is profiled */

prof_len = (unsigned long) &_etext - (unsigned long) &_stext;

prof_len >>= prof_shift;

memory_start += prof_len * sizeof(unsigned int);

}

memory_start = console_init(memory_start,memory_end);

#ifdef CONFIG_PCI

memory_start = pci_init(memory_start,memory_end);

#endif

memory_start = kmalloc_init(memory_start,memory_end);

sti();

calibrate_delay();

memory_start = inode_init(memory_start,memory_end);

memory_start = file_table_init(memory_start,memory_end);

memory_start = name_cache_init(memory_start,memory_end);

#ifdef CONFIG_BLK_DEV_INITRD

if (initrd_start && initrd_start < memory_start) {

printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - "

"disabling it.\n",initrd_start,memory_start);

initrd_start = 0;

}

#endif

mem_init(memory_start,memory_end);

buffer_init();

sock_init();

#if defined(CONFIG_SYSVIPC) || defined(CONFIG_KERNELD)

ipc_init();

#endif

dquot_init();

arch_syms_export();

sti();

check_bugs();

printk(linux_banner);

#ifdef __SMP__

smp_init();

#endif

sysctl_init();

* We count on the initial thread going ok

* Like idlers init is an unlocked kernel thread, which will

* make syscalls (and thus be locked).

kernel_thread(init, NULL, 0);

* task[0] is meant to be used as an "idle" task: it may not sleep, but

* it might do some general things like count free pages or it could be

* used to implement a reasonable LRU algorithm for the paging routines:

* anything that can be useful, but shouldn't take time from the real

* processes.

* Right now task[0] just does a infinite idle loop.

cpu_idle(NULL);

}

asmlinkage void start_secondary(void)

{

trap_init();

init_IRQ();

//初始化自己的irq

smp_callin();

//这个等待主cpu给大家发送开始信号

cpu_idle(NULL);

//这个是ide进程。

}

void smp_callin(void)

{

extern void calibrate_delay(void);

int cpuid=GET_APIC_ID(apic_read(APIC_ID));

unsigned long l;

* Activate our APIC

SMP_PRINTK(("CALLIN %d\n",smp_processor_id()));

l=apic_read(APIC_SPIV);

l|=(1<<8); /* Enable */

apic_write(APIC_SPIV,l);

sti();

* Get our bogomips.

calibrate_delay();

* Save our processor parameters

smp_store_cpu_info(cpuid);

* Allow the master to continue.

set_bit(cpuid, (unsigned long *)&cpu_callin_map[0]);

* Until we are ready for SMP scheduling

load_ldt(0);

/* printk("Testing faulting...\n");

*(long *)0=1; OOPS... */

local_flush_tlb();

while(!smp_commenced);

//这个可以看成是自旋锁，等待主cpu发smp_commenced信号即开始信号。

if (cpu_number_map[cpuid] == -1)

while(1);

local_flush_tlb();

SMP_PRINTK(("Commenced..\n"));

load_TR(cpu_number_map[cpuid]);

/* while(1);*/

}

int cpu_idle(void *unused)

{

for(;;)

idle();

}

主cpu给各次cpu发开始信号是在init函数中调用smp_begin函数：

static void smp_begin(){

smp_threads_ready=1;

smp_commence();

//这个会通过IPI给各个次cpu发送相关中断来通信

}

每个cpu有一个current指针。

刚开始的时候由主cpu赋值为init_task;

在主cpu调用 sched_init赋值。

void sched_init(void)

{

* We have to do a little magic to get the first

* process right in SMP mode.

int cpu=smp_processor_id();//这个为0,因为是主cpu才调用。

#ifndef __SMP__

current_set[cpu]=&init_task;

#else

init_task.processor=cpu;

//这个是将init_task标志为主cpu在运行。

for(cpu = 0; cpu < NR_CPUS; cpu++)

current_set[cpu] = &init_task;

#endif

init_bh(TIMER_BH, timer_bh);

init_bh(TQUEUE_BH, tqueue_bh);

init_bh(IMMEDIATE_BH, immediate_bh);

}

同时这些还会在 smp_init丰富。

static void smp_init(void)

{

int i, j;

smp_boot_cpus();

* Create the slave init tasks as sharing pid 0.

* This should only happen if we have virtual CPU numbers

* higher than 0.

for (i=1; i<smp_num_cpus; i++)

{

struct task_struct *n, *p;

j = cpu_logical_map[i];

* We use kernel_thread for the idlers which are

* unlocked tasks running in kernel space.

kernel_thread(cpu_idle, NULL, CLONE_PID);

//这个其实就是创建线程然后这个线程体现在task[i]上了，因为创建的时候的task_struct就是从task［i]取的。

* Don't assume linear processor numbering

current_set[j]=task[i];

current_set[j]->processor=j;

cli();

n = task[i]->next_run;

p = task[i]->prev_run;

nr_running--;

n->prev_run = p;

p->next_run = n;

task[i]->next_run = task[i]->prev_run = task[i];

sti();

}

上面执行完后就给每个cpu加了一个idle任务。

然后kernel_thread(init, NULL, 0)创建的init任务。

//每个cpu在时间中断时都可能调用这个共同的函数。

asmlinkage void schedule(void)

{

int c;

struct task_struct * p;

struct task_struct * prev, * next;

unsigned long timeout = 0;

int this_cpu=smp_processor_id();

//获取cpu_id;

/* check alarm, wake up any interruptible tasks that have got a signal */

if (intr_count)

goto scheduling_in_interrupt;

if (bh_active & bh_mask) {

intr_count = 1;

do_bottom_half();

intr_count = 0;

}

run_task_queue(&tq_scheduler);

need_resched = 0;

prev = current;

cli();

/* move an exhausted RR process to be last.. */

if (!prev->counter && prev->policy == SCHED_RR) {

prev->counter = prev->priority;

move_last_runqueue(prev);

}

switch (prev->state) {

case TASK_INTERRUPTIBLE:

if (prev->signal & ~prev->blocked)

goto makerunnable;

timeout = prev->timeout;

if (timeout && (timeout <= jiffies)) {

prev->timeout = 0;

timeout = 0;

makerunnable:

prev->state = TASK_RUNNING;

break;

}

default:

del_from_runqueue(prev);

case TASK_RUNNING:

}

p = init_task.next_run;

//获取进程双向链表的一个节点。

sti();

#ifdef __SMP__

* This is safe as we do not permit re-entry of schedule()

prev->processor = NO_PROC_ID;

#define idle_task (task[cpu_number_map[this_cpu]])

#else

#define idle_task (&init_task)

#endif

* Note! there may appear new tasks on the run-queue during this, as

* interrupts are enabled. However, they will be put on front of the

* list, so our list starting at "p" is essentially fixed.

/* this is the scheduler proper: */

c = -1000;

next = idle_task;

while (p != &init_task) {

//p初始值为init_task.next_run

//当回到init_task时说明已经查找为所有的了。

int weight = goodness(p, prev, this_cpu);

if (weight > c)

c = weight, next = p;

p = p->next_run;

}

//这个是查找所有的task，找出最合适的task来调度。

/* if all runnable processes have "counter == 0", re-calculate counters */

if (!c) {

for_each_task(p)

p->counter = (p->counter >> 1) + p->priority;

}

#ifdef __SMP__

* Allocate process to CPU

next->processor = this_cpu;

//将这个将要被执行的processor标识为这个cpu

next->last_processor = this_cpu;

#endif

#ifdef __SMP_PROF__

/* mark processor running an idle thread */

if (0==next->pid)

set_bit(this_cpu,&smp_idle_map);

else

clear_bit(this_cpu,&smp_idle_map);

#endif

if (prev != next) {

struct timer_list timer;

kstat.context_swtch++;

if (timeout) {

init_timer(&timer);

timer.expires = timeout;

timer.data = (unsigned long) prev;

timer.function = process_timeout;

add_timer(&timer);

}

get_mmu_context(next);

switch_to(prev,next);

if (timeout)

del_timer(&timer);

}

return;

scheduling_in_interrupt:

printk("Aiee: scheduling in interrupt %p\n",

__builtin_return_address(0));

}

上面需要注意的是current变量，在单核中肯定就是一个变量，在多核中肯定是各个cpu有自己的current：

其定义如下：

#define current (0+current_set[smp_processor_id()]

在smp中current是current_set数组中的一个元素，是指具体一个cpu的当前进程。

从上面可以看出一个cpu是从全局task找一个task来运行，每个cpu有一个idle_task，这个task的编号是固定的。

所有的task可以通过init_task来找到,因为创建新进程（内核线程）的时候，会将新建的挂到链表上。

而init_task是静态挂在这上面的。

附上task_struct:

struct task_struct {

/* these are hardcoded - don't touch */

volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */

long counter;

long priority;

unsigned long signal;

unsigned long blocked; /* bitmap of masked signals */

unsigned long flags; /* per process flags, defined below */

int errno;

long debugreg[8]; /* Hardware debugging registers */

struct exec_domain *exec_domain;

/* various fields */

struct linux_binfmt *binfmt;

struct task_struct *next_task, *prev_task;

struct task_struct *next_run, *prev_run;

unsigned long saved_kernel_stack;

unsigned long kernel_stack_page;

int exit_code, exit_signal;

/* ??? */

unsigned long personality;

int dumpable:1;

int did_exec:1;

/* shouldn't this be pid_t? */

int pid;

int pgrp;

int tty_old_pgrp;

int session;

/* boolean value for session group leader */

int leader;

int groups[NGROUPS];

* pointers to (original) parent process, youngest child, younger sibling,

* older sibling, respectively. (p->father can be replaced with

* p->p_pptr->pid)

struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;

struct wait_queue *wait_chldexit; /* for wait4() */

unsigned short uid,euid,suid,fsuid;

unsigned short gid,egid,sgid,fsgid;

unsigned long timeout, policy, rt_priority;

unsigned long it_real_value, it_prof_value, it_virt_value;

unsigned long it_real_incr, it_prof_incr, it_virt_incr;

struct timer_list real_timer;

long utime, stime, cutime, cstime, start_time;

/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */

unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;

int swappable:1;

unsigned long swap_address;

unsigned long old_maj_flt; /* old value of maj_flt */

unsigned long dec_flt; /* page fault count of the last time */

unsigned long swap_cnt; /* number of pages to swap on next pass */

/* limits */

struct rlimit rlim[RLIM_NLIMITS];

unsigned short used_math;

char comm[16];

/* file system info */

int link_count;

struct tty_struct *tty; /* NULL if no tty */

/* ipc stuff */

struct sem_undo *semundo;

struct sem_queue *semsleeping;

/* ldt for this task - used by Wine. If NULL, default_ldt is used */

struct desc_struct *ldt;

/* tss for this task */

struct thread_struct tss;

/* filesystem information */

struct fs_struct *fs;

/* open file information */

struct files_struct *files;

/* memory management info */

struct mm_struct *mm;

/* signal handlers */

struct signal_struct *sig;

#ifdef __SMP__

int processor;

int last_processor;

int lock_depth; /* Lock depth. We can context switch in and out of holding a syscall kernel lock... */

#endif

};

故这个p = init_task.next_run;

p可以获取到所有在就绪状态的task;

0 0