Linux kernel 3.10内核源码分析--进程上下文切换

来源：互联网发布：慧聪发布商机软件编辑：程序博客网时间：2024/05/17 00:15

一、疑问
进程调度时，当被选中的next进程不是current进程时，需要进行上下文切换。
进行上下文切换时，有一些问题不太容易理解，比如：
1、进程上下文切换必然发生在内核态吗？
2、上下文切换后原来的进程(prev)如果恢复执行，从什么地方开始执行？
3、上下文切换后，如何切换到新进程执行？新进程从什么地方开始执行？
5、上下文切换时，堆栈如何切换，如果保证不混乱？
6、A进程执行时被打断调度B进程运行，B进程正常执行过程中被打断调度C进程运行，C运行被打断中调度D运行，以此类推，看似一个无限嵌套，如何恢复到A进程运行，不会一层层返回吧？会不会有问题？
7、上下文切换后，如何恢复到新进程的用户态程序继续执行？
上述问题(可能还有其它疑问~)在理解了进程上下文切换的细节后，就都能回答了。

二、原理
进程上下文切换设计到几个关键的地方，也正是上述疑问所在的地方：
1、进程调度必然经过schedule函数，显然必然发生内核态，那上下文切换也必然发生于内核态了。进程调度通常的时机有：
    1）中断/异常/系统调用返回
    2）其它，如wakeup()或手工调用schedule
在没有开启内核抢占的环境中(通常如此)，仅当被替换进程(prev)处于位于用户态时，才能发生调度(上下文切换)。
呵呵，看似跟“进程调度必然发生内核态”的说法是矛盾的，其实不然，这里的意思是，在prev进程被打断之前，其位于用户态，当其被打断之后(最常见的如时钟中断)，当然就进入内核态了，然后在内核态完成进度调度和上下文切换。
2、当进程被打断(比如中断)时，当前的上下文信息(包括eip、CS和其它寄存器信息)会保存在当前的内核栈(或中断栈)中，当中断返回时，如果没有发生调度(不满足调度条件)，会恢复之前的上下文信息，即恢复到之前的被打断之前的状态继续执行。(在entry_xx.S的汇编代码中实现)。
3、当进程被打断并产生调度时，最终会进入switch_to宏进行上下文切换，被替换的进程(prev)当前的IP指针会被替换为“标号1(__switch_to函数后的一行代码)”，并被保存在task_struct.thread.ip中，同时会将被选中将执行的进程(next)的ip、堆栈指针已经相关的上下文加载到当前环境中，实现新进程的调度执行。
而当原来的prev进程重新被调度执行时，由于之前保存的IP指针为“标号1”，所以会从“标号1”开始执行，具体见后面的代码分析。
4、新进程(next)的执行分两种情况：
    1）经过调度后
经过调度后，会经历switch_to的流程，那么在进程被调度出去时，会保存switch_to宏中的“标号1”到task_struct.thread.ip中，当该进程被重新调度时，过程如3中描述一样，也会从switch_to宏中的“标号1”处开始执行。
    2）fork创建之后未经过调度
此时，该进程未经历switch_to的流程，由于在fork时，会将新进程的thread.eip设置成ret_from_fork(参见copy_thread函数)，所以此时该进程会从ret_from_fork处(在entry_xx.S的汇编代码中)开始执行。
5、堆栈的具体切换见另一篇文章：kernel 3.10内核源码分析--内核栈及堆栈切换
6、上下文切换后，由于原来的上下文完全被新上下文替换，所以新进程开始执行后，就已经没有原进程的遗留信息后，此时新进程用的是自己的地址空间、堆栈、和其它上下文，原进程被调度出去后，就跟现在的上下文脱离关系了。所以，不存在嵌套的说法，没有问题。
7、如之前所说，进程被中断时，其EIP和CS会自动保存在当前进程的内核栈(或中断栈)中，当新进程被调度执行时，其内核栈(或中断栈)中同样保存之前被调度出去时压入的EIP和CS，此时硬件会自动从内核栈中弹出EIP和CS，并将堆栈切换到用户栈，并恢复到用户态执行。

三、代码分析
进行上下文切换，主要由switch_to宏实现，代码分析如下：

点击(此处)折叠或打开

/*
* 上下文切换，在schedule中调用，current进程调度出去，当该进程被再次调度到时，重新从__switch_to后面开始执行
* prev:被替换的进程
* next:被调度的新进程
* last:当切换回原来的进程(prev)后，被替换的另外一个进程。
*/
#define switch_to(prev,next, last) \
do { \
/* \
* Context-switching clobbers all registers, so we clobber \
* them explicitly, via unused output variables. \
* (EAX and EBP is not listed because EBP is saved/restored \
* explicitly for wchan accessand EAX is the return value of \
* __switch_to()) \
*/ \
unsigned long ebx, ecx, edx, esi, edi; \
\
asm volatile("pushfl\n\t" /* save flags*/ /*将eflags寄存器值压栈*/\
"pushl %%ebp\n\t" /* save EBP*/ /*将EBP压栈*/\
/*将当前栈指针(内核态)保存到prev进程的thread.sp中*/
"movl %%esp,%[prev_sp]\n\t" /* save ESP*/ \
/*将next进程的栈指针(内核态)装载到ESP寄存器中*/
"movl %[next_sp],%%esp\n\t" /* restore ESP*/ \
/*保存"标号1"的地址到prev进程的thread.ip，以便当prev进程重新被调度运行时，可以从"标号1处"重新开始执行*/
"movl $1f,%[prev_ip]\n\t" /* save EIP*/ \
/*
* 将next进程的IP(通常都是"标号1"的地址，因为通常都是经历过这里的调度过程的，上一行代码中即保存了这个IP)
* 压入当前的(即next进程的)堆栈中。结合后面的jmp指令(注意:不是call指令)一起理解，当__switch_to执行完ret返回时，
* 会自动从当前的堆栈中弹出该地址作为函数的返回地址接着执行，如此即可实现新进程的运行。
*/
"pushl %[next_ip]\n\t" /* restore EIP*/ \
__switch_canary \
/*
*jmp到__switch_to函数执行，当此函数返回时，自动跳转到[next_ip]开始执行，实现新进程的调度。注意不是call，jmp指令
* 不会自动将当前地址压栈，call会自动压栈
*/
"jmp __switch_to\n" /* regparmcall */ \
/*当prev进程再次被调度到时，从这里开始执行*/
"1:\t" \
/*恢复EBP*/
"popl %%ebp\n\t" /* restore EBP*/ \
/*恢复eflags*/
"popfl\n" /* restore flags*/ \
\
/* output parameters*/ \
/*输出参数*/
: [prev_sp]"=m" (prev->thread.sp), \
[prev_ip]"=m" (prev->thread.ip), \
"=a" (last), \
\
/* clobbered output registers:*/ \
"=b" (ebx),"=c" (ecx),"=d" (edx), \
"=S" (esi),"=D" (edi) \
\
__switch_canary_oparam \
\
/* input parameters:*/ \
/*输入参数*/
: [next_sp]"m" (next->thread.sp), \
[next_ip]"m" (next->thread.ip), \
\
/* regparm parametersfor __switch_to():*/ \
/*将prev和next分别存入ecx和edx，然后作为参数传入到__switch_to函数中*/
[prev]"a" (prev), \
[next]"d" (next) \
\
__switch_canary_iparam \
\
: /* reloaded segment registers*/ \
"memory"); \
} while(0)

__switch_to函数实现如下：

点击(此处)折叠或打开

/*入参通过寄存器eax和edx从switch_to宏中传入*/
__notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
/*取prev进程的上下文信息*/
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
/*取当前CPU*/
int cpu = smp_processor_id();
/*获取当前CPU的TSS对应的tss_struct*/
struct tss_struct *tss = &per_cpu(init_tss, cpu);
fpu_switch_t fpu;
/* never put a printkin __switch_to... printk() calls wake_up*() indirectly */
fpu = switch_fpu_prepare(prev_p, next_p, cpu);
/*
* Reload esp0.
*/
/*
* 由于Linux的具体实现中，TSS不是针对每进程，而是针对每CPU的，即每个CPU对应一个tss_struct，那在进程上下文切换时，
* 需要考虑当前CPU上TSS中的内容的更新，其实就是内核栈指针的更新，更新后，当新进程再次进入到内核态执行时，
* 才能确保CPU硬件能从TSS中自动读取到正确的内核栈指针(sp0)的值，以保证从用户态切换到内核态时，相应的堆栈切
* 换正常。
*/
/*将next进程的内核栈指针(next->thread->sp0)值更新到当前CPU的TSS中*/
load_sp0(tss,next);
/*
* Save away %gs. No needto save %fs, as it was savedon the
* stack on entry. No needto save %esand %ds, as those are
* always kernel segments while inside the kernel. Doing this
* before setting the new TLS descriptors avoids the situation
* where we temporarily have non-reloadable segmentsin %fs
* and %gs. This could be an issue if the NMI handler ever
* used %fsor %gs (it does not today),or if the kernelis
* running inside of a hypervisor layer.
*/
lazy_save_gs(prev->gs);
/*
* Load the per-thread Thread-Local Storage descriptor.
*/
/*
* 将next_p进程使用的线程局部存储(TLS)段装入本地CPU的全局描述符表.
*/
load_TLS(next, cpu);
/*
* Restore IOPL if needed. In normal use, the flags restore
* in the switch assembly will handle this. Butif the kernel
* is running virtualized at a non-zero CPL, the popf will
* not restore flags, so it must be donein a separate step.
*/
if (get_kernel_rpl()&& unlikely(prev->iopl!= next->iopl))
set_iopl_mask(next->iopl);
/*
* Now maybe handle debug registersand/or IO bitmaps
*/
if (unlikely(task_thread_info(prev_p)->flags& _TIF_WORK_CTXSW_PREV ||
task_thread_info(next_p)->flags& _TIF_WORK_CTXSW_NEXT))
__switch_to_xtra(prev_p, next_p, tss);
/*
* Leave lazy mode, flushing any hypercalls made here.
* This must be done before restoring TLS segments so
* the GDT and LDT are properly updated,and must be
* done before math_state_restore, so the TS bitis up
* to date.
*/
/*架构相关处理，半虚拟化中使用*/
arch_end_context_switch(next_p);
/*
* Restore %gsif needed (whichis common)
*/
if (prev->gs| next->gs)
lazy_load_gs(next->gs);
switch_fpu_finish(next_p, fpu);
/*将current_task per-CPU变量值更新为next进程信息*/
this_cpu_write(current_task, next_p);
/*
* 这里需要仔细理解。return到哪里?
* switch_to宏中，jmp到__switch_to函数之前将"next_ip"压入了当前堆栈，那通常情况下，这里return后，
* 会自动从堆栈中弹出next_ip开始执行，而next_ip通常为switch_to宏中保存的"标号1"的地址，即
* 这里通常会返回到switch_to宏中__switch_to函数之后的标号1处开始执行。
* 但有例外:对于没有产生过进程切换,而是第一次开始执行的进程(刚完成fork开始执行)来说.
* 由于没有通过switch_to宏保存next_ip，所以并不会跳回switch_to，而是跳转到ret_from_fork函数的超始
* 地址开始执行，因为在fork新进程时，即设置好了该进程的thread.eip设置成了ret_from_fork(参见
* copy_thread函数)。
*/
return prev_p;
}

第一次开始执行的进程的thread.eip设置点：

点击(此处)折叠或打开

do_fork->copy_process->copy_thread
int copy_thread(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p)
{
struct pt_regs *childregs = task_pt_regs(p);
struct task_struct *tsk;
int err;
p->thread.sp= (unsigned long) childregs;
p->thread.sp0= (unsigned long)(childregs+1);
/*内核线程单独处理，其上下文信息单独填写*/
if (unlikely(p->flags& PF_KTHREAD)){
/* kernel thread*/
memset(childregs, 0, sizeof(struct pt_regs));
p->thread.ip= (unsigned long) ret_from_kernel_thread;
task_user_gs(p)= __KERNEL_STACK_CANARY;
childregs->ds= __USER_DS;
childregs->es= __USER_DS;
childregs->fs= __KERNEL_PERCPU;
childregs->bx= sp; /*function */
childregs->bp= arg;
childregs->orig_ax= -1;
childregs->cs= __KERNEL_CS | get_kernel_rpl();
childregs->flags= X86_EFLAGS_IF | X86_EFLAGS_BIT1;
p->fpu_counter= 0;
p->thread.io_bitmap_ptr= NULL;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
return 0;
}
/*将当前进程(父进程)的寄存器上下文信息赋给子进程，即子进程此后的上下文信息跟父进程保持一致了。*/
*childregs =*current_pt_regs();
childregs->ax= 0;
if (sp)
childregs->sp= sp;
/*
* 子进程的IP指向ret_from_fork，fork创建的新进程，都要经历这个过程，在调度的上下文切换时，
* 其返回到ret_from_fork(entry_32.S汇编代码)中处理，这跟普通进程调度时上下文切换不一样，普通
* 进程的IP是在上次上下文切换时(switch_to)中保存的。
*/
p->thread.ip= (unsigned long) ret_from_fork;
task_user_gs(p)= get_user_gs(current_pt_regs());
p->fpu_counter= 0;
p->thread.io_bitmap_ptr= NULL;
tsk = current;
err = -ENOMEM;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))){
p->thread.io_bitmap_ptr= kmemdup(tsk->thread.io_bitmap_ptr,
IO_BITMAP_BYTES, GFP_KERNEL);
if (!p->thread.io_bitmap_ptr){
p->thread.io_bitmap_max= 0;
return -ENOMEM;
}
set_tsk_thread_flag(p, TIF_IO_BITMAP);
}
err = 0;
/*
* Set a new TLSfor the child thread?
*/
if (clone_flags& CLONE_SETTLS)
err = do_set_thread_area(p,-1,
(struct user_desc __user *)childregs->si, 0);
if (err&& p->thread.io_bitmap_ptr){
kfree(p->thread.io_bitmap_ptr);
p->thread.io_bitmap_max= 0;
}
return err;
}

原文地址： http://blog.chinaunix.net/uid-14528823-id-4740294.html

0 0