linux进程调度之进程切换

来源：互联网发布：874是什么意思网络语言编辑：程序博客网时间：2024/05/18 00:55

linux调度中，在schedule函数中完成选择下一个进行、进程间切换进程的切换在schedule函数中主要由两个函数完成：

sched_info_switch(prev, next);主要是更新切换出去和进来进程以及对应rq的相关变量。该函数主要调用__sched_info_switch函数来实现。

/* * Called when tasks are switched involuntarily due, typically, to expiring * their time slice.  (This may also be called when switching to or from * the idle task.)  We are only called when prev != next. */static inline void__sched_info_switch(struct task_struct *prev, struct task_struct *next){struct rq *rq = task_rq(prev);/* * prev now departs the cpu.  It's not interesting to record * stats about how efficient we were at scheduling the idle * process, however. */if (prev != rq->idle)/*如果被切换出去的进程不是idle进程*/sched_info_depart(prev);/*更新prev进程和他对应rq的相关变量*/if (next != rq->idle)/*如果切换进来的进程不是idle进程*/sched_info_arrive(next);/*更新next进程和对应队列的相关变量*/}

/* * Called when a process ceases being the active-running process, either * voluntarily or involuntarily.  Now we can calculate how long we ran. * Also, if the process is still in the TASK_RUNNING state, call * sched_info_queued() to mark that it has now again started waiting on * the runqueue. */static inline void sched_info_depart(struct task_struct *t){/*计算在进程在rq中运行的时间长度*/unsigned long long delta = task_rq(t)->clock -t->sched_info.last_arrival;/*更新RunQueue中的Task所得到CPU執行時間的累加值.*/rq_sched_info_depart(task_rq(t), delta);/*如果被切换出去进程的状态是运行状态那么将进程sched_info.last_queued设置为rq的clocklast_queued为最后一次排队等待运行的时间*/if (t->state == TASK_RUNNING)sched_info_queued(t);}

/* * Called when a task finally hits the cpu.  We can now calculate how * long it was waiting to run.  We also note when it began so that we * can keep stats on how long its timeslice is. */static void sched_info_arrive(struct task_struct *t){unsigned long long now = task_rq(t)->clock, delta = 0;if (t->sched_info.last_queued)/*如果被切换进来前在运行进程中排队*/delta = now - t->sched_info.last_queued;/*计算排队等待的时间长度*/sched_info_reset_dequeued(t);/*因为进程将被切换进来运行，设定last_queued为0*/t->sched_info.run_delay += delta;/*更新进程在运行队列里面等待的时间*/t->sched_info.last_arrival = now;/*更新最后一次运行的时间*/t->sched_info.pcount++;/*cpu上运行的次数加一*//*更新rq中rq_sched_info中的对应的变量*/rq_sched_info_arrive(task_rq(t), delta);}

context_switch函数完成主要的硬件、寄存器等实际的切换工作。

/* * context_switch - switch to the new MM and the new * thread's register state. */static inline voidcontext_switch(struct rq *rq, struct task_struct *prev,       struct task_struct *next){struct mm_struct *mm, *oldmm;prepare_task_switch(rq, prev, next);trace_sched_switch(rq, prev, next);mm = next->mm;oldmm = prev->active_mm;/* * For paravirt, this is coupled with an exit in switch_to to * combine the page table reload and the switch backend into * one hypercall. */arch_start_context_switch(prev);if (unlikely(!mm)) {/*如果被切换进来的进程的mm为空*/next->active_mm = oldmm;/*将共享切换出去进程的active_mm*/atomic_inc(&oldmm->mm_count);/*有一个进程共享，所有引用计数加一*//*将per cpu变量cpu_tlbstate状态设为LAZY*/enter_lazy_tlb(oldmm, next);} else/*如果mm不会空，那么进行mm切换*/switch_mm(oldmm, mm, next);if (unlikely(!prev->mm)) {/*如果切换出去的mm为空，从上面可以看出本进程的active_mm为共享先前切换出去的进程的active_mm,所有需要在这里置空*/prev->active_mm = NULL;rq->prev_mm = oldmm; /*更新rq的前一个mm结构*/}/* * Since the runqueue lock will be released by the next * task (which is an invalid locking op but in the case * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */#ifndef __ARCH_WANT_UNLOCKED_CTXSWspin_release(&rq->lock.dep_map, 1, _THIS_IP_);#endif/* Here we just switch the register state and the stack. */switch_to(prev, next, prev);barrier();/* * this_rq must be evaluated again because prev may have moved * CPUs since it called schedule(), thus the 'rq' on its stack * frame will be invalid. */finish_task_switch(this_rq(), prev);}

static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,     struct task_struct *tsk){unsigned cpu = smp_processor_id();if (likely(prev != next)) {/* stop flush ipis for the previous mm *//*将被替换进程使用的内存描述结构的CPU掩码中当前处理器号对应的位码清0*/cpumask_clear_cpu(cpu, mm_cpumask(prev));#ifdef CONFIG_SMP/*设置per cpu变量tlb*/percpu_write(cpu_tlbstate.state, TLBSTATE_OK);percpu_write(cpu_tlbstate.active_mm, next);#endif/*将要被调度运行进程拥有的内存描述结构的CPU掩码中当前处理器号对应的位码设置为1*/cpumask_set_cpu(cpu, mm_cpumask(next));/* Re-load page tables */load_cr3(next->pgd);/*将切换进来进程的pgd load到cr3寄存器*//* * load the LDT, if the LDT is different: */if (unlikely(prev->context.ldt != next->context.ldt))load_LDT_nolock(&next->context);}#ifdef CONFIG_SMPelse {/*如果切换的两个进程相同*/percpu_write(cpu_tlbstate.state, TLBSTATE_OK);BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {/* We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. */load_cr3(next->pgd);load_LDT_nolock(&next->context);}}#endif}

具体寄存器相关的切换由函数switch_to完成，改函数用汇编代码保持各种寄存器的值，然后调用c函数__switch_to,

汇编中实现了具体的切换：

/* * Saving eflags is important. It switches not only IOPL between tasks, * it also protects other tasks from NT leaking through sysenter etc. */#define switch_to(prev, next, last)\do {\/*\ * Context-switching clobbers all registers, so we clobber\ * them explicitly, via unused output variables.\ * (EAX and EBP is not listed because EBP is saved/restored\ * explicitly for wchan access and EAX is the return value of\ * __switch_to())\ */\unsigned long ebx, ecx, edx, esi, edi;\\asm volatile("pushfl\n\t"/* save    flags */\     "pushl %%ebp\n\t"/* save    EBP   */\     "movl %%esp,%[prev_sp]\n\t"/* save    ESP   */ \     "movl %[next_sp],%%esp\n\t"/* restore ESP   */ \     "movl $1f,%[prev_ip]\n\t"/* save    EIP   */\/*将next_ip入栈，下面用jmp跳转，这样返回到标号1时就切换过来了*/     "pushl %[next_ip]\n\t"/* restore EIP   */\     __switch_canary\     "jmp __switch_to\n"/* regparm call  */\     "1:\t"\     /*切换到新进程的第一条指令*/     "popl %%ebp\n\t"/* restore EBP   */\     "popfl\n"/* restore flags */\\     /* output parameters */\     : [prev_sp] "=m" (prev->thread.sp),\       [prev_ip] "=m" (prev->thread.ip),\       "=a" (last),\\       /* clobbered output registers: */\       "=b" (ebx), "=c" (ecx), "=d" (edx),\       "=S" (esi), "=D" (edi)\       \       __switch_canary_oparam\\       /* input parameters: */\     : [next_sp]  "m" (next->thread.sp),\       [next_ip]  "m" (next->thread.ip),\       \       /* regparm parameters for __switch_to(): */\       [prev]     "a" (prev),\       [next]     "d" (next)\\       __switch_canary_iparam\\     : /* reloaded segment registers */\"memory");\} while (0)

/* *switch_to(x,yn) should switch tasks from x to y. * * We fsave/fwait so that an exception goes off at the right time * (as a call from the fsave or fwait in effect) rather than to * the wrong process. Lazy FP saving no longer makes any sense * with modern CPU's, and this simplifies a lot of things (SMP * and UP become the same). * * NOTE! We used to use the x86 hardware context switching. The * reason for not using it any more becomes apparent when you * try to recover gracefully from saved state that is no longer * valid (stale segment register values in particular). With the * hardware task-switch, there is no way to fix up bad state in * a reasonable manner. * * The fact that Intel documents the hardware task-switching to * be slow is a fairly red herring - this code is not noticeably * faster. However, there _is_ some room for improvement here, * so the performance issues may eventually be a valid point. * More important, however, is the fact that this allows us much * more flexibility. * * The return value (in %ax) will be the "prev" task after * the task-switch, and shows up in ret_from_fork in entry.S, * for example. */__notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p){struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread;int cpu = smp_processor_id();struct tss_struct *tss = &per_cpu(init_tss, cpu);/*init_tss为一个per cpu变量*/bool preload_fpu;/* never put a printk in __switch_to... printk() calls wake_up*() indirectly *//* * If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the * chances of needing FPU soon are obviously high now */preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;/*保存FPU寄存器*/__unlazy_fpu(prev_p);/* we're going to use this soon, after a few expensive things */if (preload_fpu)prefetch(next->xstate);/* * Reload esp0. */ /*吧next_p->thread.esp0装入对应于本地cpu的tss的esp0字段；任何由sysenter汇编指令产生的从用户态到内核态的特权级转换将把这个地址拷贝到esp寄存器中*/load_sp0(tss, next);/* * Save away %gs. No need to save %fs, as it was saved on the * stack on entry.  No need to save %es and %ds, as those are * always kernel segments while inside the kernel.  Doing this * before setting the new TLS descriptors avoids the situation * where we temporarily have non-reloadable segments in %fs * and %gs.  This could be an issue if the NMI handler ever * used %fs or %gs (it does not today), or if the kernel is * running inside of a hypervisor layer. */lazy_save_gs(prev->gs);/* * Load the per-thread Thread-Local Storage descriptor. */ /*把next进程使用的县城局部存储(TLS)段装入本地CPU 的全局描述符表；三个段选择符保存在进程描述符 内的tls_array数组中*/load_TLS(next, cpu);/* * Restore IOPL if needed.  In normal use, the flags restore * in the switch assembly will handle this.  But if the kernel * is running virtualized at a non-zero CPL, the popf will * not restore flags, so it must be done in a separate step. */if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))set_iopl_mask(next->iopl);/* * Now maybe handle debug registers and/or IO bitmaps */if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||     task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))__switch_to_xtra(prev_p, next_p, tss);/* If we're going to preload the fpu context, make sure clts   is run while we're batching the cpu state updates. */if (preload_fpu)clts();/* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so * the GDT and LDT are properly updated, and must be * done before math_state_restore, so the TS bit is up * to date. */arch_end_context_switch(next_p);if (preload_fpu)__math_state_restore();/*装载FPU寄存器*//* * Restore %gs if needed (which is common) */if (prev->gs | next->gs)lazy_load_gs(next->gs);percpu_write(current_task, next_p);return prev_p;}

static inline void __unlazy_fpu(struct task_struct *tsk){/*包含在thread_info描述符的status字段中的TS_USEDFPU标志。他表示进程在当前执行的过程中是否使用过FPU/MMU/XMM寄存器*/if (task_thread_info(tsk)->status & TS_USEDFPU) {/*由于tsk在这次执行中使用了FPU/MMX/SSE或SSE2指令；因此内核必须保存相关的硬件上下文*/__save_init_fpu(tsk);stts();} elsetsk->fpu_counter = 0;}

static inline void __save_init_fpu(struct task_struct *tsk){/*如果CPU使用SSE/SSE2扩展，则*/if (task_thread_info(tsk)->status & TS_XSAVE)xsave(tsk);elsefxsave(tsk);clear_fpu_state(tsk);task_thread_info(tsk)->status &= ~TS_USEDFPU;/*重置TS_USEDFPU标志*/}