Linux进程管理和调度-基于linux3.10

来源：互联网发布：电子图纸软件下载编辑：程序博客网时间：2024/06/12 18:57

在linux操作系统的主要目的是管理和分配硬件资源并为应用层提供一个良好的抽象接口。不论是内存管理子系统还是文件子系统亦或是网络子系统等都是为应用程序提供服务的，而应用程序映射到linux内核中称之为进程，由性能原因多个进程可能会被简化并组织为若干线程。

Linux进程可以把linux内核的其它子系统串接在一起，其和linux内核各个子系统均有联系，本文着重点就是梳理linux进程和各个子系统之间的关系，把握linux的脉络。还是基于以前分析的linux版本3.10

进程表示

Linux中进程表示的数据结构是task_struct，该结构是如此之重要，所以这里不得一行不落的抄在这里。这个结构体有点长，就来细细的品吧。

<include/linux/sched.h>1034 struct task_struct {1035     volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */1036     void *stack;1037     atomic_t usage;1038     unsigned int flags; /* per process flags, defined below */1039     unsigned int ptrace;1040 1041 #ifdef CONFIG_SMP1042     struct llist_node wake_entry;1043     int on_cpu;1044 #endif1045     int on_rq;1046 1047     int prio, static_prio, normal_prio;1048     unsigned int rt_priority;1049     const struct sched_class *sched_class;1050     struct sched_entity se;1051     struct sched_rt_entity rt;1052 #ifdef CONFIG_CGROUP_SCHED1053     struct task_group *sched_task_group;1054 #endif1055 1056 #ifdef CONFIG_PREEMPT_NOTIFIERS1057     /* 抢占情况下的通知链，哈希链*/1058     struct hlist_head preempt_notifiers;1059 #endif1060 1061     /*1062      * fpu_counter contains the number of consecutive context switches1063      * that the FPU is used. If this is over a threshold, the lazy fpu1064      * saving becomes unlazy to save the trap. This is an unsigned char1065      * so that after 256 times the counter wraps and the behavior turns1066      * lazy again; this to deal with bursty apps that only use FPU for1067      * a short time1068      */1069     unsigned char fpu_counter;1070 #ifdef CONFIG_BLK_DEV_IO_TRACE1071     unsigned int btrace_seq;1072 #endif1073 1074     unsigned int policy;1075     int nr_cpus_allowed;1076     cpumask_t cpus_allowed;1077 1078 #ifdef CONFIG_PREEMPT_RCU1079     int rcu_read_lock_nesting;1080     char rcu_read_unlock_special;1081     struct list_head rcu_node_entry;1082 #endif /* #ifdef CONFIG_PREEMPT_RCU */1083 #ifdef CONFIG_TREE_PREEMPT_RCU1084     struct rcu_node *rcu_blocked_node;1085 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */1086 #ifdef CONFIG_RCU_BOOST1087     struct rt_mutex *rcu_boost_mutex;1088 #endif /* #ifdef CONFIG_RCU_BOOST */1089 1090 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)1091     struct sched_info sched_info;1092 #endif1093 1094     struct list_head tasks;1095 #ifdef CONFIG_SMP1096     struct plist_node pushable_tasks;1097 #endif1098 1099     struct mm_struct *mm, *active_mm;1100 #ifdef CONFIG_COMPAT_BRK1101     unsigned brk_randomized:1;1102 #endif1103 #if defined(SPLIT_RSS_COUNTING)1104     struct task_rss_stat    rss_stat;1105 #endif1106 /* task state */1107     int exit_state;1108     int exit_code, exit_signal;1109     int pdeath_signal;  /*  The signal sent when the parent dies  */1110     unsigned int jobctl;    /* JOBCTL_*, siglock protected */1111 1112     /* Used for emulating ABI behavior of previous Linux versions */1113     unsigned int personality;1114 1115     unsigned did_exec:1;1116     unsigned in_execve:1;   /* Tell the LSMs that the process is doing an1117                  * execve */1118     unsigned in_iowait:1;1119 1120     /* task may not gain privileges */1121     unsigned no_new_privs:1;1122 1123     /* Revert to default priority/policy when forking */1124     unsigned sched_reset_on_fork:1;1125     unsigned sched_contributes_to_load:1;1126 1127     pid_t pid;1128     pid_t tgid;1129 1130 #ifdef CONFIG_CC_STACKPROTECTOR1131     /* Canary value for the -fstack-protector gcc feature */1132     unsigned long stack_canary;1133 #endif1134     /*1135      * pointers to (original) parent process, youngest child, younger sibling,1136      * older sibling, respectively.  (p->father can be replaced with1137      * p->real_parent->pid)1138      */1139     struct task_struct __rcu *real_parent; /* real parent process */1140     struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */1141     /*1142      * children/sibling forms the list of my natural children1143      */1144     struct list_head children;  /* list of my children */1145     struct list_head sibling;   /* linkage in my parent's children list */1146     struct task_struct *group_leader;   /* threadgroup leader */1147 1148     /*1149      * ptraced is the list of tasks this task is using ptrace on.1150      * This includes both natural children and PTRACE_ATTACH targets.1151      * p->ptrace_entry is p's link on the p->parent->ptraced list.1152      */1153     struct list_head ptraced;1154     struct list_head ptrace_entry;1155 1156     /* PID/PID hash table linkage. */1157     struct pid_link pids[PIDTYPE_MAX];1158     struct list_head thread_group;1159 1160     struct completion *vfork_done;      /* for vfork() */1161     int __user *set_child_tid;      /* CLONE_CHILD_SETTID */1162     int __user *clear_child_tid;        /* CLONE_CHILD_CLEARTID */1163 1164     cputime_t utime, stime, utimescaled, stimescaled;1165     cputime_t gtime;1166 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE1167     struct cputime prev_cputime;1168 #endif1169 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN1170     seqlock_t vtime_seqlock;1171     unsigned long long vtime_snap;1172     enum {1173         VTIME_SLEEPING = 0,1174         VTIME_USER,1175         VTIME_SYS,1176     } vtime_snap_whence;1177 #endif1178     unsigned long nvcsw, nivcsw; /*进程切换计数 */1179     struct timespec start_time;         /* monotonic time */1180     struct timespec real_start_time;    /*启动以来的时间 */1181 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */1182     unsigned long min_flt, maj_flt;1183 1184     struct task_cputime cputime_expires;1185     struct list_head cpu_timers[3];1186 1187 /* process credentials */1188     const struct cred __rcu *real_cred; /* objective and real subjective task1189                      * credentials (COW) */1190     const struct cred __rcu *cred;  /* effective (overridable) subjective task1191                      * credentials (COW) */1192     char comm[TASK_COMM_LEN]; /* executable name excluding path1193                      - access with [gs]et_task_comm (which lock1194                        it with task_lock())1195                      - initialized normally by setup_new_exec */1196 /* file system info */1197     int link_count, total_link_count;1198 #ifdef CONFIG_SYSVIPC1199 /* ipc stuff */1200     struct sysv_sem sysvsem;1201 #endif1202 #ifdef CONFIG_DETECT_HUNG_TASK1203 /* hung task detection */1204     unsigned long last_switch_count;1205 #endif1206 /* CPU-specific state of this task */1207     struct thread_struct thread;1208 /* filesystem information */1209     struct fs_struct *fs;1210 /* open file information */1211     struct files_struct *files;1212 /* namespaces */1213     struct nsproxy *nsproxy;1214 /* signal handlers */1215     struct signal_struct *signal;1216     struct sighand_struct *sighand;1217 1218     sigset_t blocked, real_blocked;1219     sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */1220     struct sigpending pending;1221 1222     unsigned long sas_ss_sp;1223     size_t sas_ss_size;1224     int (*notifier)(void *priv);1225     void *notifier_data;1226     sigset_t *notifier_mask;1227     struct callback_head *task_works;1228 1229     struct audit_context *audit_context;1230 #ifdef CONFIG_AUDITSYSCALL1231     kuid_t loginuid;1232     unsigned int sessionid;1233 #endif1234     struct seccomp seccomp;1235 1236 /* Thread group tracking */1237     u32 parent_exec_id;1238     u32 self_exec_id;1239 /* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,1240  * mempolicy */1241     spinlock_t alloc_lock;1242 1243     /* Protection of the PI data structures: */1244     raw_spinlock_t pi_lock;1245 1246 #ifdef CONFIG_RT_MUTEXES1247     /* PI waiters blocked on a rt_mutex held by this task */1248     struct plist_head pi_waiters;1249     /* Deadlock detection and priority inheritance handling */1250     struct rt_mutex_waiter *pi_blocked_on;1251 #endif1252 1253 #ifdef CONFIG_DEBUG_MUTEXES1254     /* mutex deadlock detection */1255     struct mutex_waiter *blocked_on;1256 #endif1257 #ifdef CONFIG_TRACE_IRQFLAGS1258     unsigned int irq_events;1259     unsigned long hardirq_enable_ip;1260     unsigned long hardirq_disable_ip;1261     unsigned int hardirq_enable_event;1262     unsigned int hardirq_disable_event;1263     int hardirqs_enabled;1264     int hardirq_context;1265     unsigned long softirq_disable_ip;1266     unsigned long softirq_enable_ip;1267     unsigned int softirq_disable_event;1268     unsigned int softirq_enable_event;1269     int softirqs_enabled;1270     int softirq_context;1271 #endif1272 #ifdef CONFIG_LOCKDEP1273 # define MAX_LOCK_DEPTH 48UL1274     u64 curr_chain_key;1275     int lockdep_depth;1276     unsigned int lockdep_recursion;1277     struct held_lock held_locks[MAX_LOCK_DEPTH];1278     gfp_t lockdep_reclaim_gfp;1279 #endif1280 1281 /* journalling filesystem info */1282     void *journal_info;1283 1284 /* stacked block device info */1285     struct bio_list *bio_list;1286 1287 #ifdef CONFIG_BLOCK1288 /* stack plugging */1289     struct blk_plug *plug;1290 #endif1291 1292 /* VM state */1293     struct reclaim_state *reclaim_state;1294 1295     struct backing_dev_info *backing_dev_info;1296 1297     struct io_context *io_context;1298 1299     unsigned long ptrace_message;1300     siginfo_t *last_siginfo; /* For ptrace use.  */1301     struct task_io_accounting ioac;1302 #if defined(CONFIG_TASK_XACCT)1303     u64 acct_rss_mem1;  /* accumulated rss usage */1304     u64 acct_vm_mem1;   /* accumulated virtual memory usage */1305     cputime_t acct_timexpd; /* stime + utime since last update */1306 #endif1307 #ifdef CONFIG_CPUSETS1308     nodemask_t mems_allowed;    /* Protected by alloc_lock */1309     seqcount_t mems_allowed_seq;    /* Seqence no to catch updates */1310     int cpuset_mem_spread_rotor;1311     int cpuset_slab_spread_rotor;1312 #endif1313 #ifdef CONFIG_CGROUPS1314     /* Control Group info protected by css_set_lock */1315     struct css_set __rcu *cgroups;1316     /* cg_list protected by css_set_lock and tsk->alloc_lock */1317     struct list_head cg_list;1318 #endif1319 #ifdef CONFIG_FUTEX1320     struct robust_list_head __user *robust_list;1321 #ifdef CONFIG_COMPAT1322     struct compat_robust_list_head __user *compat_robust_list;1323 #endif1324     struct list_head pi_state_list;1325     struct futex_pi_state *pi_state_cache;1326 #endif1327 #ifdef CONFIG_PERF_EVENTS1328     struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];1329     struct mutex perf_event_mutex;1330     struct list_head perf_event_list;1331 #endif1332 #ifdef CONFIG_NUMA1333     struct mempolicy *mempolicy;    /* Protected by alloc_lock */1334     short il_next;1335     short pref_node_fork;1336 #endif1337 #ifdef CONFIG_NUMA_BALANCING1338     int numa_scan_seq;1339     int numa_migrate_seq;1340     unsigned int numa_scan_period;1341     u64 node_stamp;         /* migration stamp  */1342     struct callback_head numa_work;1343 #endif /* CONFIG_NUMA_BALANCING */1344 1345     struct rcu_head rcu;1346 1347     /*1348      * cache last used pipe for splice1349      */1350     struct pipe_inode_info *splice_pipe;1351 1352     struct page_frag task_frag;1353 1354 #ifdef  CONFIG_TASK_DELAY_ACCT1355     struct task_delay_info *delays;1356 #endif1357 #ifdef CONFIG_FAULT_INJECTION1358     int make_it_fail;1359 #endif1360     /*1361      * when (nr_dirtied >= nr_dirtied_pause), it's time to call1362      * balance_dirty_pages() for some dirty throttling pause1363      */1364     int nr_dirtied;1365     int nr_dirtied_pause;1366     unsigned long dirty_paused_when; /* start of a write-and-pause period */1367 1368 #ifdef CONFIG_LATENCYTOP1369     int latency_record_count;1370     struct latency_record latency_record[LT_SAVECOUNT];1371 #endif1372     /*1373      * time slack values; these are used to round up poll() and1374      * select() etc timeout values. These are in nanoseconds.1375      */1376     unsigned long timer_slack_ns;1377     unsigned long default_timer_slack_ns;1378 1379 #ifdef CONFIG_FUNCTION_GRAPH_TRACER1380     /* Index of current stored address in ret_stack */1381     int curr_ret_stack;1382     /* Stack of return addresses for return function tracing */1383     struct ftrace_ret_stack *ret_stack;1384     /* time stamp for last schedule */1385     unsigned long long ftrace_timestamp;1386     /*1387      * Number of functions that haven't been traced1388      * because of depth overrun.1389      */1390     atomic_t trace_overrun;1391     /* Pause for the tracing */1392     atomic_t tracing_graph_pause;1393 #endif1394 #ifdef CONFIG_TRACING1395     /* state flags for use by tracers */1396     unsigned long trace;1397     /* bitmask and counter of trace recursion */1398     unsigned long trace_recursion;1399 #endif /* CONFIG_TRACING */1400 #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */1401     struct memcg_batch_info {1402         int do_batch;   /* incremented when batch uncharge started */1403         struct mem_cgroup *memcg; /* target memcg of uncharge */1404         unsigned long nr_pages; /* uncharged usage */1405         unsigned long memsw_nr_pages; /* uncharged mem+swap usage */1406     } memcg_batch;1407     unsigned int memcg_kmem_skip_account;1408 #endif1409 #ifdef CONFIG_HAVE_HW_BREAKPOINT1410     atomic_t ptrace_bp_refcnt;1411 #endif1412 #ifdef CONFIG_UPROBES1413     struct uprobe_task *utask;1414 #endif1415 #if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)1416     unsigned int    sequential_io;1417     unsigned int    sequential_io_avg;1418 #endif1419 };

进程状态

<include/linux/sched.h>//进程正在运行，current指针指向的内容#define TASK_RUNNING0  //分别是可中断和不可中断的等待状态，这类进程通常在资源或者某个事件发生#define TASK_INTERRUPTIBLE1#define TASK_UNINTERRUPTIBLE2//进程暂停，SIGSTOP、SIGTSIP、SIGTTIN、SIGTTOUT这类信号会触发进程到这一状态#define __TASK_STOPPED4//不属于进程状态，用于从停止的进程中，将当前被调试的那些与常规的进程区分出来。#define __TASK_TRACED8/* in tsk->exit_state *///1107可能的状态，僵死状态，进程已终止，但是父进程没有执行wait()系统调用，终止进程的信息也没有回收。#define EXIT_ZOMBIE16//wait()系统调用已经发出，而进程完全从系统移除之前的状态。#define EXIT_DEAD32/* in tsk->state again */#define TASK_DEAD64#define TASK_WAKEKILL128#define TASK_WAKING256#define TASK_PARKED512#define TASK_STATE_MAX1024

1108， exit_code进程退出码，exit_signal进程退出时发送的信号量，比如如果一个进程的退出引起一个进程组成为孤儿进程，则一个SIGUP信号将被发送给进程组。

1109父进程终止时向子进程发送的信号量。

1110作业控制，其可选值定义于include/linux/sched.h的1691~1710行

进程调度信息

1047~1051行，1074~1076

prio当前任务的动态优先级，其值影响任务的调度顺序

normal_prio任务的常规优先级，基于static_prio和调度策略计算

static_prio静态优先级，在进程创建时分配，该值会影响分配给任务的时间片的长短和非实时任务的动态优先级的计算。

rt_priority实时优先级，0表示非实时任务，[1,99]表示实时任务，值越大则优先级越高。

sched_class调度类，该调度类支持的操作函数集。

se调度实体，对单个任务或任务组进行调度。

Policy调度策略。有以下四种：

#define SCHED_NORMAL0#define SCHED_FIFO1#define SCHED_RR2#define SCHED_BATCH3

nr_cpus_allowed多核情况下，允许最多在几个核上运行。

cpus_allowed限制其能在运行的CPU。

进程管理

1094~1094行

Tasks所有进程均串接在该链表上。

pushable_tasks SMP实时调度管理per-CPU任务。

内存

1099~1105行

1099描述进程内存分布结构体，text段，data段，堆均位于此。

1101启用随机分配内存基地址标志，

1104，rss_stat线程缓存的信息

ID管理

1127~1128行

Pid进程ID，tgid线程组ID

进程创建接口

fork，创建子进程，父进程的所有资源以适当方式复制到子进程。为减少与调用相关的工作量，使用写时复制技术（copy-on-write）

vfork类似fork，但是共享父进程的数据，COW技术使该技术不再具有优势。

Exec从一个可执行的二进制文件加载另一个程序来代替当前的进程。

Clone创建linux线程库。对父子之间的资源进行精确的控制。

do_fork

Do_fork函数复制进程，如果复制成功会启动新进程并等待其完成。

<kernel/fork.c>1563 long do_fork(unsigned long clone_flags,1564           unsigned long stack_start,1565           unsigned long stack_size,1566           int __user *parent_tidptr,1567           int __user *child_tidptr)

以上是do_fork函数的定义，之所以单独列出来是因为该函数，clone_flags是父子进程复制属性控制。可选的参数如下，这些flag的标志后面都有对其意义的注释：

#define CSIGNAL0x000000ff/* signal mask to be sent at exit */#define CLONE_VM0x00000100/* set if VM shared between processes */#define CLONE_FS0x00000200/* set if fs info shared between processes */#define CLONE_FILES0x00000400/* set if open files shared between processes */#define CLONE_SIGHAND0x00000800/* set if signal handlers and blocked signals shared */#define CLONE_PTRACE0x00002000/* set if we want to let tracing continue on the child too */#define CLONE_VFORK0x00004000/* set if the parent wants the child to wake it up on mm_release */#define CLONE_PARENT0x00008000/* set if we want to have the same parent as the cloner */#define CLONE_THREAD0x00010000/* Same thread group? */#define CLONE_NEWNS0x00020000/* New namespace group? */#define CLONE_SYSVSEM0x00040000/* share system V SEM_UNDO semantics */#define CLONE_SETTLS0x00080000/* create a new TLS for the child */#define CLONE_PARENT_SETTID0x00100000/* set the TID in the parent */#define CLONE_CHILD_CLEARTID0x00200000/* clear the TID in the child */#define CLONE_DETACHED0x00400000/* Unused, ignored */#define CLONE_UNTRACED0x00800000/* set if the tracing process can't force CLONE_PTRACE on this clone */#define CLONE_CHILD_SETTID0x01000000/* set the TID in the child *//* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)   and is now available for re-use. */#define CLONE_NEWUTS0x04000000/* New utsname group? */#define CLONE_NEWIPC0x08000000/* New ipcs */#define CLONE_NEWUSER0x10000000/* New user namespace */#define CLONE_NEWPID0x20000000/* New pid namespace */#define CLONE_NEWNET0x40000000/* New network namespace */#define CLONE_IO0x80000000/* Clone io context */

Stack_start是用户态下栈的起始地址。

Regs是指向寄存器集合的指针，存放了调用参数。

Stack_size是用户态下栈大小。

parent_tidptr和child_tidptr分别指向用户空间的父子进程的PID。

线程和进程创建的系统调用接口如下：

1641 pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)1642 {1643     return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,1644         (unsigned long)arg, NULL, NULL);1645 }1646 1647 #ifdef __ARCH_WANT_SYS_FORK1648 SYSCALL_DEFINE0(fork)1649 {1650 #ifdef CONFIG_MMU1651     return do_fork(SIGCHLD, 0, 0, NULL, NULL);1652 #else1653     /* can not support in nommu mode */1654     return(-EINVAL);1655 #endif1656 }1657 #endif1667 #ifdef __ARCH_WANT_SYS_CLONE1668 #ifdef CONFIG_CLONE_BACKWARDS1669 SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,1670          int __user *, parent_tidptr,1671          int, tls_val,1672          int __user *, child_tidptr)1673 #elif defined(CONFIG_CLONE_BACKWARDS2)1674 SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,1675          int __user *, parent_tidptr,1676          int __user *, child_tidptr,1677          int, tls_val)1678 #else1679 SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,1680          int __user *, parent_tidptr,1681          int __user *, child_tidptr,1682          int, tls_val)1683 #endif1684 {1685     return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);1686 }1687 #endif

上述函数进程/线程的创建归结为对do_fork的调用。如果do_fork成功了，则返回进程的PID，否则返回错误码。创建一个新进程和以往相比麻烦的地方是对命名空间的处理。由于fork会返回新进程的PID，如果创建了新的PID命名空间，则会调用task_pid_nr_ns获取在父命名空间中为新进程选择的PID，即发出fork调用的那个命名空间。如果没有创建新的PID命名空间，则直接调用task_pid_vnr获取局部的PID返回就可以了。

图1.2.1do_fork函数调用流程

该函数首先调用copy_process执行新进程的创建工作，其返回值就是一个task_struct类型的指针，这里创建的意义是能复制，尽量复用父进程的task_struct是创建的精髓，在创建新的网络命名空间也是这么一个意义。

copy_process的第一个参数控制复制内容，主要对命名空间、线程、资源共享的处理。最后一个参数trace用于指示是否启用对进程监控的功能。copy_process完成了do_fork的大部分工作。函数的执行过程如下。

图1.2.2 copy_process函数执行过程

<kernel/fork.c>1132 static struct task_struct *copy_process(unsigned long clone_flags,1133                     unsigned long stack_start,1134                     unsigned long stack_size,1135                     int __user *child_tidptr,1136                     struct pid *pid,1137                     int trace)

对flags和安全的合法性检查这里就跳过了，current是当前进程的task_struct结构体指针，这里对其进行复制。

1186     p = dup_task_struct(current);

dup_task_struct的主要完成task_struct和thread_info内存分配和thread_info的拷贝工作。

图1.2.3 dup_task_struct函数主要工作

为task_struct和thread_info分配内存不是本文重点，有兴趣可以参考《内存管理-之内核内存管理-基于linux3.10》，剩下的是拷贝fpu内容和thread_info内容。这里总结一下该函数主要完成的工作：

1、这里fpu和thread_info的内容拷贝完毕了。

2、如果设置了随机分配栈起始，还会调用get_random_int函数设置task_struct的stack_canary值。

3、将usage成员设置成2，当前创建的进程正在使用，另外releas_task函数也会使用。

4、在复制完成之后，该函数会检查打开的进程数是否超限，如果超限则放弃创建进程，除非当前用户是root用户或者分配了CAP_SYS_RESOURCE或CAP_SYS_ADMIN权限。

1200     if (atomic_read(&p->real_cred->user->processes) >=1201             task_rlimit(p, RLIMIT_NPROC)) {1202         if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&1203             p->real_cred->user != INIT_USER)1204             goto bad_fork_free;1205     }

如果资源限制检查通过，则会初始化task_struct的一些成员，

…p->did_exec = 0;delayacct_tsk_init(p);/* Must remain after dup_task_struct() */copy_flags(clone_flags, p);INIT_LIST_HEAD(&p->children);INIT_LIST_HEAD(&p->sibling);rcu_copy_process(p);p->vfork_done = NULL;spin_lock_init(&p->alloc_lock);…

接着调用sched_fork执行一些调度相关的设置，这包括将进程设置为TASK_RUNNING状态，为进程选择合适的调度类型（实时、CFS）以及优先级。即将该进程分配给一个CPU（单核还是SMP情况）。

shed_fork首先调用_sched_frok对调度实体和进行相应的初始化，该函数还根据编译配置选项进行适当的初始化，这些编译选项包括调度统计、调度组、抢占通知函数、NUMA是否启动等。调度实体的初始化如下：

        p->se.on_rq= 0;p->se.exec_start= 0;p->se.sum_exec_runtime= 0;p->se.prev_sum_exec_runtime= 0;p->se.nr_migrations= 0;p->se.vruntime= 0;INIT_LIST_HEAD(&p->se.group_node);

上述的工作完成之后，就将进程的状态设置成TASK_RUNNING,

p->state = TASK_RUNNING;

同时将父进程的优先传递给子进程。

p->prio = current->normal_prio;

接下来判断子进程是否采用了实时调度方法，如果非实时调度方法，则将其调度类成员设置成公平调度类。

if (!rt_prio(p->prio))p->sched_class = &fair_sched_class;

接着使用RCU安全方式设置进程所在的CPU，这是由set_task_cpu完成的，在SMP情况下，这有点复杂，对于单核情况下，该函数就是空的。

        raw_spin_lock_irqsave(&p->pi_lock, flags);set_task_cpu(p, cpu);raw_spin_unlock_irqrestore(&p->pi_lock, flags);

set_task_cpu要判断的第一件事就是新创建的进程是否和父进程在同一个CPU核上，如果不在同一个核上，则说明发生了任务迁移（从一个核到另一个核），发生了迁移则需要设置迁移通知链并且需要将创建进程所在的CPU核号进行更新。

接下会调用若干的copy函数来复制或者共享父进程所拥有的一些内核子系统资源。就以一个进程所拥有的文件资源为例说明复制和拥有的差异。

931 static int copy_files(unsigned long clone_flags, struct task_struct *tsk) 932 { 933     struct files_struct *oldf, *newf; 934     int error = 0; 935  936     /* 937      * A background process may not have any files ... 938      */ 939     oldf = current->files; 940     if (!oldf) 941         goto out; 942  943     if (clone_flags & CLONE_FILES) { 944         atomic_inc(&oldf->count); 945         goto out; 946     } 947  948     newf = dup_fd(oldf, &error); 949     if (!newf) 950         goto out; 951  952     tsk->files = newf; 953     error = 0; 954 out: 955     return error; 956 }

939行判断进程是否拥有文件，对于后台进程是不需要的，如果设置了CLONE_FILES，则是共享文件，这是简单将父进程的文件引用计数加1就可以了，如果没有设置CLONE_FILES标志，则会调用dup_fd复制父进程所有的文件。其它资源的处理方法类似。这些flags在1.2.1节的开始就列出来了。

copy_namespaces有些特殊，其不是对父进程资源进行控制，而是根据flags表示为子进程选择是否创建相应的命名空间，可以参考《linux namespace-之使用》《网络命名空间（内核源码实现）--基于Linux3.10》

copy_thread的语义就更特殊一些了，其是依赖于CPU架构的，复制和包含一些线程数据，这些数据比较偏寄存器级。

接下来分配structpid实例，这个参数是copy_process函数的倒数第二个参数，如果创建的不是init的进程，则需要在进程的命名空间中分配一个pid实例。

1350     if (pid != &init_struct_pid) {1351         retval = -ENOMEM;1352         pid = alloc_pid(p->nsproxy->pid_ns);1353         if (!pid)1354             goto bad_fork_cleanup_io;1355     }

关于启动一个新程序execve系统调用，参考《linux应用程序如何运行》

调度器实现

Linux采用红黑树和虚拟时钟技术来实现调度器的管理，所有进程按时间在一个红黑树中排序，等待CPU时间最长的进程是树的最左下侧的那个节点。调度的方式有两种一种是进程自动放弃，另一种是周期性调度。

Task_struct中和调度有关的成员如下：

struct task_struct {int prio, static_prio, normal_prio;unsigned int rt_priority;const struct sched_class *sched_class;struct sched_entity se;struct sched_rt_entity rt;#ifdef CONFIG_CGROUP_SCHEDstruct task_group *sched_task_group;#endifunsigned int policy;int nr_cpus_allowed;cpumask_t cpus_allowed;}

prio,static_prio, normal_prio分别用于表示动态优先级、静态优先级和动态优先级。静态优先级在进程启动时分配。可以通过nice和sched_setscheduler系统调用修改。normal_prio则是基于静态优先级和调度策略计算而得的动态优先级。Prio是任务当前的动态优先级，其值影响调度顺序。

rt_priority是实时调度优先级，为0则表示非实时，[1,99]表示实时优先级，值越大优先级越高。

sched_class所属调度器类。

Se是调度实体，CFS（completelyfair scheduler）使用。

Rt 实时调度实体。

sched_task_group，进程调度组，这些组里的进程具有同样的调度优先级。

Policy调度策略。可选值如下：

#define SCHED_NORMAL0//CFS使用，默认#define SCHED_FIFO1//实时进程先进先出调度#define SCHED_RR2//实时进程轮询调度#define SCHED_BATCH3//CFS使用，批处理情况使用。

nr_cpus_allowed该进程可以在多少个CPU上运行。

cpus_allowed该进程可以在那些进程上运行的掩码，这里举个例子，说明这一字段的意义，一个CPU内有8个核，核0正在跑一个应用程序，该应用程序正在等待网络传递的视频数据，加入这时核号为3的收到了该数据，这时需要将数据拷贝到0核cache或者将进程切换到核号为3的那个核上，但是想一想，这样开销必然大，有没有办提高效率呢，有其中一种办法就是将接收网络视频数据的进程限制在0号核上，这时接收到的数据cache一致性将非常。

调度类

调度类提供了通用调度器和各个调度方法之间的关联。

<kernel/sched/sched.h>struct sched_class {const struct sched_class *next;void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);void (*yield_task) (struct rq *rq);bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);struct task_struct * (*pick_next_task) (struct rq *rq);void (*put_prev_task) (struct rq *rq, struct task_struct *p);#ifdef CONFIG_SMPint  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);void (*migrate_task_rq)(struct task_struct *p, int next_cpu);void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);void (*post_schedule) (struct rq *this_rq);void (*task_waking) (struct task_struct *task);void (*task_woken) (struct rq *this_rq, struct task_struct *task);void (*set_cpus_allowed)(struct task_struct *p, const struct cpumask *newmask);void (*rq_online)(struct rq *rq);void (*rq_offline)(struct rq *rq);#endifvoid (*set_curr_task) (struct rq *rq);void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);void (*task_fork) (struct task_struct *p);void (*switched_from) (struct rq *this_rq, struct task_struct *task);void (*switched_to) (struct rq *this_rq, struct task_struct *task);void (*prio_changed) (struct rq *this_rq, struct task_struct *task,     int oldprio);unsigned int (*get_rr_interval) (struct rq *rq, struct task_struct *task);#ifdef CONFIG_FAIR_GROUP_SCHEDvoid (*task_move_group) (struct task_struct *p, int on_rq);#endif};

不论是实时调度类还是非实时调度类，层次上它们位于同一层，但是实时进程会在完全公平进程之前被处理。其next成员将不同调度类的sched_class实例串接起来。调度类各个方法还是比较明显的。

enqueuer_task和dequeue_task分别向就绪队列添加、删除一个新进程。

yield_task和yield_to_task都是自主放弃CPU控制权，但是yield_to_task是将CPU控制权移交给特定的CPU。

check_preempt_curr唤醒新进程来强制当前进程，wake_up_new_task会调用更改函数。

pick_next_task选择下一个将要运行的进程，在当前进程由其它进程替换之前put_prev_task则被调用。

set_curr_task调度策略改变时会调用。

task_tick每次周期性调度器被激活时会调用该函数。

调度实体

972 struct sched_entity { 973     struct load_weight  load;       /* for load-balancing */ 974     struct rb_node      run_node; 975     struct list_head    group_node; 976     unsigned int        on_rq; 977  978     u64         exec_start; 979     u64         sum_exec_runtime; 980     u64         vruntime; 981     u64         prev_sum_exec_runtime; 982  983     u64         nr_migrations; 984  985 #ifdef CONFIG_SCHEDSTATS 986     struct sched_statistics statistics; 987 #endif 988  989 #ifdef CONFIG_FAIR_GROUP_SCHED 990     struct sched_entity *parent; 991     /* rq on which this entity is (to be) queued: */ 992     struct cfs_rq       *cfs_rq; 993     /* rq "owned" by this entity/group: */ 994     struct cfs_rq       *my_q; 995 #endif 996  997 /* 998  * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be 999  * removed when useful for applications beyond shares distribution (e.g.1000  * load-balance).1001  */1002 #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)1003     /* Per-entity load-tracking */1004     struct sched_avg    avg;1005 #endif1006 };

load为每一个调度实体确定了一个权重，该权重决定了各个实体占队列总负荷的比例。

run_node标准的树节点，允许调度实体存储在一个红黑树上。

On_rq指明该实体是否在一个调度队列上。

exec_start、sum_exec_runtime用于记录消耗的CPU时间，完全公平调度器使用。当进程的CPU控制权失去时会调用将sum_exec_runtime记录的时间值存放到prev_sum_exec_runtime里。

Vruntime是虚拟进程执行期间虚拟时钟上流逝的时间和前一个进程总的消耗的时间。

进程优先级计算方法

图1.3.1 进程优先级

进程的优先级分为实时优先级和普通优先级，可以通过nice命令设置进程的静态优先级，nice值的范围是[-20,19]，进程的实际优先级是动态优先级和静态优先级之和。

<include/linux/sched/rt.h>#define MAX_USER_RT_PRIO100#define MAX_RT_PRIOMAX_USER_RT_PRIO#define MAX_PRIO(MAX_RT_PRIO + 40)#define DEFAULT_PRIO(MAX_RT_PRIO + 20) <kernel/sched/sched.h>#define NICE_TO_PRIO(nice)(MAX_RT_PRIO + (nice) + 20)#define PRIO_TO_NICE(prio)((prio) - MAX_RT_PRIO - 20)#define TASK_NICE(p)PRIO_TO_NICE((p)->static_prio)

进程优先级的计算由effective_prio方法完成，通过nice值设置静态优先级就是调用其完成优先级的变更的。Normal_prio计算进程的普通优先级，该函数首先判断进程使用的是否是实时进程，实时进程的调度策略要么是SCHED_FIFO，要么是SCHED_RR，如果是实时进程，则其动态优先级的值是MAX_RT_PRIO-1 - p->rt_priority;如果是普通进程，则其优先级实际上就是静态优先级。

<kernel/sched/core.c>900 static inline int normal_prio(struct task_struct *p) 901 { 902     int prio; 903  904     if (task_has_rt_policy(p)) 905         prio = MAX_RT_PRIO-1 - p->rt_priority; 906     else 907         prio = __normal_prio(p); 908     return prio; 909 }

该函数计算进程的优先级，首先通过normal_prio函数计算进程的普通优先级，然后调用rt_prio检查动态优先级是否在实时进程优先级中，如果在实时进程优先级中，则返回该动态优先级。

<kernel/sched/core.c>918 static int effective_prio(struct task_struct *p) 919 { 920     p->normal_prio = normal_prio(p); 921     /* 922      * If we are RT tasks or we were boosted to RT priority, 923      * keep the priority unchanged. Otherwise, update priority 924      * to the normal priority: 925      */ 926     if (!rt_prio(p->prio)) 927         return p->normal_prio; 928     return p->prio; 929 }

进程load计算

调度实体的load成员反映的是进程的权重。

<include/linux/sched.h>struct load_weight {unsigned long weight, inv_weight;};

进程的nice值每降低一个，则将多获得10%的CPU时间，为了实现优先级和能够使用的CPU时间挂钩，采用权重转换表，权重转换表相邻项之间的比值近似为1.25，如果一个进程多获得10%的CPU时间，则意味着另一个进程必须减少10%的CPU时间，这意味着相邻权重的差值约为25%。如果只有权重为1024和820的两个进程，这两个进程占用的CPU差值应该是10%，1024/(1024+820) = ~55%，而820/(1024+820) = ~ 45%。差值约是10%。

<include/linux/sched.h>static const int prio_to_weight[40] = { /* -20 */     88761,     71755,     56483,     46273,     36291, /* -15 */     29154,     23254,     18705,     14949,     11916, /* -10 */      9548,      7620,      6100,      4904,      3906, /*  -5 */      3121,      2501,      1991,      1586,      1277, /*   0 */      1024,       820,       655,       526,       423, /*   5 */       335,       272,       215,       172,       137, /*  10 */       110,        87,        70,        56,        45, /*  15 */        36,        29,        23,        18,        15,};

优先级和权重转换由set_load_weight函数完成。755行判断是否是idle进程，idle优先级是最低的。

<kernel/sched/core.c>747 static void set_load_weight(struct task_struct *p) 748 { 749     int prio = p->static_prio - MAX_RT_PRIO; 750     struct load_weight *load = &p->se.load; 751  752     /* 753      * SCHED_IDLE tasks get minimal weight: 754      */ 755     if (p->policy == SCHED_IDLE) { 756         load->weight = scale_load(WEIGHT_IDLEPRIO); 757         load->inv_weight = WMULT_IDLEPRIO; 758         return; 759     } 760  761     load->weight = scale_load(prio_to_weight[prio]); 762     load->inv_weight = prio_to_wmult[prio]; 763 }

761行就是去前面的数组中获取权重，而762行是761行权重的倒数。

主调度器schedule

__schedule

__schedule函数进入可能源于以下时间：

1、显示阻塞：mutex、semaphore、waitqueue等

2、 TIF_NEED_RESCHED标志，中断或者用户空间程序返回检测该标志。

3、 Wakeup之类的函数，实际上并不直接调用schedule()，而是将进程添加到run-queue。如果添加到运行队列的进程抢占了当前进程，wakeup函数将设置TIF_NEED_RESCHED标志，schedule函数将在下列可能的场景被调用：

a) 如果内核支持抢占（CONFIG_PREEMPT=y），在syscall或者异常上下文，在最外层的preempt_enable()可能调用schedule。

b) 在中断上下文，中断函数返回到可抢占上下文

c) 如果内核未使能抢占，则发生在如下场景，cond_resched()、explicit schedule()、syscall或者用户空间异常、中断处理返回到用户空间。

<kernel/sched/core.c>2950 static void __sched __schedule(void)2951 {2952     struct task_struct *prev, *next;2953     unsigned long *switch_count;2954     struct rq *rq;2955     int cpu;

prev指向即将要调度出去的进程，next是调度要执行的进程。在某些情况下，二者可能是一样的，比如只有一个进程可以运行时，则调度器查找的进程将还是这个进程。

2957  need_resched:2958     preempt_disable();2959     cpu = smp_processor_id();2960     rq = cpu_rq(cpu);2961     rcu_note_context_switch(cpu);2962     prev = rq->curr;

2958行禁止抢占，2959获得进程所在的当前CPU号，这是因为运行队列使用了per-CPU变量方式，必须根据在取得CPU号的基础上获得对应的变量。

2960获得该CPU的rq，每个核号均对应一个rq，这是一种多核之间的一种免锁算法，内核中很多地方使用到。

2961行rcu_note_context_switch设置rcu调度标志

2962行prev指向当前CPU上运行的进程，因为该CPU是要被调度出去的对象。

2969     raw_spin_lock_irq(&rq->lock);2970 2971     switch_count = &prev->nivcsw;2972     if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {2973         if (unlikely(signal_pending_state(prev->state, prev))) {2974             prev->state = TASK_RUNNING;2975         } else {2976             deactivate_task(rq, prev, DEQUEUE_SLEEP);2977             prev->on_rq = 0;2978 2979             /*2980              * If a worker went to sleep, notify and ask workqueue2981              * whether it wants to wake up a task to maintain2982              * concurrency.2983              */2984             if (prev->flags & PF_WQ_WORKER) {2985                 struct task_struct *to_wakeup;2986 2987                 to_wakeup = wq_worker_sleeping(prev, cpu);2988                 if (to_wakeup)2989                     try_to_wake_up_local(to_wakeup);2990             }2991         }2992         switch_count = &prev->nvcsw;2993     }

2069行，rq不允许并发修改，这里先获取修改的自旋锁。

2971行，切换计数。

2972行，若果prev->state不等于0，即意味着进程处于不可运行或停止状态，(preempt_count() & PREEMPT_ACTIVE)非零则意味着内核支持抢占。这一样的意义就是如果该进程处于不可运行状态并且内核也不支持抢占，这就意味着该进程需要主动放弃CPU。

2973这行判断是否有信号量等待处理，unlikely的意义是不太可能有信号量需要处理，不过如果真的有信号量要处理，只能将进程状态再次设置成TASK_RUNNING状态。否则，2976行将该进程从rq上deactivate，其方法是调度类的dequeue_task方法。

2979~2990是对worker的处理，如果worker将要休眠，则通知一个工作队列以判断其是否想要唤醒一个进程以维持并发。

2995     pre_schedule(rq, prev);2996 2997     if (unlikely(!rq->nr_running))2998         idle_balance(cpu, rq);2999 3000     put_prev_task(rq, prev);3001     next = pick_next_task(rq);3002     clear_tsk_need_resched(prev);3003     rq->skip_clock_update = 0;

2995行，到这里说明是抢占式调度，该行调用调度类的pre_schedule方法进行相应的设置。

2997~2998如果当前CPU上可以运行的进程数等于0，则调用idle_balance对CPU资源进行均衡，如果是单核，则情况较为简单，什么也不过，如果是SMP情况，那么就需要从其它可运行进程较多的CPU核上迁移一些进程到该CPU的rq上。

3000行调用调度类的put_prev_task方法通知调度类自己当前的进程要被替换掉。

3001行调用调度类的pick_next_task方法选取合适的可以运行的进程。

3002行将先前进程的TIF_NEED_RESCHED标志清掉。

3005     if (likely(prev != next)) {3006         rq->nr_switches++;3007         rq->curr = next;3008         ++*switch_count;3009 3010         context_switch(rq, prev, next); /* unlocks the rq */3011         /*3012          * The context switch have flipped the stack from under us3013          * and restored the local variables which were saved when3014          * this task called schedule() in the past. prev == current3015          * is still correct, but it can be moved to another cpu/rq.3016          */3017         cpu = smp_processor_id();3018         rq = cpu_rq(cpu);3019     } else3020         raw_spin_unlock_irq(&rq->lock);

3005行判断prev和next是否相等，这是可能的，如果其它进程均在休眠，只有当前的进程可以运行，则pick_next_task方法选取到的进程依然是当前的进程。如果相等则执行3020行释放自旋锁。

否则3006~3008跟新统计信息，3010执行进程上下文切换。

3017获得该进程所在的CPU核号，然后3018行更新其rq。

3022     post_schedule(rq);3023 3024     sched_preempt_enable_no_resched();3025     if (need_resched())3026         goto need_resched;3027 }

3022行调用调度类的post方法，进行后处理。

3025行判断是否需要进行重新调度，就是判断当前线程的TIF_NEED_RESCHED标志是否被设置，如果设置则跳转到2957行再次调度。

context_switch

进程切换由context_switch完成，该函数和CPU架构是息息相关的，对于IA32和IA64而言，新进程的全局描述符表和局部描述符表需要更新的，另外寄存器的内容也要切换成新进程之前的值。

1978 static inline void1979 context_switch(struct rq *rq, struct task_struct *prev,1980            struct task_struct *next)1981 {1982     struct mm_struct *mm, *oldmm;1983 1984     prepare_task_switch(rq, prev, next);1985 1986     mm = next->mm;1987     oldmm = prev->active_mm;1988     /*1989      * For paravirt, this is coupled with an exit in switch_to to1990      * combine the page table reload and the switch backend into1991      * one hypercall.1992      */1993     arch_start_context_switch(prev);1994 1995     if (!mm) {1996         next->active_mm = oldmm;1997         atomic_inc(&oldmm->mm_count);1998         enter_lazy_tlb(oldmm, next);1999     } else2000         switch_mm(oldmm, mm, next);2001 2002     if (!prev->mm) {2003         prev->active_mm = NULL;2004         rq->prev_mm = oldmm;2005     }2006     /*2007      * Since the runqueue lock will be released by the next2008      * task (which is an invalid locking op but in the case2009      * of the scheduler it's an obvious special-case), so we2010      * do an early lockdep release here:2011      */2012 #ifndef __ARCH_WANT_UNLOCKED_CTXSW2013     spin_release(&rq->lock.dep_map, 1, _THIS_IP_);2014 #endif2015 2016     context_tracking_task_switch(prev, next);2017     /* Here we just switch the register state and the stack. */2018     switch_to(prev, next, prev);2019 2020     barrier();2021     /*2022      * this_rq must be evaluated again because prev may have moved2023      * CPUs since it called schedule(), thus the 'rq' on its stack2024      * frame will be invalid.2025      */2026     finish_task_switch(this_rq(), prev);2027 }

1984行，prepare_task_switch为进程切换之前做些准备工作，这是为不同的CPU而提供的，在正式切换之前可以做些CPU需要特定完成的事。

1986~1987行，mm含义是进程拥有的内存描述符，而active_mm是进程使用的内存描述符，这两个字段咋一看似乎应该是一样的，事实上对于一般意义上的进程这两个字段确实是相同的，但是对于内核线程而言，其没有自己的地址空间，所以其mm字段等于NULL。

1993行，arch_start_context_switch是调用参数虚拟化函数。

1995行，判断是否是内核线程。

1996~1998行，内核线程的处理方法，其active_mm使用之前进程的，使用计数原子加一。接着enter_lazy_tlb将TLB（translation look aside buffer）设置为惰性模式，这种模式在SMP情况下才有意义，如果是单核，由于所有用户态进程和内核进程共享同一段内核地址空间且不存在并发访问，所以可以不刷新TLB，而问题是在SMP情况下情况就复杂一点，但是惰性TLB技术还是可以延迟TLB刷新操作以期提高系统性能。

2002~2005行，判断前一个进程是否是内核线程，如果是内核线程则其使用的内存描符必须无效，并把prev的内存描述符的指针保存到运行队列的prev_mm字段。

2012行，切换栈和寄存器里的值以适应新调度运行进程的需要。

2020行，指示2026的操作需要操作内存而非cache。这是因为SMP情况下，prev进程可能调度到其它CPU（负载均衡技术）上运行了，所以需要跟新其栈上的rq队列。

完全公平调度类

<kernel/sched/fair.c>6135 const struct sched_class fair_sched_class = {6136     .next           = &idle_sched_class,6137     .enqueue_task       = enqueue_task_fair,6138     .dequeue_task       = dequeue_task_fair,6139     .yield_task     = yield_task_fair,6140     .yield_to_task      = yield_to_task_fair,6141 6142     .check_preempt_curr = check_preempt_wakeup,6143 6144     .pick_next_task     = pick_next_task_fair,6145     .put_prev_task      = put_prev_task_fair,6146 6147 #ifdef CONFIG_SMP6148     .select_task_rq     = select_task_rq_fair,6149 #ifdef CONFIG_FAIR_GROUP_SCHED6150     .migrate_task_rq    = migrate_task_rq_fair,6151 #endif6152     .rq_online      = rq_online_fair,6153     .rq_offline     = rq_offline_fair,6154 6155     .task_waking        = task_waking_fair,6156 #endif6157 6158     .set_curr_task          = set_curr_task_fair,6159     .task_tick      = task_tick_fair,6160     .task_fork      = task_fork_fair,6161 6162     .prio_changed       = prio_changed_fair,6163     .switched_from      = switched_from_fair,6164     .switched_to        = switched_to_fair,6165 6166     .get_rr_interval    = get_rr_interval_fair,6167 6168 #ifdef CONFIG_FAIR_GROUP_SCHED6169     .task_move_group    = task_move_group_fair,6170 #endif6171 };

CFS虚拟时钟

完全公平调度算法依赖于虚拟时钟，。于虚拟时钟相关的计算由update_curr()函数完成。该函数计算当前进程的执行时间并将该时间值存放于delta_exec变量，然后将该值作为参数传递给__update_curr()函数，该函数根据可运行进程数和进程优先级（load值）重新计算时间。当前进程的vruntime由权重时间增加。

系统时钟会周期性唤醒update_curr()，当一个进程变成可运行、阻塞、或者编程不可运行时该函数也会被调用。使用这种方式，vruntime是运行时间的准确度量并且指示了下一个要运行的进程。

<kernel/sched/fair.c>686 static void update_curr(struct cfs_rq *cfs_rq) 687 { 688     struct sched_entity *curr = cfs_rq->curr; 689     u64 now = rq_of(cfs_rq)->clock_task; 690     unsigned long delta_exec; 691  692     if (unlikely(!curr)) 693         return; 694  695     /* 696      * Get the amount of time the current task was running 697      * since the last time we changed load (this cannot 698      * overflow on 32 bits): 699      */ 700     delta_exec = (unsigned long)(now - curr->exec_start); 701     if (!delta_exec) 702         return; 703  704     __update_curr(cfs_rq, curr, delta_exec); 705     curr->exec_start = now; 706  707     if (entity_is_task(curr)) { 708         struct task_struct *curtask = task_of(curr); 709  710         trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); 711         cpuacct_charge(curtask, delta_exec); 712         account_group_exec_runtime(curtask, delta_exec); 713     } 714  715     account_cfs_rq_runtime(cfs_rq, delta_exec); 716 }

686行，该函数的参数是CFS调度队列，689行获得当前调度队列的实际时钟值。

692行，如果当前就绪队列上没有进程在运行，则直接返回。

700行，注释里说的很明白是自上一次负载权重改变之后当前进程执行的时间。

704行的函数主要是更新运行时间统计信息，后面会细细分析。

705行，将当前调度实体（进程）的执行时间更新为当前运行队列的时间值。

707行，判断是调度实体还是一个进程。调度实体和调度类的区别在于，调度实体有其自己的运行队列，而进程是没有运行队列的。

708~712是针对针对进程情况的进行的一些工作，这些工作包括统计信息变更，支持cgroup情况下的CPU统计信息，以及当前进程运行的总时间更新。

<kernel/sched/fair.c>669 static inline void 670 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, 671           unsigned long delta_exec) 672 { 673     unsigned long delta_exec_weighted; 674  675     schedstat_set(curr->statistics.exec_max, 676               max((u64)delta_exec, curr->statistics.exec_max)); 677  678     curr->sum_exec_runtime += delta_exec; 679     schedstat_add(cfs_rq, exec_clock, delta_exec); 680     delta_exec_weighted = calc_delta_fair(delta_exec, curr); 681  682     curr->vruntime += delta_exec_weighted; 683     update_min_vruntime(cfs_rq); 684 }

576行将调度实体一次运行的最长时间值exec_max设置成delta_exec和当前最长时间值中较大的那个。

678跟新当前调度实体执行的总时间，总时间=以前执行时间之和+本次执行时间之和

679行在内核配置选项打开了调度统计后，该函数会将rq的exec_clock成员加上delta_exec以跟新调度队列执行的总时间。

680根据前一次调度实体运行的时间值从新计算时间。这个计算过程有点琐碎。

<kernel/sched/fair.c>597 static inline unsigned long 598 calc_delta_fair(unsigned long delta, struct sched_entity *se) 599 { 600     if (unlikely(se->load.weight != NICE_0_LOAD)) 601         delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); 602  603     return delta; 604 }

首先600行的NICE_0_LOAD，从字面意思来看是nice值等于0时的load值，该值在1.3.4节的prio_to_weight数组里定义好了，值为1024。所以这句话看来就是该调度实体的load值不太可能是nice值等于0的进程，当然如果真是nice值等于0的进程，则直接返回上一个进程执行的时间值，否则进入601行。

calc_delta_mine是前一个进程执行的时间值，第二个参数是1024，第三个参数是调度实体的load值。

<kernel/sched/fair.c>179 static unsigned long 180 calc_delta_mine(unsigned long delta_exec, unsigned long weight, 181         struct load_weight *lw) 182 { 183     u64 tmp; 184  185     /* 186      * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched 187      * entities since MIN_SHARES = 2. Treat weight as 1 if less than 188      * 2^SCHED_LOAD_RESOLUTION. 189      */ 190     if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) 191         tmp = (u64)delta_exec * scale_load_down(weight); 192     else 193         tmp = (u64)delta_exec; 194  195     if (!lw->inv_weight) { 196         unsigned long w = scale_load_down(lw->weight); 197  198         if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) 199             lw->inv_weight = 1; 200         else if (unlikely(!w)) 201             lw->inv_weight = WMULT_CONST; 202         else 203             lw->inv_weight = WMULT_CONST / w; 204     } 205  206     /* 207      * Check whether we'd overflow the 64-bit multiplication: 208      */ 209     if (unlikely(tmp > WMULT_CONST)) 210         tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, 211             WMULT_SHIFT/2); 212     else 213         tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); 214  215     return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 216 }

190~193行，得到tmp= (u64)delta_exec * 1024；

195行的inv_weight变量从名称意义上看是权重的倒数，实际上是1.3.4节的prio_to_weight的倒数。这个inv_weight值的计算方法是：(2^32/l)，所以会得到一个求导后的数组。

static const u32 prio_to_wmult[40] = { /* -20 */     48388,     59856,     76040,     92818,    118348, /* -15 */    147320,    184698,    229616,    287308,    360437, /* -10 */    449829,    563644,    704093,    875809,   1099582, /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326, /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587, /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126, /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717, /*  15 */ 119304647,  148102320,  186737708,  238609294,  286331153,};

抛去195行~212行先不看，因为他们成立的概率比较小，213行的代码

SRR(tmp * lw->inv_weight, WMULT_SHIFT); WMULT_SHIFT的值等于32，经过翻译后可以得到如下的：

(u64)delta_exec * 1024 * (2^32/load)>>32

把上面的计算式整理一下并将其赋给真正使用这个值的地方。

curr->vruntime += delta_exec_weighted;

通过上面两行就可以知道虚拟运行时间是如何计算的。

0 0