Linux从用户层到内核层系列 - 进程管理系列1: 进程之子承父业

来源：互联网发布：unity3d 遮挡透明编辑：程序博客网时间：2024/05/24 05:18

题记：本系列文章的目的是抛开书本从源代码和使用的角度分析Linux内核和相关源代码，byhankswang和你一起玩转linux开发

轻松搞定TCP/IP协议栈，原创文章欢迎交流, byhankswang@gmail.com

欢迎加入到CHLK - Linux开发交流群 QQ：327084515 讨论Linux开发相关问题

进程之子承父业

在创建用户进程、用户线程和内核线程的时候，不同的系统调用对新创建的进程或线程如何继承父进程相关的属性？对于虚拟内存、文件系统、相关文件、信号量等等属性是如何区分的的？

1.首先从克隆父进程时的FLAG说起

CLONE时的FLAG就好比父亲签署的文件，直接指定儿子可以继承父亲的哪些家产，哪些目前是父子共用的，哪些需要儿子独立打拼白手起家的，下面的这些FLAG注释简单明了，就不翻译了。

//within file linux-3.9.3/include/uapi/linux/sched.h/* * cloning flags: */#define CSIGNAL 0x000000ff/* signal mask to be sent at exit */#define CLONE_VM 0x00000100/* set if VM shared between processes */#define CLONE_FS 0x00000200/* set if fs info shared between processes */#define CLONE_FILES 0x00000400/* set if open files shared between processes */#define CLONE_SIGHAND 0x00000800/* set if signal handlers and blocked signals shared */#define CLONE_PTRACE 0x00002000/* set if we want to let tracing continue on the child too */#define CLONE_VFORK 0x00004000/* set if the parent wants the child to wake it up on mm_release */#define CLONE_PARENT 0x00008000/* set if we want to have the same parent as the cloner */#define CLONE_THREAD 0x00010000/* Same thread group? */#define CLONE_NEWNS 0x00020000/* New namespace group? */#define CLONE_SYSVSEM 0x00040000/* share system V SEM_UNDO semantics */#define CLONE_SETTLS 0x00080000/* create a new TLS for the child */#define CLONE_PARENT_SETTID 0x00100000/* set the TID in the parent */#define CLONE_CHILD_CLEARTID 0x00200000/* clear the TID in the child */#define CLONE_DETACHED 0x00400000/* Unused, ignored */#define CLONE_UNTRACED 0x00800000/* set if the tracing process can't force CLONE_PTRACE on this clone */#define CLONE_CHILD_SETTID 0x01000000/* set the TID in the child *//* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)   and is now available for re-use. */#define CLONE_NEWUTS 0x04000000/* New utsname group? */#define CLONE_NEWIPC 0x08000000/* New ipcs */#define CLONE_NEWUSER 0x10000000/* New user namespace */#define CLONE_NEWPID 0x20000000/* New pid namespace */#define CLONE_NEWNET 0x40000000/* New network namespace */#define CLONE_IO 0x80000000/* Clone io context */

2.系统调用fork和vfork

在用户空间我们使用fork来创建一个进程的时候，系统调用fork的实现，可以参考对系统调用的分析博文《Linux从用户层到内核层系列 - TCP/IP协议栈部分系列6:linux 系统调用中断向量表》和姊妹篇博文《Linux从用户层到内核层系列 - TCP/IP协议栈部分系列11: 再话Linux系统调用》，相信看完这两篇博文之后，对系统调用会有直观的理解。来点干活吧，我们从标记为的角度来看看系统调用fork是如何为子进程合法的获得父进程的资源的。

#ifdef __ARCH_WANT_SYS_FORKSYSCALL_DEFINE0(fork){#ifdef CONFIG_MMUreturn do_fork(SIGCHLD, 0, 0, NULL, NULL);#else/* can not support in nommu mode */return(-EINVAL);#endif}#endif

其实fork函数只是对do_fork函数的一个简单的封装，与处理器平台无关的内核代码部分中，fork、vfork和kernel_thread都对do_fork进程了封装，略有区别的就是标记为的不同，我们再看一下vfork和kernel_thread对do_fork是如何使用标记位的。

#ifdef __ARCH_WANT_SYS_VFORKSYSCALL_DEFINE0(vfork){return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 0, NULL, NULL);}#endif/* * Create a kernel thread. */pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags){return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,(unsigned long)arg, NULL, NULL);}

源码告诉了我们一切，从用户层开发角度，你说fork和vfork的区别千千万，不过直接看kernel的源码，一目了然。

fork使用的标记位：SIGCHLD //新创建的子进程终结的时候，发送信号通知其父进程

vfork使用的标记位：CLONE_VFORK | CLONE_VM | SIGCHLD //新创建的子进程是要共享除了父进程的数据段的，至于如何copy请看内核源码copy_process函数

kernel_thread使用的标记位： CLONE_VM | CLONE_UNTREACED

而且我们

#ifdef __ARCH_WANT_SYS_VFORKSYSCALL_DEFINE0(vfork){return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 0, NULL, NULL);}#endif/* * Create a kernel thread. */pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags){return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,(unsigned long)arg, NULL, NULL);}

可以看到，新创建的内核线程是需要指定起始执行函数和其参数的。

另外列上源码，看看do_fork的参数：

long do_fork(unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr)

对于创建进程和线程比较重要的do_fork函数和copy_process函数我们这里列出源代码，对于懒的看源码的童鞋可以直接读下面的代码即可。如何使用上面的这些标记为，下面的代码写的清清楚楚，请保持耐心，仔细品味。

long do_fork(unsigned long clone_flags,      unsigned long stack_start,      unsigned long stack_size,      int __user *parent_tidptr,      int __user *child_tidptr){struct task_struct *p;int trace = 0;long nr;/* * Do some preliminary argument and permissions checking before we * actually start allocating stuff */if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {if (clone_flags & (CLONE_THREAD|CLONE_PARENT))return -EINVAL;}/* * Determine whether and which event to report to ptracer.  When * called from kernel_thread or CLONE_UNTRACED is explicitly * requested, no event is reported; otherwise, report if the event * for the type of forking is enabled. */if (!(clone_flags & CLONE_UNTRACED)) {if (clone_flags & CLONE_VFORK)trace = PTRACE_EVENT_VFORK;else if ((clone_flags & CSIGNAL) != SIGCHLD)trace = PTRACE_EVENT_CLONE;elsetrace = PTRACE_EVENT_FORK;if (likely(!ptrace_event_enabled(current, trace)))trace = 0;}p = copy_process(clone_flags, stack_start, stack_size, child_tidptr, NULL, trace);/* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */if (!IS_ERR(p)) {struct completion vfork;trace_sched_process_fork(current, p);nr = task_pid_vnr(p);if (clone_flags & CLONE_PARENT_SETTID)put_user(nr, parent_tidptr);if (clone_flags & CLONE_VFORK) {p->vfork_done = &vfork;init_completion(&vfork);get_task_struct(p);}wake_up_new_task(p);/* forking complete and child started to run, tell ptracer */if (unlikely(trace))ptrace_event(trace, nr);if (clone_flags & CLONE_VFORK) {if (!wait_for_vfork_done(p, &vfork))ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);}} else {nr = PTR_ERR(p);}return nr;}

copy_process函数源码，可以看到copy_process函数返回的是task_struct，真是我们所关心的。

/* * This creates a new process as a copy of the old one, * but does not actually start it yet. * * It copies the registers, and all the appropriate * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */static struct task_struct *copy_process(unsigned long clone_flags,unsigned long stack_start,unsigned long stack_size,int __user *child_tidptr,struct pid *pid,int trace){int retval;struct task_struct *p;if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))return ERR_PTR(-EINVAL);if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))return ERR_PTR(-EINVAL);/* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. */if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))return ERR_PTR(-EINVAL);/* * Shared signal handlers imply shared VM. By way of the above, * thread groups also imply shared VM. Blocking this case allows * for various simplifications in other code. */if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))return ERR_PTR(-EINVAL);/* * Siblings of global init remain as zombies on exit since they are * not reaped by their parent (swapper). To solve this and to avoid * multi-rooted process trees, prevent global and container-inits * from creating siblings. */if ((clone_flags & CLONE_PARENT) &&current->signal->flags & SIGNAL_UNKILLABLE)return ERR_PTR(-EINVAL);/* * If the new process will be in a different pid namespace * don't allow the creation of threads. */if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) &&    (task_active_pid_ns(current) != current->nsproxy->pid_ns))return ERR_PTR(-EINVAL);retval = security_task_create(clone_flags);if (retval)goto fork_out;retval = -ENOMEM;p = dup_task_struct(current);if (!p)goto fork_out;ftrace_graph_init_task(p);get_seccomp_filter(p);rt_mutex_init_task(p);#ifdef CONFIG_PROVE_LOCKINGDEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);#endifretval = -EAGAIN;if (atomic_read(&p->real_cred->user->processes) >=task_rlimit(p, RLIMIT_NPROC)) {if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&    p->real_cred->user != INIT_USER)goto bad_fork_free;}current->flags &= ~PF_NPROC_EXCEEDED;retval = copy_creds(p, clone_flags);if (retval < 0)goto bad_fork_free;/* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there * to stop root fork bombs. */retval = -EAGAIN;if (nr_threads >= max_threads)goto bad_fork_cleanup_count;if (!try_module_get(task_thread_info(p)->exec_domain->module))goto bad_fork_cleanup_count;p->did_exec = 0;delayacct_tsk_init(p);/* Must remain after dup_task_struct() */copy_flags(clone_flags, p);INIT_LIST_HEAD(&p->children);INIT_LIST_HEAD(&p->sibling);rcu_copy_process(p);p->vfork_done = NULL;spin_lock_init(&p->alloc_lock);init_sigpending(&p->pending);p->utime = p->stime = p->gtime = 0;p->utimescaled = p->stimescaled = 0;#ifndef CONFIG_VIRT_CPU_ACCOUNTINGp->prev_cputime.utime = p->prev_cputime.stime = 0;#endif#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GENseqlock_init(&p->vtime_seqlock);p->vtime_snap = 0;p->vtime_snap_whence = VTIME_SLEEPING;#endif#if defined(SPLIT_RSS_COUNTING)memset(&p->rss_stat, 0, sizeof(p->rss_stat));#endifp->default_timer_slack_ns = current->timer_slack_ns;task_io_accounting_init(&p->ioac);acct_clear_integrals(p);posix_cpu_timers_init(p);do_posix_clock_monotonic_gettime(&p->start_time);p->real_start_time = p->start_time;monotonic_to_bootbased(&p->real_start_time);p->io_context = NULL;p->audit_context = NULL;if (clone_flags & CLONE_THREAD)threadgroup_change_begin(current);cgroup_fork(p);#ifdef CONFIG_NUMAp->mempolicy = mpol_dup(p->mempolicy);if (IS_ERR(p->mempolicy)) {retval = PTR_ERR(p->mempolicy);p->mempolicy = NULL;goto bad_fork_cleanup_cgroup;}mpol_fix_fork_child_flag(p);#endif#ifdef CONFIG_CPUSETSp->cpuset_mem_spread_rotor = NUMA_NO_NODE;p->cpuset_slab_spread_rotor = NUMA_NO_NODE;seqcount_init(&p->mems_allowed_seq);#endif#ifdef CONFIG_TRACE_IRQFLAGSp->irq_events = 0;p->hardirqs_enabled = 0;p->hardirq_enable_ip = 0;p->hardirq_enable_event = 0;p->hardirq_disable_ip = _THIS_IP_;p->hardirq_disable_event = 0;p->softirqs_enabled = 1;p->softirq_enable_ip = _THIS_IP_;p->softirq_enable_event = 0;p->softirq_disable_ip = 0;p->softirq_disable_event = 0;p->hardirq_context = 0;p->softirq_context = 0;#endif#ifdef CONFIG_LOCKDEPp->lockdep_depth = 0; /* no locks held yet */p->curr_chain_key = 0;p->lockdep_recursion = 0;#endif#ifdef CONFIG_DEBUG_MUTEXESp->blocked_on = NULL; /* not blocked yet */#endif#ifdef CONFIG_MEMCGp->memcg_batch.do_batch = 0;p->memcg_batch.memcg = NULL;#endif/* Perform scheduler related setup. Assign this task to a CPU. */sched_fork(p);retval = perf_event_init_task(p);if (retval)goto bad_fork_cleanup_policy;retval = audit_alloc(p);if (retval)goto bad_fork_cleanup_policy;/* copy all the process information */retval = copy_semundo(clone_flags, p);if (retval)goto bad_fork_cleanup_audit;retval = copy_files(clone_flags, p);if (retval)goto bad_fork_cleanup_semundo;retval = copy_fs(clone_flags, p);if (retval)goto bad_fork_cleanup_files;retval = copy_sighand(clone_flags, p);if (retval)goto bad_fork_cleanup_fs;retval = copy_signal(clone_flags, p);if (retval)goto bad_fork_cleanup_sighand;retval = copy_mm(clone_flags, p);if (retval)goto bad_fork_cleanup_signal;retval = copy_namespaces(clone_flags, p);if (retval)goto bad_fork_cleanup_mm;retval = copy_io(clone_flags, p);if (retval)goto bad_fork_cleanup_namespaces;retval = copy_thread(clone_flags, stack_start, stack_size, p);if (retval)goto bad_fork_cleanup_io;if (pid != &init_struct_pid) {retval = -ENOMEM;pid = alloc_pid(p->nsproxy->pid_ns);if (!pid)goto bad_fork_cleanup_io;}p->pid = pid_nr(pid);p->tgid = p->pid;if (clone_flags & CLONE_THREAD)p->tgid = current->tgid;p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;/* * Clear TID on mm_release()? */p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;#ifdef CONFIG_BLOCKp->plug = NULL;#endif#ifdef CONFIG_FUTEXp->robust_list = NULL;#ifdef CONFIG_COMPATp->compat_robust_list = NULL;#endifINIT_LIST_HEAD(&p->pi_state_list);p->pi_state_cache = NULL;#endifuprobe_copy_process(p);/* * sigaltstack should be cleared when sharing the same VM */if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)p->sas_ss_sp = p->sas_ss_size = 0;/* * Syscall tracing and stepping should be turned off in the * child regardless of CLONE_PTRACE. */user_disable_single_step(p);clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);#ifdef TIF_SYSCALL_EMUclear_tsk_thread_flag(p, TIF_SYSCALL_EMU);#endifclear_all_latency_tracing(p);/* ok, now we should be set up.. */if (clone_flags & CLONE_THREAD)p->exit_signal = -1;else if (clone_flags & CLONE_PARENT)p->exit_signal = current->group_leader->exit_signal;elsep->exit_signal = (clone_flags & CSIGNAL);p->pdeath_signal = 0;p->exit_state = 0;p->nr_dirtied = 0;p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);p->dirty_paused_when = 0;/* * Ok, make it visible to the rest of the system. * We dont wake it up yet. */p->group_leader = p;INIT_LIST_HEAD(&p->thread_group);p->task_works = NULL;/* Need tasklist lock for parent etc handling! */write_lock_irq(&tasklist_lock);/* CLONE_PARENT re-uses the old parent */if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {p->real_parent = current->real_parent;p->parent_exec_id = current->parent_exec_id;} else {p->real_parent = current;p->parent_exec_id = current->self_exec_id;}spin_lock(¤t->sighand->siglock);/* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the * fork. Restart if a signal comes in before we add the new process to * it's process group. * A fatal signal pending means that current will exit, so the new * thread can't slip out of an OOM kill (or normal SIGKILL).*/recalc_sigpending();if (signal_pending(current)) {spin_unlock(¤t->sighand->siglock);write_unlock_irq(&tasklist_lock);retval = -ERESTARTNOINTR;goto bad_fork_free_pid;}if (clone_flags & CLONE_THREAD) {current->signal->nr_threads++;atomic_inc(¤t->signal->live);atomic_inc(¤t->signal->sigcnt);p->group_leader = current->group_leader;list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);}if (likely(p->pid)) {ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);if (thread_group_leader(p)) {if (is_child_reaper(pid)) {ns_of_pid(pid)->child_reaper = p;p->signal->flags |= SIGNAL_UNKILLABLE;}p->signal->leader_pid = pid;p->signal->tty = tty_kref_get(current->signal->tty);attach_pid(p, PIDTYPE_PGID, task_pgrp(current));attach_pid(p, PIDTYPE_SID, task_session(current));list_add_tail(&p->sibling, &p->real_parent->children);list_add_tail_rcu(&p->tasks, &init_task.tasks);__this_cpu_inc(process_counts);}attach_pid(p, PIDTYPE_PID, pid);nr_threads++;}total_forks++;spin_unlock(¤t->sighand->siglock);write_unlock_irq(&tasklist_lock);proc_fork_connector(p);cgroup_post_fork(p);if (clone_flags & CLONE_THREAD)threadgroup_change_end(current);perf_event_fork(p);trace_task_newtask(p, clone_flags);return p;bad_fork_free_pid:if (pid != &init_struct_pid)free_pid(pid);bad_fork_cleanup_io:if (p->io_context)exit_io_context(p);bad_fork_cleanup_namespaces:exit_task_namespaces(p);bad_fork_cleanup_mm:if (p->mm)mmput(p->mm);bad_fork_cleanup_signal:if (!(clone_flags & CLONE_THREAD))free_signal_struct(p->signal);bad_fork_cleanup_sighand:__cleanup_sighand(p->sighand);bad_fork_cleanup_fs:exit_fs(p); /* blocking */bad_fork_cleanup_files:exit_files(p); /* blocking */bad_fork_cleanup_semundo:exit_sem(p);bad_fork_cleanup_audit:audit_free(p);bad_fork_cleanup_policy:perf_event_free_task(p);#ifdef CONFIG_NUMAmpol_put(p->mempolicy);bad_fork_cleanup_cgroup:#endifif (clone_flags & CLONE_THREAD)threadgroup_change_end(current);cgroup_exit(p, 0);delayacct_tsk_free(p);module_put(task_thread_info(p)->exec_domain->module);bad_fork_cleanup_count:atomic_dec(&p->cred->user->processes);exit_creds(p);bad_fork_free:free_task(p);fork_out:return ERR_PTR(retval);}

进程管理的内容不是一篇博文就能说的明白的，尽可能每次从一个小的点入手，以点搏面，娓娓道来。