Linux kernel Namespace源码分析

来源：互联网发布：php项目源码编辑：程序博客网时间：2024/05/16 09:10

学习一下linux kernel namespace的代码还是很有必要的，让你对docker容器的namespace隔离有更深的认识。我的源码分析，是基于Linux Kernel 4.4.19 (https://www.kernel.org/pub/linux/kernel/v4.x/patch-4.4.19.gz)版本的，由于namespace模块更新很少，因此其他相近版本之间雷同。User namespace由于与其他namespaces耦合在一起，比较难分析，我将在后续再作分析。

Kernel，Namespace，Process

Linux Namespace是一种Linux Kernel提供的资源隔离方案，提供Pid，Network，Ipc，Uts，Mount等资源的隔离，每个Namespace下的这些资源对于其他Namespace是不可见的。
注意，一个进程可以同时属于多个Namespace。Linux Kernel、Namespace、Process之间的关系可以用下图描述。
这里写图片描述

Begin with “task_struct”

As u know, Linux Namespace是用来做进程资源隔离的，那么在进程描述符中，一定有对应的Namespaces Info。
在linux-4.4.19/include/linux/sched.h #1380 定义task_struct结构体，该结构体是Linux Process完整信息的集合，其中就包含了一个指向Namespace结构体的指针nsproxy。

struct task_struct {      ...      /* namespaces */      struct nsproxy *nsproxy;      ...}

nsproxy结构体的定义在linux-4.4.6/include/linux/nsproxy.h #29

/* * A structure to contain pointers to all per-process * namespaces - fs (mount), uts, network, sysvipc, etc. * * The pid namespace is an exception -- it's accessed using * task_active_pid_ns.  The pid namespace here is the * namespace that children will use. * * 'count' is the number of tasks holding a reference. * The count for each namespace, then, will be the number * of nsproxies pointing to it, not the number of tasks. * * The nsproxy is shared by tasks which share all namespaces. * As soon as a single namespace is cloned or unshared, the * nsproxy is copied. */struct nsproxy {      atomic_t count;      struct uts_namespace *uts_ns;      struct ipc_namespace *ipc_ns;      struct mnt_namespace *mnt_ns;      struct pid_namespace *pid_ns_for_children;      struct net         *net_ns;};

注意：正如如上代码注释写到，只要namespace被clone了，那么nsproxy就会跟着被clone。
同时，nsproxy.h中定义了一些对namespace的操作，包括copy_namespaces等。

 int copy_namespaces(unsigned long flags, struct task_struct *tsk);void exit_task_namespaces(struct task_struct *tsk);void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);void free_nsproxy(struct nsproxy *ns);int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct cred *, struct fs_struct *);int __init nsproxy_cache_init(void);static inline void put_nsproxy(struct nsproxy *ns) { … }static inline void get_nsproxy(struct nsproxy *ns) { … }

uts_namespace

linux-4.4.19/include/linux/utsname.h #12

struct uts_namespace {       struct kref kref;       struct new_utsname name;       struct user_namespace *user_ns;       struct ns_common ns;};

ipc_namespace

linux-4.4.19/include/linux/ipc_namespace.h #21

struct ipc_namespace {       atomic_t      count;       struct ipc_ids      ids[3];       int          sem_ctls[4];       int          used_sems;       unsigned int msg_ctlmax;       unsigned int msg_ctlmnb;       unsigned int msg_ctlmni;       atomic_t      msg_bytes;       atomic_t      msg_hdrs;       size_t           shm_ctlmax;       size_t           shm_ctlall;       unsigned long     shm_tot;       int          shm_ctlmni;       /*        * Defines whether IPC_RMID is forced for _all_ shm segments regardless        * of shmctl()        */       int          shm_rmid_forced;       struct notifier_block ipcns_nb;       /* The kern_mount of the mqueuefs sb.  We take a ref on it */       struct vfsmount  *mq_mnt;       /* # queues in this ns, protected by mq_lock */       unsigned int    mq_queues_count;       /* next fields are set through sysctl */       unsigned int    mq_queues_max;   /* initialized to DFLT_QUEUESMAX */       unsigned int    mq_msg_max;      /* initialized to DFLT_MSGMAX */       unsigned int    mq_msgsize_max;  /* initialized to DFLT_MSGSIZEMAX */       unsigned int    mq_msg_default;       unsigned int    mq_msgsize_default;       /* user_ns which owns the ipc ns */       struct user_namespace *user_ns;       struct ns_common ns;};

mnt_namespace

linux-4.4.19/fs/mount.h #7

struct mnt_namespace {       atomic_t             count;       struct ns_common     ns;       struct mount *    root;       struct list_head   list;       struct user_namespace    *user_ns;       u64               seq; /* Sequence number to prevent loops */       wait_queue_head_t poll;       u64 event;};

pid_namespace

linux-4.4.19/include/linux/pid_namespace.h #24

struct pid_namespace {       struct kref kref;       struct pidmap pidmap[PIDMAP_ENTRIES];       struct rcu_head rcu;       int last_pid;       unsigned int nr_hashed;       struct task_struct *child_reaper;       struct kmem_cache *pid_cachep;       unsigned int level;       struct pid_namespace *parent;#ifdef CONFIG_PROC_FS       struct vfsmount *proc_mnt;       struct dentry *proc_self;       struct dentry *proc_thread_self;#endif#ifdef CONFIG_BSD_PROCESS_ACCT       struct fs_pin *bacct;#endif       struct user_namespace *user_ns;       struct work_struct proc_work;       kgid_t pid_gid;       int hide_pid;       int reboot;    /* group exit code if this pidns was rebooted */       struct ns_common ns;};

net_namespace

linux-4.4.19/include/net/net_namespace.h #47

struct net {       atomic_t             passive; /* To decided when the network                                           * namespace should be freed.                                           */       atomic_t             count;           /* To decided when the network                                           *  namespace should be shut down.                                           */       spinlock_t            rules_mod_lock;       atomic64_t         cookie_gen;       struct list_head   list;        /* list of network namespaces */       struct list_head   cleanup_list; /* namespaces on death row */       struct list_head   exit_list;       /* Use only net_mutex */       struct user_namespace   *user_ns;   /* Owning user namespace */       spinlock_t            nsid_lock;       struct idr             netns_ids;       struct ns_common     ns;       struct proc_dir_entry       *proc_net;       struct proc_dir_entry       *proc_net_stat;#ifdef CONFIG_SYSCTL       struct ctl_table_set   sysctls;#endif       struct sock          *rtnl;                   /* rtnetlink socket */       struct sock           *genl_sock;       struct list_head dev_base_head;       struct hlist_head       *dev_name_head;       struct hlist_head *dev_index_head;       unsigned int        dev_base_seq;    /* protected by rtnl_mutex */       int                 ifindex;       unsigned int        dev_unreg_count;       /* core fib_rules */       struct list_head   rules_ops;       struct net_device       *loopback_dev;          /* The loopback */       struct netns_core       core;       struct netns_mib mib;       struct netns_packet   packet;       struct netns_unix       unx;       struct netns_ipv4       ipv4;#if IS_ENABLED(CONFIG_IPV6)       struct netns_ipv6       ipv6;#endif#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)       struct netns_ieee802154_lowpan  ieee802154_lowpan;#endif#if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE)       struct netns_sctp       sctp;#endif#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE)       struct netns_dccp      dccp;#endif#ifdef CONFIG_NETFILTER       struct netns_nf          nf;       struct netns_xt           xt;#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)       struct netns_ct           ct;#endif#if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE)       struct netns_nftables nft;#endif#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)       struct netns_nf_frag  nf_frag;#endif       struct sock           *nfnl;       struct sock           *nfnl_stash;#if IS_ENABLED(CONFIG_NETFILTER_NETLINK_ACCT)       struct list_head        nfnl_acct_list;#endif#endif#ifdef CONFIG_WEXT_CORE       struct sk_buff_head   wext_nlevents;#endif       struct net_generic __rcu  *gen;       /* Note : following structs are cache line aligned */#ifdef CONFIG_XFRM       struct netns_xfrm      xfrm;#endif#if IS_ENABLED(CONFIG_IP_VS)       struct netns_ipvs *ipvs;#endif#if IS_ENABLED(CONFIG_MPLS)       struct netns_mpls      mpls;#endif       struct sock           *diag_nlsk;       atomic_t             fnhe_genid;};

task_struct, nsproxy, uts_ns, ipc_ns，….之间的结构关系如下：
这里写图片描述

各个namespace的初始化

Kernel中有一个默认的nsproxy为init_nsproxy, init_nsproxy在task initialize的时候会被初始化。Init_nsproxy的定义在linux-4.4.19/include/linux/init_task.h #232

#define INIT_TASK(tsk)   {…nsproxy  = &init_nsproxy,…}继续跟进init_nsproxy的定义，@ linux-4.4.19/kernel/nsproxy.c #31struct nsproxy init_nsproxy = {       .count                  = ATOMIC_INIT(1),       .uts_ns                = &init_uts_ns,#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)       .ipc_ns                 = &init_ipc_ns,#endif       .mnt_ns               = NULL,       .pid_ns_for_children  = &init_pid_ns,#ifdef CONFIG_NET       .net_ns                = &init_net,#endif};

可见，系统初始化task时，完成了对uts, ipc, pid, net的默认初始化工作，唯独mount没有。具体的各个namespace的initial code location如下：

init_pid_ns —— linux-4.4.19/kernel/pid.c #70
init_uts_ns —— linux-4.4.19/kernel/user.c #25
init_ipc_ns —— linux-4.4.19/ipc/msgutil.c #31
init_net —— linux-4.4.19/net/core/net_namespace.c #35

The workflow of Create Namespace

系统如何Create New Namespace？
The answer is:
int clone (int (fn) (void), void *child_stack, int flags, void *arg)
clone()是libc库中封装的函数，我们不对其进行深究。在linux kernel中，fork/vfork对clone进行了封装，代码在linux/linux-4.4.19/kernel/fork.c #1808-1833

#ifdef __ARCH_WANT_SYS_FORKSYSCALL_DEFINE0(fork){#ifdef CONFIG_MMU       return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);#else       /* can not support in nommu mode */       return -EINVAL;#endif}#endif#ifdef __ARCH_WANT_SYS_VFORKSYSCALL_DEFINE0(vfork){       return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,                     0, NULL, NULL, 0);}#endif#ifdef __ARCH_WANT_SYS_CLONE#ifdef CONFIG_CLONE_BACKWARDSSYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,               int __user *, parent_tidptr,               unsigned long, tls,               int __user *, child_tidptr)#elif defined(CONFIG_CLONE_BACKWARDS2)SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,               int __user *, parent_tidptr,               int __user *, child_tidptr,               unsigned long, tls)#elif defined(CONFIG_CLONE_BACKWARDS3)SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,              int, stack_size,              int __user *, parent_tidptr,              int __user *, child_tidptr,              unsigned long, tls)#elseSYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,               int __user *, parent_tidptr,               int __user *, child_tidptr,               unsigned long, tls)#endif{       return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);}

ork 通过0x80中断（系统调用）来陷入内核，由系统提供的相应系统调用来完成进程的创建。
上面显示，不论是vfork还是fork来创建新城，最终都是通过 _do_fork()来负责实现。
跟踪到linux/linux-4.4.19/kernel/fork.c #1693，查看_do_fork()函数的实现：

long _do_fork(unsigned long clone_flags,             unsigned long stack_start,             unsigned long stack_size,             int __user *parent_tidptr,             int __user *child_tidptr,             unsigned long tls){       //创建进程描述符指针       struct task_struct *p;       int trace = 0;       long nr;       if (!(clone_flags & CLONE_UNTRACED)) {              if (clone_flags & CLONE_VFORK)                     trace = PTRACE_EVENT_VFORK;              else if ((clone_flags & CSIGNAL) != SIGCHLD)                     trace = PTRACE_EVENT_CLONE;              else                     trace = PTRACE_EVENT_FORK;              if (likely(!ptrace_event_enabled(current, trace)))                     trace = 0;       }       //复制进程描述符，copy_process()的返回值是一个 task_struct 指针。       p = copy_process(clone_flags, stack_start, stack_size,                      child_tidptr, NULL, trace, tls);       if (!IS_ERR(p)) {              struct completion vfork;              struct pid *pid;              trace_sched_process_fork(current, p);              //得到新创建的进程描述符中的pid              pid = get_task_pid(p, PIDTYPE_PID);              nr = pid_vnr(pid);              if (clone_flags & CLONE_PARENT_SETTID)                     put_user(nr, parent_tidptr);              //如果调用的 vfork()方法，初始化 vfork 完成处理信息。              if (clone_flags & CLONE_VFORK) {                     p->vfork_done = &vfork;                     init_completion(&vfork);                     get_task_struct(p);              }              //将子进程加入到调度器中，为其分配 CPU，准备执行              wake_up_new_task(p);              //fork 完成，子进程即将开始运行              if (unlikely(trace))                     ptrace_event_pid(trace, pid);               //如果是 vfork，将父进程加入至等待队列，等待子进程完成              if (clone_flags & CLONE_VFORK) {                     if (!wait_for_vfork_done(p, &vfork))                            ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);              }              put_pid(pid);       } else {              nr = PTR_ERR(p);       }       return nr;}

_do_fork 流程

调用 copy_process 为子进程复制出一份进程信息
如果是 vfork 初始化完成处理信息
调用 wake_up_new_task 将子进程加入调度器，为之分配 CPU
如果是 vfork，父进程等待子进程完成 exec 替换自己的地址空间

copy_process源码分析

linux/linux-4.4.19/kernel/fork.c #1243中，定义了copy_process()函数的实现：

static struct task_struct *copy_process(unsigned long clone_flags,                                   unsigned long stack_start,                                   unsigned long stack_size,                                   int __user *child_tidptr,                                   struct pid *pid,                                   int trace,                                   unsigned long tls){       int retval;        //创建进程描述符指针       struct task_struct *p;       void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};        // 检查clone flags的合法性，比如CLONE_NEWNS与CLONE_FS是互斥的；       if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))              return ERR_PTR(-EINVAL);       if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))              return ERR_PTR(-EINVAL);       if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))              return ERR_PTR(-EINVAL);       if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))              return ERR_PTR(-EINVAL);       // 比如CLONE_PARENT时得检查当前signal flags是否为SIGNAL_UNKILLABLE，防止kill init进程。       if ((clone_flags & CLONE_PARENT) &&                            current->signal->flags & SIGNAL_UNKILLABLE)              return ERR_PTR(-EINVAL);       if (clone_flags & CLONE_THREAD) {              if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||                  (task_active_pid_ns(current) !=                            current->nsproxy->pid_ns_for_children))                     return ERR_PTR(-EINVAL);       }       retval = security_task_create(clone_flags);       if (retval)              goto fork_out;       retval = -ENOMEM;       // 复制当前的task_struct       p = dup_task_struct(current);       if (!p)              goto fork_out;       ftrace_graph_init_task(p);       // 初始化互斥变量       rt_mutex_init_task(p);#ifdef CONFIG_PROVE_LOCKING       DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);       DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);#endif       retval = -EAGAIN;       // 检查进程数是否是超过限制，由OS定义       if (atomic_read(&p->real_cred->user->processes) >=                     task_rlimit(p, RLIMIT_NPROC)) {              if (p->real_cred->user != INIT_USER &&                  !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))                     goto bad_fork_free;       }       current->flags &= ~PF_NPROC_EXCEEDED;       retval = copy_creds(p, clone_flags);       if (retval < 0)              goto bad_fork_free;       retval = -EAGAIN;        //检查进程数是否超过 max_threads 由内存大小决定       if (nr_threads >= max_threads)              goto bad_fork_cleanup_count;       delayacct_tsk_init(p);       /* Must remain after dup_task_struct() */       p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);       p->flags |= PF_FORKNOEXEC;       INIT_LIST_HEAD(&p->children);       INIT_LIST_HEAD(&p->sibling);       rcu_copy_process(p);       p->vfork_done = NULL;       //初始化自旋锁       spin_lock_init(&p->alloc_lock);       //初始化挂起信号       init_sigpending(&p->pending);       p->utime = p->stime = p->gtime = 0;       p->utimescaled = p->stimescaled = 0;       prev_cputime_init(&p->prev_cputime);#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN       seqlock_init(&p->vtime_seqlock);       p->vtime_snap = 0;       p->vtime_snap_whence = VTIME_SLEEPING;#endif#if defined(SPLIT_RSS_COUNTING)       memset(&p->rss_stat, 0, sizeof(p->rss_stat));#endif       p->default_timer_slack_ns = current->timer_slack_ns;       task_io_accounting_init(&p->ioac);       acct_clear_integrals(p);       // 初始化CPU定时器       posix_cpu_timers_init(p);       p->start_time = ktime_get_ns();       p->real_start_time = ktime_get_boot_ns();       p->io_context = NULL;       p->audit_context = NULL;       threadgroup_change_begin(current);       cgroup_fork(p);        //......        //初始化进程数据结构，并把进程状态设置为 TASK_RUNNING       retval = sched_fork(clone_flags, p);        //复制所有进程信息，包括文件系统、信号处理函数、信号、内存管理等       if (retval)              goto bad_fork_cleanup_policy;       retval = perf_event_init_task(p);       if (retval)              goto bad_fork_cleanup_policy;       retval = audit_alloc(p);       if (retval)              goto bad_fork_cleanup_perf;       /* copy all the process information */       shm_init_task(p);       retval = copy_semundo(clone_flags, p);       if (retval)              goto bad_fork_cleanup_audit;       retval = copy_files(clone_flags, p);       if (retval)              goto bad_fork_cleanup_semundo;       retval = copy_fs(clone_flags, p);       if (retval)              goto bad_fork_cleanup_files;       retval = copy_sighand(clone_flags, p);       if (retval)              goto bad_fork_cleanup_fs;       retval = copy_signal(clone_flags, p);       if (retval)              goto bad_fork_cleanup_sighand;       retval = copy_mm(clone_flags, p);       if (retval)              goto bad_fork_cleanup_signal;       // 复制namespaces       retval = copy_namespaces(clone_flags, p);       if (retval)              goto bad_fork_cleanup_mm;       retval = copy_io(clone_flags, p);       if (retval)              goto bad_fork_cleanup_namespaces;       // 初始化子进程内核栈       retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);       if (retval)              goto bad_fork_cleanup_io;        //为新进程分配新的 pid       if (pid != &init_struct_pid) {              pid = alloc_pid(p->nsproxy->pid_ns_for_children);              if (IS_ERR(pid)) {                     retval = PTR_ERR(pid);                     goto bad_fork_cleanup_io;              }       }       p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;       p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;        //......        //设置子进程 pid              p->pid = pid_nr(pid);        //......       /*        * Ensure that the cgroup subsystem policies allow the new process to be        * forked. It should be noted the the new process's css_set can be changed        * between here and cgroup_post_fork() if an organisation operation is in        * progress.        */       retval = cgroup_can_fork(p, cgrp_ss_priv);       if (retval)              goto bad_fork_free_pid;       /*        * Make it visible to the rest of the system, but dont wake it up yet.        * Need tasklist lock for parent etc handling!        */       write_lock_irq(&tasklist_lock);       /* CLONE_PARENT re-uses the old parent */       if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {              p->real_parent = current->real_parent;              p->parent_exec_id = current->parent_exec_id;       } else {              p->real_parent = current;              p->parent_exec_id = current->self_exec_id;       }       spin_lock(&current->sighand->siglock);       // 复制seccommp配置       copy_seccomp(p);       /*        * Process group and session signals need to be delivered to just the        * parent before the fork or both the parent and the child after the        * fork. Restart if a signal comes in before we add the new process to        * it's process group.        * A fatal signal pending means that current will exit, so the new        * thread can't slip out of an OOM kill (or normal SIGKILL).       */       recalc_sigpending();       if (signal_pending(current)) {              spin_unlock(&current->sighand->siglock);              write_unlock_irq(&tasklist_lock);              retval = -ERESTARTNOINTR;              goto bad_fork_cancel_cgroup;       }        //......       total_forks++;       spin_unlock(&current->sighand->siglock);       syscall_tracepoint_update(p);       write_unlock_irq(&tasklist_lock);       proc_fork_connector(p);       cgroup_post_fork(p, cgrp_ss_priv);       threadgroup_change_end(current);       perf_event_fork(p);       trace_task_newtask(p, clone_flags);       uprobe_copy_process(p, clone_flags);       // 返回结构体p       return p;}

copy_process流程：

调用 dup_task_struct 复制当前的 task_struct，为新进程分配了新的堆栈
检查进程数是否超过限制
初始化自旋锁、挂起信号、CPU 定时器等
调用 sched_fork 初始化进程数据结构，并把进程状态设置为 TASK_RUNNING
复制所有进程信息，包括文件系统、信号处理函数、信号、内存管理等
调用copy_namespaces复制namesapces
调用 copy_thread 初始化子进程内核栈，将父进程的寄存器上下文copy给了子进程
为新进程分配并设置新的 pid

dup_task_struct源码分析

dup_task_struct()函数的定义在linux-4.4.19/kernel/fork.c #334

static struct task_struct *dup_task_struct(struct task_struct *orig){       struct task_struct *tsk;       struct thread_info *ti;       int node = tsk_fork_get_node(orig);       int err;       //分配一个 task_struct 节点       tsk = alloc_task_struct_node(node);       if (!tsk)              return NULL;       //分配一个 thread_info 节点，包含进程的内核栈，ti 为栈底       ti = alloc_thread_info_node(tsk, node);       if (!ti)              goto free_tsk;       //将栈底的值赋给新节点的栈       tsk->stack = ti;       //……       return tsk;}

dup_task_struct流程：

调用alloc_task_struct_node分配一个 task_struct 节点
调用alloc_thread_info_node分配一个 thread_info 节点，其实是分配了一个thread_union联合体,将栈底返回给 ti

union thread_union {   struct thread_info thread_info;  unsigned long stack[THREAD_SIZE/sizeof(long)];};

最后将栈底的值 ti 赋值给新节点的栈
最终执行完dup_task_struct之后，子进程除了tsk->stack指针不同之外。

sched_fork源码分析

linux-4.4.19/kernel/sched/core.c #2187

int sched_fork(unsigned long clone_flags, struct task_struct *p){       unsigned long flags;       int cpu = get_cpu();       __sched_fork(clone_flags, p);       //将子进程状态设置为 TASK_RUNNING       p->state = TASK_RUNNING;       //……       //为子进程分配 CPU       set_task_cpu(p, cpu);       put_cpu();       return 0;}

我们可以看到sched_fork大致完成了两项重要工作，一是将子进程状态设置为 TASK_RUNNING，二是为其分配 CPU

copy_thread_tls源码分析

linux-4.4.19/arch/x86/kernel/process_64.c #156

int copy_thread_tls(unsigned long clone_flags, unsigned long sp,              unsigned long arg, struct task_struct *p, unsigned long tls){       int err;       struct pt_regs *childregs;       struct task_struct *me = current;       p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;       // 获取寄存器信息       childregs = task_pt_regs(p);       p->thread.sp = (unsigned long) childregs;       set_tsk_thread_flag(p, TIF_FORK);       p->thread.io_bitmap_ptr = NULL;       savesegment(gs, p->thread.gsindex);       p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;       savesegment(fs, p->thread.fsindex);       p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;       savesegment(es, p->thread.es);       savesegment(ds, p->thread.ds);       memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));       if (unlikely(p->flags & PF_KTHREAD)) {              /* kernel thread */              memset(childregs, 0, sizeof(struct pt_regs));              childregs->sp = (unsigned long)childregs;              childregs->ss = __KERNEL_DS;              childregs->bx = sp; /* function */              childregs->bp = arg;              childregs->orig_ax = -1;              childregs->cs = __KERNEL_CS | get_kernel_rpl();              childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;              return 0;       }       // 将当前寄存器信息复制给子进程       *childregs = *current_pt_regs();       //子进程 eax 置 0，因此fork 在子进程返回0       childregs->ax = 0;       if (sp)              childregs->sp = sp;       err = -ENOMEM;       if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {              p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,                                            IO_BITMAP_BYTES, GFP_KERNEL);              if (!p->thread.io_bitmap_ptr) {                     p->thread.io_bitmap_max = 0;                     return -ENOMEM;              }              set_tsk_thread_flag(p, TIF_IO_BITMAP);       }       /*        * Set a new TLS for the child thread?        */       if (clone_flags & CLONE_SETTLS) {#ifdef CONFIG_IA32_EMULATION              if (is_ia32_task())                     err = do_set_thread_area(p, -1,                            (struct user_desc __user *)tls, 0);              else#endif                     err = do_arch_prctl(p, ARCH_SET_FS, tls);              if (err)                     goto out;       }       err = 0;out:       if (err && p->thread.io_bitmap_ptr) {              kfree(p->thread.io_bitmap_ptr);              p->thread.io_bitmap_max = 0;       }       return err;}

Copy_namesapces源码分析

Copy_namespaces()函数的定义在linux-4.4.19/kernel/nsproxy.c #124

int copy_namespaces(unsigned long flags, struct task_struct *tsk){       struct nsproxy *old_ns = tsk->nsproxy;       struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);       struct nsproxy *new_ns;       if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |                           CLONE_NEWPID | CLONE_NEWNET)))) {              get_nsproxy(old_ns);              return 0;       }       if (!ns_capable(user_ns, CAP_SYS_ADMIN))              return -EPERM;       /*        * CLONE_NEWIPC must detach from the undolist: after switching        * to a new ipc namespace, the semaphore arrays from the old        * namespace are unreachable.  In clone parlance, CLONE_SYSVSEM        * means share undolist with parent, so we must forbid using        * it along with CLONE_NEWIPC.        */       if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==              (CLONE_NEWIPC | CLONE_SYSVSEM))              return -EINVAL;       new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);       if (IS_ERR(new_ns))              return  PTR_ERR(new_ns);       tsk->nsproxy = new_ns;       return 0;}

Create_new_namespaces源码分析

Create_new_namespaces()函数实现定义在linux-4.4.19/kernel/nsproxy.c #59

/* * Create new nsproxy and all of its the associated namespaces. * Return the newly created nsproxy.  Do not attach this to the task, * leave it to the caller to do proper locking and attach it to task. */static struct nsproxy *create_new_namespaces(unsigned long flags,       struct task_struct *tsk, struct user_namespace *user_ns,       struct fs_struct *new_fs){       struct nsproxy *new_nsp;       int err;       new_nsp = create_nsproxy();       if (!new_nsp)              return ERR_PTR(-ENOMEM);       new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);       if (IS_ERR(new_nsp->mnt_ns)) {              err = PTR_ERR(new_nsp->mnt_ns);              goto out_ns;       }       new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);       if (IS_ERR(new_nsp->uts_ns)) {              err = PTR_ERR(new_nsp->uts_ns);              goto out_uts;       }       new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);       if (IS_ERR(new_nsp->ipc_ns)) {              err = PTR_ERR(new_nsp->ipc_ns);              goto out_ipc;       }       new_nsp->pid_ns_for_children =              copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children);       if (IS_ERR(new_nsp->pid_ns_for_children)) {              err = PTR_ERR(new_nsp->pid_ns_for_children);              goto out_pid;       }       new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);       if (IS_ERR(new_nsp->net_ns)) {              err = PTR_ERR(new_nsp->net_ns);              goto out_net;       }       return new_nsp;out_net:       if (new_nsp->pid_ns_for_children)              put_pid_ns(new_nsp->pid_ns_for_children);out_pid:       if (new_nsp->ipc_ns)              put_ipc_ns(new_nsp->ipc_ns);out_ipc:       if (new_nsp->uts_ns)              put_uts_ns(new_nsp->uts_ns);out_uts:       if (new_nsp->mnt_ns)              put_mnt_ns(new_nsp->mnt_ns);out_ns:       kmem_cache_free(nsproxy_cachep, new_nsp);       return ERR_PTR(err);}

在copy_namespaces()中，分别调用create_nsproxy(), create_utsname(), create_ipcs(), create_pid_ns(), create_net_ns(), create_mnt_ns()，具体的实现请参考如下索引。
create_nsproxy() —— linux-4.4.19/kernel/nsproxy.c #44
create_utsname() ——linux-4.4.19/kernel/utsname.c #66
create_mnt_ns() ——linux-4.4.19/fs/namespace.c #2775
create_ipcs() —— linux-4.4.19/ipc/namespace.c #54
create_pid_ns() —— linux-4.4.19/kernel/pid_namespace.c #153
create_net_ns() —— linux-4.4.19/net/core/net_namespace.c #351
create new namespace代码流程图：
这里写图片描述

1 0