Linux kernel Namespace源码分析
来源:互联网 发布:php项目源码 编辑:程序博客网 时间:2024/05/16 09:10
学习一下linux kernel namespace的代码还是很有必要的,让你对docker容器的namespace隔离有更深的认识。我的源码分析,是基于Linux Kernel 4.4.19 (https://www.kernel.org/pub/linux/kernel/v4.x/patch-4.4.19.gz)版本的,由于namespace模块更新很少,因此其他相近版本之间雷同。User namespace由于与其他namespaces耦合在一起,比较难分析,我将在后续再作分析。
Kernel,Namespace,Process
Linux Namespace是一种Linux Kernel提供的资源隔离方案,提供Pid,Network,Ipc,Uts,Mount等资源的隔离,每个Namespace下的这些资源对于其他Namespace是不可见的。
注意,一个进程可以同时属于多个Namespace。Linux Kernel、Namespace、Process之间的关系可以用下图描述。
Begin with “task_struct”
As u know, Linux Namespace是用来做进程资源隔离的,那么在进程描述符中,一定有对应的Namespaces Info。
在linux-4.4.19/include/linux/sched.h #1380 定义task_struct结构体,该结构体是Linux Process完整信息的集合,其中就包含了一个指向Namespace结构体的指针nsproxy。
struct task_struct { ... /* namespaces */ struct nsproxy *nsproxy; ...}
nsproxy结构体的定义在linux-4.4.6/include/linux/nsproxy.h #29
/* * A structure to contain pointers to all per-process * namespaces - fs (mount), uts, network, sysvipc, etc. * * The pid namespace is an exception -- it's accessed using * task_active_pid_ns. The pid namespace here is the * namespace that children will use. * * 'count' is the number of tasks holding a reference. * The count for each namespace, then, will be the number * of nsproxies pointing to it, not the number of tasks. * * The nsproxy is shared by tasks which share all namespaces. * As soon as a single namespace is cloned or unshared, the * nsproxy is copied. */struct nsproxy { atomic_t count; struct uts_namespace *uts_ns; struct ipc_namespace *ipc_ns; struct mnt_namespace *mnt_ns; struct pid_namespace *pid_ns_for_children; struct net *net_ns;};
注意:正如如上代码注释写到,只要namespace被clone了,那么nsproxy就会跟着被clone。
同时,nsproxy.h中定义了一些对namespace的操作,包括copy_namespaces等。
int copy_namespaces(unsigned long flags, struct task_struct *tsk);void exit_task_namespaces(struct task_struct *tsk);void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);void free_nsproxy(struct nsproxy *ns);int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct cred *, struct fs_struct *);int __init nsproxy_cache_init(void);static inline void put_nsproxy(struct nsproxy *ns) { … }static inline void get_nsproxy(struct nsproxy *ns) { … }
uts_namespace
linux-4.4.19/include/linux/utsname.h #12
struct uts_namespace { struct kref kref; struct new_utsname name; struct user_namespace *user_ns; struct ns_common ns;};
ipc_namespace
linux-4.4.19/include/linux/ipc_namespace.h #21
struct ipc_namespace { atomic_t count; struct ipc_ids ids[3]; int sem_ctls[4]; int used_sems; unsigned int msg_ctlmax; unsigned int msg_ctlmnb; unsigned int msg_ctlmni; atomic_t msg_bytes; atomic_t msg_hdrs; size_t shm_ctlmax; size_t shm_ctlall; unsigned long shm_tot; int shm_ctlmni; /* * Defines whether IPC_RMID is forced for _all_ shm segments regardless * of shmctl() */ int shm_rmid_forced; struct notifier_block ipcns_nb; /* The kern_mount of the mqueuefs sb. We take a ref on it */ struct vfsmount *mq_mnt; /* # queues in this ns, protected by mq_lock */ unsigned int mq_queues_count; /* next fields are set through sysctl */ unsigned int mq_queues_max; /* initialized to DFLT_QUEUESMAX */ unsigned int mq_msg_max; /* initialized to DFLT_MSGMAX */ unsigned int mq_msgsize_max; /* initialized to DFLT_MSGSIZEMAX */ unsigned int mq_msg_default; unsigned int mq_msgsize_default; /* user_ns which owns the ipc ns */ struct user_namespace *user_ns; struct ns_common ns;};
mnt_namespace
linux-4.4.19/fs/mount.h #7
struct mnt_namespace { atomic_t count; struct ns_common ns; struct mount * root; struct list_head list; struct user_namespace *user_ns; u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; u64 event;};
pid_namespace
linux-4.4.19/include/linux/pid_namespace.h #24
struct pid_namespace { struct kref kref; struct pidmap pidmap[PIDMAP_ENTRIES]; struct rcu_head rcu; int last_pid; unsigned int nr_hashed; struct task_struct *child_reaper; struct kmem_cache *pid_cachep; unsigned int level; struct pid_namespace *parent;#ifdef CONFIG_PROC_FS struct vfsmount *proc_mnt; struct dentry *proc_self; struct dentry *proc_thread_self;#endif#ifdef CONFIG_BSD_PROCESS_ACCT struct fs_pin *bacct;#endif struct user_namespace *user_ns; struct work_struct proc_work; kgid_t pid_gid; int hide_pid; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns;};
net_namespace
linux-4.4.19/include/net/net_namespace.h #47
struct net { atomic_t passive; /* To decided when the network * namespace should be freed. */ atomic_t count; /* To decided when the network * namespace should be shut down. */ spinlock_t rules_mod_lock; atomic64_t cookie_gen; struct list_head list; /* list of network namespaces */ struct list_head cleanup_list; /* namespaces on death row */ struct list_head exit_list; /* Use only net_mutex */ struct user_namespace *user_ns; /* Owning user namespace */ spinlock_t nsid_lock; struct idr netns_ids; struct ns_common ns; struct proc_dir_entry *proc_net; struct proc_dir_entry *proc_net_stat;#ifdef CONFIG_SYSCTL struct ctl_table_set sysctls;#endif struct sock *rtnl; /* rtnetlink socket */ struct sock *genl_sock; struct list_head dev_base_head; struct hlist_head *dev_name_head; struct hlist_head *dev_index_head; unsigned int dev_base_seq; /* protected by rtnl_mutex */ int ifindex; unsigned int dev_unreg_count; /* core fib_rules */ struct list_head rules_ops; struct net_device *loopback_dev; /* The loopback */ struct netns_core core; struct netns_mib mib; struct netns_packet packet; struct netns_unix unx; struct netns_ipv4 ipv4;#if IS_ENABLED(CONFIG_IPV6) struct netns_ipv6 ipv6;#endif#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) struct netns_ieee802154_lowpan ieee802154_lowpan;#endif#if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE) struct netns_sctp sctp;#endif#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) struct netns_dccp dccp;#endif#ifdef CONFIG_NETFILTER struct netns_nf nf; struct netns_xt xt;#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct netns_ct ct;#endif#if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE) struct netns_nftables nft;#endif#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) struct netns_nf_frag nf_frag;#endif struct sock *nfnl; struct sock *nfnl_stash;#if IS_ENABLED(CONFIG_NETFILTER_NETLINK_ACCT) struct list_head nfnl_acct_list;#endif#endif#ifdef CONFIG_WEXT_CORE struct sk_buff_head wext_nlevents;#endif struct net_generic __rcu *gen; /* Note : following structs are cache line aligned */#ifdef CONFIG_XFRM struct netns_xfrm xfrm;#endif#if IS_ENABLED(CONFIG_IP_VS) struct netns_ipvs *ipvs;#endif#if IS_ENABLED(CONFIG_MPLS) struct netns_mpls mpls;#endif struct sock *diag_nlsk; atomic_t fnhe_genid;};
task_struct, nsproxy, uts_ns, ipc_ns,….之间的结构关系如下:
各个namespace的初始化
Kernel中有一个默认的nsproxy为init_nsproxy, init_nsproxy在task initialize的时候会被初始化。Init_nsproxy的定义在linux-4.4.19/include/linux/init_task.h #232
#define INIT_TASK(tsk) {…nsproxy = &init_nsproxy,…}继续跟进init_nsproxy的定义,@ linux-4.4.19/kernel/nsproxy.c #31struct nsproxy init_nsproxy = { .count = ATOMIC_INIT(1), .uts_ns = &init_uts_ns,#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) .ipc_ns = &init_ipc_ns,#endif .mnt_ns = NULL, .pid_ns_for_children = &init_pid_ns,#ifdef CONFIG_NET .net_ns = &init_net,#endif};
可见,系统初始化task时,完成了对uts, ipc, pid, net的默认初始化工作,唯独mount没有。具体的各个namespace的initial code location如下:
- init_pid_ns —— linux-4.4.19/kernel/pid.c #70
- init_uts_ns —— linux-4.4.19/kernel/user.c #25
- init_ipc_ns —— linux-4.4.19/ipc/msgutil.c #31
- init_net —— linux-4.4.19/net/core/net_namespace.c #35
The workflow of Create Namespace
系统如何Create New Namespace?
The answer is:
int clone (int (fn) (void), void *child_stack, int flags, void *arg)
clone()是libc库中封装的函数,我们不对其进行深究。在linux kernel中,fork/vfork对clone进行了封装,代码在linux/linux-4.4.19/kernel/fork.c #1808-1833
#ifdef __ARCH_WANT_SYS_FORKSYSCALL_DEFINE0(fork){#ifdef CONFIG_MMU return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);#else /* can not support in nommu mode */ return -EINVAL;#endif}#endif#ifdef __ARCH_WANT_SYS_VFORKSYSCALL_DEFINE0(vfork){ return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 0, NULL, NULL, 0);}#endif#ifdef __ARCH_WANT_SYS_CLONE#ifdef CONFIG_CLONE_BACKWARDSSYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, unsigned long, tls, int __user *, child_tidptr)#elif defined(CONFIG_CLONE_BACKWARDS2)SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, unsigned long, tls)#elif defined(CONFIG_CLONE_BACKWARDS3)SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp, int, stack_size, int __user *, parent_tidptr, int __user *, child_tidptr, unsigned long, tls)#elseSYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, int __user *, child_tidptr, unsigned long, tls)#endif{ return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);}
ork 通过0x80中断(系统调用)来陷入内核,由系统提供的相应系统调用来完成进程的创建。
上面显示,不论是vfork还是fork来创建新城,最终都是通过 _do_fork()来负责实现。
跟踪到linux/linux-4.4.19/kernel/fork.c #1693,查看_do_fork()函数的实现:
long _do_fork(unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr, unsigned long tls){ //创建进程描述符指针 struct task_struct *p; int trace = 0; long nr; if (!(clone_flags & CLONE_UNTRACED)) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; else if ((clone_flags & CSIGNAL) != SIGCHLD) trace = PTRACE_EVENT_CLONE; else trace = PTRACE_EVENT_FORK; if (likely(!ptrace_event_enabled(current, trace))) trace = 0; } //复制进程描述符,copy_process()的返回值是一个 task_struct 指针。 p = copy_process(clone_flags, stack_start, stack_size, child_tidptr, NULL, trace, tls); if (!IS_ERR(p)) { struct completion vfork; struct pid *pid; trace_sched_process_fork(current, p); //得到新创建的进程描述符中的pid pid = get_task_pid(p, PIDTYPE_PID); nr = pid_vnr(pid); if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, parent_tidptr); //如果调用的 vfork()方法,初始化 vfork 完成处理信息。 if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); get_task_struct(p); } //将子进程加入到调度器中,为其分配 CPU,准备执行 wake_up_new_task(p); //fork 完成,子进程即将开始运行 if (unlikely(trace)) ptrace_event_pid(trace, pid); //如果是 vfork,将父进程加入至等待队列,等待子进程完成 if (clone_flags & CLONE_VFORK) { if (!wait_for_vfork_done(p, &vfork)) ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); } put_pid(pid); } else { nr = PTR_ERR(p); } return nr;}
_do_fork 流程
- 调用 copy_process 为子进程复制出一份进程信息
- 如果是 vfork 初始化完成处理信息
- 调用 wake_up_new_task 将子进程加入调度器,为之分配 CPU
- 如果是 vfork,父进程等待子进程完成 exec 替换自己的地址空间
copy_process源码分析
linux/linux-4.4.19/kernel/fork.c #1243中,定义了copy_process()函数的实现:
static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, int trace, unsigned long tls){ int retval; //创建进程描述符指针 struct task_struct *p; void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {}; // 检查clone flags的合法性,比如CLONE_NEWNS与CLONE_FS是互斥的; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return ERR_PTR(-EINVAL); if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) return ERR_PTR(-EINVAL); if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); // 比如CLONE_PARENT时得检查当前signal flags是否为SIGNAL_UNKILLABLE,防止kill init进程。 if ((clone_flags & CLONE_PARENT) && current->signal->flags & SIGNAL_UNKILLABLE) return ERR_PTR(-EINVAL); if (clone_flags & CLONE_THREAD) { if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || (task_active_pid_ns(current) != current->nsproxy->pid_ns_for_children)) return ERR_PTR(-EINVAL); } retval = security_task_create(clone_flags); if (retval) goto fork_out; retval = -ENOMEM; // 复制当前的task_struct p = dup_task_struct(current); if (!p) goto fork_out; ftrace_graph_init_task(p); // 初始化互斥变量 rt_mutex_init_task(p);#ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);#endif retval = -EAGAIN; // 检查进程数是否是超过限制,由OS定义 if (atomic_read(&p->real_cred->user->processes) >= task_rlimit(p, RLIMIT_NPROC)) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_free; } current->flags &= ~PF_NPROC_EXCEEDED; retval = copy_creds(p, clone_flags); if (retval < 0) goto bad_fork_free; retval = -EAGAIN; //检查进程数是否超过 max_threads 由内存大小决定 if (nr_threads >= max_threads) goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); rcu_copy_process(p); p->vfork_done = NULL; //初始化自旋锁 spin_lock_init(&p->alloc_lock); //初始化挂起信号 init_sigpending(&p->pending); p->utime = p->stime = p->gtime = 0; p->utimescaled = p->stimescaled = 0; prev_cputime_init(&p->prev_cputime);#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN seqlock_init(&p->vtime_seqlock); p->vtime_snap = 0; p->vtime_snap_whence = VTIME_SLEEPING;#endif#if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat));#endif p->default_timer_slack_ns = current->timer_slack_ns; task_io_accounting_init(&p->ioac); acct_clear_integrals(p); // 初始化CPU定时器 posix_cpu_timers_init(p); p->start_time = ktime_get_ns(); p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; threadgroup_change_begin(current); cgroup_fork(p); //...... //初始化进程数据结构,并把进程状态设置为 TASK_RUNNING retval = sched_fork(clone_flags, p); //复制所有进程信息,包括文件系统、信号处理函数、信号、内存管理等 if (retval) goto bad_fork_cleanup_policy; retval = perf_event_init_task(p); if (retval) goto bad_fork_cleanup_policy; retval = audit_alloc(p); if (retval) goto bad_fork_cleanup_perf; /* copy all the process information */ shm_init_task(p); retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_audit; retval = copy_files(clone_flags, p); if (retval) goto bad_fork_cleanup_semundo; retval = copy_fs(clone_flags, p); if (retval) goto bad_fork_cleanup_files; retval = copy_sighand(clone_flags, p); if (retval) goto bad_fork_cleanup_fs; retval = copy_signal(clone_flags, p); if (retval) goto bad_fork_cleanup_sighand; retval = copy_mm(clone_flags, p); if (retval) goto bad_fork_cleanup_signal; // 复制namespaces retval = copy_namespaces(clone_flags, p); if (retval) goto bad_fork_cleanup_mm; retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; // 初始化子进程内核栈 retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); if (retval) goto bad_fork_cleanup_io; //为新进程分配新的 pid if (pid != &init_struct_pid) { pid = alloc_pid(p->nsproxy->pid_ns_for_children); if (IS_ERR(pid)) { retval = PTR_ERR(pid); goto bad_fork_cleanup_io; } } p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; //...... //设置子进程 pid p->pid = pid_nr(pid); //...... /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted the the new process's css_set can be changed * between here and cgroup_post_fork() if an organisation operation is in * progress. */ retval = cgroup_can_fork(p, cgrp_ss_priv); if (retval) goto bad_fork_free_pid; /* * Make it visible to the rest of the system, but dont wake it up yet. * Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; p->parent_exec_id = current->parent_exec_id; } else { p->real_parent = current; p->parent_exec_id = current->self_exec_id; } spin_lock(¤t->sighand->siglock); // 复制seccommp配置 copy_seccomp(p); /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the * fork. Restart if a signal comes in before we add the new process to * it's process group. * A fatal signal pending means that current will exit, so the new * thread can't slip out of an OOM kill (or normal SIGKILL). */ recalc_sigpending(); if (signal_pending(current)) { spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; goto bad_fork_cancel_cgroup; } //...... total_forks++; spin_unlock(¤t->sighand->siglock); syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p, cgrp_ss_priv); threadgroup_change_end(current); perf_event_fork(p); trace_task_newtask(p, clone_flags); uprobe_copy_process(p, clone_flags); // 返回结构体p return p;}
copy_process流程:
- 调用 dup_task_struct 复制当前的 task_struct,为新进程分配了新的堆栈
- 检查进程数是否超过限制
- 初始化自旋锁、挂起信号、CPU 定时器等
- 调用 sched_fork 初始化进程数据结构,并把进程状态设置为 TASK_RUNNING
- 复制所有进程信息,包括文件系统、信号处理函数、信号、内存管理等
- 调用copy_namespaces复制namesapces
- 调用 copy_thread 初始化子进程内核栈,将父进程的寄存器上下文copy给了子进程
- 为新进程分配并设置新的 pid
dup_task_struct源码分析
dup_task_struct()函数的定义在linux-4.4.19/kernel/fork.c #334
static struct task_struct *dup_task_struct(struct task_struct *orig){ struct task_struct *tsk; struct thread_info *ti; int node = tsk_fork_get_node(orig); int err; //分配一个 task_struct 节点 tsk = alloc_task_struct_node(node); if (!tsk) return NULL; //分配一个 thread_info 节点,包含进程的内核栈,ti 为栈底 ti = alloc_thread_info_node(tsk, node); if (!ti) goto free_tsk; //将栈底的值赋给新节点的栈 tsk->stack = ti; //…… return tsk;}
dup_task_struct流程:
- 调用alloc_task_struct_node分配一个 task_struct 节点
- 调用alloc_thread_info_node分配一个 thread_info 节点,其实是分配了一个thread_union联合体,将栈底返回给 ti
union thread_union { struct thread_info thread_info; unsigned long stack[THREAD_SIZE/sizeof(long)];};
- 最后将栈底的值 ti 赋值给新节点的栈
- 最终执行完dup_task_struct之后,子进程除了tsk->stack指针不同之外。
sched_fork源码分析
linux-4.4.19/kernel/sched/core.c #2187
int sched_fork(unsigned long clone_flags, struct task_struct *p){ unsigned long flags; int cpu = get_cpu(); __sched_fork(clone_flags, p); //将子进程状态设置为 TASK_RUNNING p->state = TASK_RUNNING; //…… //为子进程分配 CPU set_task_cpu(p, cpu); put_cpu(); return 0;}
我们可以看到sched_fork大致完成了两项重要工作,一是将子进程状态设置为 TASK_RUNNING,二是为其分配 CPU
copy_thread_tls源码分析
linux-4.4.19/arch/x86/kernel/process_64.c #156
int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls){ int err; struct pt_regs *childregs; struct task_struct *me = current; p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; // 获取寄存器信息 childregs = task_pt_regs(p); p->thread.sp = (unsigned long) childregs; set_tsk_thread_flag(p, TIF_FORK); p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs; savesegment(fs, p->thread.fsindex); p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs; savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); childregs->sp = (unsigned long)childregs; childregs->ss = __KERNEL_DS; childregs->bx = sp; /* function */ childregs->bp = arg; childregs->orig_ax = -1; childregs->cs = __KERNEL_CS | get_kernel_rpl(); childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; return 0; } // 将当前寄存器信息复制给子进程 *childregs = *current_pt_regs(); //子进程 eax 置 0,因此fork 在子进程返回0 childregs->ax = 0; if (sp) childregs->sp = sp; err = -ENOMEM; if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr, IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { p->thread.io_bitmap_max = 0; return -ENOMEM; } set_tsk_thread_flag(p, TIF_IO_BITMAP); } /* * Set a new TLS for the child thread? */ if (clone_flags & CLONE_SETTLS) {#ifdef CONFIG_IA32_EMULATION if (is_ia32_task()) err = do_set_thread_area(p, -1, (struct user_desc __user *)tls, 0); else#endif err = do_arch_prctl(p, ARCH_SET_FS, tls); if (err) goto out; } err = 0;out: if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } return err;}
Copy_namesapces源码分析
Copy_namespaces()函数的定义在linux-4.4.19/kernel/nsproxy.c #124
int copy_namespaces(unsigned long flags, struct task_struct *tsk){ struct nsproxy *old_ns = tsk->nsproxy; struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); struct nsproxy *new_ns; if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET)))) { get_nsproxy(old_ns); return 0; } if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; /* * CLONE_NEWIPC must detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old * namespace are unreachable. In clone parlance, CLONE_SYSVSEM * means share undolist with parent, so we must forbid using * it along with CLONE_NEWIPC. */ if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) == (CLONE_NEWIPC | CLONE_SYSVSEM)) return -EINVAL; new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); if (IS_ERR(new_ns)) return PTR_ERR(new_ns); tsk->nsproxy = new_ns; return 0;}
Create_new_namespaces源码分析
Create_new_namespaces()函数实现定义在linux-4.4.19/kernel/nsproxy.c #59
/* * Create new nsproxy and all of its the associated namespaces. * Return the newly created nsproxy. Do not attach this to the task, * leave it to the caller to do proper locking and attach it to task. */static struct nsproxy *create_new_namespaces(unsigned long flags, struct task_struct *tsk, struct user_namespace *user_ns, struct fs_struct *new_fs){ struct nsproxy *new_nsp; int err; new_nsp = create_nsproxy(); if (!new_nsp) return ERR_PTR(-ENOMEM); new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs); if (IS_ERR(new_nsp->mnt_ns)) { err = PTR_ERR(new_nsp->mnt_ns); goto out_ns; } new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns); if (IS_ERR(new_nsp->uts_ns)) { err = PTR_ERR(new_nsp->uts_ns); goto out_uts; } new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns); if (IS_ERR(new_nsp->ipc_ns)) { err = PTR_ERR(new_nsp->ipc_ns); goto out_ipc; } new_nsp->pid_ns_for_children = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children); if (IS_ERR(new_nsp->pid_ns_for_children)) { err = PTR_ERR(new_nsp->pid_ns_for_children); goto out_pid; } new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns); if (IS_ERR(new_nsp->net_ns)) { err = PTR_ERR(new_nsp->net_ns); goto out_net; } return new_nsp;out_net: if (new_nsp->pid_ns_for_children) put_pid_ns(new_nsp->pid_ns_for_children);out_pid: if (new_nsp->ipc_ns) put_ipc_ns(new_nsp->ipc_ns);out_ipc: if (new_nsp->uts_ns) put_uts_ns(new_nsp->uts_ns);out_uts: if (new_nsp->mnt_ns) put_mnt_ns(new_nsp->mnt_ns);out_ns: kmem_cache_free(nsproxy_cachep, new_nsp); return ERR_PTR(err);}
在copy_namespaces()中,分别调用create_nsproxy(), create_utsname(), create_ipcs(), create_pid_ns(), create_net_ns(), create_mnt_ns(),具体的实现请参考如下索引。
create_nsproxy() —— linux-4.4.19/kernel/nsproxy.c #44
create_utsname() ——linux-4.4.19/kernel/utsname.c #66
create_mnt_ns() ——linux-4.4.19/fs/namespace.c #2775
create_ipcs() —— linux-4.4.19/ipc/namespace.c #54
create_pid_ns() —— linux-4.4.19/kernel/pid_namespace.c #153
create_net_ns() —— linux-4.4.19/net/core/net_namespace.c #351
create new namespace代码流程图:
- Linux kernel Namespace源码分析
- linux kernel network namespace
- Linux Kernel Namespace实现: namespace API介绍
- Linux Kernel devm_* API源码分析
- 【runc 源码分析】namespace 源码分析
- android kernel 源码 分析
- S3C24x0 kernel 源码分析
- linux源码分析之cpu初始化 kernel/head.s
- linux源码分析之cpu初始化 kernel/head.s
- Linux kernel 3.10内核源码分析--进程上下文切换
- Linux kernel 3.10内核源码分析--进程退出exit_code
- linux kernel 源码
- linux kernel初始化分析
- linux kernel 启动分析
- Linux Kernel Interrupt 分析
- runC源码分析——namespace
- 查看linux kernel源码网站
- Linux Kernel Cgroups源码浅析
- HTM-16.2代码(9)——fillMvpCand
- WCF SOAP用法
- Android IntentService 和Service 的区别
- youcomplete 添加 python 支持
- [POJ1226]Substrings(后缀数组+二分)
- Linux kernel Namespace源码分析
- c语言中关于头文件重复包含
- 各个iPhone尺寸和分辨率
- python 学习笔记-列表
- JavaScript 内置对象(一):Array 对象(构造函数、属性和方法)
- Java8中的default方法
- Python发邮件时报错 554
- Mac OSX网络诊断命令
- TEC1401.Report开发技术总结 - 第三章 使用Oracle Reports开发报表-创建一个分组报表(2/4)