Linux进程线程源码浅析

来源：互联网发布：西方意识形态知乎编辑：程序博客网时间：2024/06/16 09:19

内核版本3.13

概述

Linux内核中，进程通过数据结构task_struct（也称为进程描述符）被表示成任务（task），不像其他的操作系统会区别进程、轻量级进程和线程（下边就统称进程吧），Linux系统用 task_struct 数据结构来表示所有的执行上下文。对于每一个进程，一个类型为task_struct的进程描述符始终存在于内存中。它包含了内核管理全部进程所需的重要信息，如调度参数、已打开的文件描述符列表等。进程描述符从进程被创建开始就一直存在于内核堆栈之中。
Linu对进程标识符(PID) 和任务标识符（TID）进行了区分。这两个分量都存储在任务数据结构task_struct 中。当调用clone函数创建一个新进程而不需要和旧进程共享任何信息时，会设置一个新的PID，否则，任务得到一个新的任务标识符TID，但是PID不变。这样一来，一个进程中所有的线程都会拥有与该进程中第一个线程相同的PID。

创建过程介绍

创建一个新进程会为其创建要给新的进程描述符和用户空间，然后从父进程复制大量的内容，如子进程被赋予一个PID，并建立它的内存映射，同时它也被赋予了访问属于父进程文件的权利。然后，它的寄存器内容被初始化并准备运行。
当系统调用fork执行的时候，调用fork函数的进程陷入内核并且创建一个task_struct结构和其他相关的数据结构，如内核堆栈和thread_info结构。这个结构位于进程堆栈栈底固定偏移量的地方，包含一些进程参数，以及进程描述符的地址。把进程描述符的地址存储在一个固定的地方，使得Linux系统只需要进行很少的有效操作就可以找到一个运行中进程的task_struct。
进程描述符的主要内容根据父进程的进程描述符来填充。Linux系统只需要寻找一个可用的PID，更新进程标识符散列表的表项使之指向新的任务数据结构即可。如果散列表发生冲突，相同键值的进程描述符会被组成链表。它会把task_struct 结构中的一些分量设置为指向任务数组中相应进程的前一/后一进程的指针。
理论上，现在就应该为子进程分配数据段、堆栈段，并且对父进程的段进行复制，因为fork函数意味着父、子进程之间不共享内存。其中如果代码段是只读的，可以复制也可以共享。然后，子进程就可以运行了。但是，实际上复制内存的代价相当的昂贵，所以现代Linux系统都使用了欺骗手段。在最开始主要依赖于父进程来创建子进程用户空间，在创建的过程中所做的工作仅仅是建立mm_struct结构、vm_area_struct结构以及页目录和页表，并没有真正地复制一个物理页面。它们赋予子进程属于它自己的页表，但是这些页表都指向父进程的页面，同时把这些页面标记成只读。当子进程试图向某一页面中写入数据的时候，它会收到写保护的错误。内核发现子进程的写入行为之后，会为子进程分配一个该页面的新副本，并将这个副本标记为可读、可写，即就是为子进程分配一个对应的物理页面。通过这种方式，使得只有需要写入数据的页面才会被复制。这种机制称为写时复制机制（Copy-on-Write）。它所带来的好处就是不需要在内存中维护同一个程序的两个副本，从而节省了内存RAM。

一步一步源码分析

linux中创建进程和线程一般都是使用fork()和pthread_create()，接下来就可以对其分别使用strace命令进行追踪，确定其系统调用函数。

//fork创建进程strace追踪
tiany@tiany-desktop:~/program/C/pthread$ strace ./fork.o
......
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD,child_tidptr=0x7fb1e8fe7a10) = 3599
 
 
//pthread_create创建线程
tiany@tiany-desktop:~/program/C/pthread$ strace ./pthread_create.o
……
clone(xchild_stack=0x7f683d393fb0, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|  CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, parent_tidptr=0x7f683d3949d0, tls=0x7f683d394700, child_tidptr=0x7f683d3949d0) = 3878

由strace结果可以看到，无论是fork创建进程还是pthread_create创建线程，最终都是使用系统调用clone来实现的。两者主要就是参数不一致，特别是clone_flags标志。接下来就进入内核进行深入的分析吧。先看下刚刚提到的clone_flags标志，如下。

/*
 * cloning flags:
 */
#define CSIGNAL     0x000000ff  /* signal mask to be sent at exit */
#define CLONE_VM    0x00000100  /* set if VM shared between processes */
#define CLONE_FS    0x00000200  /* set if fs info shared between processes */
#define CLONE_FILES 0x00000400  /* set if open files shared between processes */
#define CLONE_SIGHAND   0x00000800  /* set if signal handlers and blocked signals shared */
#define CLONE_PTRACE    0x00002000  /* set if we want to let tracing continue on the child too */
#define CLONE_VFORK 0x00004000  /* set if the parent wants the child to wake it up on mm_release */
#define CLONE_PARENT    0x00008000  /* set if we want to have the same parent as the cloner */
#define CLONE_THREAD    0x00010000  /* Same thread group? */
#define CLONE_NEWNS 0x00020000  /* New namespace group? */
#define CLONE_SYSVSEM   0x00040000  /* share system V SEM_UNDO semantics */
#define CLONE_SETTLS    0x00080000  /* create a new TLS for the child */
#define CLONE_PARENT_SETTID 0x00100000  /* set the TID in the parent */
#define CLONE_CHILD_CLEARTID    0x00200000  /* clear the TID in the child */
#define CLONE_DETACHED      0x00400000  /* Unused, ignored */
#define CLONE_UNTRACED      0x00800000  /* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID  0x01000000  /* set the TID in the child */
/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
   and is now available for re-use. */
#define CLONE_NEWUTS        0x04000000  /* New utsname group? */
#define CLONE_NEWIPC        0x08000000  /* New ipcs */
#define CLONE_NEWUSER       0x10000000  /* New user namespace */
#define CLONE_NEWPID        0x20000000  /* New pid namespace */
#define CLONE_NEWNET        0x40000000  /* New network namespace */
#define CLONE_IO        0x80000000  /* Clone io context */

这些flag在创建进程线程时是非常重要的，通过这些标志一般就基本上可以确定创建的是进程还是线程。接下来就真正进入内核，看看fork、vfork、clone等函数的实现，如下：

//fork.c 
#ifdef __ARCH_WANT_SYS_CLONE
#ifdef CONFIG_CLONE_BACKWARDS
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
         int __user *, parent_tidptr,
         int, tls_val,
         int __user *, child_tidptr)
#elif defined(CONFIG_CLONE_BACKWARDS2)
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
         int __user *, parent_tidptr,
         int __user *, child_tidptr,
         int, tls_val)
#elif defined(CONFIG_CLONE_BACKWARDS3)
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
        int, stack_size,
        int __user *, parent_tidptr,
        int __user *, child_tidptr,
        int, tls_val)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
         int __user *, parent_tidptr,
         int __user *, child_tidptr,
         int, tls_val)
#endif
{
    return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
}
#endif

do_fork函数

实际上边的那些函数最终都是调用do_fork()函数来实现的。

/*
 这是fork的主程序。复制进程，如果需要的话，并等待它使用VM完成
 @clone_flags: 低字节指定子进程结束时发送到父进程的信号代码(通常是SIGCHILD)，高位保存了其他的标志flags，如CLONE_VM
 @stack_start: 用户态下，栈的起始地址
 @stack_size: 为未使用(被设置为0)
 @stack_parent_tidptr: 用户态下父进程的TID指针
 @stack_child_tidptr: 用户态下子进程的TID指针
 */
long do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr)
{
    struct task_struct *p;
    int trace = 0;
    long nr;
 
    /*
     * 确定是否以及哪些事件向追踪者报告。 当从kernel_thread或CLONE_UNTRACED被调用被显式请求时，没有事件被报告; 否则，报告是否启用了分支类型的事件
     * 下边的if语句部分主要是对参数clone_flag组合的正确性进行检查，因为标志需要遵循一定的规则，若不符合，则返回错误代码
     */
    if (!(clone_flags & CLONE_UNTRACED)) {
        if (clone_flags & CLONE_VFORK)
            trace = PTRACE_EVENT_VFORK;
        else if ((clone_flags & CSIGNAL) != SIGCHLD)
            trace = PTRACE_EVENT_CLONE;
        else
            trace = PTRACE_EVENT_FORK;
 
        if (likely(!ptrace_event_enabled(current, trace)))
            trace = 0;
    }
 
    p = copy_process(clone_flags, stack_start, stack_size,
             child_tidptr, NULL, trace);
    /*
     * Do this prior waking up the new thread - the thread pointer
     * might get invalid after that point, if the thread exits quickly.
     */
    if (!IS_ERR(p)) {
        struct completion vfork;
 
        trace_sched_process_fork(current, p);
 
        nr = task_pid_vnr(p); //获取子进程的pid
 
        if (clone_flags & CLONE_PARENT_SETTID)
            put_user(nr, parent_tidptr);
 
        if (clone_flags & CLONE_VFORK) {
            p->vfork_done = &vfork;
            init_completion(&vfork);
            get_task_struct(p);
        }
 
        //若copy_process函数执行成功，没有错误，则唤醒新创建的子进程，让子进程运行
        wake_up_new_task(p);
 
        /* forking complete and child started to run, tell ptracer */
        if (unlikely(trace))
            ptrace_event(trace, nr);
 
        if (clone_flags & CLONE_VFORK) {
            if (!wait_for_vfork_done(p, &vfork))
                ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
        }
    } else {
        nr = PTR_ERR(p);
    }
    return nr;
}

copy_process函数

do_fork函数中主要调用了copy_process函数实现从父进程拷贝一个新的进程，其中设置有子进程的私有数据、内存空间等资源

 
/*
 由父进程的拷贝创建一个新的进程，但是没有启动它。拷贝寄存器和进程环境中的所有相关部分（根据clone的flags）
 */
static struct task_struct *copy_process(unsigned long clone_flags,
                    unsigned long stack_start,
                    unsigned long stack_size,
                    int __user *child_tidptr,
                    struct pid *pid,
                    int trace)
{
    int retval;
    struct task_struct *p;
 
    //CLONE_NEWNS(创建自己的命名空间)、CLONE_FS(共享根目录和当前工作目录所在的表)不能同时出现，若同时出现，则退出
    if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
        return ERR_PTR(-EINVAL);
 
    //CLONE_NEWUSER、CLONE_FS不能同时设置
    if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
        return ERR_PTR(-EINVAL);
 
    /*
     * Thread groups must share signals as well, and detached threads
     * can only be started up within the thread group.
       线程组必须共享信号，分离的线程只能在线程组中启动
     */
    //设置了CLONE_THREAD 就必须同时设置CLONE_SIGHAND， 否则报错
    if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
        return ERR_PTR(-EINVAL);
 
    /*
     * Shared signal handlers imply shared VM. By way of the above,
     * thread groups also imply shared VM. Blocking this case allows
     * for various simplifications in other code.
       共享信号处理程意味着共享VM,通过上述方式，线程组也意味着共享VM
     */
    //设置了CLONE_SIGHAND 就必须同时设置CLONE_VM， 否则报错
    if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
        return ERR_PTR(-EINVAL);
 
    /*
     * Siblings of global init remain as zombies on exit since they are
     * not reaped by their parent (swapper). To solve this and to avoid
     * multi-rooted process trees, prevent global and container-inits
     * from creating siblings.
     */
    if ((clone_flags & CLONE_PARENT) &&
                current->signal->flags & SIGNAL_UNKILLABLE)
        return ERR_PTR(-EINVAL);
 
    /*
     * If the new process will be in a different pid or user namespace
     * do not allow it to share a thread group or signal handlers or
     * parent with the forking task.
     */
    if (clone_flags & CLONE_SIGHAND) {
        if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
            (task_active_pid_ns(current) !=
                current->nsproxy->pid_ns_for_children))
            return ERR_PTR(-EINVAL);
    }
 
    //创建子进程前检查权限，允许则返回0
    retval = security_task_create(clone_flags);
    if (retval)
        goto fork_out;
 
    retval = -ENOMEM;
    p = dup_task_struct(current);  //为子进程复制父进程的task_struct 信息，并对自己的部分私有数据进行初始化
    if (!p)
        goto fork_out;
 
    ftrace_graph_init_task(p);
    get_seccomp_filter(p);
 
    rt_mutex_init_task(p);
 
#ifdef CONFIG_PROVE_LOCKING
    DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
    DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
    retval = -EAGAIN;
 
    /*
     * 在创建新进程的相关数据结构后，将会对这个新进程进行检查，看是否超出了当前用户的进程数限制，如果超出了限制，
     * 并且没有相关的权限，也不是init用户，将会跳转到相关的失败处理指令处。
     */
    if (atomic_read(&p->real_cred->user->processes) >=
            task_rlimit(p, RLIMIT_NPROC)) {
        if (p->real_cred->user != INIT_USER &&
            !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
            goto bad_fork_free;
    }
    current->flags &= ~PF_NPROC_EXCEEDED;
 
    retval = copy_creds(p, clone_flags);
    if (retval < 0)
        goto bad_fork_free;
 
    /*
     * If multiple threads are within copy_process(), then this check
     * triggers too late. This doesn't hurt, the check is only there
     * to stop root fork bombs.
     */
    retval = -EAGAIN;
    //检查创建的线程是否超过了系统线程总数
    if (nr_threads >= max_threads)
        goto bad_fork_cleanup_count;
 
    //获得进程执行域？？
    if (!try_module_get(task_thread_info(p)->exec_domain->module))
        goto bad_fork_cleanup_count;
 
    /*
     * 底下这段代码首先将进程描述符p的did_exec值设置为0，以保证新进程不会被运行，
     * 接着将新进程的进程描述符的部分内容清除掉并设置为初始值，如children、sibling和等待的信号等值都被初始化了。
     *
     */
    p->did_exec = 0;
    delayacct_tsk_init(p);  /* Must remain after dup_task_struct() */
    /*
     * copy_flag函数将会更新新创建的子进程的标志，主要是清除PF_SUPERPRIV标志（表示一个进程是否使用超级用户权限）。
     * 然后设置PF_FORKNOEXEC标志，表示这个进程还没有执行过exec函数
     */
    copy_flags(clone_flags, p);
    INIT_LIST_HEAD(&p->children);
    INIT_LIST_HEAD(&p->sibling);
    rcu_copy_process(p);
    p->vfork_done = NULL;
    spin_lock_init(&p->alloc_lock);
 
    init_sigpending(&p->pending);
 
    p->utime = p->stime = p->gtime = 0;
    p->utimescaled = p->stimescaled = 0;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
    p->prev_cputime.utime = p->prev_cputime.stime = 0;
#endif
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
    seqlock_init(&p->vtime_seqlock);
    p->vtime_snap = 0;
    p->vtime_snap_whence = VTIME_SLEEPING;
#endif
 
#if defined(SPLIT_RSS_COUNTING)
    memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif
   …………
    /* Perform scheduler related setup. Assign this task to a CPU. */
    sched_fork(clone_flags, p);
    /*
     * 下边的代码根据clone_flag集合中的值，共享或者复制父进程打开的文件，文件系统信息，
     * 信号处理函数、进程地址空间、命名空间等资源。这些资源通常情况下在一个进程内的多个线程中
     * 才会共享，对于fork创建子进程来说，都会复制一份到子进程。
     */
    retval = perf_event_init_task(p);
    if (retval)
        goto bad_fork_cleanup_policy;
    retval = audit_alloc(p);
    if (retval)
        goto bad_fork_cleanup_policy;
    /* copy all the process information */
    retval = copy_semundo(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_audit;
    retval = copy_files(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_semundo;
    retval = copy_fs(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_files;
    retval = copy_sighand(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_fs;
    retval = copy_signal(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_sighand;
    //copy_mm牵扯到进程的写时复制机制
    retval = copy_mm(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_signal;
    retval = copy_namespaces(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_mm;
    retval = copy_io(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_namespaces;
    retval = copy_thread(clone_flags, stack_start, stack_size, p);
    if (retval)
        goto bad_fork_cleanup_io;
 
    /*
     * 为新建进程分配一个PID
     */
    if (pid != &init_struct_pid) {
        retval = -ENOMEM;
        pid = alloc_pid(p->nsproxy->pid_ns_for_children);
        if (!pid)
            goto bad_fork_cleanup_io;
    }
 
    //设置了CLONE_CHILD_SETTID标志，则为新建线程设置线程号TID
    p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
    /*
     * Clear TID on mm_release()?
     */
    p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
#ifdef CONFIG_BLOCK
    p->plug = NULL;
#endif
#ifdef CONFIG_FUTEX
    p->robust_list = NULL;
#ifdef CONFIG_COMPAT
    p->compat_robust_list = NULL;
#endif
    INIT_LIST_HEAD(&p->pi_state_list);
    p->pi_state_cache = NULL;
#endif
    /*
     * sigaltstack should be cleared when sharing the same VM
     */
    if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
        p->sas_ss_sp = p->sas_ss_size = 0;
 
    /*
     * Syscall tracing and stepping should be turned off in the
     * child regardless of CLONE_PTRACE.
     */
    user_disable_single_step(p);
    clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
    clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
    clear_all_latency_tracing(p);
 
    /* ok, now we should be set up.. */
    p->pid = pid_nr(pid);
    if (clone_flags & CLONE_THREAD) {
        p->exit_signal = -1;
        p->group_leader = current->group_leader;
        p->tgid = current->tgid;
    } else {
        if (clone_flags & CLONE_PARENT)
            p->exit_signal = current->group_leader->exit_signal;
        else
            p->exit_signal = (clone_flags & CSIGNAL);
        p->group_leader = p;
        p->tgid = p->pid;
    }
 
    p->pdeath_signal = 0;
    p->exit_state = 0;
 
    p->nr_dirtied = 0;
    p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
    p->dirty_paused_when = 0;
 
    INIT_LIST_HEAD(&p->thread_group);
    p->task_works = NULL;
 …………
}

copy_process函数中首先调用dup_task_struct()函数为子进程创建task_struct结构体等信息，然后根据clone_flags集合中的标志值，设置共享或者复制父进程打开的文件、文件系统信息、信号处理函数、进程地址空间、命名空间等资源，其中copy_mm函数实现父进程地址空间的拷贝，也就是fork创建子进程时的写时复制机制的核心处了，接下来看看这个函数的实现。

父进程地址空间的拷贝copy_mm()

//@tsk：新进程的PCB
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
{
    struct mm_struct *mm, *oldmm;
    int retval;
 
    tsk->min_flt = tsk->maj_flt = 0;
    tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
    tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
#endif
 
    tsk->mm = NULL;
    tsk->active_mm = NULL;
 
    /*
     * Are we cloning a kernel thread?
     * We need to steal a active VM for that..
     */
    oldmm = current->mm;
    if (!oldmm)
        return 0;
 
    /*
     * 若设置有CLONE_VM就是两个进程之间共享VM，也就是pthread_create创建线程的情况，
     * 若是fork创建子进程，则没有该标志，而是调用dup_mm函数
     */
    if (clone_flags & CLONE_VM) {
        atomic_inc(&oldmm->mm_users);
        mm = oldmm;
        goto good_mm;
    }
 
    retval = -ENOMEM;
    mm = dup_mm(tsk);
    if (!mm)
        goto fail_nomem;
 
good_mm:
    tsk->mm = mm;
    tsk->active_mm = mm;
    return 0;
 
fail_nomem:
    return retval;
}

copy_mm()中检查clone_flags中是否有CLONE_VM标志，若有，两个进程之间共享VM，即就是创建轻量级进程(线程)，否则，就是fork创建进程，从而调用dup_mm()函数为子进程分配一个新的mm_struct结构体。

子进程的写时复制机制

/*
 * Allocate a new mm structure and copy contents from the
 * mm structure of the passed in task structure.
  分配一个新的mm结构体，并从参数tsk(子进程)进程描述符结构体中的mm结构体复制内容
 */
struct mm_struct *dup_mm(struct task_struct *tsk)
{
    struct mm_struct *mm, *oldmm = current->mm;
    int err;
 
    if (!oldmm)
        return NULL;
 
    mm = allocate_mm();  //为mm分配一段内存(由slab分配)
    if (!mm)
        goto fail_nomem;
 
    memcpy(mm, oldmm, sizeof(*mm));  //将oldmm内存区域拷贝至新分配的mm
    mm_init_cpumask(mm);
 
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
    mm->pmd_huge_pte = NULL;
#endif
    if (!mm_init(mm, tsk))
        goto fail_nomem;
 
    if (init_new_context(tsk, mm))
        goto fail_nocontext;
 
    dup_mm_exe_file(oldmm, mm);  //复制父进程中正在运行的程序链接
 
    err = dup_mmap(mm, oldmm);  //拷贝父进程地址空间
    if (err)
        goto free_pt;
 
    mm->hiwater_rss = get_mm_rss(mm);
    mm->hiwater_vm = mm->total_vm;
 
    if (mm->binfmt && !try_module_get(mm->binfmt->module))
        goto free_pt;
 
    return mm;
 
free_pt:
    /* don't put binfmt in mmput, we haven't got module yet */
    mm->binfmt = NULL;
    mmput(mm);
 
fail_nomem:
    return NULL;
 
fail_nocontext:
    /*
     * If init_new_context() failed, we cannot use mmput() to free the mm
     * because it calls destroy_context()
     */
    mm_free_pgd(mm);
    free_mm(mm);
    return NULL;
}

使用dup_mmap()函数为子进程拷贝父进程地址空间，其中调用copy_page_range()函数进行页表的拷贝，由于linux中采用四级分页机制，分别是pgd、pud、pmd、pte，因而依次对其进行拷贝，最终在拷贝pte的函数copy_pte_range中调用copy_one_page函数实现真正的写时复制。

copy_one_pte()函数

 
/*
 * copy one vm_area from one task to the other. Assumes the page tables
 * already present in the new task to be cleared in the whole range
 * covered by this vma.
   将一个vma_area从一个task复制到另一个，假设新的task中已经存在的页表将被该vma覆盖的整个范围中被清除
 */
 
static inline unsigned long
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
        unsigned long addr, int *rss)
{
    unsigned long vm_flags = vma->vm_flags;
    pte_t pte = *src_pte;
    struct page *page;
 
    /* pte contains position in swap or file, so copy. */
    if (unlikely(!pte_present(pte))) {
        if (!pte_file(pte)) {
            swp_entry_t entry = pte_to_swp_entry(pte);
 
            if (swap_duplicate(entry) < 0)
                return entry.val;
 
            /* make sure dst_mm is on swapoff's mmlist. */
            if (unlikely(list_empty(&dst_mm->mmlist))) {
                spin_lock(&mmlist_lock);
                if (list_empty(&dst_mm->mmlist))
                    list_add(&dst_mm->mmlist,
                         &src_mm->mmlist);
                spin_unlock(&mmlist_lock);
            }
            if (likely(!non_swap_entry(entry)))
                rss[MM_SWAPENTS]++;
            else if (is_migration_entry(entry)) {
                page = migration_entry_to_page(entry);
 
                if (PageAnon(page))
                    rss[MM_ANONPAGES]++;
                else
                    rss[MM_FILEPAGES]++;
 
                if (is_write_migration_entry(entry) &&
                    is_cow_mapping(vm_flags)) {
                    /*
                     * COW mappings require pages in both
                     * parent and child to be set to read.
                       写时复制机制的核心所在
                     */
                    make_migration_entry_read(&entry);
                    pte = swp_entry_to_pte(entry);
                    if (pte_swp_soft_dirty(*src_pte))
                        pte = pte_swp_mksoft_dirty(pte);
                    set_pte_at(src_mm, addr, src_pte, pte);
                }
            }
        }
        goto out_set_pte;
    }
 
    /*
     * If it's a COW mapping, write protect it both
     * in the parent and the child
     */
    if (is_cow_mapping(vm_flags)) {
        ptep_set_wrprotect(src_mm, addr, src_pte);
        pte = pte_wrprotect(pte);
    }
 
    /*
     * If it's a shared mapping, mark it clean in
     * the child
     */
    if (vm_flags & VM_SHARED)
        pte = pte_mkclean(pte);
    pte = pte_mkold(pte);
 
    page = vm_normal_page(vma, addr, pte);
    if (page) {
        get_page(page);
        page_dup_rmap(page);
        if (PageAnon(page))
            rss[MM_ANONPAGES]++;
        else
            rss[MM_FILEPAGES]++;
    }
 
out_set_pte:
    set_pte_at(dst_mm, addr, dst_pte, pte);
    return 0;
}

在该函数中判断页是否支持写时复制，若支持就给其添加写保护，在写操作发生时，发生写保护错误，从而为子进程新分配一块内存。以上就是自己对创建进程线程的一些理解，如有错误，还望不吝指教。

阅读全文

0 0