6、分析Linux内核创建一个新进程的过程

来源:互联网 发布:软件导刊 编辑:程序博客网 时间:2024/05/16 05:58

姓名:周毅原创作品转载请注明出处 《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000
这篇文章主要分析linux调用fork系统调用时,执行了哪些过程。
一、fork系统调用代码分析
fork()允许用户态下创建新的进程, fork 创造的子进程复制了父亲进程的资源,包括内存的内容和task_struct内容,新旧进程使用同一代码段,复制数据段和堆栈段。在 Linux 内核中,供用户创建进程的系统调用fork()函数的响应函数是 sys_fork()、sys_clone()、sys_vfork()。这三个函数都是通过调用内核函数 do_fork() 来实现的。
do_fork在/linux-3.18.6/kernel/fork.c中定义:

1623long do_fork(unsigned long clone_flags,1624          unsigned long stack_start,1625          unsigned long stack_size,1626          int __user *parent_tidptr,1627          int __user *child_tidptr)1628{1629    struct task_struct *p; //创建新的进程描述符1630    int trace = 0;1631    long nr;16321633    /*1634     * Determine whether and which event to report to ptracer.  When1635     * called from kernel_thread or CLONE_UNTRACED is explicitly1636     * requested, no event is reported; otherwise, report if the event1637     * for the type of forking is enabled.1638     */1639    if (!(clone_flags & CLONE_UNTRACED)) {1640        if (clone_flags & CLONE_VFORK)1641            trace = PTRACE_EVENT_VFORK;1642        else if ((clone_flags & CSIGNAL) != SIGCHLD)1643            trace = PTRACE_EVENT_CLONE;1644        else1645            trace = PTRACE_EVENT_FORK;16461647        if (likely(!ptrace_event_enabled(current, trace)))1648            trace = 0;1649    }1650    //复制父进程的进程数据1651    p = copy_process(clone_flags, stack_start, stack_size,1652             child_tidptr, NULL, trace);1653    /*1654     * Do this prior waking up the new thread - the thread pointer1655     * might get invalid after that point, if the thread exits quickly.1656     */1657    if (!IS_ERR(p)) {1658        struct completion vfork;1659        struct pid *pid;16601661        trace_sched_process_fork(current, p);16621663        pid = get_task_pid(p, PIDTYPE_PID);1664        nr = pid_vnr(pid);16651666        if (clone_flags & CLONE_PARENT_SETTID)1667            put_user(nr, parent_tidptr);1668        1669        if (clone_flags & CLONE_VFORK) {1670            p->vfork_done = &vfork;1671            init_completion(&vfork);1672            get_task_struct(p);1673        }1674        //将子进程添加到调度器的队列,使得子进程有机会获得CPU1675        wake_up_new_task(p);16761677        /* forking complete and child started to run, tell ptracer */1678        if (unlikely(trace))1679            ptrace_event_pid(trace, pid);16801681        if (clone_flags & CLONE_VFORK) {1682            if (!wait_for_vfork_done(p, &vfork))1683                ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);1684        }16851686        put_pid(pid);1687    } else {1688        nr = PTR_ERR(p);1689    }1690    return nr;1691}

copy_process是做复制父进程,创建子进程过程:

1182static struct task_struct *copy_process(unsigned long clone_flags,1183                    unsigned long stack_start,1184                    unsigned long stack_size,1185                    int __user *child_tidptr,1186                    struct pid *pid,1187                    int trace)1188{1189    int retval;1190    struct task_struct *p;//创建子进程PCB结构体11911192    if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))1193        return ERR_PTR(-EINVAL);11941195    if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))1196        return ERR_PTR(-EINVAL);11971198    /*1199     * Thread groups must share signals as well, and detached threads1200     * can only be started up within the thread group.1201     */1202    if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))1203        return ERR_PTR(-EINVAL);12041205    /*1206     * Shared signal handlers imply shared VM. By way of the above,1207     * thread groups also imply shared VM. Blocking this case allows1208     * for various simplifications in other code.1209     */1210    if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))1211        return ERR_PTR(-EINVAL);12121213    /*1214     * Siblings of global init remain as zombies on exit since they are1215     * not reaped by their parent (swapper). To solve this and to avoid1216     * multi-rooted process trees, prevent global and container-inits1217     * from creating siblings.1218     */1219    if ((clone_flags & CLONE_PARENT) &&1220                current->signal->flags & SIGNAL_UNKILLABLE)1221        return ERR_PTR(-EINVAL);12221223    /*1224     * If the new process will be in a different pid or user namespace1225     * do not allow it to share a thread group or signal handlers or1226     * parent with the forking task.1227     */1228    if (clone_flags & CLONE_SIGHAND) {1229        if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||1230            (task_active_pid_ns(current) !=1231                current->nsproxy->pid_ns_for_children))1232            return ERR_PTR(-EINVAL);1233    }12341235    retval = security_task_create(clone_flags);1236    if (retval)1237        goto fork_out;12381239    retval = -ENOMEM;1240    p = dup_task_struct(current);  //current为当前进程,复制当前进程PCB1241    if (!p)1242        goto fork_out;12431244    ftrace_graph_init_task(p);12451246    rt_mutex_init_task(p);12471248#ifdef CONFIG_PROVE_LOCKING1249    DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);1250    DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);1251#endif1252    retval = -EAGAIN;1253    if (atomic_read(&p->real_cred->user->processes) >=1254            task_rlimit(p, RLIMIT_NPROC)) {1255        if (p->real_cred->user != INIT_USER &&1256            !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))1257            goto bad_fork_free;1258    }1259    current->flags &= ~PF_NPROC_EXCEEDED;12601261    retval = copy_creds(p, clone_flags);1262    if (retval < 0)1263        goto bad_fork_free;12641265    /*1266     * If multiple threads are within copy_process(), then this check1267     * triggers too late. This doesn't hurt, the check is only there1268     * to stop root fork bombs.1269     */1270    retval = -EAGAIN;1271    if (nr_threads >= max_threads)1272        goto bad_fork_cleanup_count;12731274    if (!try_module_get(task_thread_info(p)->exec_domain->module))1275        goto bad_fork_cleanup_count;12761277    delayacct_tsk_init(p);  /* Must remain after dup_task_struct() */1278    p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);1279    p->flags |= PF_FORKNOEXEC;1280    INIT_LIST_HEAD(&p->children);1281    INIT_LIST_HEAD(&p->sibling);1282    rcu_copy_process(p);1283    p->vfork_done = NULL;1284    spin_lock_init(&p->alloc_lock);12851286    init_sigpending(&p->pending);1287    //复制完后我们需要修改子进程p内部的一系列数据1288    p->utime = p->stime = p->gtime = 0;1289    p->utimescaled = p->stimescaled = 0;1290#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE1291    p->prev_cputime.utime = p->prev_cputime.stime = 0;1292#endif1293#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN1294    seqlock_init(&p->vtime_seqlock);1295    p->vtime_snap = 0;1296    p->vtime_snap_whence = VTIME_SLEEPING;1297#endif12981299#if defined(SPLIT_RSS_COUNTING)1300    memset(&p->rss_stat, 0, sizeof(p->rss_stat));1301#endif13021303    p->default_timer_slack_ns = current->timer_slack_ns;13041305    task_io_accounting_init(&p->ioac);1306    acct_clear_integrals(p);13071308    posix_cpu_timers_init(p);13091310    p->start_time = ktime_get_ns();1311    p->real_start_time = ktime_get_boot_ns();1312    p->io_context = NULL;1313    p->audit_context = NULL;1314    if (clone_flags & CLONE_THREAD)1315        threadgroup_change_begin(current);1316    cgroup_fork(p);1317#ifdef CONFIG_NUMA1318    p->mempolicy = mpol_dup(p->mempolicy);1319    if (IS_ERR(p->mempolicy)) {1320        retval = PTR_ERR(p->mempolicy);1321        p->mempolicy = NULL;1322        goto bad_fork_cleanup_threadgroup_lock;1323    }1324#endif1325#ifdef CONFIG_CPUSETS1326    p->cpuset_mem_spread_rotor = NUMA_NO_NODE;1327    p->cpuset_slab_spread_rotor = NUMA_NO_NODE;1328    seqcount_init(&p->mems_allowed_seq);1329#endif1330#ifdef CONFIG_TRACE_IRQFLAGS1331    p->irq_events = 0;1332    p->hardirqs_enabled = 0;1333    p->hardirq_enable_ip = 0;1334    p->hardirq_enable_event = 0;1335    p->hardirq_disable_ip = _THIS_IP_;1336    p->hardirq_disable_event = 0;1337    p->softirqs_enabled = 1;1338    p->softirq_enable_ip = _THIS_IP_;1339    p->softirq_enable_event = 0;1340    p->softirq_disable_ip = 0;1341    p->softirq_disable_event = 0;1342    p->hardirq_context = 0;1343    p->softirq_context = 0;1344#endif1345#ifdef CONFIG_LOCKDEP1346    p->lockdep_depth = 0; /* no locks held yet */1347    p->curr_chain_key = 0;1348    p->lockdep_recursion = 0;1349#endif13501351#ifdef CONFIG_DEBUG_MUTEXES1352    p->blocked_on = NULL; /* not blocked yet */1353#endif1354#ifdef CONFIG_BCACHE1355    p->sequential_io    = 0;1356    p->sequential_io_avg    = 0;1357#endif13581359    /* Perform scheduler related setup. Assign this task to a CPU. */1360    retval = sched_fork(clone_flags, p);1361    if (retval)1362        goto bad_fork_cleanup_policy;13631364    retval = perf_event_init_task(p);1365    if (retval)1366        goto bad_fork_cleanup_policy;1367    retval = audit_alloc(p);1368    if (retval)1369        goto bad_fork_cleanup_perf;1370    /* copy all the process information */1371    shm_init_task(p);1372    retval = copy_semundo(clone_flags, p);1373    if (retval)1374        goto bad_fork_cleanup_audit;1375    retval = copy_files(clone_flags, p);1376    if (retval)1377        goto bad_fork_cleanup_semundo;1378    retval = copy_fs(clone_flags, p);1379    if (retval)1380        goto bad_fork_cleanup_files;1381    retval = copy_sighand(clone_flags, p);1382    if (retval)1383        goto bad_fork_cleanup_fs;1384    retval = copy_signal(clone_flags, p);1385    if (retval)1386        goto bad_fork_cleanup_sighand;1387    retval = copy_mm(clone_flags, p);1388    if (retval)1389        goto bad_fork_cleanup_signal;1390    retval = copy_namespaces(clone_flags, p);1391    if (retval)1392        goto bad_fork_cleanup_mm;1393    retval = copy_io(clone_flags, p);1394    if (retval)1395        goto bad_fork_cleanup_namespaces;1396    retval = copy_thread(clone_flags, stack_start, stack_size, p);//此过程复制了一些关键数据,下面介绍1397    if (retval)1398        goto bad_fork_cleanup_io;13991400    if (pid != &init_struct_pid) {1401        retval = -ENOMEM;1402        pid = alloc_pid(p->nsproxy->pid_ns_for_children);1403        if (!pid)1404            goto bad_fork_cleanup_io;1405    }14061407    p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;1408    /*1409     * Clear TID on mm_release()?1410     */1411    p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;1412#ifdef CONFIG_BLOCK1413    p->plug = NULL;1414#endif1415#ifdef CONFIG_FUTEX1416    p->robust_list = NULL;1417#ifdef CONFIG_COMPAT1418    p->compat_robust_list = NULL;1419#endif1420    INIT_LIST_HEAD(&p->pi_state_list);1421    p->pi_state_cache = NULL;1422#endif1423    /*1424     * sigaltstack should be cleared when sharing the same VM1425     */1426    if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)1427        p->sas_ss_sp = p->sas_ss_size = 0;14281429    /*1430     * Syscall tracing and stepping should be turned off in the1431     * child regardless of CLONE_PTRACE.1432     */1433    user_disable_single_step(p);1434    clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);1435#ifdef TIF_SYSCALL_EMU1436    clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);1437#endif1438    clear_all_latency_tracing(p);14391440    /* ok, now we should be set up.. */1441    p->pid = pid_nr(pid);1442    if (clone_flags & CLONE_THREAD) {1443        p->exit_signal = -1;1444        p->group_leader = current->group_leader;1445        p->tgid = current->tgid;1446    } else {1447        if (clone_flags & CLONE_PARENT)1448            p->exit_signal = current->group_leader->exit_signal;1449        else1450            p->exit_signal = (clone_flags & CSIGNAL);1451        p->group_leader = p;1452        p->tgid = p->pid;1453    }14541455    p->nr_dirtied = 0;1456    p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);1457    p->dirty_paused_when = 0;14581459    p->pdeath_signal = 0;1460    INIT_LIST_HEAD(&p->thread_group);1461    p->task_works = NULL;14621463    /*1464     * Make it visible to the rest of the system, but dont wake it up yet.1465     * Need tasklist lock for parent etc handling!1466     */1467    write_lock_irq(&tasklist_lock);14681469    /* CLONE_PARENT re-uses the old parent */1470    if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {1471        p->real_parent = current->real_parent;1472        p->parent_exec_id = current->parent_exec_id;1473    } else {1474        p->real_parent = current;1475        p->parent_exec_id = current->self_exec_id;1476    }14771478    spin_lock(&current->sighand->siglock);14791480    /*1481     * Copy seccomp details explicitly here, in case they were changed1482     * before holding sighand lock.1483     */1484    copy_seccomp(p);14851486    /*1487     * Process group and session signals need to be delivered to just the1488     * parent before the fork or both the parent and the child after the1489     * fork. Restart if a signal comes in before we add the new process to1490     * it's process group.1491     * A fatal signal pending means that current will exit, so the new1492     * thread can't slip out of an OOM kill (or normal SIGKILL).1493    */1494    recalc_sigpending();1495    if (signal_pending(current)) {1496        spin_unlock(&current->sighand->siglock);1497        write_unlock_irq(&tasklist_lock);1498        retval = -ERESTARTNOINTR;1499        goto bad_fork_free_pid;1500    }15011502    if (likely(p->pid)) {1503        ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);15041505        init_task_pid(p, PIDTYPE_PID, pid);1506        if (thread_group_leader(p)) {1507            init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));1508            init_task_pid(p, PIDTYPE_SID, task_session(current));15091510            if (is_child_reaper(pid)) {1511                ns_of_pid(pid)->child_reaper = p;1512                p->signal->flags |= SIGNAL_UNKILLABLE;1513            }15141515            p->signal->leader_pid = pid;1516            p->signal->tty = tty_kref_get(current->signal->tty);1517            list_add_tail(&p->sibling, &p->real_parent->children);1518            list_add_tail_rcu(&p->tasks, &init_task.tasks);1519            attach_pid(p, PIDTYPE_PGID);1520            attach_pid(p, PIDTYPE_SID);1521            __this_cpu_inc(process_counts);1522        } else {1523            current->signal->nr_threads++;1524            atomic_inc(&current->signal->live);1525            atomic_inc(&current->signal->sigcnt);1526            list_add_tail_rcu(&p->thread_group,1527                      &p->group_leader->thread_group);1528            list_add_tail_rcu(&p->thread_node,1529                      &p->signal->thread_head);1530        }1531        attach_pid(p, PIDTYPE_PID);1532        nr_threads++;1533    }15341535    total_forks++;1536    spin_unlock(&current->sighand->siglock);1537    syscall_tracepoint_update(p);1538    write_unlock_irq(&tasklist_lock);15391540    proc_fork_connector(p);1541    cgroup_post_fork(p);1542    if (clone_flags & CLONE_THREAD)1543        threadgroup_change_end(current);1544    perf_event_fork(p);15451546    trace_task_newtask(p, clone_flags);1547    uprobe_copy_process(p, clone_flags);1548    //返回修改后的子进程1549    return p;15501551bad_fork_free_pid:1552    if (pid != &init_struct_pid)1553        free_pid(pid);1554bad_fork_cleanup_io:1555    if (p->io_context)1556        exit_io_context(p);1557bad_fork_cleanup_namespaces:1558    exit_task_namespaces(p);1559bad_fork_cleanup_mm:1560    if (p->mm)1561        mmput(p->mm);1562bad_fork_cleanup_signal:1563    if (!(clone_flags & CLONE_THREAD))1564        free_signal_struct(p->signal);1565bad_fork_cleanup_sighand:1566    __cleanup_sighand(p->sighand);1567bad_fork_cleanup_fs:1568    exit_fs(p); /* blocking */1569bad_fork_cleanup_files:1570    exit_files(p); /* blocking */1571bad_fork_cleanup_semundo:1572    exit_sem(p);1573bad_fork_cleanup_audit:1574    audit_free(p);1575bad_fork_cleanup_perf:1576    perf_event_free_task(p);1577bad_fork_cleanup_policy:1578#ifdef CONFIG_NUMA1579    mpol_put(p->mempolicy);1580bad_fork_cleanup_threadgroup_lock:1581#endif1582    if (clone_flags & CLONE_THREAD)1583        threadgroup_change_end(current);1584    delayacct_tsk_free(p);1585    module_put(task_thread_info(p)->exec_domain->module);1586bad_fork_cleanup_count:1587    atomic_dec(&p->cred->user->processes);1588    exit_creds(p);1589bad_fork_free:1590    free_task(p);1591fork_out:1592    return ERR_PTR(retval);1593}

dup_task_struct的具体实现如下:

305static struct task_struct *dup_task_struct(struct task_struct *orig)306{307 struct task_struct *tsk;308 struct thread_info *ti;309 int node = tsk_fork_get_node(orig);310 int err;311312 tsk = alloc_task_struct_node(node);//分配一个task_struct313 if (!tsk)314     return NULL;315316 ti = alloc_thread_info_node(tsk, node);//分配堆栈空间317 if (!ti)318     goto free_tsk;319320 err = arch_dup_task_struct(tsk, orig);//从orig复制task_struct至tsk321 if (err)322     goto free_ti;323324 tsk->stack = ti;//tsk的堆栈指向ti325#ifdef CONFIG_SECCOMP326 /*327  * We must handle setting up seccomp filters once we're under328  * the sighand lock in case orig has changed between now and329  * then. Until then, filter must be NULL to avoid messing up330  * the usage counts on the error path calling free_task.331  */332 tsk->seccomp.filter = NULL;333#endif334335 setup_thread_stack(tsk, orig);//复制堆栈内容336 clear_user_return_notifier(tsk);337 clear_tsk_need_resched(tsk);338 set_task_stack_end_magic(tsk);339340#ifdef CONFIG_CC_STACKPROTECTOR341 tsk->stack_canary = get_random_int();342#endif343344 /*345  * One for us, one for whoever does the "release_task()" (usually346  * parent)347  */348 atomic_set(&tsk->usage, 2);349#ifdef CONFIG_BLK_DEV_IO_TRACE350 tsk->btrace_seq = 0;351#endif352 tsk->splice_pipe = NULL;353 tsk->task_frag.page = NULL;354355 account_kernel_stack(ti, 1);356357 return tsk;358359free_ti:360 free_thread_info(ti);361free_tsk:362 free_task_struct(tsk);363 return NULL;364}

arch_dup_task_struct的实现如下:

290int __weak arch_dup_task_struct(struct task_struct *dst,291                        struct task_struct *src)292{293 *dst = *src;  //实际上就是把src的内容复制到dst294 return 0;295}

copy_thread定义在linux-3.18.6/arch/x86/kernel/process_32.c 中:

132int copy_thread(unsigned long clone_flags, unsigned long sp,133 unsigned long arg, struct task_struct *p)134{135 struct pt_regs *childregs = task_pt_regs(p);//栈顶地址136 struct task_struct *tsk;137 int err;138139 p->thread.sp = (unsigned long) childregs;//栈顶地址赋给sp140 p->thread.sp0 = (unsigned long) (childregs+1);141 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));142143 if (unlikely(p->flags & PF_KTHREAD)) {144     /* kernel thread */145     memset(childregs, 0, sizeof(struct pt_regs));146     p->thread.ip = (unsigned long) ret_from_kernel_thread;147     task_user_gs(p) = __KERNEL_STACK_CANARY;148     childregs->ds = __USER_DS;149     childregs->es = __USER_DS;150     childregs->fs = __KERNEL_PERCPU;151     childregs->bx = sp; /* function */152     childregs->bp = arg;153     childregs->orig_ax = -1;154     childregs->cs = __KERNEL_CS | get_kernel_rpl();155     childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;156     p->thread.io_bitmap_ptr = NULL;157     return 0;158 }159 *childregs = *current_pt_regs();//父进程的栈内容复制给子进程160 childregs->ax = 0;//子进程返回值为0161 if (sp)162     childregs->sp = sp;163164 p->thread.ip = (unsigned long) ret_from_fork;//子进程的运行入口165 task_user_gs(p) = get_user_gs(current_pt_regs());166167 p->thread.io_bitmap_ptr = NULL;168 tsk = current;169 err = -ENOMEM;170171 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {172     p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,173                     IO_BITMAP_BYTES, GFP_KERNEL);174     if (!p->thread.io_bitmap_ptr) {175         p->thread.io_bitmap_max = 0;176         return -ENOMEM;177     }178     set_tsk_thread_flag(p, TIF_IO_BITMAP);179 }180181 err = 0;182183 /*184  * Set a new TLS for the child thread?185  */186 if (clone_flags & CLONE_SETTLS)187     err = do_set_thread_area(p, -1,188         (struct user_desc __user *)childregs->si, 0);189190 if (err && p->thread.io_bitmap_ptr) {191     kfree(p->thread.io_bitmap_ptr);192     p->thread.io_bitmap_max = 0;193 }194 return err;195}

struct pt_regs定义在linux-3.18.6/arch/x86/include/uapi/asm/ptrace.h

17struct pt_regs {18  long ebx;19  long ecx;20  long edx;21  long esi;22  long edi;23  long ebp;24  long eax;25  int  xds;26  int  xes;27  int  xfs;28  int  xgs;29  long orig_eax;30  long eip;31  int  xcs;32  long eflags;33  long esp;34  int  xss;35};36

二、使用GDB跟踪fork调用
根据上面的分析,我们分别在sys_clone、do_fork、copy_process、copy_thread、ret_from_fork处设置断点(实验为前几篇文章用到的menu系统)
这里写图片描述
使用fork调用后,执行到sys_clone处:
这里写图片描述
接着执行到do_fork处:
这里写图片描述
copy_process:
这里写图片描述
copy_thread:
这里写图片描述
ret_from_fork:
这里写图片描述
发现最终只能追踪到syscall_exit处:
这里写图片描述

三、总结
linux创建一个新的进程是从复制父进程内核栈、页表项开始的,在系统内核里首先是将父进程的进程描述符进行拷贝,然后再根据自己的情况修改相应的参数,获取自己的进程号,再开始执行。

0 0
原创粉丝点击