7、Linux内核如何装载和启动一个可执行程序

来源:互联网 发布:网络客户服务的优势有 编辑:程序博客网 时间:2024/06/13 11:51

姓名:周毅原创作品转载请注明出处 《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000
一、可执行文件的创建——预处理、编译和链接
这里写图片描述
预处理,替换宏定义等等:gcc -E -o hello.cpp hello.c
编译,编译成汇编文本代码:gcc -x cpp-output -S -o hello.s hello.cpp
汇编,汇编成目标文件:gcc -x assembler -c hello.s -o hello.o
链接共享库,得到可以执行文件:gcc -o hello hello.o
静态链接:gcc -o hello.static hello.o -static
二、查看ELF目标文件格式
是UNIX系统实验室(USL)作为应用程序二进制接口(Application Binary Interface,ABI)而开发和发布的,也是Linux的主要可执行文件格式。

#define EI_NIDENT  typedef struct{  unsigned char e_ident[EI_NIDENT];  Elf32_Half e_type;  Elf32_Half e_machine;  Elf32_Word e_version;  Elf32_Addr e_entry;  Elf32_Off e_phoff;  Elf32_Off e_shoff;  Elf32_Word e_flags;  Elf32_Half e_ehsize;  Elf32_Half e_phentsize;  Elf32_Half e_phnum;  Elf32_Half e_shentsize;  Elf32_Half e_shnum;  Elf32_Half e_shstrndx;  }Elf32_Ehdr;

e_type 它标识的是该文件的类型。
e_machine 表明运行该程序需要的体系结构。
e_version 表示文件的版本。
e_entry 程序的入口地址。
e_phoff 表示Program header table 在文件中的偏移量(以字节计数)。
e_shoff 表示Section header table 在文件中的偏移量(以字节计数)。
e_flags 对IA32而言,此项为0。
e_ehsize 表示ELF header大小(以字节计数)。
e_phentsize 表示Program header table中每一个条目的大小。
e_phnum 表示Program header table中有多少个条目。
e_shentsize 表示Section header table中的每一个条目的大小。 
e_shnum 表示Section header table中有多少个条目。
e_shstrndx 包含节名称的字符串是第几个节(从零开始计数)
这里写图片描述
可以看到elf头的各种信息。
三、可执行程序的装载
Shell本身不限制命令行参数的个数, 命令行参数的个数受限于命令自身
例如,int main(int argc, char *argv[])
又如, int main(int argc, char *argv[], char *envp[])
Shell会调用execve将命令行参数和环境参数传递给可执行程序的main函数
int execve(const char * filename,char * const argv[ ],char * const envp[ ]);
库函数exec*都是execve的封装例程
下面通过内核代码看看sys_execve的执行过程:
在linux-3.18.6/fs/exec.c中我们找到sys_execve的系统调用,实际上执行的是do_execve:

SYSCALL_DEFINE3(execve,        const char __user *, filename,        const char __user *const __user *, argv,        const char __user *const __user *, envp){    return do_execve(getname(filename), argv, envp);}

继续找到do_execve,最终执行的是do_execve_common,:

int do_execve(struct filename *filename,    const char __user *const __user *__argv,    const char __user *const __user *__envp){    struct user_arg_ptr argv = { .ptr.native = __argv };    struct user_arg_ptr envp = { .ptr.native = __envp };    return do_execve_common(filename, argv, envp);}

找到do_execve_common,:

static int do_execve_common(struct filename *filename,                struct user_arg_ptr argv,                struct user_arg_ptr envp){    struct linux_binprm *bprm;// linux_binprm结构体用于维护程序执行过程中所使用的各种数据。     struct file *file;    struct files_struct *displaced;    int retval;    if (IS_ERR(filename))        return PTR_ERR(filename);    /*     * We move the actual failure in case of RLIMIT_NPROC excess from     * set*uid() to execve() because too many poorly written programs     * don't check setuid() return code.  Here we additionally recheck     * whether NPROC limit is still exceeded.     */    if ((current->flags & PF_NPROC_EXCEEDED) &&        atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {        retval = -EAGAIN;        goto out_ret;    }   //下面是初始化bprm的过程    /* We're below the limit (still or again), so we don't want to make     * further execve() calls fail. */    current->flags &= ~PF_NPROC_EXCEEDED;    retval = unshare_files(&displaced);    if (retval)        goto out_ret;    retval = -ENOMEM;    bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);    if (!bprm)        goto out_files;    retval = prepare_bprm_creds(bprm);    if (retval)        goto out_free;    check_unsafe_exec(bprm);    current->in_execve = 1;    file = do_open_exec(filename);    retval = PTR_ERR(file);    if (IS_ERR(file))        goto out_unmark;    sched_exec();    bprm->file = file;    bprm->filename = bprm->interp = filename->name;    retval = bprm_mm_init(bprm);    if (retval)        goto out_unmark;    bprm->argc = count(argv, MAX_ARG_STRINGS);    if ((retval = bprm->argc) < 0)        goto out;    bprm->envc = count(envp, MAX_ARG_STRINGS);    if ((retval = bprm->envc) < 0)        goto out;    retval = prepare_binprm(bprm);    if (retval < 0)        goto out;    retval = copy_strings_kernel(1, &bprm->filename, bprm);    if (retval < 0)        goto out;    bprm->exec = bprm->p;    retval = copy_strings(bprm->envc, envp, bprm);    if (retval < 0)        goto out;    retval = copy_strings(bprm->argc, argv, bprm);    if (retval < 0)        goto out;    retval = exec_binprm(bprm);//加载程序    if (retval < 0)        goto out;    /* execve succeeded */    current->fs->in_exec = 0;    current->in_execve = 0;    acct_update_integrals(current);    task_numa_free(current);    free_bprm(bprm);    putname(filename);    if (displaced)        put_files_struct(displaced);    return retval;out:    if (bprm->mm) {        acct_arg_size(bprm, 0);        mmput(bprm->mm);    }out_unmark:    current->fs->in_exec = 0;    current->in_execve = 0;out_free:    free_bprm(bprm);out_files:    if (displaced)        reset_files_struct(displaced);out_ret:    putname(filename);    return retval;}

我们找到exec_binprm:

static int exec_binprm(struct linux_binprm *bprm){    pid_t old_pid, old_vpid;    int ret;    /* Need to fetch pid before load_binary changes it */    old_pid = current->pid;    rcu_read_lock();    old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));    rcu_read_unlock();    ret = search_binary_handler(bprm);//寻找文件格式对应的解析模块    if (ret >= 0) {        audit_bprm(bprm);        trace_sched_process_exec(current, old_pid, bprm);        ptrace_event(PTRACE_EVENT_EXEC, old_vpid);        proc_exec_connector(current);    }    return ret;}

找到search_binary_handler:

int search_binary_handler(struct linux_binprm *bprm){    bool need_retry = IS_ENABLED(CONFIG_MODULES);    struct linux_binfmt *fmt;    int retval;    /* This allows 4 levels of binfmt rewrites before failing hard. */    if (bprm->recursion_depth > 5)        return -ELOOP;    retval = security_bprm_check(bprm);    if (retval)        return retval;    retval = -ENOENT; retry:    read_lock(&binfmt_lock);    list_for_each_entry(fmt, &formats, lh) {        if (!try_module_get(fmt->module))            continue;        read_unlock(&binfmt_lock);        bprm->recursion_depth++;        retval = fmt->load_binary(bprm);//寻找文件格式对应的解析模块        read_lock(&binfmt_lock);        put_binfmt(fmt);        bprm->recursion_depth--;        if (retval < 0 && !bprm->mm) {            /* we got to flush_old_exec() and failed after it */            read_unlock(&binfmt_lock);            force_sigsegv(SIGSEGV, current);            return retval;        }        if (retval != -ENOEXEC || !bprm->file) {            read_unlock(&binfmt_lock);            return retval;        }    }    read_unlock(&binfmt_lock);    if (need_retry) {        if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&            printable(bprm->buf[2]) && printable(bprm->buf[3]))            return retval;        if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)            return retval;        need_retry = false;        goto retry;    }    return retval;}

于ELF格式的可执行文件fmt->load_binary(bprm);执行的应该是load_elf_binary其内部是和ELF文件格式解析:

static int load_elf_binary(struct linux_binprm *bprm){    ///解析elf文件格式    struct file *interpreter = NULL; /* to shut gcc up */    unsigned long load_addr = 0, load_bias = 0;    int load_addr_set = 0;    char * elf_interpreter = NULL;    unsigned long error;    struct elf_phdr *elf_ppnt, *elf_phdata;    unsigned long elf_bss, elf_brk;    int retval, i;    unsigned int size;    unsigned long elf_entry;    unsigned long interp_load_addr = 0;    unsigned long start_code, end_code, start_data, end_data;    unsigned long reloc_func_desc __maybe_unused = 0;    int executable_stack = EXSTACK_DEFAULT;    struct pt_regs *regs = current_pt_regs();    struct {        struct elfhdr elf_ex;        struct elfhdr interp_elf_ex;    } *loc;    loc = kmalloc(sizeof(*loc), GFP_KERNEL);    if (!loc) {        retval = -ENOMEM;        goto out_ret;    }    /* Get the exec-header */    loc->elf_ex = *((struct elfhdr *)bprm->buf);    retval = -ENOEXEC;    /* First of all, some simple consistency checks */    if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0)        goto out;    if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN)        goto out;    if (!elf_check_arch(&loc->elf_ex))        goto out;    if (!bprm->file->f_op->mmap)        goto out;    /* Now read in all of the header information */    if (loc->elf_ex.e_phentsize != sizeof(struct elf_phdr))        goto out;    if (loc->elf_ex.e_phnum < 1 ||        loc->elf_ex.e_phnum > 65536U / sizeof(struct elf_phdr))        goto out;    size = loc->elf_ex.e_phnum * sizeof(struct elf_phdr);    retval = -ENOMEM;    elf_phdata = kmalloc(size, GFP_KERNEL);    if (!elf_phdata)        goto out;    retval = kernel_read(bprm->file, loc->elf_ex.e_phoff,                 (char *)elf_phdata, size);    if (retval != size) {        if (retval >= 0)            retval = -EIO;        goto out_free_ph;    }    elf_ppnt = elf_phdata;    elf_bss = 0;    elf_brk = 0;    start_code = ~0UL;    end_code = 0;    start_data = 0;    end_data = 0;    for (i = 0; i < loc->elf_ex.e_phnum; i++) {        if (elf_ppnt->p_type == PT_INTERP) {            /* This is the program interpreter used for             * shared libraries - for now assume that this             * is an a.out format binary             */            retval = -ENOEXEC;            if (elf_ppnt->p_filesz > PATH_MAX ||                 elf_ppnt->p_filesz < 2)                goto out_free_ph;            retval = -ENOMEM;            elf_interpreter = kmalloc(elf_ppnt->p_filesz,                          GFP_KERNEL);            if (!elf_interpreter)                goto out_free_ph;            retval = kernel_read(bprm->file, elf_ppnt->p_offset,                         elf_interpreter,                         elf_ppnt->p_filesz);            if (retval != elf_ppnt->p_filesz) {                if (retval >= 0)                    retval = -EIO;                goto out_free_interp;            }            /* make sure path is NULL terminated */            retval = -ENOEXEC;            if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')                goto out_free_interp;            interpreter = open_exec(elf_interpreter);            retval = PTR_ERR(interpreter);            if (IS_ERR(interpreter))                goto out_free_interp;            /*             * If the binary is not readable then enforce             * mm->dumpable = 0 regardless of the interpreter's             * permissions.             */            would_dump(bprm, interpreter);            retval = kernel_read(interpreter, 0, bprm->buf,                         BINPRM_BUF_SIZE);            if (retval != BINPRM_BUF_SIZE) {                if (retval >= 0)                    retval = -EIO;                goto out_free_dentry;            }            /* Get the exec headers */            loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);            break;        }        elf_ppnt++;    }    elf_ppnt = elf_phdata;    for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++)        if (elf_ppnt->p_type == PT_GNU_STACK) {            if (elf_ppnt->p_flags & PF_X)                executable_stack = EXSTACK_ENABLE_X;            else                executable_stack = EXSTACK_DISABLE_X;            break;        }    /* Some simple consistency checks for the interpreter */    if (elf_interpreter) {        retval = -ELIBBAD;        /* Not an ELF interpreter */        if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0)            goto out_free_dentry;        /* Verify the interpreter has a valid arch */        if (!elf_check_arch(&loc->interp_elf_ex))            goto out_free_dentry;    }    /* Flush all traces of the currently running executable */    retval = flush_old_exec(bprm);    if (retval)        goto out_free_dentry;    /* Do this immediately, since STACK_TOP as used in setup_arg_pages       may depend on the personality.  */    SET_PERSONALITY(loc->elf_ex);    if (elf_read_implies_exec(loc->elf_ex, executable_stack))        current->personality |= READ_IMPLIES_EXEC;    if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)        current->flags |= PF_RANDOMIZE;    setup_new_exec(bprm);    /* Do this so that we can load the interpreter, if need be.  We will       change some of these later */    retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),                 executable_stack);    if (retval < 0)        goto out_free_dentry;    current->mm->start_stack = bprm->p;    /* Now we do a little grungy work by mmapping the ELF image into       the correct location in memory. */    for(i = 0, elf_ppnt = elf_phdata;        i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {        int elf_prot = 0, elf_flags;        unsigned long k, vaddr;        if (elf_ppnt->p_type != PT_LOAD)            continue;        if (unlikely (elf_brk > elf_bss)) {            unsigned long nbyte;            /* There was a PT_LOAD segment with p_memsz > p_filesz               before this one. Map anonymous pages, if needed,               and clear the area.  */            retval = set_brk(elf_bss + load_bias,                     elf_brk + load_bias);            if (retval)                goto out_free_dentry;            nbyte = ELF_PAGEOFFSET(elf_bss);            if (nbyte) {                nbyte = ELF_MIN_ALIGN - nbyte;                if (nbyte > elf_brk - elf_bss)                    nbyte = elf_brk - elf_bss;                if (clear_user((void __user *)elf_bss +                            load_bias, nbyte)) {                    /*                     * This bss-zeroing can fail if the ELF                     * file specifies odd protections. So                     * we don't check the return value                     */                }            }        }        if (elf_ppnt->p_flags & PF_R)            elf_prot |= PROT_READ;        if (elf_ppnt->p_flags & PF_W)            elf_prot |= PROT_WRITE;        if (elf_ppnt->p_flags & PF_X)            elf_prot |= PROT_EXEC;        elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;        vaddr = elf_ppnt->p_vaddr;        if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {            elf_flags |= MAP_FIXED;        } else if (loc->elf_ex.e_type == ET_DYN) {            /* Try and get dynamic programs out of the way of the             * default mmap base, as well as whatever program they             * might try to exec.  This is because the brk will             * follow the loader, and is not movable.  */#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE            /* Memory randomization might have been switched off             * in runtime via sysctl or explicit setting of             * personality flags.             * If that is the case, retain the original non-zero             * load_bias value in order to establish proper             * non-randomized mappings.             */            if (current->flags & PF_RANDOMIZE)                load_bias = 0;            else                load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);#else            load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);#endif        }        error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,                elf_prot, elf_flags, 0); //目标文件映射到地址空间        if (BAD_ADDR(error)) {            retval = IS_ERR((void *)error) ?                PTR_ERR((void*)error) : -EINVAL;            goto out_free_dentry;        }        if (!load_addr_set) {            load_addr_set = 1;            load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset);            if (loc->elf_ex.e_type == ET_DYN) {                load_bias += error -                             ELF_PAGESTART(load_bias + vaddr);                load_addr += load_bias;                reloc_func_desc = load_bias;            }        }        k = elf_ppnt->p_vaddr;        if (k < start_code)            start_code = k;        if (start_data < k)            start_data = k;        /*         * Check to see if the section's size will overflow the         * allowed task size. Note that p_filesz must always be         * <= p_memsz so it is only necessary to check p_memsz.         */        if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz ||            elf_ppnt->p_memsz > TASK_SIZE ||            TASK_SIZE - elf_ppnt->p_memsz < k) {            /* set_brk can never work. Avoid overflows. */            retval = -EINVAL;            goto out_free_dentry;        }        k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz;        if (k > elf_bss)            elf_bss = k;        if ((elf_ppnt->p_flags & PF_X) && end_code < k)            end_code = k;        if (end_data < k)            end_data = k;        k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;        if (k > elf_brk)            elf_brk = k;    }    loc->elf_ex.e_entry += load_bias;    elf_bss += load_bias;    elf_brk += load_bias;    start_code += load_bias;    end_code += load_bias;    start_data += load_bias;    end_data += load_bias;    /* Calling set_brk effectively mmaps the pages that we need     * for the bss and break sections.  We must do this before     * mapping in the interpreter, to make sure it doesn't wind     * up getting placed where the bss needs to go.     */    retval = set_brk(elf_bss, elf_brk);    if (retval)        goto out_free_dentry;    if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {        retval = -EFAULT; /* Nobody gets to see this, but.. */        goto out_free_dentry;    }    if (elf_interpreter) {//动态链接时的入口        unsigned long interp_map_addr = 0;        elf_entry = load_elf_interp(&loc->interp_elf_ex,                        interpreter,                        &interp_map_addr,                        load_bias);        if (!IS_ERR((void *)elf_entry)) {            /*             * load_elf_interp() returns relocation             * adjustment             */            interp_load_addr = elf_entry;            elf_entry += loc->interp_elf_ex.e_entry;        }        if (BAD_ADDR(elf_entry)) {            retval = IS_ERR((void *)elf_entry) ?                    (int)elf_entry : -EINVAL;            goto out_free_dentry;        }        reloc_func_desc = interp_load_addr;        allow_write_access(interpreter);        fput(interpreter);        kfree(elf_interpreter);    } else {        elf_entry = loc->elf_ex.e_entry;//无动态链接的程序入口        if (BAD_ADDR(elf_entry)) {            retval = -EINVAL;            goto out_free_dentry;        }    }    kfree(elf_phdata);    set_binfmt(&elf_format);#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES    retval = arch_setup_additional_pages(bprm, !!elf_interpreter);    if (retval < 0)        goto out;#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */    install_exec_creds(bprm);    retval = create_elf_tables(bprm, &loc->elf_ex,              load_addr, interp_load_addr);    if (retval < 0)        goto out;    /* N.B. passed_fileno might not be initialized? */    current->mm->end_code = end_code;    current->mm->start_code = start_code;    current->mm->start_data = start_data;    current->mm->end_data = end_data;    current->mm->start_stack = bprm->p;#ifdef arch_randomize_brk    if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {        current->mm->brk = current->mm->start_brk =            arch_randomize_brk(current->mm);#ifdef CONFIG_COMPAT_BRK        current->brk_randomized = 1;#endif    }#endif    if (current->personality & MMAP_PAGE_ZERO) {        /* Why this, you ask???  Well SVr4 maps page 0 as read-only,           and some applications "depend" upon this behavior.           Since we do not have the power to recompile these, we           emulate the SVr4 behavior. Sigh. */        error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,                MAP_FIXED | MAP_PRIVATE, 0);    }#ifdef ELF_PLAT_INIT    /*     * The ABI may specify that certain registers be set up in special     * ways (on i386 %edx is the address of a DT_FINI function, for     * example.  In addition, it may also specify (eg, PowerPC64 ELF)     * that the e_entry field is the address of the function descriptor     * for the startup routine, rather than the address of the startup     * routine itself.  This macro performs whatever initialization to     * the regs structure is required as well as any relocations to the     * function descriptor entries when executing dynamically links apps.     */    ELF_PLAT_INIT(regs, reloc_func_desc);#endif    start_thread(regs, elf_entry, bprm->p);//执行程序    retval = 0;out:    kfree(loc);out_ret:    return retval;    /* error cleanup */out_free_dentry:    allow_write_access(interpreter);    if (interpreter)        fput(interpreter);out_free_interp:    kfree(elf_interpreter);out_free_ph:    kfree(elf_phdata);    goto out;}

start_thread修改了寄存器的值:

start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp){    set_user_gs(regs, 0);    regs->fs        = 0;    regs->ds        = __USER_DS;    regs->es        = __USER_DS;    regs->ss        = __USER_DS;    regs->cs        = __USER_CS;    regs->ip        = new_ip;//返回用户态的位置从int 0x80的下一条指令的位置变成新加载的可执行文件的entry位置(new_ip)。    regs->sp        = new_sp;    regs->flags     = X86_EFLAGS_IF;    /*     * force it to the iret return path by making it look as if there was     * some work pending.     */    set_thread_flag(TIF_NOTIFY_RESUME);}

这样程序在执行时,从new_ip(加载的程序)开始执行。

0 0