linux内核学习(13)我们开始慢慢地往上爬...(上)

来源:互联网 发布:百度的默认端口号 编辑:程序博客网 时间:2024/05/16 10:06

这个题目取得好,好像底层离我们越来越远了,因为内核在一步一步脱壳,很快漂亮的形状就会展现在我们眼前,在这之前,我们得屏气凝神,静静地等待这一时刻的到来。进入真实内核的第一个文件是arch/x86/kernel/head_32.S。

/*
 * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,
 * %esi points to the real-mode code as a 32-bit pointer.
 * CS and DS must be 4 GB flat segments, but we don't depend on
 * any particular GDT layout, because we load our own as soon as we
 * can.
 */
__HEAD
ENTRY(startup_32)
    /* test KEEP_SEGMENTS flag to see if the bootloader is asking
        us to not reload segments */
    testb $(1<<6), BP_loadflags(%esi)    # 这个标志见过,询问是否重新加载一遍段寄存器
    jnz 2f

/*
 * Set segments to known values.
 */
    lgdt pa(boot_gdt_descr)     # boot_gdt_descr在下面数据区中有设置,记载GDT表首地址
    movl $(__BOOT_DS),%eax
    movl %eax,%ds
    movl %eax,%es
    movl %eax,%fs
    movl %eax,%gs
2:

/*
 * Clear BSS first so that there are no surprises...
 */
# 将bss段清0
    cld             /*di,si ++*/
    xorl %eax,%eax
    movl $pa(__bss_start),%edi
    movl $pa(__bss_stop),%ecx
    subl %edi,%ecx
    shrl $2,%ecx
    rep ; stosl
/*
 * Copy bootup parameters out of the way.
 * Note: %esi still has the pointer to the real-mode data.
 * With the kexec as boot loader, parameter segment might be loaded beyond
 * kernel image and might not even be addressable by early boot page tables.
 * (kexec on panic case). Hence copy out the parameters before initializing
 * page tables.
 */
# 拷贝实模式中的boot_params结构体
    movl $pa(boot_params),%edi
    movl $(PARAM_SIZE/4),%ecx    /*PARAM_SIZE ==struct boot_params*/
    cld
    rep
    movsl           /*ds:esi copy to es:edi*/
# 拷贝命令行参数到boot_command_line
# 得到命令行参数地址
    movl pa(boot_params) + NEW_CL_POINTER,%esi  /*boot_params->hdr.cmd_line_ptr*/
    andl %esi,%esi                               /*test if point is null*/ # 检查指针是否为空
    jz 1f            # No comand line
    movl $pa(boot_command_line),%edi        /*in the init/main.c*/
    movl $(COMMAND_LINE_SIZE/4),%ecx
    rep
    movsl
1:
# 不用管
#ifdef CONFIG_OLPC_OPENFIRMWARE         /*firmware*/
    /* save OFW's pgdir table for later use when calling into OFW */
    movl %cr3, %eax
    movl %eax, pa(olpc_ofw_pgd)
#endif
# 虚拟环境,直接飘过~
#ifdef CONFIG_PARAVIRT          /*virtual environment*/
    /* This is can only trip for a broken bootloader... */
    cmpw $0x207, pa(boot_params + BP_version)
    jb default_entry

    /* Paravirt-compatible boot parameters.  Look to see what architecture
        we're booting under. */
    movl pa(boot_params + BP_hardware_subarch), %eax
    cmpl $num_subarch_entries, %eax
    jae bad_subarch         /*if eax >= num_subarch_entries*/

    movl pa(subarch_entries)(,%eax,4), %eax   /*eax=subarch_entries+eax*4*/
    subl $__PAGE_OFFSET, %eax
    jmp *%eax

bad_subarch:
WEAK(lguest_entry)
WEAK(xen_entry)
    /* Unknown implementation; there's really
       nothing we can do at this point. */
    ud2a

    __INITDATA

subarch_entries:
    .long default_entry        /* normal x86/PC */     /*if hardware_subarch=0*/
    .long lguest_entry        /* lguest hypervisor */  /*0x00000001*/
    .long xen_entry            /* Xen hypervisor */     /*0x00000002*/
    .long default_entry        /* Moorestown MID */
num_subarch_entries = (. - subarch_entries) / 4
.previous
#endif /* CONFIG_PARAVIRT */

/*
 * Initialize page tables.  This creates a PDE and a set of page
 * tables, which are located immediately beyond __brk_base.  The variable
 * _brk_end is set up to point to the first "safe" location.
 * Mappings are created both at virtual address 0 (identity mapping)
 * and PAGE_OFFSET for up to _end.
 *
 * Note that the stack is not yet set up!
 */
# 这段非常关键,之前的保护模式是没有分页功能的,这段就是设置全局页目录和页表项
# 然后开启分页机制
default_entry:
# 如果启用了PAE,也就是物理地址扩展,变成64G,这里不用考虑
#ifdef CONFIG_X86_PAE           /*physical address extend , 64G */

    /*
     * In PAE mode swapper_pg_dir is statically defined to contain enough
     * entries to cover the VMSPLIT option (that is the top 1, 2 or 3
     * entries). The identity mapping is handled by pointing two PGD
     * entries to the first kernel PMD.
     *
     * Note the upper half of each PMD or PTE are always zero at
     * this stage.
     */

#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
                /*__PAGE_OFFSET = 0xc0000000*/
    xorl %ebx,%ebx                /* %ebx is kept at zero */

    movl $pa(__brk_base), %edi
    movl $pa(swapper_pg_pmd), %edx
    movl $PTE_IDENT_ATTR, %eax
10:
    leal PDE_IDENT_ATTR(%edi),%ecx        /* Create PMD entry */
    movl %ecx,(%edx)            /* Store PMD entry */
                        /* Upper half already zero */
    addl $8,%edx
    movl $512,%ecx
11:
    stosl
    xchgl %eax,%ebx
    stosl
    xchgl %eax,%ebx
    addl $0x1000,%eax
    loop 11b

    /*
     * End condition: we must map up to the end + MAPPING_BEYOND_END.
     */
    movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
    cmpl %ebp,%eax
    jb 10b
1:
    addl $__PAGE_OFFSET, %edi
    movl %edi, pa(_brk_end)
    shrl $12, %eax
    movl %eax, pa(max_pfn_mapped)

    /* Do early initialization of the fixmap area */
    movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
    movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
#else    /* Not PAE */

# 如果没有启用PAE,那么线性地址为4G,一般都属于这里
                        /*4G physical address*/
# 这个很重要,内核的线性地址偏移
# 我们知道在liunx中,进程空间为0-3G,高1G是内核空间使用的
# 也就是3G之后
# 这里的这个偏移是相对于全局页目录的,全局页目录大小为4k,每项大小为4b
# 每项可以表示4M的线性范围
# 因此这个偏移3k刚好就是3G,3k/4*4M=3G。
page_pde_offset = (__PAGE_OFFSET >> 20);   /*0xc00= 3k , PGD(3k)=3G*/

# 这里的这个__brk_base在文件vmlinux.lds.S中,链接后,表示页表的首地址
. = ALIGN(PAGE_SIZE);
    .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
        __brk_base = .;
        . += 64 * 1024;        /* 64k alignment slop space */
        *(.brk_reservation)    /* areas brk users have reserved */
        __brk_limit = .;
    }

    _end = .;
# 可以看到总大小为64k,也就是说可以设置线性地址64M
    movl $pa(__brk_base), %edi             /*__brk_base==PTD*/
    movl $pa(swapper_pg_dir), %edx    # 全局页目录地址
# 这个宏只是个属性而已
# 不管是全局页目录项还是页表项都是4B
# 4B中最后12位来表示属性
    movl $PTE_IDENT_ATTR, %eax              /*PTE_IDENT_ATTR= 0x00000067*/
10:
# 外循环是填充全局页目录项的
# edi为页表首地址
    leal PDE_IDENT_ATTR(%edi),%ecx        /* Create PDE entry */
# 将计算出的页目录项填充到全局页目录中
# edx表示全局页目录地址
    movl %ecx,(%edx)            /* Store identity PDE entry */
    movl %ecx,page_pde_offset(%edx)        /* Store kernel PDE entry */
# 填充下一个全局页目录项
    addl $4,%edx
    movl $1024, %ecx
11:                     /*fill 4k PTD*/
# 内循环填充页表
# edi为页表地址,eax为页表项
    stosl           /*es:edi= eax,edi++*/
    addl $0x1000,%eax  # 注意理解这里的0x1000,后12位为属性,前20位为页地址,4k对齐
    loop 11b
    /*
     * End condition: we must map up to the end + MAPPING_BEYOND_END.
     */  /*_end-__brk_base = 64k*/
# 如果64k页表还没用完,那么继续,eax表示当前页表地址(当然包括了属性)
# 都忽略属性标志,则eax表示在__brk_base中的偏移/4
# 这里没有看懂MAPPING_BEYOND_END
# 这个在上面定义的,我计算了一下为0x0c400000
# 不过这里好像有些不对,不过看程序没有影响,就是判断是否还有页表可以设置
    movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
    cmpl %ebp,%eax
    jb 10b          /*if eax < ebp jump*/
# 将页表末尾的线性地址存入_brk_end变量中
    addl $__PAGE_OFFSET, %edi
    movl %edi, pa(_brk_end)         /*_brk_end = _end=edi+__PAGE_OFFSET*/
# 这次映射了多少物理页面都在eax中,存好
    shrl $12, %eax
    movl %eax, pa(max_pfn_mapped)   /*previous eax pages(4k) in physical address*/

    /* Do early initialization of the fixmap area */
# 这里还加了一个全局页目录项,页表地址为swapper_pg_fixmap
# 放入了全局页目录项的最后一项
    movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
    movl %eax,pa(swapper_pg_dir+0xffc)      /*a 4k PTD in the end of PGT*/
#endif
    jmp 3f
/*
 * Non-boot CPU entry point; entered from trampoline.S
 * We can't lgdt here, because lgdt itself uses a data segment, but
 * we know the trampoline has already loaded the boot_gdt for us.
 *
 * If cpu hotplug is not supported then this code can go in init section
 * which will be freed later
 */

__CPUINIT
# 多处理器平台
#ifdef CONFIG_SMP
ENTRY(startup_32_smp)
    cld
    movl $(__BOOT_DS),%eax
    movl %eax,%ds
    movl %eax,%es
    movl %eax,%fs
    movl %eax,%gs
#endif /* CONFIG_SMP */
3:

/*
 *    New page tables may be in 4Mbyte page mode and may
 *    be using the global pages.
 *
 *    NOTE! If we are on a 486 we may have no cr4 at all!
 *    So we do not try to touch it unless we really have
 *    some bits in it to set.  This won't work if the BSP
 *    implements cr4 but this AP does not -- very unlikely
 *    but be warned!  The same applies to the pse feature
 *    if not equally supported. --macro
 *
 *    NOTE! We have to correct for the fact that we're
 *    not yet offset PAGE_OFFSET..
 */
# 这里可以设置页面大小为4M,cr4的PSE如果置位的话
# 可以完全飘过~ 飘到下标6f
#define cr4_bits pa(mmu_cr4_features)           /*the bit in PSE on the cr4*/
    movl cr4_bits,%edx
    andl %edx,%edx          /*if set the bit then start 4M page*/
    jz 6f                   /*assume don't set the PSE bit*/
    movl %cr4,%eax        # Turn on paging options (PSE,PAE,..)
    orl %edx,%eax
    movl %eax,%cr4

    testb $X86_CR4_PAE, %al        # check if PAE is enabled
    jz 6f

    /* Check if extended functions are implemented */
    movl $0x80000000, %eax
    cpuid
    /* Value must be in the range 0x80000001 to 0x8000ffff */
    subl $0x80000001, %eax
    cmpl $(0x8000ffff-0x80000001), %eax
    ja 6f
    mov $0x80000001, %eax
    cpuid
    /* Execute Disable bit supported? */
    btl $(X86_FEATURE_NX & 31), %edx
    jnc 6f

    /* Setup EFER (Extended Feature Enable Register) */
    movl $MSR_EFER, %ecx
    rdmsr

    btsl $_EFER_NX, %eax
    /* Make changes effective */
    wrmsr

6:

/*
 * Enable paging
 */
# 从这里开始我们将完全进入保护模式
# initial_page_table就是全局页目录的首地址
# 放入cr3
    movl pa(initial_page_table), %eax
    movl %eax,%cr3        /* set the page table pointer.. */
# 将cr0的分页机制打开
    movl %cr0,%eax
    orl  $X86_CR0_PG,%eax   /*X86_CR0_PG=0x80000000*/
    movl %eax,%cr0        /* ..and set paging (PG) bit */

# 这里,我们调皮的跳一下,完全进入保护模式,呵呵!linux内核学习(13)我们开始慢慢地往上爬... - 小鱼 - ringk--linuxer
    ljmp $__BOOT_CS,$1f    /* Clear prefetch and normalize %eip */
1:                              /*Now completely go to protected mode...*/
    /* Set up the stack pointer */
# 设置好栈
    lss stack_start,%esp

/*
 * Initialize eflags.  Some BIOS's leave bits like NT set.  This would
 * confuse the debugger if this code is traced.
 * XXX - best to initialize before switching to protected mode.
 */
# 将标志寄存器清0
    pushl $0
    popfl

# 多处理器,直接飘过~
#ifdef CONFIG_SMP
    cmpb $0, ready
    jz  1f                /* Initial CPU cleans BSS */
    jmp checkCPUtype
1:
#endif /* CONFIG_SMP */

/*
 * start system 32-bit setup. We need to re-do some of the things done
 * in 16-bit mode for the "real" operations.
 */
# 设置IDT,中断向量表,这个非常重要
# 不过现在还没有什么的
    call setup_idt

# 检查一下CPU类型
checkCPUtype:

    movl $-1,X86_CPUID        #  -1 for no CPUID initially

/* check if it is 486 or 386. */
/*
 * XXX - this does a lot of unnecessary setup.  Alignment checks don't
 * apply at our cpl of 0 and the stack ought to be aligned already, and
 * we don't need to preserve eflags.
 */
# 这段可以完全跳过,直接进入is386下标继续看
    movb $3,X86        # at least 386
    pushfl            # push EFLAGS
    popl %eax        # get EFLAGS
    movl %eax,%ecx        # save original EFLAGS
    xorl $0x240000,%eax    # flip AC and ID bits in EFLAGS
    pushl %eax        # copy to EFLAGS
    popfl            # set EFLAGS
    pushfl            # get new EFLAGS
    popl %eax        # put it in eax
    xorl %ecx,%eax        # change in flags
    pushl %ecx        # restore original EFLAGS
    popfl
    testl $0x40000,%eax    # check if AC bit changed
    je is386

    movb $4,X86        # at least 486
    testl $0x200000,%eax    # check if ID bit changed
    je is486

    /* get vendor info */
    xorl %eax,%eax            # call CPUID with 0 -> return vendor ID
    cpuid
    movl %eax,X86_CPUID        # save CPUID level
    movl %ebx,X86_VENDOR_ID        # lo 4 chars
    movl %edx,X86_VENDOR_ID+4    # next 4 chars
    movl %ecx,X86_VENDOR_ID+8    # last 4 chars

    orl %eax,%eax            # do we have processor info as well?
    je is486

    movl $1,%eax        # Use the CPUID instruction to get CPU type
    cpuid
    movb %al,%cl        # save reg for future use
    andb $0x0f,%ah        # mask processor family
    movb %ah,X86
    andb $0xf0,%al        # mask model
    shrb $4,%al
    movb %al,X86_MODEL
    andb $0x0f,%cl        # mask mask revision
    movb %cl,X86_MASK
    movl %edx,X86_CAPABILITY

is486:    movl $0x50022,%ecx    # set AM, WP, NE and MP
    jmp 2f

# 在往下看,无非就是加载了GDT、IDT
is386:    movl $2,%ecx        # set MP
2:    movl %cr0,%eax
    andl $0x80000011,%eax    # Save PG,PE,ET
    orl %ecx,%eax
    movl %eax,%cr0

    call check_x87
    lgdt early_gdt_descr
    lidt idt_descr

    ljmp $(__KERNEL_CS),$1f
1:    movl $(__KERNEL_DS),%eax    # reload all the segment registers
    movl %eax,%ss            # after changing gdt.

    movl $(__USER_DS),%eax        # DS/ES contains default USER segment
    movl %eax,%ds
    movl %eax,%es

    movl $(__KERNEL_PERCPU), %eax
    movl %eax,%fs            # set this cpu's percpu

# 栈保护,飘过~
#ifdef CONFIG_CC_STACKPROTECTOR
    /*
     * The linker can't handle this by relocation.  Manually set
     * base address in stack canary segment descriptor.
     */
    cmpb $0,ready
    jne 1f
    movl $gdt_page,%eax
    movl $stack_canary,%ecx
    movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
    shrl $16, %ecx
    movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
    movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
1:
#endif
# 飘到这里了~
    movl $(__KERNEL_STACK_CANARY),%eax
    movl %eax,%gs

    xorl %eax,%eax            # Clear LDT
    lldt %ax

    cld            # gcc2 wants the direction flag cleared at all times
    pushl $0        # fake return address for unwinder
# 又是多处理器,继续飘~
#ifdef CONFIG_SMP
    movb ready, %cl
    movb $1, ready
    cmpb $0,%cl        # the first CPU calls start_kernel
    je   1f
    movl (stack_start), %esp
1:
#endif /* CONFIG_SMP */
# 关键点了,即将进入C代码了,哈哈!linux内核学习(13)我们开始慢慢地往上爬... - 小鱼 - ringk--linuxer
    jmp *(initial_code)

/*
 * We depend on ET to be correct. This checks for 287/387.
 */
# is386里的调用,居然还要检查一下x87
check_x87:
    movb $0,X86_HARD_MATH
    clts
    fninit
    fstsw %ax
    cmpb $0,%al
    je 1f
    movl %cr0,%eax        /* no coprocessor: have to set bits */
    xorl $4,%eax        /* set EM */
    movl %eax,%cr0
    ret
    ALIGN
1:    movb $1,X86_HARD_MATH
    .byte 0xDB,0xE4        /* fsetpm for 287, ignored by 387 */
    ret

/*
 *  setup_idt
 *
 *  sets up a idt with 256 entries pointing to
 *  ignore_int, interrupt gates. It doesn't actually load
 *  idt - that can be done only after paging has been enabled
 *  and the kernel moved to PAGE_OFFSET. Interrupts
 *  are enabled elsewhere, when we can be relatively
 *  sure everything is ok.
 *
 *  Warning: %esi is live across this function.
 */
# 这就是设置IDT表了
setup_idt:
# 默认中断处理程序
    lea ignore_int,%edx
# 注意得用代码段选择子,而且肯定是内核代码段
    movl $(__KERNEL_CS << 16),%eax  /*set selector in the GDT*/
    movw %dx,%ax        /* selector = 0x0010 = cs */
    movw $0x8E00,%dx    /* interrupt gate - dpl=0, present */

    lea idt_table,%edi    # idt_table为idt表的首地址
    mov $256,%ecx   # 总共设置256项
rp_sidt:
    movl %eax,(%edi)
    movl %edx,4(%edi)
    addl $8,%edi
    dec %ecx
    jne rp_sidt