From kernel startup entry point to start_kernel (3)

来源:互联网 发布:linux vsftpd chcon 编辑:程序博客网 时间:2024/06/05 21:02

有关临时页表的建立和主内核页表的建立swapper_pg_dir

1.swapper_pg_dir的地址是怎样决定的


.equ    swapper_pg_dir, KERNEL_RAM_VADDR - PG_DIR_SIZE
其中KERNEL_RAM_VADDR == KERNEL_START,而KERNEL_START = 0xc0008000;
又 PG_DIR_SIZE = 0x4000,
所以:swapper_pg_dir = 0xc0008000 - 0x4000 = 0xc0004000,也就是kernel代码段
上面的16K.

/*
 * swapper_pg_dir is the virtual address of the initial page table.
 * We place the page tables 16K below KERNEL_RAM_VADDR.  Therefore, we must
 * make sure that KERNEL_RAM_VADDR is correctly set.  Currently, we expect
 * the least significant 16 bits to be 0x8000, but we could probably
 * relax this restriction to KERNEL_RAM_VADDR >= PAGE_OFFSET + 0x4000.
 */
    .globl    swapper_pg_dir
    .equ    swapper_pg_dir, KERNEL_RAM_VADDR - PG_DIR_SIZE

#define PG_DIR_SIZE    0x4000
#define PMD_ORDER    2

先已知swapper_pd_dir的地址是0xc0004000, 所以可知道KERNEL_RAM_VADDR = 0xc0008000;
#define KERNEL_START    KERNEL_RAM_VADDR,
也就是KERNEL_START = 0xc0008000。

也就是kernel的入口地址
ENTRY(stext)
可知代码的入口是stext
crash> sym stext
c0008000 (T) stext
crash> dis stext
0xc0008000 <.head.text>:        msr     CPSR_c, #211    ; 0xd3

2.硬件相关的RAM地址是怎样告诉内核的


0xc0008014 <stext+0x14>:        add     r3, pc, #44     ; 0x2c
0xc0008018 <stext+0x18>:        ldm     r3, {r4, r8}
0xc000801c <stext+0x1c>:        sub     r4, r3, r4
0xc0008020 <stext+0x20>:        add     r8, r8, r4

r3根据pc找到数据段的物理地址,此时pc是物理地址,这里使用的是相对地址;
把数据段的内容load到r4, r8,此时r4是r3对应的虚拟地址,根据这个线性对应关系
得到PAGE_OFFSET对应的物理地址:PHYS_OFFSET
    .long    .
    .long    PAGE_OFFSET
从这里可以看出PHYS_OFFSET不是用参数直接传进来的,而是上面的方法得到的。

3.__create_page_tables

3.1 Clear the swapper page table


r8:物理地址PHYS_OFFSET,
r4:物理地址swapper_pg_dir
该代码的意思是clean the swapper_pg_dir 64K region
0xc0008050 <__create_page_tables>:      add     r4, r8, #16384  ; 0x4000
0xc0008054 <__create_page_tables+0x4>:  mov     r0, r4
0xc0008058 <__create_page_tables+0x8>:  mov     r3, #0
0xc000805c <__create_page_tables+0xc>:  add     r6, r0, #16384  ; 0x4000
0xc0008060 <__create_page_tables+0x10>: str     r3, [r0], #4
0xc0008064 <__create_page_tables+0x14>: str     r3, [r0], #4
0xc0008068 <__create_page_tables+0x18>: str     r3, [r0], #4
0xc000806c <__create_page_tables+0x1c>: str     r3, [r0], #4
0xc0008070 <__create_page_tables+0x20>: teq     r0, r6
0xc0008074 <__create_page_tables+0x24>: bne     0xc0008060

3.2 Create identity mapping to cater for __enable_mmu

0xc0008078 <__create_page_tables+0x28>: ldr     r7, [r10, #8]

/*根据物理地址寻址,得到变量的虚拟地址,根据线性关系又得到变量的物理地址*/
0xc000807c <__create_page_tables+0x2c>: add     r0, pc, #128    ; 0x80
0xc0008080 <__create_page_tables+0x30>: ldm     r0, {r3, r5, r6}
/*virt->phys offset*/
0xc0008084 <__create_page_tables+0x34>: sub     r0, r0, r3
/*phys __turn_mmu_on, phys __turn_mmu_on_end*/
0xc0008088 <__create_page_tables+0x38>: add     r5, r5, r0
0xc000808c <__create_page_tables+0x3c>: add     r6, r6, r0
/*物理地址右移20位,再左移2位,得到对应的tabale index【index不是应该来自虚拟地址吗?】
 *由base address+index,就可得到descriptor的地址
 *
 *描述符的内容来自PROCINFO_MM_MMUFLAGS,
 */
0xc0008090 <__create_page_tables+0x40>: lsr     r5, r5, #20
0xc0008094 <__create_page_tables+0x44>: lsr     r6, r6, #20
0xc0008098 <__create_page_tables+0x48>: orr     r3, r7, r5, lsl #20
0xc000809c <__create_page_tables+0x4c>: str     r3, [r4, r5, lsl #2]
0xc00080a0 <__create_page_tables+0x50>: cmp     r5, r6
0xc00080a4 <__create_page_tables+0x54>: addcc   r5, r5, #1
0xc00080a8 <__create_page_tables+0x58>: bcc     0xc0008098 <__create_page_tables+72>

3.3 setup the pagetables for our kernel direct mapped region.


0xc00080ac <__create_page_tables+0x5c>: mov     r3, pc /*PC为物理地址*/
0xc00080b0 <__create_page_tables+0x60>: lsr     r3, r3, #20
0xc00080b4 <__create_page_tables+0x64>: orr     r3, r7, r3, lsl #20
0xc00080b8 <__create_page_tables+0x68>: add     r0, r4, #12288  ; 0x3000
0xc00080bc <__create_page_tables+0x6c>: str     r3, [r0, #0]!

/*r0: table address, r4: base
 *r6的内容是虚地址
 *crash> rd 0xc0008100 :c0008100:  c0815e33  
 *c0815e33 >> 18: 0x00003000这里和  add   r0, r4, #12288  ; 0x3000对应了
 */
0xc00080c0 <__create_page_tables+0x70>: ldr     r6, [pc, #56]   ; 0xc0008100
0xc00080c4 <__create_page_tables+0x74>: add     r0, r0, #4
0xc00080c8 <__create_page_tables+0x78>: add     r6, r4, r6, lsr #18

0xc00080cc <__create_page_tables+0x7c>: cmp     r0, r6
0xc00080d0 <__create_page_tables+0x80>: add     r3, r3, #1048576        ; 0x100000
0xc00080d4 <__create_page_tables+0x84>: strls   r3, [r0], #4
0xc00080d8 <__create_page_tables+0x88>: bls     0xc00080cc <__create_page_tables+124>

建立页表内容的过程是:

1.由物理地址右移20位,再左移20位;
2.得到控制位;
3.物理地址和控制位oor;
4.把内容写到相应的地址中,4字节对齐,the 2 least 00.

4.进入C语言后的页表创建

paging_init -> create_mapping -> pgd = pgd_offset_k(addr);
/* to find an entry in a kernel page-table-directory */
#define pgd_offset_k(addr)    pgd_offset(&init_mm, addr)

#define pgd_offset(mm, addr)    ((mm)->pgd + pgd_index(addr))
struct mm_struct init_mm = {
    .mm_rb        = RB_ROOT,
    .pgd        = swapper_pg_dir,
    .mm_users    = ATOMIC_INIT(2),
    .mm_count    = ATOMIC_INIT(1),
    .mmap_sem    = __RWSEM_INITIALIZER(init_mm.mmap_sem),
    .page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
    .mmlist        = LIST_HEAD_INIT(init_mm.mmlist),
    INIT_MM_CONTEXT(init_mm)
};


5. the procedure of page creation

setup_arch:
    parse_early_param();
    sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL);
    sanity_check_meminfo();
    arm_memblock_init(&meminfo, mdesc);


struct memblock {
    phys_addr_t current_limit;
    struct memblock_type memory;
    struct memblock_type reserved;
};
memblock_type分两种一种为reserverd,即已经有固定用途

void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc)
{
    int i;
    /*1. regions difined in meminfo*/
    for (i = 0; i < mi->nr_banks; i++)
        memblock_add(mi->bank[i].start, mi->bank[i].size);

    /*2. Register the kernel text, kernel data and initrd with memblock. */
    memblock_reserve(__pa(_stext), _end - _stext);
    arm_mm_memblock_reserve();
    arm_dt_memblock_reserve();

    /*3. reserve any platform specific memblock areas */
    if (mdesc->reserve)
        mdesc->reserve();

    /*4.
     * reserve memory for DMA contigouos allocations,
     * must come from DMA area inside low memory
     */
    dma_contiguous_reserve(min(arm_dma_limit, arm_lowmem_limit));

    arm_memblock_steal_permitted = false;
    memblock_allow_resize();
    memblock_dump_all();
}

/*
 * paging_init() sets up the page tables, initialises the zone memory
 * maps, and sets up the zero page, bad page and bad page tables.
 */
void __init paging_init(struct machine_desc *mdesc)
{
    void *zero_page;

    memblock_set_current_limit(arm_lowmem_limit);
    build_mem_type_table();
    prepare_page_table();
    map_lowmem();
    dma_contiguous_remap();
    devicemaps_init(mdesc);
    kmap_init();
    top_pmd = pmd_off_k(0xffff0000);

    /* allocate the zero page. */
    zero_page = early_alloc(PAGE_SIZE);

    bootmem_init();

    empty_zero_page = virt_to_page(zero_page);
    __flush_dcache_page(NULL, empty_zero_page);
}

struct mem_type {
    pteval_t prot_pte;
    pmdval_t prot_l1;
    pmdval_t prot_sect;
    unsigned int domain;
};

typedef u32 pteval_t;
typedef u32 pmdval_t;

/*
 * Architecture ioremap implementation.
 */
#define MT_DEVICE        0
#define MT_DEVICE_NONSHARED    1
#define MT_DEVICE_CACHED    2
#define MT_DEVICE_WC        3

/* types 0-3 are defined in asm/io.h */
#define MT_UNCACHED        4
#define MT_CACHECLEAN        5
#define MT_MINICLEAN        6
#define MT_LOW_VECTORS        7
#define MT_HIGH_VECTORS        8
#define MT_MEMORY        9
#define MT_ROM            10
#define MT_MEMORY_NONCACHED    11
#define MT_MEMORY_DTCM        12
#define MT_MEMORY_ITCM        13
#define MT_MEMORY_SO        14
#define MT_MEMORY_DMA_READY    15

crash> p mem_types
mem_types = $18 =
 {{
    prot_pte = 0x653,
    prot_l1 = 0x41,
    prot_sect = 0x11452,
    domain = 0x2
  }, {
    prot_pte = 0x273,
    prot_l1 = 0x41,
    prot_sect = 0x1452,
    domain = 0x2
  }, {
    prot_pte = 0x66f,
    prot_l1 = 0x41,
    prot_sect = 0x1045e,
    domain = 0x2
  }, {
    prot_pte = 0x667,
    prot_l1 = 0x41,
    prot_sect = 0x10456,
    domain = 0x2
  }, {
    prot_pte = 0x243,
    prot_l1 = 0x41,
    prot_sect = 0x52,
    domain = 0x2
  }, {
    prot_pte = 0x0,
    prot_l1 = 0x0,
    prot_sect = 0x841e,
    domain = 0x0
  }, {
    prot_pte = 0x0,
    prot_l1 = 0x0,
    prot_sect = 0x941a,
    domain = 0x0
  }, {
    prot_pte = 0x4df,
    prot_l1 = 0x21,
    prot_sect = 0x0,
    domain = 0x1
  }, {
    prot_pte = 0x5df,
    prot_l1 = 0x21,
    prot_sect = 0x0,
    domain = 0x1
  }, {
    prot_pte = 0x45f,
    prot_l1 = 0x1,
    prot_sect = 0x1140e,
    domain = 0x0
  }, {
    prot_pte = 0x0,
    prot_l1 = 0x0,
    prot_sect = 0x940e,
    domain = 0x0
  }, {
    prot_pte = 0x447,
    prot_l1 = 0x1,
    prot_sect = 0x10406,
    domain = 0x0
  }, {
    prot_pte = 0x243,
    prot_l1 = 0x1,
    prot_sect = 0x12,
    domain = 0x0
  }, {
    prot_pte = 0x43,
    prot_l1 = 0x1,
    prot_sect = 0x0,
    domain = 0x0
  }, {
    prot_pte = 0x43,
    prot_l1 = 0x1,
    prot_sect = 0x10412,
    domain = 0x0
  }, {
    prot_pte = 0x45f,
    prot_l1 = 0x1,
    prot_sect = 0x0,
    domain = 0x0
  }}

static inline void prepare_page_table(void)
{
    unsigned long addr;
    phys_addr_t end;

    /*
     * Clear out all the mappings below the kernel image.
     */
    for (addr = 0; addr < MODULES_VADDR; addr += PMD_SIZE)
        pmd_clear(pmd_off_k(addr));


    for ( ; addr < PAGE_OFFSET; addr += PMD_SIZE)
        pmd_clear(pmd_off_k(addr));

    /*
     * Find the end of the first block of lowmem.
     */
    end = memblock.memory.regions[0].base + memblock.memory.regions[0].size;
    if (end >= arm_lowmem_limit)
        end = arm_lowmem_limit;

    /*
     * Clear out all the kernel space mappings, except for the first
     * memory bank, up to the vmalloc region.
     */
    for (addr = __phys_to_virt(end);
         addr < VMALLOC_START; addr += PMD_SIZE)
        pmd_clear(pmd_off_k(addr));
}

static inline pmd_t *pmd_off_k(unsigned long virt)
{
    return pmd_offset(pud_offset(pgd_offset_k(virt), virt), virt);
}

/* to find an entry in a kernel page-table-directory */
#define pgd_offset_k(addr)    pgd_offset(&init_mm, addr)
#define pgd_offset(mm, addr)    ((mm)->pgd + pgd_index(addr))
/* to find an entry in a page-table-directory */
#define pgd_index(addr)        ((addr) >> PGDIR_SHIFT)
#define PMD_SHIFT        21
#define PGDIR_SHIFT        21

static inline pud_t * pud_offset(pgd_t * pgd, unsigned long address)
{
    return (pud_t *)pgd;
}
static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
{
    return (pmd_t *)pud;
}

从外想内, pud_offset(pgd_offset_k(virt), virt) -> pgd_offset_k(virt) -> pgd_offset(&init_mm, addr)
       -> ((mm)->pgd + pgd_index(addr))

#define pmd_clear(pmdp)            \
    do {                \
        pmdp[0] = __pmd(0);    \
        pmdp[1] = __pmd(0);    \
        clean_pmd_entry(pmdp);    \
    } while (0)
static inline void clean_pmd_entry(void *pmd)
{
    const unsigned int __tlb_flag = __cpu_tlb_flags;

    tlb_op(TLB_DCLEAN, "c7, c10, 1    @ flush_pmd", pmd);
    tlb_l2_op(TLB_L2CLEAN_FR, "c15, c9, 1  @ L2 flush_pmd", pmd);
}
#define __pte(x)        (x)
#define __pmd(x)        (x)
#define __pgprot(x)     (x)

static void __init map_lowmem(void)
{
    struct memblock_region *reg;
    phys_addr_t start;
    phys_addr_t end;
    struct map_desc map;

    /* Map all the lowmem memory banks. */
    for_each_memblock(memory, reg) {
        start = reg->base;
        end = start + reg->size;

        if (end > arm_lowmem_limit)
            end = arm_lowmem_limit;
        if (start >= end)
            break;

        map.pfn = __phys_to_pfn(start);
        map.virtual = __phys_to_virt(start);
        map.length = end - start;
        map.type = MT_MEMORY;

        create_mapping(&map, false);
    }
}

/*
 * Create the page directory entries and any necessary
 * page tables for the mapping specified by `md'.  We
 * are able to cope here with varying sizes and address
 * offsets, and we take full advantage of sections and
 * supersections.
 */
static void __init create_mapping(struct map_desc *md, bool force_pages)
{
    unsigned long addr, length, end;
    phys_addr_t phys;
    const struct mem_type *type;
    pgd_t *pgd;

    type = &mem_types[md->type];

    addr = md->virtual & PAGE_MASK;
    phys = __pfn_to_phys(md->pfn);
    length = PAGE_ALIGN(md->length + (md->virtual & ~PAGE_MASK));

    pgd = pgd_offset_k(addr);
    end = addr + length;
    do {
        unsigned long next = pgd_addr_end(addr, end);

        alloc_init_pud(pgd, addr, next, phys, type, force_pages);

        phys += next - addr;
        addr = next;
    } while (pgd++, addr != end);
}


static void __init alloc_init_section(pud_t *pud, unsigned long addr,
                      unsigned long end, phys_addr_t phys,
                      const struct mem_type *type,
                      bool force_pages)
{
    pmd_t *pmd = pmd_offset(pud, addr);

    /*
     * Try a section mapping - end, addr and phys must all be aligned
     * to a section boundary.  Note that PMDs refer to the individual
     * L1 entries, whereas PGDs refer to a group of L1 entries making
     * up one logical pointer to an L2 table.
     */
    if (type->prot_sect && ((addr | end | phys) & ~SECTION_MASK) == 0 &&
        !force_pages) {
        pmd_t *p = pmd;


        if (addr & SECTION_SIZE)
            pmd++;

        do {
            *pmd = __pmd(phys | type->prot_sect);
            phys += SECTION_SIZE;
        } while (pmd++, addr += SECTION_SIZE, addr != end);

        flush_pmd_entry(p);
    } else {
        /*
         * No need to loop; pte's aren't interested in the
         * individual L1 entries.
         */
        alloc_init_pte(pmd, addr, end, __phys_to_pfn(phys), type);
    }
}

static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr,
                  unsigned long end, unsigned long pfn,
                  const struct mem_type *type)
{
    pte_t *start_pte = early_pte_alloc(pmd);
    pte_t *pte = start_pte + pte_index(addr);

    /* If replacing a section mapping, the whole section must be replaced */
    BUG_ON(pmd_bad(*pmd) && ((addr | end) & ~PMD_MASK));

    do {
        set_pte_ext(pte, pfn_pte(pfn, __pgprot(type->prot_pte)), 0);
        pfn++;
    } while (pte++, addr += PAGE_SIZE, addr != end);
    early_pte_install(pmd, start_pte, type->prot_l1);
}

arch/arm/include/asm/glue-proc.h:

#ifdef CONFIG_CPU_V7
# ifdef CPU_NAME
#  undef  MULTI_CPU
#  define MULTI_CPU
# else
#  define CPU_NAME cpu_v7
# endif
#endif

#ifndef MULTI_CPU
#define cpu_proc_init            __glue(CPU_NAME,_proc_init)
#define cpu_proc_fin            __glue(CPU_NAME,_proc_fin)
#define cpu_reset            __glue(CPU_NAME,_reset)
#define cpu_do_idle            __glue(CPU_NAME,_do_idle)
#define cpu_dcache_clean_area        __glue(CPU_NAME,_dcache_clean_area)
#define cpu_do_switch_mm        __glue(CPU_NAME,_switch_mm)
#define cpu_set_pte_ext            __glue(CPU_NAME,_set_pte_ext)
#define cpu_suspend_size        __glue(CPU_NAME,_suspend_size)
#define cpu_do_suspend            __glue(CPU_NAME,_do_suspend)
#define cpu_do_resume            __glue(CPU_NAME,_do_resume)
#endif

arch/arm/mm/proc-v7-2level.S
/*
 *    cpu_v7_set_pte_ext(ptep, pte)
 *
 *    Set a level 2 translation table entry.
 *
 *    - ptep  - pointer to level 2 translation table entry
 *          (hardware version is stored at +2048 bytes)
 *    - pte   - PTE value to store
 *    - ext    - value for extended PTE bits
 */
ENTRY(cpu_v7_set_pte_ext)

    str    r1, [r0]            @ linux version

    bic    r3, r1, #0x000003f0
    bic    r3, r3, #PTE_TYPE_MASK
    orr    r3, r3, r2
    orr    r3, r3, #PTE_EXT_AP0 | 2

    tst    r1, #1 << 4
    orrne    r3, r3, #PTE_EXT_TEX(1)

    eor    r1, r1, #L_PTE_DIRTY
    tst    r1, #L_PTE_RDONLY | L_PTE_DIRTY
    orrne    r3, r3, #PTE_EXT_APX

    tst    r1, #L_PTE_USER
    orrne    r3, r3, #PTE_EXT_AP1


    tst    r1, #L_PTE_XN
    orrne    r3, r3, #PTE_EXT_XN

    tst    r1, #L_PTE_YOUNG
    tstne    r1, #L_PTE_PRESENT
    moveq    r3, #0

 ARM(    str    r3, [r0, #2048]! )

    mcr    p15, 0, r0, c7, c10, 1        @ flush_pte

    mov    pc, lr
ENDPROC(cpu_v7_set_pte_ext)