用户空间缺页异常pte_handle_fault()分析--(上)

来源:互联网 发布:java主函数的写法 编辑:程序博客网 时间:2024/06/05 08:51

       前面简单的分析了内核处理用户空间缺页异常的流程,进入到了handle_mm_fault()函数,该函数为触发缺页异常的地址address分配各级的页目录,也就是说现在已经拥有了一个和address配对的pte了,但是这个pte如何去映射物理页框,内核又得根据pte的状态进行分类和判断,而这个过程又会牵扯出一些其他的概念……这也是初读linux内核源码的最大障碍吧,在一些复杂的处理中,一个点往往可以延伸出一个面,容易让人迷失方向……因此后面打算分几次将这个函数分析完,自己也没有完全理解透,所以不到位的地方欢迎大家指出,一起交流~

static inline int handle_pte_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address,pte_t *pte, pmd_t *pmd, unsigned int flags){pte_t entry;spinlock_t *ptl;entry = *pte;if (!pte_present(entry)) {//如果页不在主存中if (pte_none(entry)) {//页表项内容为0,表明进程未访问过该页/*如果vm_ops字段和fault字段都不为空,则说明这是一个基于文件的映射*/if (vma->vm_ops) {if (likely(vma->vm_ops->fault))return do_linear_fault(mm, vma, address,pte, pmd, flags, entry);}/*否则分配匿名页*/return do_anonymous_page(mm, vma, address, pte, pmd, flags);}/*属于非线性文件映射且已被换出*/if (pte_file(entry))return do_nonlinear_fault(mm, vma, address,pte, pmd, flags, entry);/*页不在主存中,但是页表项保存了相关信息,则表明该页被内核换出,则要进行换入操作*/return do_swap_page(mm, vma, address,pte, pmd, flags, entry);}                   ...         ...}

 

首先要确定的一点就是pte对应的页是否驻留在主存中,因为pte有可能之前映射了页,但是该页被换出了。上面的代码给出了pte对应的页没有驻留在主存中的情况。如果pte对应的页没有驻留在主存中,且没有映射任何页,即pte_present()返回0,pte_none()返回0,则要判断要分配一个匿名页还是一个映射页。在Linux虚拟内存中,如果页对应的vma映射的是文件,则称为映射页,如果不是映射的文件,则称为匿名页。两者最大的区别体现在页和vma的组织上,因为在页框回收处理时要通过页来逆向搜索映射了该页的vma。对于匿名页的逆映射,vma都是通过vma结构体中的vma_anon_node(链表节点)和anon_vma(链表头)组织起来,再把该链表头的信息保存在页描述符中;而映射页和vma的组织是通过vma中的优先树节点和页描述符中的mapping->i_mmap优先树树根进行组织的,具体可以参看ULK3。

来看基于文件的映射的处理:

static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,unsigned long address, pte_t *page_table, pmd_t *pmd,unsigned int flags, pte_t orig_pte){pgoff_t pgoff = (((address & PAGE_MASK)- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;pte_unmap(page_table);//如果page_table之前用来建立了临时内核映射,则释放该映射return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);}

关键函数__do_fault():

static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,unsigned long address, pmd_t *pmd,pgoff_t pgoff, unsigned int flags, pte_t orig_pte){pte_t *page_table;spinlock_t *ptl;struct page *page;pte_t entry;int anon = 0;int charged = 0;struct page *dirty_page = NULL;struct vm_fault vmf;int ret;int page_mkwrite = 0;vmf.virtual_address = (void __user *)(address & PAGE_MASK);vmf.pgoff = pgoff;vmf.flags = flags;vmf.page = NULL;ret = vma->vm_ops->fault(vma, &vmf);//调用定义好的fault函数,确保将所需的文件数据读入到映射页if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))return ret;if (unlikely(PageHWPoison(vmf.page))) {if (ret & VM_FAULT_LOCKED)unlock_page(vmf.page);return VM_FAULT_HWPOISON;}/* * For consistency in subsequent calls, make the faulted page always * locked. */if (unlikely(!(ret & VM_FAULT_LOCKED)))lock_page(vmf.page);elseVM_BUG_ON(!PageLocked(vmf.page));/* * Should we do an early C-O-W break? */page = vmf.page;if (flags & FAULT_FLAG_WRITE) {//写访问if (!(vma->vm_flags & VM_SHARED)) {//私有映射,则要创建一个副本进行写时复制anon = 1;// 标记为一个匿名映射if (unlikely(anon_vma_prepare(vma))) {//创建一个anon_vma实例给vmaret = VM_FAULT_OOM;goto out;}page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,//分配一个页vma, address);if (!page) {ret = VM_FAULT_OOM;goto out;}if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {ret = VM_FAULT_OOM;page_cache_release(page);goto out;}charged = 1;/* * Don't let another task, with possibly unlocked vma, * keep the mlocked page. */if (vma->vm_flags & VM_LOCKED)clear_page_mlock(vmf.page);/*创建数据的副本,将数据拷贝到新分配的页*/copy_user_highpage(page, vmf.page, address, vma);__SetPageUptodate(page);} else {/* * If the page will be shareable, see if the backing * address space wants to know that the page is about * to become writable */if (vma->vm_ops->page_mkwrite) {int tmp;unlock_page(page);vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;tmp = vma->vm_ops->page_mkwrite(vma, &vmf);if (unlikely(tmp &  (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {ret = tmp;goto unwritable_page;}if (unlikely(!(tmp & VM_FAULT_LOCKED))) {lock_page(page);if (!page->mapping) {ret = 0; /* retry the fault */unlock_page(page);goto unwritable_page;}} elseVM_BUG_ON(!PageLocked(page));page_mkwrite = 1;}}}page_table = pte_offset_map_lock(mm, pmd, address, &ptl);/* * This silly early PAGE_DIRTY setting removes a race * due to the bad i386 page protection. But it's valid * for other architectures too. * * Note that if FAULT_FLAG_WRITE is set, we either now have * an exclusive copy of the page, or this is a shared mapping, * so we can make it writable and dirty to avoid having to * handle that later. *//* Only go through if we didn't race with anybody else... */if (likely(pte_same(*page_table, orig_pte))) {//确定没有竞争,也就是页表项中的内容和之前是一样的flush_icache_page(vma, page);entry = mk_pte(page, vma->vm_page_prot);//页表项指向对应的物理页/*如果是写操作,则将页的访问权限置为RW*/if (flags & FAULT_FLAG_WRITE)entry = maybe_mkwrite(pte_mkdirty(entry), vma);/*如果之前生成的页是匿名的,则将其集成到逆向映射当中*/if (anon) {inc_mm_counter(mm, anon_rss);page_add_new_anon_rmap(page, vma, address);//建立匿名页与第一个vma的逆向映射} else {inc_mm_counter(mm, file_rss);page_add_file_rmap(page);//建立页与vma的普通映射if (flags & FAULT_FLAG_WRITE) {dirty_page = page;get_page(dirty_page);}}set_pte_at(mm, address, page_table, entry);//修改page_table使其指向entry对应的页框/* no need to invalidate: a not-present page won't be cached */update_mmu_cache(vma, address, entry);} else {if (charged)mem_cgroup_uncharge_page(page);if (anon)page_cache_release(page);elseanon = 1; /* no anon but release faulted_page */}pte_unmap_unlock(page_table, ptl);out:if (dirty_page) {struct address_space *mapping = page->mapping;if (set_page_dirty(dirty_page))page_mkwrite = 1;unlock_page(dirty_page);put_page(dirty_page);if (page_mkwrite && mapping) {/* * Some device drivers do not set page.mapping but still * dirty their pages */balance_dirty_pages_ratelimited(mapping);}/* file_update_time outside page_lock */if (vma->vm_file)file_update_time(vma->vm_file);} else {unlock_page(vmf.page);if (anon)page_cache_release(vmf.page);}return ret;unwritable_page:page_cache_release(page);return ret;}



首先要做的就是调用vma->vm_ops中定义好的fault()函数,将所需的数据从文件读入到映射页中,该函数还会将vma插入到映射页的mapping->i_mmap优先树中。

文件一般以共享的方式进行映射,接下来就要判断触发异常的操作是否包含写操作,如果是写操作并且该vma不是以共享的方式映射该页,则要进行写时复制,也就是创建一个新的页来供该vma读写,此时会申请一个匿名页,并将数据拷贝到该匿名页中。

接下来就要计算出page对应的pte值是多少,并将page_table指向的pte以该值进行填充,这样就完成了页表项到物理页的映射

 

再来看分配匿名页的处理

static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,unsigned long address, pte_t *page_table, pmd_t *pmd,unsigned int flags){struct page *page;spinlock_t *ptl;pte_t entry;pte_unmap(page_table);/* Check if we need to add a guard page to the stack */if (check_stack_guard_page(vma, address) < 0)return VM_FAULT_SIGBUS;/* Use the zero-page for reads *//*如果是读操作,那么就让entry指向一个已有的填充为0的现有页,因为进程是第一次访问该页,  所以页中的内容是什么并不重要,这样进一步推迟了新页的分配*/if (!(flags & FAULT_FLAG_WRITE)) {entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),vma->vm_page_prot));page_table = pte_offset_map_lock(mm, pmd, address, &ptl);if (!pte_none(*page_table))goto unlock;goto setpte;}/*如果是写操作,则要分配一个新的页*//* Allocate our own private page. */if (unlikely(anon_vma_prepare(vma)))//分配一个anon_vma实例goto oom;/*分配一个被0填充的页*/page = alloc_zeroed_user_highpage_movable(vma, address);if (!page)goto oom;__SetPageUptodate(page);if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))goto oom_free_page;/*获取页对应的PTE内容*/entry = mk_pte(page, vma->vm_page_prot);/*如果是写操作则将页的权限设为读写并设置为脏页*/if (vma->vm_flags & VM_WRITE)entry = pte_mkwrite(pte_mkdirty(entry));page_table = pte_offset_map_lock(mm, pmd, address, &ptl);if (!pte_none(*page_table))goto release;inc_mm_counter(mm, anon_rss);page_add_new_anon_rmap(page, vma, address);//建立线性区和匿名页的反向映射setpte:set_pte_at(mm, address, page_table, entry);//设置page_table对应的pte/* No need to invalidate - it was non-present before */update_mmu_cache(vma, address, entry);//更新MMU缓存unlock:pte_unmap_unlock(page_table, ptl);return 0;release:mem_cgroup_uncharge_page(page);page_cache_release(page);goto unlock;oom_free_page:page_cache_release(page);oom:return VM_FAULT_OOM;}

匿名页分配的工作和__do_fault()中分配匿名页差不多,只不过前面多了一个读写的判断,如果是读的话,不会分配匿名页,而是让pte指向一个被0填充的页,这样就进一步推迟了页的分配。也许你会觉得奇怪,既然要读数据怎么可以分配一个事先准备好的全0的页,其实仔细想想就会明白,缺页异常处理进行到这里,一定是第一次访问相应的内存时才会触发,匿名页对应的一般都是堆,栈这些区域,对这些区域的访问一定先是写而不是读,所以对于这种操作本身就不正常,分配一个被0填充的页使用户进程读出来的都是0也许会更安全一些。

 

如果不是这两种情况的话,也就是说pte_none()返回的是0,那就说明pte之前映射过页,只是该页已被换出

如果该页之前是用来进行非线性文件映射的话,其处理的主体函数就是上面介绍过的__do_fault()

static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,unsigned long address, pte_t *page_table, pmd_t *pmd,unsigned int flags, pte_t orig_pte){pgoff_t pgoff;flags |= FAULT_FLAG_NONLINEAR;if (!pte_unmap_same(mm, pmd, page_table, orig_pte))return 0;if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {//确保vma具有非线性映射属性/* * Page table corrupted: show pte and kill process. */print_bad_pte(vma, address, orig_pte, NULL);return VM_FAULT_SIGBUS;}pgoff = pte_to_pgoff(orig_pte);//获取映射的文件偏移return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);}

pte_to_pgoff()这个函数是和pgoff_to_pte()相对的一组操作。在非线性文件映射的页被换出时,其映射文件的偏移会以PAGE_SIZE为单位进行编码,存储到其pte中,所以当要重新换入该页时,要进行相应的解码计算出pgoff,再由__do_fault()进行处理!

对于页没有驻留在主存的情况中的最后一种处理方式,do_swap_page(),留在下次再做分析!

 

 


      

原创粉丝点击