do_fork->copy_process->copy_mm

来源:互联网 发布:性格温柔的男生知乎 编辑:程序博客网 时间:2024/06/05 09:09
static int copy_mm(unsigned long clone_flags, struct task_struct * tsk){struct mm_struct * mm, *oldmm;int retval;tsk->min_flt = tsk->maj_flt = 0;tsk->nvcsw = tsk->nivcsw = 0;#ifdef CONFIG_DETECT_HUNG_TASKtsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;#endiftsk->mm = NULL;tsk->active_mm = NULL;/* * Are we cloning a kernel thread? * * We need to steal a active VM for that.. */oldmm = current->mm; if (!oldmm)return 0;if (clone_flags & CLONE_VM) {atomic_inc(&oldmm->mm_users);mm = oldmm;goto good_mm;}retval = -ENOMEM;mm = dup_mm(tsk);if (!mm)goto fail_nomem;good_mm:/* Initializing for Swap token stuff */mm->token_priority = 0;mm->last_interval = 0;if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)atomic_inc(&mm->oom_disable_count);tsk->mm = mm;tsk->active_mm = mm;return 0;fail_nomem:return retval;}
这段代码需要注意两个问题,一个是task_struct中的mm和activemm指针的指向问题,在前面的文章中已经讨论过了;我们重点看第二个问题,也是这个函数的核心,即
mm=dup_mm(tsk)

dup_mm的代码如下:

/* * Allocate a new mm structure and copy contents from the * mm structure of the passed in task structure. */struct mm_struct *dup_mm(struct task_struct *tsk){struct mm_struct *mm, *oldmm = current->mm;int err;if (!oldmm)return NULL;mm = allocate_mm();  //在专用高速缓存中分配一个mm结构if (!mm)goto fail_nomem;memcpy(mm, oldmm, sizeof(*mm)); //将父进程的mm拷贝给子进程mm_init_cpumask(mm);/* Initializing for Swap token stuff */mm->token_priority = 0;mm->last_interval = 0;#ifdef CONFIG_TRANSPARENT_HUGEPAGEmm->pmd_huge_pte = NULL;#endifif (!mm_init(mm, tsk)) goto fail_nomem;if (init_new_context(tsk, mm))goto fail_nocontext;dup_mm_exe_file(oldmm, mm);err = dup_mmap(mm, oldmm);if (err)goto free_pt;mm->hiwater_rss = get_mm_rss(mm);mm->hiwater_vm = mm->total_vm;if (mm->binfmt && !try_module_get(mm->binfmt->module))goto free_pt;return mm;free_pt:/* don't put binfmt in mmput, we haven't got module yet */mm->binfmt = NULL;mmput(mm);fail_nomem:return NULL;fail_nocontext:/* * If init_new_context() failed, we cannot use mmput() to free the mm * because it calls destroy_context() */mm_free_pgd(mm);free_mm(mm);return NULL;}
先来看函数中的关键操作mm_init(mm,tsk);

static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p){atomic_set(&mm->mm_users, 1);atomic_set(&mm->mm_count, 1);init_rwsem(&mm->mmap_sem);INIT_LIST_HEAD(&mm->mmlist);mm->flags = (current->mm) ?(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;mm->core_state = NULL;mm->nr_ptes = 0;memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));spin_lock_init(&mm->page_table_lock);mm->free_area_cache = TASK_UNMAPPED_BASE;mm->cached_hole_size = ~0UL;mm_init_aio(mm);mm_init_owner(mm, p);atomic_set(&mm->oom_disable_count, 0);          /*以上代码初始化mm的数据成员*/        /*关键是下面的mm_alloc_pgd(mm)*/if (likely(!mm_alloc_pgd(mm))) {  //allocate pgd, and copy the kernel space  items to the gdbmm->def_flags = 0;mmu_notifier_mm_init(mm);return mm;}free_mm(mm);return NULL;}
关键操作mm_alloc_pgd(mm),这个操作可以跟踪到pgd_alloc函数:

pgd_t *pgd_alloc(struct mm_struct *mm){  /*由于未开启PAE机制,因此不用关心所有有关pmd的操作*/pgd_t *pgd;pmd_t *pmds[PREALLOCATED_PMDS];pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); //为pgd分配一个页面if (pgd == NULL)goto out;mm->pgd = pgd;  //令mm的pgd成员变量指向新申请的pgd页面if (preallocate_pmds(pmds) != 0)goto out_free_pgd;if (paravirt_pgd_alloc(mm) != 0)goto out_free_pmds;/* * Make sure that pre-populating the pmds is atomic with * respect to anything walking the pgd_list, so that they * never see a partially populated pgd. */spin_lock(&pgd_lock);pgd_ctor(mm, pgd);   //关键代码,将指向内核空间的页目录项拷贝到新分配的pgd对应的项目中。pgd_prepopulate_pmd(mm, pgd, pmds);spin_unlock(&pgd_lock);return pgd;out_free_pmds:free_pmds(pmds);out_free_pgd:free_page((unsigned long)pgd);out:return NULL;}
这个函数的关键就是
pgd_ctor(mm, pgd);   //关键代码,将指向内核空间的页目录项拷贝到新分配的pgd对应的项目中。
我们来看一下:
static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd){/* If the pgd points to a shared pagetable level (either the   ptes in non-PAE, or shared PMD in PAE), then just copy the   references from swapper_pg_dir. */if (PAGETABLE_LEVELS == 2 ||    (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||    PAGETABLE_LEVELS == 4) {clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,    //item points to the kernel spaceswapper_pg_dir + KERNEL_PGD_BOUNDARY,  KERNEL_PGD_PTRS);}/* list required to sync kernel mapping updates */if (!SHARED_KERNEL_PMD) {pgd_set_mm(pgd, mm);pgd_list_add(pgd);}}
关键代码:

clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,    //item points to the kernel space
swapper_pg_dir + KERNEL_PGD_BOUNDARY,  KERNEL_PGD_PTRS);
clone_pgd_range的源代码

/* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * *  dst - pointer to pgd range anwhere on a pgd page *  src - "" *  count - the number of pgds to copy. * * dst and src can be on the same page, but the range must not overlap, * and must not cross a page boundary. */static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count){       memcpy(dst, src, count * sizeof(pgd_t));}

然后看一下传给clone_pgd_range的参数:

第一个参数:

pgd + KERNEL_PGD_BOUNDARY
pgd是分配得到的页目录的基地址,经过跟踪计算KERNEL_PGD_BOUNDARY=768,也就是指向pgd的第768项,这个项目正好是内核空间的开始。

第二个参数:


swapper_pg_dir + KERNEL_PGD_BOUNDARY,

首先来看一下swapper_pg_dir是什么?(详细的解释请参看:http://blog.csdn.net/sunnybeike/article/details/6897819)

swapper_pg_dir这个东西其实就是一个页目录的指针。swapper_pg_dir只是在内核初始化的时候被载入到cr3指示内存映射信息,之后在init进程启动后就成了idle内核线程的页目录指针了,/sbin/init由一个叫做init的内核线程exec而成,而init内核线程是原始的内核也就是后来的idle线程do_fork而成的,而在do_fork中会为新生的进程重启分配一个页目录指针,由此可见swapper_pg_dir只是在idle和内核线程中被使用,可是它的作用却不只是为idle进程指示内存映射信息,更多的,它作为一个内核空间的内存映射模板而存在,在linux中,任何进程在内核空间就不分彼此了,所有的进程都会公用一份内核空间的内存映射,因此,内核空间是所有进程共享的,每当一个新的进程建立的时候,都会将swapper_pg_dir的768项以后的信息全部复制到新进程页目录的768项以后,代表内核空间。另外在操作3G+896M以上的虚拟内存时,只会更改swapper_pg_dir的映射信息,当别的进程访问到这些页面的时候会发生缺页,在缺页处理中会与swapper_pg_dir同步。

因此第二个参数指向的是页目录表模板的第768项,也就是指向指向内核空间的页目录项。

第三个参数:

KERNEL_PGD_PTRS);
顾名思义,是表示内核目录项的个数。

因此,整个clone_pgd_range就是将初始化页表的内核空间。

这样回到mm_init中,下面一个重要的函数就是:

/* * we do not have to muck with descriptors here, that is * done in switch_mm() as needed. */int init_new_context(struct task_struct *tsk, struct mm_struct *mm){struct mm_struct *old_mm;int retval = 0;mutex_init(&mm->context.lock); //初始化锁mm->context.size = 0;old_mm = current->mm;if (old_mm && old_mm->context.size > 0) {mutex_lock(&old_mm->context.lock);retval = copy_ldt(&mm->context, &old_mm->context);mutex_unlock(&old_mm->context.lock);}return retval;}
显然,这里面比较重要的操作就是:

copy_ldt(&mm->context, &old_mm->context);
但是只有VM86模式才会有LDT,因此我们并不关心这个操作。

来看下面一个函数:

static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm){/* It's safe to write the exe_file pointer without exe_file_lock because * this is called during fork when the task is not yet in /proc */newmm->exe_file = get_mm_exe_file(oldmm);}

首先task_struct中的exe_file字段,在task_struct结构中的定义如下:

    /* store ref to file /proc/<pid>/exe symlink points to */         struct file *exe_file;
可以看到,它是指向可执行文件的。所以,dup_mm_exe_file就是将mm中的exe_file字段指向父进程的可执行文件。

最后一个重要的函数是dup_mmap函数:

static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm){struct vm_area_struct *mpnt, *tmp, *prev, **pprev;struct rb_node **rb_link, *rb_parent;int retval;unsigned long charge;struct mempolicy *pol;down_write(&oldmm->mmap_sem);flush_cache_dup_mm(oldmm);/* * Not linked in yet - no deadlock potential: */down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);mm->locked_vm = 0;mm->mmap = NULL;mm->mmap_cache = NULL;mm->free_area_cache = oldmm->mmap_base;mm->cached_hole_size = ~0UL;mm->map_count = 0;cpumask_clear(mm_cpumask(mm));mm->mm_rb = RB_ROOT;rb_link = &mm->mm_rb.rb_node;rb_parent = NULL;pprev = &mm->mmap;retval = ksm_fork(mm, oldmm);if (retval)goto out;retval = khugepaged_fork(mm, oldmm);if (retval)goto out;prev = NULL;for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {  //对的vm_area_struct和页面映射表进行循环复制struct file *file;if (mpnt->vm_flags & VM_DONTCOPY) {  //do not copy this vm on fork.long pages = vma_pages(mpnt); //这个vm中包含的页面数mm->total_vm -= pages;   //将mm所拥有的页面数减去这些没有拷贝的页面的数量vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,-pages); //主要对映射到这个vm中的file相关的标志做设置continue;}charge = 0;if (mpnt->vm_flags & VM_ACCOUNT) {unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;if (security_vm_enough_memory(len))goto fail_nomem;charge = len;}tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);if (!tmp)goto fail_nomem;*tmp = *mpnt;  //***************************完全拷贝当前vmINIT_LIST_HEAD(&tmp->anon_vma_chain);pol = mpol_dup(vma_policy(mpnt));retval = PTR_ERR(pol);if (IS_ERR(pol))goto fail_nomem_policy;vma_set_policy(tmp, pol);tmp->vm_mm = mm;if (anon_vma_fork(tmp, mpnt))goto fail_nomem_anon_vma_fork;tmp->vm_flags &= ~VM_LOCKED;tmp->vm_next = tmp->vm_prev = NULL;file = tmp->vm_file;if (file) {struct inode *inode = file->f_path.dentry->d_inode;struct address_space *mapping = file->f_mapping;get_file(file);if (tmp->vm_flags & VM_DENYWRITE)atomic_dec(&inode->i_writecount);mutex_lock(&mapping->i_mmap_mutex);if (tmp->vm_flags & VM_SHARED)mapping->i_mmap_writable++;flush_dcache_mmap_lock(mapping);/* insert tmp into the share list, just after mpnt */vma_prio_tree_add(tmp, mpnt);flush_dcache_mmap_unlock(mapping);mutex_unlock(&mapping->i_mmap_mutex);}/* * Clear hugetlb-related page reserves for children. This only * affects MAP_PRIVATE mappings. Faults generated by the child * are not guaranteed to succeed, even if read-only */if (is_vm_hugetlb_page(tmp))reset_vma_resv_huge_pages(tmp);/* * Link in the new vma and copy the page table entries. */*pprev = tmp;pprev = &tmp->vm_next;tmp->vm_prev = prev;prev = tmp;__vma_link_rb(mm, tmp, rb_link, rb_parent);rb_link = &tmp->vm_rb.rb_right;rb_parent = &tmp->vm_rb;mm->map_count++;retval = copy_page_range(mm, oldmm, mpnt);  //*****************这个操作是整个循环中最值得我们关注的函数。if (tmp->vm_ops && tmp->vm_ops->open)tmp->vm_ops->open(tmp);if (retval)goto out;}/* a new mm has just been created */arch_dup_mmap(oldmm, mm);retval = 0;out:up_write(&mm->mmap_sem);flush_tlb_mm(oldmm);up_write(&oldmm->mmap_sem);return retval;fail_nomem_anon_vma_fork:mpol_put(pol);fail_nomem_policy:kmem_cache_free(vm_area_cachep, tmp);fail_nomem:retval = -ENOMEM;vm_unacct_memory(charge);goto out;}
下面来看copy_range:

这个函数逐层处理页目录项和页表项

int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,struct vm_area_struct *vma){pgd_t *src_pgd, *dst_pgd;unsigned long next;unsigned long addr = vma->vm_start;unsigned long end = vma->vm_end;int ret;/* * Don't copy ptes where a page fault will fill them correctly. * Fork becomes much lighter when there are big shared or private * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {if (!vma->anon_vma)return 0;}if (is_vm_hugetlb_page(vma))  //如果是基于某些体系结构支持的巨型页,就调用另外的处理方法,显然我们这里是不需要的。return copy_hugetlb_page_range(dst_mm, src_mm, vma);if (unlikely(is_pfn_mapping(vma))) {  //如果是纯PFN机制的,则也不是我们需要关注的/* * We do not free on error cases below as remove_vma * gets called on error from higher level routine */ret = track_pfn_vma_copy(vma);if (ret)return ret;}/* * We need to invalidate the secondary MMU mappings only when * there could be a permission downgrade on the ptes of the * parent mm. And a permission downgrade will only happen if * is_cow_mapping() returns true. */if (is_cow_mapping(vma->vm_flags))  mmu_notifier_invalidate_range_start(src_mm, addr, end);       /*以下代码是需要认真关注的*/ret = 0;dst_pgd = pgd_offset(dst_mm, addr);   //获得dst_mm中addr地址在页目录表中的页目录项src_pgd = pgd_offset(src_mm, addr);   //获得src_mm中addr地址在页目录表中的页目录项do {next = pgd_addr_end(addr, end);  //获取下一个目录项所指的地址if (pgd_none_or_clear_bad(src_pgd)) //如果页面的映射尚未建立,则无需做任何事情;如果是坏的页目录项,则将其清空continue;if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,      vma, addr, next))) {  //这个操作是关键,它会复制当前页目录项指向的页表ret = -ENOMEM;break;}} while (dst_pgd++, src_pgd++, addr = next, addr != end);  //一直循环,直至将当前vm的目录项遍历完毕if (is_cow_mapping(vma->vm_flags))mmu_notifier_invalidate_range_end(src_mm,  vma->vm_start, end);return ret;}
copy_pud_range是对页表的复制:

/*对页表的复制类似于对页目录的复制*/static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,unsigned long addr, unsigned long end){pud_t *src_pud, *dst_pud;unsigned long next;dst_pud = pud_alloc(dst_mm, dst_pgd, addr);if (!dst_pud)return -ENOMEM;src_pud = pud_offset(src_pgd, addr);do {next = pud_addr_end(addr, end);if (pud_none_or_clear_bad(src_pud))continue;if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,vma, addr, next))return -ENOMEM;} while (dst_pud++, src_pud++, addr = next, addr != end);return 0;}

copy_mm就是对虚存存页目录和页表的的拷贝,因此涉及到内存管理,很多东西没有讲清楚,待看完内存管理之后再来详看!

到这里,我们知道了copy_mm的所做的事情如下:

1.从swapper_pg_dir拷贝内核空间的页目录

2.从父进程中拷贝用户空间的页目录和页表,但实际是要进行的拷贝内存也的过程,但是由于cow的应用,实际上一页都没有拷贝,只是将可写的用户空间对应的页表设置

成为可读,而对于可读的用户空间则进行共享。