linux mmap内核实现

来源:互联网 发布:linux开机动画 编辑:程序博客网 时间:2024/06/05 19:01

用户态mmap最终会调用kernel 系统调用,

SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, pgoff)

接下来分析下函数实现过程。


SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,unsigned long, prot, unsigned long, flags,unsigned long, fd, unsigned long, pgoff){struct file *file = NULL;unsigned long retval = -EBADF;if (!(flags & MAP_ANONYMOUS)) {//非匿名映射,文件映射到用户进程audit_mmap_fd(fd, flags);if (unlikely(flags & MAP_HUGETLB))return -EINVAL;file = fget(fd);//通过fd获取file,从而获取inode信息,关联磁盘文件,后面关闭fd,仍然可以用mmap操作if (!file)goto out;if (is_file_hugepages(file))len = ALIGN(len, huge_page_size(hstate_file(file)));} else if (flags & MAP_HUGETLB) {struct user_struct *user = NULL;struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) &   SHM_HUGE_MASK);if (!hs)return -EINVAL;len = ALIGN(len, huge_page_size(hs));/* * VM_NORESERVE is used because the reservations will be * taken when vm_ops->mmap() is called * A dummy user value is used because we are not locking * memory so no accounting is necessary */file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,VM_NORESERVE,&user, HUGETLB_ANONHUGE_INODE,(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);if (IS_ERR(file))return PTR_ERR(file);}flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);if (file)fput(file);out:return retval;}

//做完安全权限检查,调用主要实现函数do_mmap_pgoffunsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,unsigned long len, unsigned long prot,unsigned long flag, unsigned long pgoff){unsigned long ret;struct mm_struct *mm = current->mm;unsigned long populate;//映射权限检查ret = security_mmap_file(file, prot, flag);if (!ret) {down_write(&mm->mmap_sem);ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,    &populate);up_write(&mm->mmap_sem);if (populate)mm_populate(ret, populate);}return ret;}

//首先检查能否得到一个vma,满足需要len的映射,之后对文件映射检查//文件属性跟mmap映射属性是否匹配,最后再调用mmap_regionunsigned long do_mmap_pgoff(struct file *file, unsigned long addr,unsigned long len, unsigned long prot,unsigned long flags, unsigned long pgoff,unsigned long *populate){struct mm_struct * mm = current->mm;struct inode *inode;vm_flags_t vm_flags;*populate = 0;/* * Does the application expect PROT_READ to imply PROT_EXEC? * * (the exception is when the underlying filesystem is noexec *  mounted, in which case we dont add PROT_EXEC.) */if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))prot |= PROT_EXEC;if (!len)return -EINVAL;if (!(flags & MAP_FIXED))//MAP_FIXED时判断输入的欲映射的起始地址是否小于最小映射地址addr = round_hint_to_min(addr);/* Careful about overflows.. *///len page对齐len = PAGE_ALIGN(len);if (!len)return -ENOMEM;/* offset overflow? *///是否溢出if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)               return -EOVERFLOW;/* Too many mappings? */if (mm->map_count > sysctl_max_map_count)return -ENOMEM;/* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. *///获取未映射的vmaaddr = get_unmapped_area(file, addr, len, pgoff, flags);if (addr & ~PAGE_MASK)return addr;/* Do simple checking here so the lower-level routines won't have * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;if (flags & MAP_LOCKED)if (!can_do_mlock())return -EPERM;/* mlock MCL_FUTURE? */if (vm_flags & VM_LOCKED) {unsigned long locked, lock_limit;locked = len >> PAGE_SHIFT;locked += mm->locked_vm;lock_limit = rlimit(RLIMIT_MEMLOCK);lock_limit >>= PAGE_SHIFT;if (locked > lock_limit && !capable(CAP_IPC_LOCK))return -EAGAIN;}//文件或者是匿名映射inode = file ? file_inode(file) : NULL;if (file) {switch (flags & MAP_TYPE) {case MAP_SHARED:if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))//权限检查,文件属性不可写,但是映射属性为wirte,返回错误return -EACCES;/* * Make sure we don't allow writing to an append-only * file.. */ //不能往只能追加的文件写if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))return -EACCES;/* * Make sure there are no mandatory locks on the file. *///确保文件没有被锁定if (locks_verify_locked(inode))return -EAGAIN;vm_flags |= VM_SHARED | VM_MAYSHARE;if (!(file->f_mode & FMODE_WRITE))vm_flags &= ~(VM_MAYWRITE | VM_SHARED);/* fall through */case MAP_PRIVATE://文件不可读会报错if (!(file->f_mode & FMODE_READ))return -EACCES;if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {if (vm_flags & VM_EXEC)return -EPERM;vm_flags &= ~VM_MAYEXEC;}if (!file->f_op || !file->f_op->mmap)return -ENODEV;break;default:return -EINVAL;}} else {switch (flags & MAP_TYPE) {case MAP_SHARED:/* * Ignore pgoff. */pgoff = 0;vm_flags |= VM_SHARED | VM_MAYSHARE;break;case MAP_PRIVATE:/* * Set pgoff according to addr for anon_vma. */pgoff = addr >> PAGE_SHIFT;break;default:return -EINVAL;}}/* * Set 'VM_NORESERVE' if we should not account for the * memory use of this mapping. */if (flags & MAP_NORESERVE) {/* We honor MAP_NORESERVE if allowed to overcommit */if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)vm_flags |= VM_NORESERVE;/* hugetlb applies strict overcommit unless MAP_NORESERVE */if (file && is_file_hugepages(file))vm_flags |= VM_NORESERVE;}addr = mmap_region(file, addr, len, vm_flags, pgoff);if (!IS_ERR_VALUE(addr) &&    ((vm_flags & VM_LOCKED) ||     (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))*populate = len;return addr;}

/*先检查vma分配后是否超过进程限制,之后在rb tree中查找满足本次申请条件vma的前驱vma区,如果两个vma能合并就合并成一个,否则将新申请的vma插入rb tree中。在这整个过程中,我们看到只有vma的申请和磁盘文件的关联,物理地址的分配在缺页中断中进行,最后磁盘文件的调页由文件系统进行。*/unsigned long mmap_region(struct file *file, unsigned long addr,unsigned long len, vm_flags_t vm_flags, unsigned long pgoff){struct mm_struct *mm = current->mm;struct vm_area_struct *vma, *prev;int correct_wcount = 0;int error;struct rb_node **rb_link, *rb_parent;unsigned long charged = 0;struct inode *inode =  file ? file_inode(file) : NULL;/* Check against address space limit. *///检查分配vma后是否会超过进程的限制,0表示超过资源限制if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {unsigned long nr_pages;/* * MAP_FIXED may remove pages of mappings that intersects with * requested mapping. Account for the pages it would unmap. *///如果是MAP_FIXED的话,可以删除跟申请地址有交叉的映射来满足要求if (!(vm_flags & MAP_FIXED))return -ENOMEM;nr_pages = count_vma_pages_range(mm, addr, addr + len);if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))return -ENOMEM;}/* Clear old maps */error = -ENOMEM;munmap_back://查找最靠近申请区的vmaif (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {if (do_munmap(mm, addr, len))return -ENOMEM;goto munmap_back;}/* * Private writable mapping: check memory availability */if (accountable_mapping(file, vm_flags)) {charged = len >> PAGE_SHIFT;if (security_vm_enough_memory_mm(mm, charged))return -ENOMEM;vm_flags |= VM_ACCOUNT;}/* * Can we just expand an old mapping? *///当前要申请的vma能否跟已经存在prev vma合并vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,NULL, NULL);if (vma)goto out;/* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ //如果没法合并新申请一个vma,然后插入mm vma list中vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);if (!vma) {error = -ENOMEM;goto unacct_error;}vma->vm_mm = mm;vma->vm_start = addr;vma->vm_end = addr + len;vma->vm_flags = vm_flags;vma->vm_page_prot = vm_get_page_prot(vm_flags);vma->vm_pgoff = pgoff;INIT_LIST_HEAD(&vma->anon_vma_chain);error = -EINVAL;/* when rejecting VM_GROWSDOWN|VM_GROWSUP */if (file) {//文件映射if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))//文件映射的区间不能增长,因此跳转到free_vmagoto free_vma;if (vm_flags & VM_DENYWRITE) {//不允许通过常规文件操作访问文件,i_writecount减1error = deny_write_access(file);if (error)goto free_vma;correct_wcount = 1;}vma->vm_file = get_file(file);//获取vma 映射的文件error = file->f_op->mmap(file, vma);//fs mmapif (error)goto unmap_and_free_vma;/* Can addr have changed?? * * Answer: Yes, several device drivers can do it in their *         f_op->mmap method. -DaveM * Bug: If addr is changed, prev, rb_link, rb_parent should *      be updated for vma_link() */WARN_ON_ONCE(addr != vma->vm_start);addr = vma->vm_start;pgoff = vma->vm_pgoff;vm_flags = vma->vm_flags;} else if (vm_flags & VM_SHARED) {if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))goto free_vma;error = shmem_zero_setup(vma);if (error)goto free_vma;}if (vma_wants_writenotify(vma)) {pgprot_t pprot = vma->vm_page_prot;/* Can vma->vm_page_prot have changed?? * * Answer: Yes, drivers may have changed it in their *         f_op->mmap method. * * Ensures that vmas marked as uncached stay that way. */vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);}//vma 插入rb treevma_link(mm, vma, prev, rb_link, rb_parent);file = vma->vm_file;/* Once vma denies write, undo our temporary denial count */if (correct_wcount)atomic_inc(&inode->i_writecount);out:perf_event_mmap(vma);vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);if (vm_flags & VM_LOCKED) {if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||vma == get_gate_vma(current->mm)))mm->locked_vm += (len >> PAGE_SHIFT);elsevma->vm_flags &= ~VM_LOCKED;}if (file)uprobe_mmap(vma);return addr;unmap_and_free_vma:if (correct_wcount)atomic_inc(&inode->i_writecount);vma->vm_file = NULL;fput(file);/* Undo any partial mapping done by a device driver. */unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);charged = 0;free_vma:kmem_cache_free(vm_area_cachep, vma);unacct_error:if (charged)vm_unacct_memory(charged);return error;}


0 0