linux内存管理之内存映射
来源:互联网 发布:北斗卫星定位精度知乎 编辑:程序博客网 时间:2024/05/21 11:26
之前讲了那么多内存的东西,那么都离不开内存映射,不论虚拟地址到物理地址,还是用户空间地址到内核空间。关于映射用户空间最常用的是mmap来映射设备的io空间,直接访问,来提高io效率。内核的有ioremap映射设备io地址空间以供内核访问,kmap映射申请的高端内存,还有DMA ,dma主要用的多的是网卡驱动里ring buffer机制.
下面就说说mmap:
函数原型:void* mmap ( void * start , size_t len , int prot , int flags , int fd , off_t offset )
参数说明:
MAP_HUGETLB (since Linux 2.6.32)
Allocate the mapping using "huge pages." See the kernel source file Documentation/vm/hugetlbpage.txt for further information.
返回值:
成功执行时,mmap()返回被映射区的指针,失败时,mmap()返回MAP_FAILED[其值为(void *)-1],
在include/linux/fs.h:
点击(此处)折叠或打开
- struct file_operations {
- struct module *owner;
- loff_t (*llseek) (struct file *, loff_t, int);
- ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
- ...
- long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
- long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
- int (*mmap) (struct file *, struct vm_area_struct *);
那么需要看一下mmap系统调用的实现:mm/mmap.c:
点击(此处)折叠或打开
- SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
- unsigned long, prot, unsigned long, flags,
- unsigned long, fd, unsigned long, pgoff)
- {
- struct file *file = NULL;
- unsigned long retval = -EBADF;
- if (!(flags & MAP_ANONYMOUS)) { // 匿名映射flag
- audit_mmap_fd(fd, flags);
- if (unlikely(flags & MAP_HUGETLB))
- return -EINVAL;
- file = fget(fd);
- if (!file)
- goto out;
- if (is_file_hugepages(file))
- len = ALIGN(len, huge_page_size(hstate_file(file)));
- } else if (flags & MAP_HUGETLB) { // hugetlb 大页映射
- struct user_struct *user = NULL;
- len = ALIGN(len, huge_page_size(hstate_sizelog(
- (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK)));
- /*
- * VM_NORESERVE is used because the reservations will be
- * taken when vm_ops->mmap() is called
- * A dummy user value is used because we are not locking
- * memory so no accounting is necessary
- */
- file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
- VM_NORESERVE,
- &user, HUGETLB_ANONHUGE_INODE,
- (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
- if (IS_ERR(file))
- return PTR_ERR(file);
- }
- flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
- retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
- if (file)
- fput(file);
- out:
- return retval;
- }
其实之前文章已经说过这个布局。
我们看include/linux/mm_types.h:
点击(此处)折叠或打开
- /*
- * This struct defines a memory VMM memory area. There is one of these
- * per VM-area/task. A VM area is any part of the process virtual memory
- * space that has a special rule for the page-fault handlers (ie a shared
- * library, the executable area etc).
- */
- struct vm_area_struct {
- /* The first cache line has the info for VMA tree walking. */
- unsigned long vm_start; /* Our start address within vm_mm. */
- unsigned long vm_end; /* The first byte after our end address
- within vm_mm. */
- /* linked list of VM areas per task, sorted by address */
- struct vm_area_struct *vm_next, *vm_prev;
- struct rb_node vm_rb;
- /*
- * Largest free memory gap in bytes to the left of this VMA.
- * Either between this VMA and vma->vm_prev, or between one of the
- * VMAs below us in the VMA rbtree and its ->vm_prev. This helps
- * get_unmapped_area find a free area of the right size.
- */
- unsigned long rb_subtree_gap;
- /* Second cache line starts here. */
- struct mm_struct *vm_mm; /* The address space we belong to. */
- pgprot_t vm_page_prot; /* Access permissions of this VMA. */
- unsigned long vm_flags; /* Flags, see mm.h. */
- /*
- * For areas with an address space and backing store,
- * linkage into the address_space->i_mmap interval tree, or
- * linkage of vma in the address_space->i_mmap_nonlinear list.
- */
- union {
- struct {
- struct rb_node rb;
- unsigned long rb_subtree_last;
- } linear;
- struct list_head nonlinear;
- } shared;
- /*
- * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
- * list, after a COW of one of the file pages. A MAP_SHARED vma
- * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack
- * or brk vma (with NULL file) can only be in an anon_vma list.
- */
- struct list_head anon_vma_chain; /* Serialized by mmap_sem &
- * page_table_lock */
- struct anon_vma *anon_vma; /* Serialized by page_table_lock */
- /* Function pointers to deal with this struct. */
- const struct vm_operations_struct *vm_ops;
- /* Information about our backing store: */
- unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
- units, *not* PAGE_CACHE_SIZE */
- struct file * vm_file; /* File we map to (can be NULL). */
- void * vm_private_data; /* was vm_pte (shared mem) */
- #ifndef CONFIG_MMU
- struct vm_region *vm_region; /* NOMMU mapping region */
- #endif
- #ifdef CONFIG_NUMA
- struct mempolicy *vm_policy; /* NUMA policy for the VMA */
- #endif
- }
Struct vm_area_struct用红黑树来管理。不是和vmalloc里一些结构很相似?但是别搞混了.
内核中每一个这样的对象都表示用户进程地址空间的一段区域。
当linux 运行一个应用程序时,系统调用exec通过load_elf_binary函数把elf加载到用户虚拟空间。前面我们已经说了栈和堆。Text不用多解释。
那么基本流程就是:
1. 用户调用mmap系统调用
2. 内核在用户空间mmap区域分配一个空闲的vm_area_struct对象。
3. 然后修改页目录表项把对象的地址和设备的内存对应起来
那么在用户空间,mmap系统调用函数原型为:
Void *mmap(void *start,size_t length,int prot ,int flags,int fd, off_t offset);
它能够起作用的前提是打开的设备文件的驱动里实现了mmap。
看看mmap系统调用内核实现,
1.找到fd对应的struct file;
2 do_mmap_pgoff完成映射的工作。
细说do_mmap_pgoff函数
(1) 调用get_unmapped_area获得未使用的vm_area_struct
(2) 后续是mmap_region
(3) 调用到驱动file->mmap的具体实现
(4) 具体驱动层mmap的实现
在具体实现驱动层的mmap前,linux内核已经实现了页表映射的接口api供我们使用。
Remap_pfn_range (memory.c)也有其他延伸接口
参数fd:要映射到内存中的文件描述符。如果使用匿名内存映射时,即flags中设置了MAP_ANONYMOUS,fd设为-1。有
些系统不支持匿名内存映射,则可以使用fopen打开/dev/zero文件,然后对该文件进行映射,可以同样达到匿名内存映射的效果。
显然正常的mmap调用流程会走人第一个if语句获取file指针.
点击(此处)折叠或打开
- if (!(flags & MAP_ANONYMOUS)) {
- audit_mmap_fd(fd, flags);
- if (unlikely(flags & MAP_HUGETLB))
- return -EINVAL;
- file = fget(fd);
- if (!file)
- goto out;
- if (is_file_hugepages(file))
- len = ALIGN(len, huge_page_size(hstate_file(file)));
- }
点击(此处)折叠或打开
- unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
- unsigned long len, unsigned long prot,
- unsigned long flag, unsigned long pgoff)
- {
- unsigned long ret;
- struct mm_struct *mm = current->mm;
- ret = security_mmap_file(file, prot, flag);
- if (!ret) {
- down_write(&mm->mmap_sem);
- ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff);
- up_write(&mm->mmap_sem);
- }
- return ret;
- }
点击(此处)折叠或打开
- /*
- * The caller must hold down_write(¤t->mm->mmap_sem).
- */
- unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
- unsigned long len, unsigned long prot,
- unsigned long flags, unsigned long pgoff)
- {
- struct mm_struct * mm = current->mm;
- struct inode *inode;
- vm_flags_t vm_flags;
- /*
- * Does the application expect PROT_READ to imply PROT_EXEC?
- *
- * (the exception is when the underlying filesystem is noexec
- * mounted, in which case we dont add PROT_EXEC.)
- */
- if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
- if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
- prot |= PROT_EXEC;
- if (!len)
- return -EINVAL;
- if (!(flags & MAP_FIXED))
- addr = round_hint_to_min(addr);
- /* Careful about overflows.. */
- len = PAGE_ALIGN(len);
- if (!len)
- return -ENOMEM;
- /* offset overflow? */
- if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
- return -EOVERFLOW;
- /* Too many mappings? */
- if (mm->map_count > sysctl_max_map_count)
- return -ENOMEM;
- /* Obtain the address to map to. we verify (or select) it and ensure
- * that it represents a valid section of the address space.
- */
- addr = get_unmapped_area(file, addr, len, pgoff, flags); // 从用户空间map空闲区里分配一个地址空间,返回首地址。稍 //后它要赋值给vma (struct vm_area_struct)
- if (addr & ~PAGE_MASK)
- return addr;
- /* Do simple checking here so the lower-level routines won't have
- * to. we assume access permissions have been handled by the open
- * of the memory object, so we don't do any here.
- */
- vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
- mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
- if (flags & MAP_LOCKED)
- if (!can_do_mlock())
- return -EPERM;
- /* mlock MCL_FUTURE? */
- if (vm_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
- inode = file ? file->f_path.dentry->d_inode : NULL; // 文件节点
- if (file) {
- switch (flags & MAP_TYPE) {
- case MAP_SHARED:
- if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
- return -EACCES;
- /*
- * Make sure we don't allow writing to an append-only
- * file..
- */
- if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
- return -EACCES;
- /*
- * Make sure there are no mandatory locks on the file.
- */
- if (locks_verify_locked(inode))
- return -EAGAIN;
- vm_flags |= VM_SHARED | VM_MAYSHARE;
- if (!(file->f_mode & FMODE_WRITE))
- vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
- /* fall through */
- case MAP_PRIVATE:
- if (!(file->f_mode & FMODE_READ))
- return -EACCES;
- if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
- if (vm_flags & VM_EXEC)
- return -EPERM;
- vm_flags &= ~VM_MAYEXEC;
- }
- if (!file->f_op || !file->f_op->mmap)
- return -ENODEV;
- break;
- default:
- return -EINVAL;
- }
- } else {
- switch (flags & MAP_TYPE) {
- case MAP_SHARED:
- /*
- * Ignore pgoff.
- */
- pgoff = 0;
- vm_flags |= VM_SHARED | VM_MAYSHARE;
- break;
- case MAP_PRIVATE:
- /*
- * Set pgoff according to addr for anon_vma.
- */
- pgoff = addr >> PAGE_SHIFT;
- break;
- default:
- return -EINVAL;
- }
- }
- return mmap_region(file, addr, len, flags, vm_flags, pgoff);
- }
在mmap_region中:
点击(此处)折叠或打开
- unsigned long mmap_region(struct file *file, unsigned long addr,
- unsigned long len, unsigned long flags,
- vm_flags_t vm_flags, unsigned long pgoff)
- {
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev;
- int correct_wcount = 0;
- int error;
- struct rb_node **rb_link, *rb_parent;
- unsigned long charged = 0;
- struct inode *inode = file ? file->f_path.dentry->d_inode : NULL;
- ...
- /*
- * Can we just expand an old mapping?
- */
- vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
- if (vma)
- goto out;
- /*
- * Determine the object being mapped and call the appropriate
- * specific mapper. the address has already been validated, but
- * not unmapped, but the maps are removed from the list.
- */
- vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); // 申请 vma 并初始化
- if (!vma) {
- error = -ENOMEM;
- goto unacct_error;
- }
- vma->vm_mm = mm;
- vma->vm_start = addr;
- vma->vm_end = addr + len;
- vma->vm_flags = vm_flags;
- vma->vm_page_prot = vm_get_page_prot(vm_flags);
- vma->vm_pgoff = pgoff;
- INIT_LIST_HEAD(&vma->anon_vma_chain);
- error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */
- if (file) {
- if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
- goto free_vma;
- if (vm_flags & VM_DENYWRITE) {
- error = deny_write_access(file);
- if (error)
- goto free_vma;
- correct_wcount = 1;
- }
- vma->vm_file = get_file(file);
- error = file->f_op->mmap(file, vma); //调用open文件的mmap实现
- if (error)
- goto unmap_and_free_vma;
- /* Can addr have changed??
- *
- * Answer: Yes, several device drivers can do it in their
- * f_op->mmap method. -DaveM
- * Bug: If addr is changed, prev, rb_link, rb_parent should
- * be updated for vma_link()
- */
- WARN_ON_ONCE(addr != vma->vm_start);
- addr = vma->vm_start;
- pgoff = vma->vm_pgoff;
- vm_flags = vma->vm_flags;
- } else if (vm_flags & VM_SHARED) {
- if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
- goto free_vma;
- error = shmem_zero_setup(vma);
- if (error)
- goto free_vma;
- }
- ...
- }
这样整个流程就清晰了,驱动开发人员只需要关注设备驱动里file操作中mmap实现就可以了。
关于可执行文件的映射我们可以参考几个图:
struct vm_area_struct * mmap; /* list of VMAs */
它保存了进程所有映射的区域,之前我们提到过每个vma(即结构vm_area_struct都代表用户空间的一个映射)。那么它在这里连接起来。vma_link(mm, vma, prev, rb_link, rb_parent); 即把申请的vma加入管理中.
这里需要说明库文件的map和设备驱动的映射不太一样,前者不要求物理地址连续,但是后者要求,因为设备io空间默认是连续的.
对于任何一个普通文件,对于的file *中的mmap操作是什么呢?
这个跟fs有关系:
.mmap=generic_file_mmap // filemap.c
我们也可以通过proc来查看:#cat /proc/pid/maps
而查看静态的bin可以通过nm和objdump,Nm查看bin的符号,objdump可以查看elf信息,也可以通过file 和readelf查看
1. mmap共享内存:
(1)使用普通文件提供的内存映射:
适用于任何进程之间。此时,需要打开或创建一个文件,然后再调用mmap()
典型调用代码如下:
fd=open(name, flag, mode); if(fd<0) ...
ptr=mmap(NULL, len , PROT_READ|PROT_WRITE, MAP_SHARED , fd , 0);
通过mmap()实现共享内存的通信方式有许多特点和要注意的地方,可以参看UNIX网络编程第二卷。
(2)使用特殊文件提供匿名内存映射:
适用于具有亲缘关系的进程之间。由于父子进程特殊的亲缘关系,在父进程中先调用mmap(),然后调用 fork()。那么在调用fork()之后,子进程继承父进程匿名映射后的地址空间,同样也继承mmap()返回的地址,这样,父子进程就可以通过映射区 域进行通信了。注意,这里不是一般的继承关系。一般来说,子进程单独维护从父进程继承下来的一些变量。而mmap()返回的地址,却由父子进程共同维护。 对于具有亲缘关系的进程实现共享内存最好的方式应该是采用匿名内存映射的方式。此时,不必指定具体的文件,只要设置相应的标志即可。
2. 提高文件访问效率3. 映射设备
实现映射设备的函数mmap的时候,需要用到remap_pfn_range
remap_pfn_range不能映射常规内存,只存取保留页和在物理内存顶之上的物理地址。因为保留页和在物理
内存顶之上的物理地址内存管理系统的各个子模块管理不到。640 KB 和 1MB 是保留页可能映射,设备I/O
内存也可以映射。如果想把kmalloc()申请的内存映射到用户空间,则可以通过mem_map_reserve()把相应
的内存设置为保留后就可以。
remap_pfn_range常用于设备内存映射,而nopage()常用于RAM映射
调用mmap()时就决定了映射大小,不能再增加。换句话说,映射不能改变文件的大小。反过来,由文件被映射部分,而不是由文件大小来决定进程可访问内存空间范围(映射时,指定offset最好是内存页面大小的整数倍)。通常使用mmap()的三种情况.提高I/O效率、匿名内存映射、共享内存进程通信。
在kernel里,通常有3种申请内存的方式:vmalloc, kmalloc, alloc_pages。kmalloc与alloc_pages类似,均是申请连续的地址空间。而vmalloc则可以申请一段不连续的物理地址空间,并将其映射到连续的线性地址上。每次vmalloc之后,内核会创建一个vm_struct,用以映射分配到的不连续的内存区域。vm_struct类似vma,但是又不是一回事。vma是将物理内存映射到进程的虚拟地址空间。而vm_struct是将物理内存映射到内核的线性地址空间。 既然vmalloc拿到的不是连续的物理内存,那么将这些内存映射到vma时,就不能直接利用remap_pfn_range()了。此时可以采用两种方法,一种是实现vm_operations_struct的fault()方法,用以在缺页时再映射需要的页。此方法操作起来较为麻烦。另一种方法是直接使用remap_vmalloc_range()函数。该函数的原型为:
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
unsigned long pgoff)
其中参数vma是mmap使用调用传下来的,addr即为vmalloc()所分配内存的起始地址。而pgoff则为mmap()系统调用里的偏移参数,可以通过vma->vm_pgoff获得。该函数成功执行后,返回值为0。如果返回值为负数,则说明出错了。通常是由于所传的参数不正确。
需要注意的是,需要映射到用户空间的内存段,不能直接利用vmalloc()分配,而应该使用vmalloc_user()函数。该函数除了分配内存之外,还会将相应的vm_struct结构标记为VM_USERMAP。否则,remap_vmalloc_range将返回错误。
下面附上自己设备映射的测试代码(由于是测试只映射内核内存,用了两种方式一种是kmalloc 一种是vmalloc,而映射设备的时候直接传递设备io地址)用户空间程序:
点击(此处)折叠或打开
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <sys/types.h>
- #include <sys/stat.h>
- #include <fcntl.h>
- #include <unistd.h>
- #include <sys/mman.h>
- int main(void)
- {
- int fd;
- char *p;
- fd=open("/dev/my_mmap",O_RDWR);
- if(fd < 0)
- {
- printf("open my dev failed \n");
- return 0;
- }
-
- p=(char *)mmap(0,4096,PROT_READ|PROT_WRITE,MAP_SHARED,fd,0);
- printf("p..is %s.uuu..\n",p);
-
- munmap(p,4096);
- close(fd);
- return 0;
- }
点击(此处)折叠或打开
- #include <linux/kernel.h>
- #include <linux/init.h>
- #include <linux/module.h>
- #include <linux/device.h>
- #include <linux/cdev.h>
- #include <linux/fs.h>
- #include <linux/fcntl.h>
- #include <linux/string.h>
- #include <linux/gfp.h>
- #include <linux/mm_types.h>
- #include <linux/mm.h>
- #include <linux/highmem.h>
- #include <linux/slab.h>
- static struct cdev *my_dev;
- static dev_t md;
- static struct page *pg;
- void *mp;
- static int my_open(struct inode *inode, struct file *filp)
- {
- return 0;
- }
- static int my_mmap(struct file *filp, struct vm_area_struct *vma)
- {
- int err;
- unsigned long start;
- unsigned long size;
- unsigned long pfn;
- start = vma->vm_start;
- size = vma->vm_end -vma->vm_start;
-
- // use remap_pfn_range to map phy addr
- /* 2 user kmalloc */
- pfn=virt_to_phys(mp);
- err = remap_pfn_range(vma,start,pfn >> 12,size,vma->vm_page_prot);
- /* 1 user vmalloc */
- // err = remap_vmalloc_range(vma,mp,0);
- return err;
- }
- static struct file_operations mmap_fops =
- {
- .owner =THIS_MODULE,
- .open =my_open,
- .mmap =my_mmap,
- };
- static int __init hello_init(void)
- {
- int err;
- char *p;
- printk("hello ko ..\n");
- void *m;
-
- /* 1 use vmalloc */
- // mp =vmalloc_user(4096);
- /* 2 user kmalloc */
- mp = kmalloc(4096,GFP_KERNEL);
- SetPageReserved(virt_to_page(mp));
- /* 2 end */
- //memset(mp,5,4096);
- strcpy(mp,"hello");
- printk("p is %s....\n",mp);
- // create cdev and alloc page
- my_dev =cdev_alloc();
- cdev_init(my_dev,&mmap_fops);
- alloc_chrdev_region(&md,0,1,"mmap_dev");
- printk("major=%d,minor=%d...\n",MAJOR(md),MINOR(md));
- my_dev->owner=THIS_MODULE;
- cdev_add(my_dev,md,1);
- return 0;
- }
- static void __exit hello_exit(void)
- {
- printk("hello exit...\n");
- kfree(mp);
- cdev_del(&my_dev);
- unregister_chrdev_region(md,1);
- }
- module_init(hello_init);
- module_exit(hello_exit);
obj-m:=hello.o
make -C /usr/src/linux M=`pwd` modules // /usr/src/linux是内核路径或者内核头文件路径
安装 insmod hello.ko // 还需要自己查询设备号来创建设备文件.
- linux内存管理之内存映射
- linux内存管理总结之内存分配
- Linux 内存管理之内存零头
- linux内存管理之内存回收机制
- linux内存管理之内存管理区
- 内存管理之内存分配
- Linux编程C++内存管理之内存分配详解
- Linux编程C++内存管理之内存分配详解
- Linux编程内存管理之内存分配详解
- 浅析linux内核内存管理之内存池
- Linux编程C++内存管理之内存分配详解
- Linux编程C++内存管理之内存分配详解
- Linux编程C++内存管理之内存分配详解
- Linux内核内存管理之内存结构概述(一)
- 框架管理基础之内存映射
- Linux核心之内存管理
- Linux 之内存管理(1)
- linux之内存管理(2)
- Android进程和线程
- linux内存管理之malloc
- Canonical Correlation Analysis 典型相关分析
- Mybatis系列(七)关联映射
- C++ core guidelines -- P.1. -- 直接在代码中表达你的想法
- linux内存管理之内存映射
- linux内存管理之DMA
- 【Leetcode算法】- Move Zeroes
- HDOJ 题目5442 Favorite Donut(后缀数组)
- Robert C. Martin列举的专业软件开发人员必须精通的技能
- 1205 Constructing Roads In JGShining's Kingdom【lis】
- java环境变量的作用与配置
- 浅谈socket
- android增强ImageView