linux SysV IPC shm共享内存实现

来源:互联网 发布:tomcat的默认端口 编辑:程序博客网 时间:2024/06/08 19:59

共享内存可以使多个进程共享某段内存,由于不需要进程间数据复制,所以是速度最快的IPC。
多个进程访问共享内存时需要同步机制,如进程A往共享内存中写数据时,进程B不能使用共享内存;通常采用信号量同步多进程访问共享内存。

共享内存实现主要有以下几点:
1.分配物理内存
2.将物理内存映射到进程的地址空间;通过修改进程的页表,可以虚拟地址直接访问物理内存
3.进程不再使用共享内存时,取消物理内存在进程地址空间的映射

tmpfs文件系统将所有文件存储在内存(而非硬盘等介质)中;tmpfs将所有的东西存放在内核缓存中,可以根据文件系统中所容纳的文件自动增长和收缩,也可以将不使用的页swap出去。
linux共享内存的实现基于tmpfs文件系统及mmap文件映射;通过在tmpfs中创建文件来获取物理内存,将文件映射到进程地址空间后可以使用虚拟地址访问共享内存。


I.数据结构
include/linux/shm.h

 86 struct shmid_kernel /* private to the kernel */ 87 { 88         struct kern_ipc_perm    shm_perm;       /* operation perms */ 89         struct file *           shm_file;       /* tmpfs file */ 90         unsigned long           shm_nattch;     /* no. of current attaches */ 91         unsigned long           shm_segsz;      /* size of segment (bytes) */ 92         time_t                  shm_atim;       /* last attach time */ 93         time_t                  shm_dtim;       /* last detach time */ 94         time_t                  shm_ctim;       /* last change time */ 95         pid_t                   shm_cprid;      /* pid of creator */ 96         pid_t                   shm_lprid;      /* pid of last operator */ 97         struct user_struct      *mlock_user; 98 };

shmid_kernel用于存放共享内存信息
注:
  shm_file存放tmpfs中创建的内存文件,用于分配物理内存;用tmpfs的文件映射功能直接将共享内存映射到进程地址空间

 

ipc/shm.c

  48 struct shm_file_data {  49         int id;  50         struct ipc_namespace *ns;  51         struct file *file;  52         const struct vm_operations_struct *vm_ops;  53 };

shm_file_data主要用于保存文件(tmpfs文件)内存映射的虚拟内存操作集vm_ops,进而扩展vm_ops,使某进程已经调用IPC_RMID,其它所有进程detach后能正常释放共享内存IPC资源
注:
  共享内存主要涉及两种文件,tmpfs文件与shm文件;一个共享内存对应一个tmpfs文件,有多少个进程attach到共享内存就有多少个shm文件。
  为什么要在tmpfs文件上层再加shm文件呢?直接将tmpfs文件映射到多个进程地址空间不就能实现内存共享了吗?
  的确,可以将tmpfs文件映射到多个进程地址空间,并能实现内存共享。但是有一种特殊情况,当多个进程attach到共享内存,此时某个进程删除共享内存,为了保证其他进程能继续正常使用共享内存,则暂不能删除共享内存的IPC资源;而所有的进程detach后,tmpfs文件munmap又不能删除IPC资源。
  所以在tmpfs文件上层添加shm文件,用于扩展tmpfs文件映射的vm_ops,来实现所有进程detach后删除共享内存的IPC资源。
  详细代码参见:do_shm_rmid、shm_close
 


II.共享内存创建

 326 /** 327  * newseg - Create a new shared memory segment 328  * @ns: namespace 329  * @params: ptr to the structure that contains key, size and shmflg 330  * 331  * Called with shm_ids.rw_mutex held as a writer. 332  */ 333  334 static int newseg(struct ipc_namespace *ns, struct ipc_params *params) 335 { 336         key_t key = params->key; 337         int shmflg = params->flg; 338         size_t size = params->u.size; 339         int error; 340         struct shmid_kernel *shp; 341         int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT; 342         struct file * file; 343         char name[13]; 344         int id; 345         int acctflag = 0; 346  347         if (size < SHMMIN || size > ns->shm_ctlmax) 348                 return -EINVAL; 349  350         if (ns->shm_tot + numpages > ns->shm_ctlall) 351                 return -ENOSPC; 352  353         shp = ipc_rcu_alloc(sizeof(*shp)); 354         if (!shp) 355                 return -ENOMEM; 356  357         shp->shm_perm.key = key; 358         shp->shm_perm.mode = (shmflg & S_IRWXUGO); 359         shp->mlock_user = NULL; 360  361         shp->shm_perm.security = NULL; 362         error = security_shm_alloc(shp); 363         if (error) { 364                 ipc_rcu_putref(shp); 365                 return error; 366         } 367  368         sprintf (name, "SYSV%08x", key); 369         if (shmflg & SHM_HUGETLB) { 370                 /* hugetlb_file_setup applies strict accounting */ 371                 if (shmflg & SHM_NORESERVE) 372                         acctflag = VM_NORESERVE; 373                 file = hugetlb_file_setup(name, size, acctflag, 374                                         &shp->mlock_user, HUGETLB_SHMFS_INODE); 375         } else { 376                 /* 377                  * Do not allow no accounting for OVERCOMMIT_NEVER, even 378                  * if it's asked for. 379                  */ 380                 if  ((shmflg & SHM_NORESERVE) && 381                                 sysctl_overcommit_memory != OVERCOMMIT_NEVER) 382                         acctflag = VM_NORESERVE; 383                 file = shmem_file_setup(name, size, acctflag); 384         } 385         error = PTR_ERR(file); 386         if (IS_ERR(file)) 387                 goto no_file; 388  389         id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); 390         if (id < 0) { 391                 error = id; 392                 goto no_id; 393         } 394  395         shp->shm_cprid = task_tgid_vnr(current); 396         shp->shm_lprid = 0; 397         shp->shm_atim = shp->shm_dtim = 0; 398         shp->shm_ctim = get_seconds(); 399         shp->shm_segsz = size; 400         shp->shm_nattch = 0; 401         shp->shm_file = file; 402         /* 403          * shmid gets reported as "inode#" in /proc/pid/maps. 404          * proc-ps tools use this. Changing this will break them. 405          */ 406         file->f_dentry->d_inode->i_ino = shp->shm_perm.id; 407  408         ns->shm_tot += numpages; 409         error = shp->shm_perm.id; 410         shm_unlock(shp); 411         return error; 412  413 no_id: 414         if (is_file_hugepages(file) && shp->mlock_user) 415                 user_shm_unlock(size, shp->mlock_user); 416         fput(file); 417 no_file: 418         security_shm_free(shp); 419         ipc_rcu_putref(shp); 420         return error; 421 }

1.参数及共享内存系统限制检查
2.分配共享内存管理结构shmid_kernel
3.在tmpfs中创建共享内存文件,以获取物理内存
4.将shmid_kernel添加到共享内存基数树中,并获得基数树id
5.初始化shmid_kernel结构
6.返回共享内存IPC id

 

III.共享内存映射到进程地址空间
i.do_shmat

 806 /* 807  * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists. 808  * 809  * NOTE! Despite the name, this is NOT a direct system call entrypoint. The 810  * "raddr" thing points to kernel space, and there has to be a wrapper around 811  * this. 812  */ 813 long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) 814 { 815         struct shmid_kernel *shp; 816         unsigned long addr; 817         unsigned long size; 818         struct file * file; 819         int    err; 820         unsigned long flags; 821         unsigned long prot; 822         int acc_mode; 823         unsigned long user_addr; 824         struct ipc_namespace *ns; 825         struct shm_file_data *sfd; 826         struct path path; 827         fmode_t f_mode; 828  829         err = -EINVAL; 830         if (shmid < 0) 831                 goto out; 832         else if ((addr = (ulong)shmaddr)) { 833                 if (addr & (SHMLBA-1)) { 834                         if (shmflg & SHM_RND) 835                                 addr &= ~(SHMLBA-1);       /* round down */ 836                         else 837 #ifndef __ARCH_FORCE_SHMLBA 838                                 if (addr & ~PAGE_MASK) 839 #endif 840                                         goto out; 841                 } 842                 flags = MAP_SHARED | MAP_FIXED; 843         } else { 844                 if ((shmflg & SHM_REMAP)) 845                         goto out; 846  847                 flags = MAP_SHARED; 848         } 849  850         if (shmflg & SHM_RDONLY) { 851                 prot = PROT_READ; 852                 acc_mode = S_IRUGO; 853                 f_mode = FMODE_READ; 854         } else { 855                 prot = PROT_READ | PROT_WRITE; 856                 acc_mode = S_IRUGO | S_IWUGO; 857                 f_mode = FMODE_READ | FMODE_WRITE; 858         } 859         if (shmflg & SHM_EXEC) { 860                 prot |= PROT_EXEC; 861                 acc_mode |= S_IXUGO; 862         } 863  864         /* 865          * We cannot rely on the fs check since SYSV IPC does have an 866          * additional creator id... 867          */ 868         ns = current->nsproxy->ipc_ns; 869         shp = shm_lock_check(ns, shmid); 870         if (IS_ERR(shp)) { 871                 err = PTR_ERR(shp); 872                 goto out; 873         } 874  875         err = -EACCES; 876         if (ipcperms(&shp->shm_perm, acc_mode)) 877                 goto out_unlock; 878  879         err = security_shm_shmat(shp, shmaddr, shmflg); 880         if (err) 881                 goto out_unlock; 882  883         path.dentry = dget(shp->shm_file->f_path.dentry); 884         path.mnt    = shp->shm_file->f_path.mnt; 885         shp->shm_nattch++; 886         size = i_size_read(path.dentry->d_inode); 887         shm_unlock(shp); 888  889         err = -ENOMEM; 890         sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); 891         if (!sfd) 892                 goto out_put_dentry; 893  894         file = alloc_file(path.mnt, path.dentry, f_mode, 895                         is_file_hugepages(shp->shm_file) ? 896                                 &shm_file_operations_huge : 897                                 &shm_file_operations); 898         if (!file) 899                 goto out_free; 900         ima_counts_get(file); 901  902         file->private_data = sfd; 903         file->f_mapping = shp->shm_file->f_mapping; 904         sfd->id = shp->shm_perm.id; 905         sfd->ns = get_ipc_ns(ns); 906         sfd->file = shp->shm_file; 907         sfd->vm_ops = NULL; 908  909         down_write(¤t->mm->mmap_sem); 910         if (addr && !(shmflg & SHM_REMAP)) { 911                 err = -EINVAL; 912                 if (find_vma_intersection(current->mm, addr, addr + size)) 913                         goto invalid; 914                 /* 915                  * If shm segment goes below stack, make sure there is some 916                  * space left for the stack to grow (at least 4 pages). 917                  */ 918                 if (addr < current->mm->start_stack && 919                     addr > current->mm->start_stack - size - PAGE_SIZE * 5) 920                         goto invalid; 921         } 922  923         user_addr = do_mmap (file, addr, size, prot, flags, 0); 924         *raddr = user_addr; 925         err = 0; 926         if (IS_ERR_VALUE(user_addr)) 927                 err = (long)user_addr; 928 invalid: 929         up_write(¤t->mm->mmap_sem); 930  931         fput(file); 932  933 out_nattch: 934         down_write(&shm_ids(ns).rw_mutex); 935         shp = shm_lock(ns, shmid); 936         BUG_ON(IS_ERR(shp)); 937         shp->shm_nattch--; 938         if(shp->shm_nattch == 0 && 939            shp->shm_perm.mode & SHM_DEST) 940                 shm_destroy(ns, shp); 941         else 942                 shm_unlock(shp); 943         up_write(&shm_ids(ns).rw_mutex); 944  945 out: 946         return err; 947  948 out_unlock: 949         shm_unlock(shp); 950         goto out; 951  952 out_free: 953         kfree(sfd); 954 out_put_dentry: 955         dput(path.dentry); 956         goto out_nattch; 957 }

1.对参数进行合法性检查,并根据参数计算内存映射标识和保护方式
2.attach权限检验
3.attach计数器shm_nattch加1
4.分配shm文件,并初始化私有数据shm_file_data
5.将shm文件映射到进程地址空间(do_mmap实现是将tmpfs文件映射到进程地址空间)
6.attach计数器shm_nattch减1,由于在shm文件映射时shm_mmap->shm_open会将shm_nattch加1

 

ii.shm_mmap
do_mmap会回调shm文件的shm_mmap函数:
do_mmap->do_mmap_pgoff->mmap_region->mmap

 249 static int shm_mmap(struct file * file, struct vm_area_struct * vma) 250 { 251         struct shm_file_data *sfd = shm_file_data(file); 252         int ret; 253  254         ret = sfd->file->f_op->mmap(sfd->file, vma); 255         if (ret != 0) 256                 return ret; 257         sfd->vm_ops = vma->vm_ops; 258 #ifdef CONFIG_MMU 259         BUG_ON(!sfd->vm_ops->fault); 260 #endif 261         vma->vm_ops = &shm_vm_ops; 262         shm_open(vma); 263  264         return ret; 265 }

1.将tmpfs文件的vma操作切换成shm文件的vma操作shm_vm_ops,用于进程munmap时调用shm_close,来实现所有进程detach且之前有IPC_RMID时删除共享内存IPC资源
2.可以看出do_mmap最后会调用tmpfs文件的mmap方法,将tmpfs文件映射到进程地址空间

 

iii.shm_fault
将tmpfs文件映射到进程地址空间后,如果是第一次访问会产生缺页异常;缺页异常处理中会将文件装入内存中并添加相应的页表项,以便使用虚拟地址访问。
do_page_fault->handle_mm_fault->handle_pte_fault->do_linear_fault->__do_fault->shm_fault

 214 static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 215 { 216         struct file *file = vma->vm_file; 217         struct shm_file_data *sfd = shm_file_data(file); 218  219         return sfd->vm_ops->fault(vma, vmf); 220 }

shm文件的异常处理shm_fault实际调用的是tmpfs文件的异常处理,来装入tmpfs文件的内容。

 


IV.共享内存从进程地址空间中删除
i.shmdt
当进程不想再访问共享内存时,会将其从地址空间中移除。

 971 /* 972  * detach and kill segment if marked destroyed. 973  * The work is done in shm_close. 974  */ 975 SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) 976 { 977         struct mm_struct *mm = current->mm; 978         struct vm_area_struct *vma; 979         unsigned long addr = (unsigned long)shmaddr; 980         int retval = -EINVAL; 981 #ifdef CONFIG_MMU 982         loff_t size = 0; 983         struct vm_area_struct *next; 984 #endif 985  986         if (addr & ~PAGE_MASK) 987                 return retval; 988  989         down_write(&mm->mmap_sem); 990  991         /* 992          * This function tries to be smart and unmap shm segments that 993          * were modified by partial mlock or munmap calls: 994          * - It first determines the size of the shm segment that should be 995          *   unmapped: It searches for a vma that is backed by shm and that 996          *   started at address shmaddr. It records it's size and then unmaps 997          *   it. 998          * - Then it unmaps all shm vmas that started at shmaddr and that 999          *   are within the initially determined size.1000          * Errors from do_munmap are ignored: the function only fails if1001          * it's called with invalid parameters or if it's called to unmap1002          * a part of a vma. Both calls in this function are for full vmas,1003          * the parameters are directly copied from the vma itself and always1004          * valid - therefore do_munmap cannot fail. (famous last words?)1005          */1006         /*1007          * If it had been mremap()'d, the starting address would not1008          * match the usual checks anyway. So assume all vma's are1009          * above the starting address given.1010          */1011         vma = find_vma(mm, addr);1012 1013 #ifdef CONFIG_MMU1014         while (vma) {1015                 next = vma->vm_next;1016 1017                 /*1018                  * Check if the starting address would match, i.e. it's1019                  * a fragment created by mprotect() and/or munmap(), or it1020                  * otherwise it starts at this address with no hassles.1021                  */1022                 if ((vma->vm_ops == &shm_vm_ops) &&1023                         (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {1024 1025 1026                         size = vma->vm_file->f_path.dentry->d_inode->i_size;1027                         do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);1028                         /*1029                          * We discovered the size of the shm segment, so1030                          * break out of here and fall through to the next1031                          * loop that uses the size information to stop1032                          * searching for matching vma's.1033                          */1034                         retval = 0;1035                         vma = next;1036                         break;1037                 }1038                 vma = next;1039         }1040 1041         /*1042          * We need look no further than the maximum address a fragment1043          * could possibly have landed at. Also cast things to loff_t to1044          * prevent overflows and make comparisions vs. equal-width types.1045          */1046         size = PAGE_ALIGN(size);1047         while (vma && (loff_t)(vma->vm_end - addr) <= size) {1048                 next = vma->vm_next;1049 1050                 /* finding a matching vma now does not alter retval */1051                 if ((vma->vm_ops == &shm_vm_ops) &&1052                         (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff)1053 1054                         do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);1055                 vma = next;1056         }1057 1058 #else /* CONFIG_MMU */1059         /* under NOMMU conditions, the exact address to be destroyed must be1060          * given */1061         retval = -EINVAL;1062         if (vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {1063                 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);1064                 retval = 0;1065         }1066 1067 #endif1068 1069         up_write(&mm->mmap_sem);1070         return retval;1071 }

1.查找共享内存映射到进程地址空间的虚拟地址内存段vma
2.移除共享内存在进程地址空间的映射

ii.shm_close

186 /* 187  * remove the attach descriptor vma. 188  * free memory for segment if it is marked destroyed. 189  * The descriptor has already been removed from the current->mm->mmap list 190  * and will later be kfree()d. 191  */ 192 static void shm_close(struct vm_area_struct *vma) 193 { 194         struct file * file = vma->vm_file; 195         struct shm_file_data *sfd = shm_file_data(file); 196         struct shmid_kernel *shp; 197         struct ipc_namespace *ns = sfd->ns; 198  199         down_write(&shm_ids(ns).rw_mutex); 200         /* remove from the list of attaches of the shm segment */ 201         shp = shm_lock(ns, sfd->id); 202         BUG_ON(IS_ERR(shp)); 203         shp->shm_lprid = task_tgid_vnr(current); 204         shp->shm_dtim = get_seconds(); 205         shp->shm_nattch--; 206         if(shp->shm_nattch == 0 && 207            shp->shm_perm.mode & SHM_DEST) 208                 shm_destroy(ns, shp); 209         else 210                 shm_unlock(shp); 211         up_write(&shm_ids(ns).rw_mutex); 212 }

1.attach计数器shm_nattch减1
2.如果共享内存没有attach的进程,且已经有进程调用过shmctl(...,IPC_RMID,...),则销毁共享内存IPC资源

 


V.共享内存移除
i.do_shm_rmid
当不再使用共享内存时,会移除共享内存;通过IPC_RMID命令调用shmctl来实现共享内存的移除

  82 /*  83  * Called with shm_ids.rw_mutex (writer) and the shp structure locked.  84  * Only shm_ids.rw_mutex remains locked on exit.  85  */  86 static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)  87 {  88         struct shmid_kernel *shp;  89         shp = container_of(ipcp, struct shmid_kernel, shm_perm);  90   91         if (shp->shm_nattch){  92                 shp->shm_perm.mode |= SHM_DEST;  93                 /* Do not find it any more */  94                 shp->shm_perm.key = IPC_PRIVATE;  95                 shm_unlock(shp);  96         } else  97                 shm_destroy(ns, shp);  98 }

1.还有进程attach到共享内存,置共享内存销毁SHM_DEST标识,用于在所有进程detach时销毁共享内存IPC资源,见shm_close;并将key置为IPC_PRIVATE,不能通过key再获取共享内存
2.如果没有进程attach到共享内存,销毁共享内存IPC资源

 

ii.shm_destroy

 162 /* 163  * shm_destroy - free the struct shmid_kernel 164  * 165  * @ns: namespace 166  * @shp: struct to free 167  * 168  * It has to be called with shp and shm_ids.rw_mutex (writer) locked, 169  * but returns with shp unlocked and freed. 170  */ 171 static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) 172 { 173         ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; 174         shm_rmid(ns, shp); 175         shm_unlock(shp); 176         if (!is_file_hugepages(shp->shm_file)) 177                 shmem_lock(shp->shm_file, 0, shp->mlock_user); 178         else if (shp->mlock_user) 179                 user_shm_unlock(shp->shm_file->f_path.dentry->d_inode->i_size, 180                                                 shp->mlock_user); 181         fput (shp->shm_file); 182         security_shm_free(shp); 183         ipc_rcu_putref(shp); 184 }

1.将共享内存IPC从共享内存基数树中移除
2.释放tmpfs文件使用的file结构内存
3.释放shmid_kernel内存

原创粉丝点击