IO复用——epoll内核源代码剖析
来源:互联网 发布:最优化方法第二版答案 编辑:程序博客网 时间:2024/05/16 17:53
*最近拖延症又犯了。。。嗯。。。废话不多说。。。直接上硬货。。。→_→*
比较select系统调用请戳传送门——select内核源代码剖析
了解poll机制请戳传送门——poll机制内核源代码剖析
- epoll_create
这是Linux_3.0.12内核版本。。。和之前剖析的2.4.0内核版本的系统调用有一些差别。。。所以直接进SYSCALL_DEFINE1。。。
//为每一个监听的事件都分配一个epitem数据结构struct epitem { /* RB tree node used to link this structure to the eventpoll RB tree */ //每个epitem都存放在eventpoll中以rbr为根的红黑树中 //rbn记录epitem在红黑树中的结点 struct rb_node rbn; /* List header used to link this structure to the eventpoll ready list */ //每个就绪事件所对应的epitem都链入了eventpoll中的rdllink //rdllink记录就绪链表头 struct list_head rdllink; /* * Works together "struct eventpoll"->ovflist in keeping the * single linked chain of items. */ //记录每个epitem在eventpoll数据结构中的ovflist的下一个epitem struct epitem *next; /* The file descriptor information this item refers to */ //epoll_filefd数据结构记录epitem所对应的struct file和fd文件描述符 struct epoll_filefd ffd; /* Number of active wait queue attached to poll operations */ //poll操作上的等待队列个数 int nwait; /* List containing poll wait queues */ //包含等待队列对头的单链表 struct list_head pwqlist; /* The "container" of this item */ //记录epitem所属哪一个eventpoll数据结构 struct eventpoll *ep; /* List header used to link this item to the "struct file" items list */ //记录epitem所对应的struct file的单链表 struct list_head fllink; /* The structure that describe the interested events and the source fd */ //记录epitem对应的epoll_event数据结构,epoll_event是epoll_ctl函数传入的参数 struct epoll_event event;};
struct eventpoll { /* Protect the access to this structure */ spinlock_t lock; /* * This mutex is used to ensure that files are not removed * while epoll is using them. This is held during the event * collection loop, the file cleanup path, the epoll file exit * code and the ctl operations. */ //对事件进行处理时,内核都都会持有这个互斥锁,因此在内核态中epoll的相关操作可以保证是线程安全的 struct mutex mtx; /* Wait queue used by sys_epoll_wait() */ //调用sys_epoll_wait()时,存放当前进程的等待队列 wait_queue_head_t wq; /* Wait queue used by file->poll() */ //此等待队列存放监听事件的poll操作 wait_queue_head_t poll_wait; /* List of ready file descriptors */ //为每个事件都会分配一个epitem,当事件就绪时其所对应的epitem就会链入rdllist双向链表中 //epitem数据类型定义在上面 struct list_head rdllist; /* RB tree root used to store monitored fd structs */ //为每个事件都会分配一个epitem,所有的epitem都会存放在这个红黑树中 struct rb_root rbr; /* * This is a single linked list that chains all the "struct epitem" that * happened while transferring ready events to userspace w/out * holding ->lock. */ //就绪事件在转移到用户空间时,发生了就绪事件,其所对应的epitem被链入ovflist双向链表中 struct epitem *ovflist; /* The user that created the eventpoll descriptor */ //保存用户信息,比如资源的上限值 struct user_struct *user;};
SYSCALL_DEFINE1(epoll_create1, int, flags){ int error; //eventpoll是epoll中非常重要的数据结构!每一个epollfd都有一个对应的eventpoll数据结构 //eventpoll数据结构定义在上面 struct eventpoll *ep = NULL; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); if (flags & ~EPOLL_CLOEXEC) return -EINVAL; /* * Create the internal data structure ("struct eventpoll"). */ //初始化一个eventpoll数据结构 //ep_alloc定义在下面 error = ep_alloc(&ep); if (error < 0) return error; /* * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ //创建epollfd //因为epollfd并不存在真正对应的文件,所以内核创建了一个虚拟的文件,并为这个虚拟文件分配struct file数据结构 //参数eventpoll_fops就是file operations,即文件支持的操作 //关于file operations在之前的poll机制内核源代码剖析一文中已经做了非常深入的解释 //这里简单解释一下,file operations中的每一个成员都是回调函数指针,对应每一种操作的具体实现 //epollfd文件实现了三种操作,即release、poll、llseek //eventpoll_fops数据结构定义在下面 //参数ep就是epollfd所对应的eventpoll数据结构,在anon_inode_getfd中,将struct file的private_data成员赋值为ep的地址 //anon_inode_getfd定义在下面 error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC)); if (error < 0) ep_free(ep); //返回epollfd的值 return error;}SYSCALL_DEFINE1(epoll_create, int, size){ //实际上传入的size参数并没有什么用。。。 if (size <= 0) return -EINVAL; //sys_epoll_create1定义在上面 return sys_epoll_create1(0);}
static int ep_alloc(struct eventpoll **pep){ int error; struct user_struct *user; struct eventpoll *ep; //获取当前用户信息 user = get_current_user(); error = -ENOMEM; //通过kmalloc为eventpoll数据结构分配内存空间 ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (unlikely(!ep)) goto free_uid; spin_lock_init(&ep->lock); mutex_init(&ep->mtx); //初始化eventpoll中的wq init_waitqueue_head(&ep->wq); //初始化eventpoll中的poll_wait init_waitqueue_head(&ep->poll_wait); //初始化存放就绪事件所对应的epitem的双向链表 INIT_LIST_HEAD(&ep->rdllist); //初始化存放所有事件对应的epiitem的红黑树,初始值为NULL //#define RB_ROOT (struct rb_root) { NULL, } ep->rbr = RB_ROOT; //初始化转移到用户空间之前,存放就绪事件所对应的epitem的双向链表,初始值为-1L //#define EP_UNACTIVE_PTR ((void *) -1L) ep->ovflist = EP_UNACTIVE_PTR; //初始化用户信息 ep->user = user; //为eventpoll数据结构指针赋值 *pep = ep; return 0;free_uid: free_uid(user); return error;}
//由此可见epollfd所对应的的匿名文件只实现了三种操作//release操作为释放epollfd所对应的eventpoll数据结构//ep_eventpoll_release定义在下面//poll操作为事件就绪时,调用poll操作对应的回调函数对当前进程进行一些列操作//ep_eventpoll_poll定义先放一边,在epoll_wait中会详细解释//llseek操作为获取匿名文件的游标偏移//noop_llseek定义在下面static const struct file_operations eventpoll_fops = { .release = ep_eventpoll_release, .poll = ep_eventpoll_poll, .llseek = noop_llseek,};
static int ep_eventpoll_release(struct inode *inode, struct file *file){ //通过struct file中的成员private_data得到epollfd所对应的eventpoll数据结构 struct eventpoll *ep = file->private_data; //释放eventpoll数据结构 if (ep) ep_free(ep); return 0;}
loff_t noop_llseek(struct file *file, loff_t offset, int origin){ //返回当前文件的偏移量 return file->f_pos;}
int anon_inode_getfd(const char *name, const struct file_operations *fops, void *priv, int flags){ int error, fd; struct file *file; //分配文件描述符,即epollfd error = get_unused_fd_flags(flags); if (error < 0) return error; fd = error; //创建匿名文件 file = anon_inode_getfile(name, fops, priv, flags); if (IS_ERR(file)) { error = PTR_ERR(file); goto err_put_unused_fd; } //将文件描述符fd和匿名文件绑定,即将file_struct中的fdtable的成员fd[fd]赋值为file fd_install(fd, file); //返回epollfd的值 return fd;err_put_unused_fd: put_unused_fd(fd); return error;}
- epoll_ctl
struct epoll_event { __u32 events; //epoll事件类型 __u64 data; //指定所要监听的事件的文件描述符} EPOLL_PACKED;
//参数epfd就是epoll_create中返回的epollfd//参数op指定对事件的操作类型,具体分为三种//#define EPOLL_CTL_ADD 1 添加新的监听事件//#define EPOLL_CTL_DEL 2 删除监听事件//#define EPOLL_CTL_MOD 3 修改监听事件//参数fd就是想要操作的文件描述符//参数event表示监听的是什么事件类型//数据可读事件EPOLLIN、高效工作事件模式EPOLLET、事件只被处理一次EPOLLONESHOT//epoll_event数据结构定义在上面SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event){ int error; int did_lock_epmutex = 0; struct file *file, *tfile; struct eventpoll *ep; struct epitem *epi; struct epoll_event epds; error = -EFAULT; //ep_op_has_event中为return op != EPOLL_CTL_DEL;即判断op操作是否为删除监听事件 //从用户拷贝epoll_event数据结构到内核空间 if (ep_op_has_event(op) && copy_from_user(&epds, event, sizeof(struct epoll_event))) goto error_return; /* Get the "struct file *" for the eventpoll file */ error = -EBADF; //获取epollfd所对应的匿名文件struct file数据结构 file = fget(epfd); if (!file) goto error_return; /* Get the "struct file *" for the target file */ //获取所要操作的文件描述符所对应的struct file数据结构 tfile = fget(fd); if (!tfile) goto error_fput; /* The target file descriptor must support poll */ error = -EPERM; //判断所要监听的事件是否支持文件操作或poll操作 if (!tfile->f_op || !tfile->f_op->poll) goto error_tgt_fput; /* * We have to check that the file structure underneath the file descriptor * the user passed to us _is_ an eventpoll file. And also we do not permit * adding an epoll file descriptor inside itself. */ error = -EINVAL; //判断所要监听的事件是否是epollfd本身、判断所要监听的事件是否支持epoll对文件的三种操作 if (file == tfile || !is_file_epoll(file)) goto error_tgt_fput; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ //从struct file数据结构中获取eventpoll数据结构 ep = file->private_data; /* * When we insert an epoll file descriptor, inside another epoll file * descriptor, there is the change of creating closed loops, which are * better be handled here, than in more critical paths. * * We hold epmutex across the loop check and the insert in this case, in * order to prevent two separate inserts from racing and each doing the * insert "at the same time" such that ep_loop_check passes on both * before either one does the insert, thereby creating a cycle. */ //检查监听的事件是否支持epoll对文件的三种操作且为添加事件 //当我们插入一个epoll文件描述符时,在另一个epoll文件描述符中,创建闭环,这在这里更好地处理,而不是更关键的路径。 //在这种情况下,我们保留epmutex的循环检查和插入,以防止两个单独的插入,并且每个插入“同时进行”,使得ep_loop_check在两个插入之前都通过,从而创建一个周期。 if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { mutex_lock(&epmutex); did_lock_epmutex = 1; error = -ELOOP; if (ep_loop_check(ep, tfile) != 0) goto error_tgt_fput; } mutex_lock_nested(&ep->mtx, 0); /* * Try to lookup the file inside our RB tree, Since we grabbed "mtx" * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ //epoll不允许重复添加fd //在eventpoll数据结构中的rbr红黑树里,根据监听事件的struct和fd,与每一个epitem中的epoll_filefd数据结构进行比较 //找到返回监听事件对应的epitem,没有找到返回NULL epi = ep_find(ep, tfile, fd); error = -EINVAL; //根据对事件的操作进行分类操作 switch (op) { //添加新的监听事件 case EPOLL_CTL_ADD: //如果之前不存在此事件才可以添加 if (!epi) { //添加内核关心的事件类型POLLERR和POLLHUP epds.events |= POLLERR | POLLHUP; //真正的添加新的监听事件 //ep_insert定义在下面 error = ep_insert(ep, &epds, tfile, fd); } else error = -EEXIST; break; //删除事件 case EPOLL_CTL_DEL: //如果存在此事件才可以删除 if (epi) //ep_remove就不剖了。。 error = ep_remove(ep, epi); else error = -ENOENT; break; //修改事件 case EPOLL_CTL_MOD: if (epi) { epds.events |= POLLERR | POLLHUP; //ep_modify就不剖了。。。 error = ep_modify(ep, epi, &epds); } else error = -ENOENT; break; } mutex_unlock(&ep->mtx);error_tgt_fput: if (unlikely(did_lock_epmutex)) mutex_unlock(&epmutex); fput(tfile);error_fput: fput(file);error_return: return error;}
typedef struct poll_table_struct { //poll_queue_proc就是当监听事件就绪时,对事件进行具体操作的回调函数 poll_queue_proc qproc; //key记录对监听事件的何种event感兴趣 unsigned long key;} poll_table;
struct ep_pqueue { //poll_table数据结构和poll回调函数机制有关 //poll_table数据结构定义在上面 poll_table pt; //记录对应的epitem数据结构 struct epitem *epi;};
static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc){ //初始化poll机制的回调函数 pt->qproc = qproc; //初始化感兴趣的事件类型,初值为对所有event都感兴趣 pt->key = ~0UL; /* all events enabled */}
//ep参数为epollfd所对应的eventpoll数据结构//event参数为新监听事件的epoll事件类型,即epoll_event数据结构//tfile参数为新监听事件所对应的struct file数据结构//fd参数为新监听事件的文件描述符static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct file *tfile, int fd){ int error, revents, pwake = 0; unsigned long flags; long user_watches; struct epitem *epi; struct ep_pqueue epq; //将当前用户的监听事件数加1 user_watches = atomic_long_read(&ep->user->epoll_watches); //判断是否超过当前用户的最大监听数 if (unlikely(user_watches >= max_user_watches)) return -ENOSPC; //从slab中分配一个epitem数据结构 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) return -ENOMEM; /* Item initialization follow here ... */ //初始化各个链表 INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->fllink); INIT_LIST_HEAD(&epi->pwqlist); //记录epitem所对应的eventpoll数据结构 epi->ep = ep; //在epitem中的epoll_filefd数据结构中记录新监听事件所对应的struct file数据结构和文件描述符fd ep_set_ffd(&epi->ffd, tfile, fd); //记录新监听事件,想要监听的事件类型 epi->event = *event; //poll操作上的等待队列个数初始化为0 epi->nwait = 0; //初始化epitem在eventpoll中的ovflist链表的后继为(void *) -1L epi->next = EP_UNACTIVE_PTR; /* Initialize the poll table using the queue callback */ //记录ep_pqueue中的epitem数据结构 //epq数据类型为ep_pqueue,ep_pqueue数据结构定义在上面 epq.epi = epi; //初始化poll_table数据结构 //init_poll_funcptr定义在上面 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); /* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has * been increased by the caller of this function. Note that after * this operation completes, the poll callback can start hitting * the new item. */ //对监听事件所对应的struct file中的file operation中的poll操作进行初始化,即对poll回调函数进行初始化,详细的poll机制回调函数在之前已经做了详细说明 //返回值为已经就绪的事件 revents = tfile->f_op->poll(tfile, &epq.pt); /* * We have to check if something went wrong during the poll wait queue * install process. Namely an allocation for a wait queue failed due * high memory pressure. */ error = -ENOMEM; //如果内存不够,有可能导致等待队列分配失败,所以此时需要判断等待队列是否存在 if (epi->nwait < 0) goto error_unregister; /* Add the current item to the list of active epoll hook for this file */ spin_lock(&tfile->f_lock); //将epitem链入监听事件所对应的strcut file中的f_ep_links成员上 list_add_tail(&epi->fllink, &tfile->f_ep_links); spin_unlock(&tfile->f_lock); /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. */ //将epitem插入到epollfd所对应的eventpolld中的rbr红黑树中 ep_rbtree_insert(ep, epi); /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags); /* If the file is already "ready" we drop it inside the ready list */ //此时判断一下是不是新的监听事件已经就绪且就绪链表为空 if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { //将epitem链入就绪链表中 list_add_tail(&epi->rdllink, &ep->rdllist); /* Notify waiting tasks that events are available */ //判断eventpoll中的wq等待队列是否为NULL,如果不为空,就唤醒等待队列上对应的进程 if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); //判断poll_wait等待队列是否为NULL,如果不为NULL,pwake加1 if (waitqueue_active(&ep->poll_wait)) pwake++; } spin_unlock_irqrestore(&ep->lock, flags); atomic_long_inc(&ep->user->epoll_watches); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&ep->poll_wait); return 0;error_unregister: ep_unregister_pollwait(ep, epi); /* * We need to do this because an event could have been arrived on some * allocated wait queue. Note that we don't care about the ep->ovflist * list, since that is used/cleaned only inside a section bound by "mtx". * And ep_insert() is called with "mtx" held. */ spin_lock_irqsave(&ep->lock, flags); if (ep_is_linked(&epi->rdllink)) list_del_init(&epi->rdllink); spin_unlock_irqrestore(&ep->lock, flags); kmem_cache_free(epi_cache, epi); return error;}
- epoll_wait
//参数epfd就是epollfd//参数events指向一个数组,用来存放最后返回的就绪事件//参数maxevents表示最多监听多少个事件//参数timeout表示阻塞时间SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout){ int error; struct file *file; struct eventpoll *ep; /* The maximum number of event must be greater than zero */ //判断maxevents是否合法 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) return -EINVAL; /* Verify that the area passed by the user is writeable */ //判断用户传入的events指向的空间是否合法有效 if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) { error = -EFAULT; goto error_return; } /* Get the "struct file *" for the eventpoll file */ error = -EBADF; //通过epollfd获得其所对应的struct file数据结构 file = fget(epfd); if (!file) goto error_return; /* * We have to check that the file structure underneath the fd * the user passed to us _is_ an eventpoll file. */ error = -EINVAL; //判断file文件是否支持epoll对文件的操作 if (!is_file_epoll(file)) goto error_fput; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ //struct file中的private_data成员存储着epollfd对应的eventpoll数据结构 ep = file->private_data; /* Time to fish for events ... */ //ep_poll的定义在下面 error = ep_poll(ep, events, maxevents, timeout);error_fput: fput(file);error_return: return error;}
//参数ep为epollfd所对应的eventpoll数据结构//其余参数与epoll_wait参数含义相同static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout){ int res = 0, eavail, timed_out = 0; unsigned long flags; long slack = 0; //存放当前进程的等待队列 wait_queue_t wait; ktime_t expires, *to = NULL; //如果阻塞时间大于0,就将timeout转化为计算机内部的时间 if (timeout > 0) { struct timespec end_time = ep_set_mstimeout(timeout); slack = select_estimate_accuracy(&end_time); to = &expires; *to = timespec_to_ktime(end_time); } //如果阻塞时间等于0,即非阻塞模式就直接调转到check_events执行 else if (timeout == 0) { /* * Avoid the unnecessary trip to the wait queue loop, if the * caller specified a non blocking operation. */ timed_out = 1; spin_lock_irqsave(&ep->lock, flags); goto check_events; }fetch_events: spin_lock_irqsave(&ep->lock, flags); //如果eventpoll中的rdllist为空或者ovflist为初始化值EP_UNACTIVE_PTR时,满足条件 if (!ep_events_available(ep)) { /* * We don't have any available event to return to the caller. * We need to sleep here, and we will be wake up by * ep_poll_callback() when events will become available. */ //初始化等待队列wait,参数current是一个宏,代表当前进程 //init_waitqueue_entry定义在下面 init_waitqueue_entry(&wait, current); //将等待队列wait添加到eventpoll中的wq等待队列中 __add_wait_queue_exclusive(&ep->wq, &wait); for (;;) { /* * We don't want to sleep if the ep_poll_callback() sends us * a wakeup in between. That's why we set the task state * to TASK_INTERRUPTIBLE before doing the checks. */ //将当前进程调度后的状态设置为浅睡眠,即可中断睡眠状态 set_current_state(TASK_INTERRUPTIBLE); //如果此时eventpoll中的rdllist就绪链表不为NULL或ovflist不为EP_UNACTIVE_PTR或timed_out为0,那么就不再调度了,直接break跳出循环 if (ep_events_available(ep) || timed_out) break; //如果此时收到了信号,那么也不再调度了,直接break跳出循环 if (signal_pending(current)) { res = -EINTR; break; } spin_unlock_irqrestore(&ep->lock, flags); //当前进程被调度,进入前睡眠状态 //在此期间,若发生事件就绪或收到信号,就执行poll回调机制 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) timed_out = 1; spin_lock_irqsave(&ep->lock, flags); } //此时已从for循环中跳出 //从eventpoll中的wq等待队列里删除wait等待队列 __remove_wait_queue(&ep->wq, &wait); //设置当前进程下一次调度的状态为运行中状态 set_current_state(TASK_RUNNING); }check_events: /* Is it worth to try to dig for events ? */ //判断此时eventpoll中的rdllist是否为空或者ovflist为初始化值是否为EP_UNACTIVE_PTR eavail = ep_events_available(ep); spin_unlock_irqrestore(&ep->lock, flags); /* * Try to transfer events to user space. In case we get 0 events and * there's still timeout left over, we go trying again in search of * more luck. */ //此时尝试将就绪事件传输到用户空间 //如果我们得到0个就绪事件,还有超时时间,就跳转至fetch_events //ep_send_events定义在下面 if (!res && eavail && !(res = ep_send_events(ep, events, maxevents)) && !timed_out) goto fetch_events; return res;}
static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p){ q->flags = 0; //将等待队列状态初始化为0 q->private = p; //将等待队列的成员private指针初始化为p,即当前进程 q->func = default_wake_function; //将等待队列的成员func初始化为default_wake_function,即唤醒进程时的函数}
//参数含义与ep_poll函数参数相同,不再赘述static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, int maxevents){ //初始化ep_send_events_data数据结构,这个数据结构就只包含maxevents和events struct ep_send_events_data esed; esed.maxevents = maxevents; esed.events = events; //ep_scan_ready_list定义在下面 return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);}
//参数ep为epollfd所对应的eventpoll//参数sproc为函数指针,调用时赋值为ep_send_events_proc//参数priv指向ep_send_events_data数据结构//参数depth初始化为0static int ep_scan_ready_list(struct eventpoll *ep, int (*sproc)(struct eventpoll *, struct list_head *, void *), void *priv, int depth){ int error, pwake = 0; unsigned long flags; struct epitem *epi, *nepi; LIST_HEAD(txlist); /* * We need to lock this because we could be hit by * eventpoll_release_file() and epoll_ctl(). */ mutex_lock_nested(&ep->mtx, depth); /* * Steal the ready list, and re-init the original one to the * empty list. Also, set ep->ovflist to NULL so that events * happening while looping w/out locks, are not lost. We cannot * have the poll callback to queue directly on ep->rdllist, * because we want the "sproc" callback to be able to do it * in a lockless way. */ spin_lock_irqsave(&ep->lock, flags); //此时所有发生就绪事件的epitem都已经链入了eventpoll中的rdllist就绪链表了 //此时将rdllist就绪链表上的所有元素都转移到txlist中,而rdllist被清空 list_splice_init(&ep->rdllist, &txlist); //将ovlist置NULL,是因为此时不希望再有新的就绪事件对应的epitem加入到rdllist中 ep->ovflist = NULL; spin_unlock_irqrestore(&ep->lock, flags); /* * Now call the callback function. */ //此时调用参数传入的回调函数,即ep_send_events_proc //ep_send_events_proc定义在下面 error = (*sproc)(ep, &txlist, priv); spin_lock_irqsave(&ep->lock, flags); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. * We re-insert them inside the main ready-list here. */ //当调用ep_send_events_proc函数时,即向用户空间传递数据时 //发生了就绪事件,这些就绪事件对应的epitem都链入了eventpoll中的ovflist //现在遍历ovflist链表,依次处理这些epitem for (nepi = ep->ovflist; (epi = nepi) != NULL; nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { /* * We need to check if the item is already in the list. * During the "sproc" callback execution time, items are * queued into ->ovflist but the "txlist" might already * contain them, and the list_splice() below takes care of them. */ //如果epitem存在,就将epitem尾插进rddlist中 if (!ep_is_linked(&epi->rdllink)) list_add_tail(&epi->rdllink, &ep->rdllist); } /* * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after * releasing the lock, events will be queued in the normal way inside * ep->rdllist. */ //将ovflist置为EP_UNACTIVE_PTR,即((void *) -1L) ep->ovflist = EP_UNACTIVE_PTR; /* * Quickly re-inject items left on "txlist". */ //经过ep_send_events_proc对epitem的处理后,有的epitem还未被处理完,将这些epitem重新链入rdllist中 list_splice(&txlist, &ep->rdllist); //如果rdllist就绪链表不为NULL时 if (!list_empty(&ep->rdllist)) { /* * Wake up (if active) both the eventpoll wait list and * the ->poll() wait list (delayed after we release the lock). */ //当wq等待队列wq不为NULL时 if (waitqueue_active(&ep->wq)) //唤醒等待队列wq上的成员,及当前进程 wake_up_locked(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } spin_unlock_irqrestore(&ep->lock, flags); mutex_unlock(&ep->mtx); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&ep->poll_wait); return error;}
//参数ep为epollfd所对应的eventpoll//参数head为txlist//参数priv为ep_send_events_data数据结构static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, void *priv){ struct ep_send_events_data *esed = priv; int eventcnt; unsigned int revents; struct epitem *epi; struct epoll_event __user *uevent; /* * We can loop without lock because we are passed a task private list. * Items cannot vanish during the loop because ep_scan_ready_list() is * holding "mtx" during this call. */ //遍历整个txlist链表 for (eventcnt = 0, uevent = esed->events; !list_empty(head) && eventcnt < esed->maxevents;) { //获取txlist链表中的第一个节点 epi = list_first_entry(head, struct epitem, rdllink); //从txlink链表中将epitem删除 list_del_init(&epi->rdllink); //获取此时,最新的epitem的就绪事件类型 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & epi->event.events; /* * If the event mask intersect the caller-requested one, * deliver the event to userspace. Again, ep_scan_ready_list() * is holding "mtx", so no operations coming from userspace * can change the item. */ //再一次判断是否有就绪事件发生 if (revents) { //将当前的就绪事件拷贝到用户空间中 //如果此时epitem还没有处理完,就将epitem再链入txlist链表中 if (__put_user(revents, &uevent->events) || __put_user(epi->event.data, &uevent->data)) { list_add(&epi->rdllink, head); return eventcnt ? eventcnt : -EFAULT; } eventcnt++; uevent++; if (epi->event.events & EPOLLONESHOT) epi->event.events &= EP_PRIVATE_BITS; //判断fd是否为ET模式,如果不是ET模式,就要将自己再一次链入rdllist就绪链表中,这是LT和ET模式本质区别 //以便下次调用epoll_wait()会再次检查事件的可用性 else if (!(epi->event.events & EPOLLET)) { /* * If this file has been added with Level * Trigger mode, we need to insert back inside * the ready list, so that the next call to * epoll_wait() will check again the events * availability. At this point, no one can insert * into ep->rdllist besides us. The epoll_ctl() * callers are locked out by * ep_scan_ready_list() holding "mtx" and the * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); } } } return eventcnt;}
阅读全文
0 0
- IO复用——epoll内核源代码剖析
- epoll内核源代码剖析
- IO复用——select内核源代码剖析
- IO复用——poll机制内核源代码剖析
- linux下epoll内核源代码剖析
- linux下poll和epoll内核源代码剖析
- select,poll,epoll实现分析—结合内核源代码
- select,poll,epoll实现分析—结合内核源代码
- select,poll,epoll实现分析—结合内核源代码
- epoll—IO多路复用
- poll内核源代码剖析
- 内核源码IO多路复用EPOLL
- Linux I/O复用 —— epoll部分源码剖析
- linux内核mount源代码剖析
- linux 内核poll/select/epoll实现剖析
- linux 内核poll/select/epoll实现剖析
- linux 内核poll/select/epoll实现剖析
- EPOLL Linux内核源代码实现原理分析
- 第七篇:静态链表的游标实现
- ubuntu 16.04 安装 opencv master版本
- Zookeeper C API学习总结
- RocketMQ实战(三):分布式事务
- 第八篇:浅谈尾递归
- IO复用——epoll内核源代码剖析
- 第九篇:基本数据结构——队列的链式表示
- 阿里云安装 JDK mysql 环境搭建
- [HDU 6155] Subsequence Count
- zzuli GJJ的日常之暴富梦
- Codeforces_841_C Leha and Function(贪心+构造|规律)
- HDU 6152 Friend-Graph(拉姆齐定理+暴力)
- Qt写c++控制台中文乱码问题
- 第十篇:二叉树递归与非递归遍历(附完整源码)