源码剖析之epoll(2)
来源:互联网 发布:淘宝代理货源要钱吗 编辑:程序博客网 时间:2024/06/06 20:27
1. 源码剖析
本篇主要分析epoll_ctl
以及相关函数
以下源码取自4.10
1.1 epoll_ctl
用于添加/调整/删除我们要监视的事件
fs/eventpoll.c
/* * The following function implements the controller interface for * the eventpoll file that enables the insertion/removal/change of * file descriptors inside the interest set. */SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event){ int error; int full_check = 0; struct fd f, tf; struct eventpoll *ep; struct epitem *epi; struct epoll_event epds; struct eventpoll *tep = NULL; error = -EFAULT; /* 调用copy_from_user获得我们关注的事件 */ if (ep_op_has_event(op) && copy_from_user(&epds, event, sizeof(struct epoll_event))) goto error_return; error = -EBADF; f = fdget(epfd); if (!f.file) goto error_return; /* Get the "struct file *" for the target file */ tf = fdget(fd); if (!tf.file) goto error_fput; /* The target file descriptor must support poll */ error = -EPERM; if (!tf.file->f_op->poll) goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */ if (ep_op_has_event(op)) ep_take_care_of_epollwakeup(&epds); /* * We have to check that the file structure underneath the file descriptor * the user passed to us _is_ an eventpoll file. And also we do not permit * adding an epoll file descriptor inside itself. */ error = -EINVAL; if (f.file == tf.file || !is_file_epoll(f.file)) goto error_tgt_fput; /* * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only, * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. * Also, we do not currently supported nested exclusive wakeups. */ if (epds.events & EPOLLEXCLUSIVE) { if (op == EPOLL_CTL_MOD) goto error_tgt_fput; if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || (epds.events & ~EPOLLEXCLUSIVE_OK_BITS))) goto error_tgt_fput; } /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = f.file->private_data; /* * When we insert an epoll file descriptor, inside another epoll file * descriptor, there is the change of creating closed loops, which are * better be handled here, than in more critical paths. While we are * checking for loops we also determine the list of files reachable * and hang them on the tfile_check_list, so we can check that we * haven't created too many possible wakeup paths. * * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when * the epoll file descriptor is attaching directly to a wakeup source, * unless the epoll file descriptor is nested. The purpose of taking the * 'epmutex' on add is to prevent complex toplogies such as loops and * deep wakeup paths from forming in parallel through multiple * EPOLL_CTL_ADD operations. */ mutex_lock_nested(&ep->mtx, 0); if (op == EPOLL_CTL_ADD) { if (!list_empty(&f.file->f_ep_links) || is_file_epoll(tf.file)) { full_check = 1; mutex_unlock(&ep->mtx); mutex_lock(&epmutex); if (is_file_epoll(tf.file)) { error = -ELOOP; if (ep_loop_check(ep, tf.file) != 0) { clear_tfile_check_list(); goto error_tgt_fput; } } else list_add(&tf.file->f_tfile_llink, &tfile_check_list); mutex_lock_nested(&ep->mtx, 0); if (is_file_epoll(tf.file)) { tep = tf.file->private_data; mutex_lock_nested(&tep->mtx, 1); } } } /* * Try to lookup the file inside our RB tree, Since we grabbed "mtx" * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ epi = ep_find(ep, tf.file, fd); error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds.events |= POLLERR | POLLHUP; error = ep_insert(ep, &epds, tf.file, fd, full_check); } else error = -EEXIST; if (full_check) clear_tfile_check_list(); break; case EPOLL_CTL_DEL: if (epi) error = ep_remove(ep, epi); else error = -ENOENT; break; case EPOLL_CTL_MOD: if (epi) { if (!(epi->event.events & EPOLLEXCLUSIVE)) { epds.events |= POLLERR | POLLHUP; error = ep_modify(ep, epi, &epds); } } else error = -ENOENT; break; } if (tep != NULL) mutex_unlock(&tep->mtx); mutex_unlock(&ep->mtx);error_tgt_fput: if (full_check) mutex_unlock(&epmutex); fdput(tf);error_fput: fdput(f);error_return: return error;}
1.2 ep_insert
fs/eventpoll.c
static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct file *tfile, int fd, int full_check){ int error, revents, pwake = 0; unsigned long flags; long user_watches; struct epitem *epi; struct ep_pqueue epq; // 检测目前监视数量是否超过限制 user_watches = atomic_long_read(&ep->user->epoll_watches); if (unlikely(user_watches >= max_user_watches)) return -ENOSPC; // 从slab中分配空间用于存储epitem if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) return -ENOMEM; /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->fllink); INIT_LIST_HEAD(&epi->pwqlist); epi->ep = ep; /* 设置epitem在RB-Tree中的key */ ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; epi->nwait = 0; epi->next = EP_UNACTIVE_PTR; if (epi->event.events & EPOLLWAKEUP) { error = ep_create_wakeup_source(epi); if (error) goto error_create_wakeup_source; } else { RCU_INIT_POINTER(epi->ws, NULL); } /* Initialize the poll table using the queue callback */ epq.epi = epi; /* 用ep_ptable_queue_proc作为poll的回调函数 */ /* 回调函数的作用是将该epitem加入的文件的等待队列中去 */ init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); /* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has * been increased by the caller of this function. Note that after * this operation completes, the poll callback can start hitting * the new item. * 获取文件当前就绪事件的掩码,并执行poll的回调函数ep_ptable_queue_proc */ revents = ep_item_poll(epi, &epq.pt); /* * We have to check if something went wrong during the poll wait queue * install process. Namely an allocation for a wait queue failed due * high memory pressure. */ error = -ENOMEM; if (epi->nwait < 0) goto error_unregister; /** * Add the current item to the list of active epoll hook for this file * 文件会链接所有监听自己的epitem,把当前epitem加入到文件的链表中 */ spin_lock(&tfile->f_lock); list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links); spin_unlock(&tfile->f_lock); /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. * 将epitem插入到eventpoll的RB-Tree中 */ ep_rbtree_insert(ep, epi); /* now check if we've created too many backpaths */ error = -EINVAL; if (full_check && reverse_path_check()) goto error_remove_epi; /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags); /* If the file is already "ready" we drop it inside the ready list */ if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { /* 将epitem加入到对应eventpoll的就绪链表中 */ list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ /* 唤醒所有阻塞与epoll_wait调用的所有进程/线程 */ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); /* 唤醒阻塞于epoll该eventpoll的所有进程/线程 */ if (waitqueue_active(&ep->poll_wait)) pwake++; } spin_unlock_irqrestore(&ep->lock, flags); atomic_long_inc(&ep->user->epoll_watches); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&ep->poll_wait); return 0;error_remove_epi: spin_lock(&tfile->f_lock); list_del_rcu(&epi->fllink); spin_unlock(&tfile->f_lock); rb_erase(&epi->rbn, &ep->rbr);error_unregister: ep_unregister_pollwait(ep, epi); /* * We need to do this because an event could have been arrived on some * allocated wait queue. Note that we don't care about the ep->ovflist * list, since that is used/cleaned only inside a section bound by "mtx". * And ep_insert() is called with "mtx" held. */ spin_lock_irqsave(&ep->lock, flags); if (ep_is_linked(&epi->rdllink)) list_del_init(&epi->rdllink); spin_unlock_irqrestore(&ep->lock, flags); wakeup_source_unregister(ep_wakeup_source(epi));error_create_wakeup_source: kmem_cache_free(epi_cache, epi); return error;}/** * ffd.file->f_op->poll的warpper * 调用poll_table中指定的回调函数 * 如果文件当前发生了epitem的关心事件,返回事件掩码 */static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt){ pt->_key = epi->event.events; /** * 调用被监视文件对应的poll函数 * 如果是tcp socket,那么调用tcp_poll * 如果是udp socket,那么调用udp_poll */ return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;}
1.3 tcp_poll
net/ipv4/tcp.c
unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait){ unsigned int mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); int state; sock_rps_record_flow(sk); /* 通过sk_sleep(sk)的得到该文件的等待队列作为第二个参数 */ /* 最终以同样的这三个参数调用poll_wait */ sock_poll_wait(file, sk_sleep(sk), wait); // code omitted return mask;}
1.4 sk_sleep
该函数返回socket
文件的的等待队列
include/net/sock.h
static inline wait_queue_head_t *sk_sleep(struct sock *sk){ BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0); return &rcu_dereference_raw(sk->sk_wq)->wait;}
1.5 poll_wait
该函数会调用poll_table->_qproc
函数
include/linux/poll.h
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p){ /* 如果poll_table中指定函数不为空,那么就调用该函数,否则什么也不做 */ if (p && p->_qproc && wait_address) p->_qproc(filp, wait_address, p);}
1.6 ep_ptable_queue_proc
该函数是的作用是将包含epitem
信息的等待队列项加入文件的等待队列
fs/eventpoll.c
/* * This is the callback that is used to add our wait queue to the * target file wakeup lists. */static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt){ struct epitem *epi = ep_item_from_epqueue(pt); /* 用于关联epitem/wait_queue_t/wait_queue_head_t三者,以便在回调的时候能找到 */ struct eppoll_entry *pwq; if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) { /* 设置等待队列项的回调函数为ep_poll_callback */ init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; /* 将配置好的等待队列项加入对应的等待队列中去 */ if (epi->event.events & EPOLLEXCLUSIVE) add_wait_queue_exclusive(whead, &pwq->wait); else add_wait_queue(whead, &pwq->wait); /* 关联epitem和epoll_entry */ list_add_tail(&pwq->llink, &epi->pwqlist); epi->nwait++; } else { /* We have to signal that an error occurred */ epi->nwait = -1; }}
1.7 ep_poll_callback
该函数是当文件有事件发生时的回调函数
fs/eventpoll.c
/* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they * have events to report. */static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key){ int pwake = 0; unsigned long flags; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; int ewake = 0; if ((unsigned long)key & POLLFREE) { ep_pwq_from_wait(wait)->whead = NULL; /* * whead = NULL above can race with ep_remove_wait_queue() * which can do another remove_wait_queue() after us, so we * can't use __remove_wait_queue(). whead->lock is held by * the caller. */ list_del_init(&wait->task_list); } spin_lock_irqsave(&ep->lock, flags); /* * If the event mask does not contain any poll(2) event, we consider the * descriptor to be disabled. This condition is likely the effect of the * EPOLLONESHOT bit that disables the descriptor when an event is received, * until the next EPOLL_CTL_MOD will be issued. */ if (!(epi->event.events & ~EP_PRIVATE_BITS)) goto out_unlock; /* * Check the events coming with the callback. At this stage, not * every device reports the events in the "key" parameter of the * callback. We need to be able to handle both cases here, hence the * test for "key" != NULL before the event match test. */ if (key && !((unsigned long) key & epi->event.events)) goto out_unlock; /* * If we are transferring events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { if (epi->next == EP_UNACTIVE_PTR) { epi->next = ep->ovflist; ep->ovflist = epi; if (epi->ws) { /* * Activate ep->ws since epi->ws may get * deactivated at any time. */ __pm_stay_awake(ep->ws); } } goto out_unlock; } /* If this file is already in the ready list we exit soon */ if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake_rcu(epi); } /* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. */ if (waitqueue_active(&ep->wq)) { if ((epi->event.events & EPOLLEXCLUSIVE) && !((unsigned long)key & POLLFREE)) { switch ((unsigned long)key & EPOLLINOUT_BITS) { case POLLIN: if (epi->event.events & POLLIN) ewake = 1; break; case POLLOUT: if (epi->event.events & POLLOUT) ewake = 1; break; case 0: ewake = 1; break; } } /* 唤醒阻塞于epoll_wait和epoll_pwait调用的进程/线程 */ wake_up_locked(&ep->wq); } /* 唤醒阻塞于poll该eventpoll的进程/线程 */ if (waitqueue_active(&ep->poll_wait)) pwake++;out_unlock: spin_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&ep->poll_wait); if (epi->event.events & EPOLLEXCLUSIVE) return ewake; return 1;}
1.8 ep_modify
由于ep_modify
和ep_insert
类似,所以不具体分析了
fs/eventpoll.c
/* * Modify the interest event mask by dropping an event if the new mask * has a match in the current file status. Must be called with "mtx" held. */static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event){ int pwake = 0; unsigned int revents; poll_table pt; init_poll_funcptr(&pt, NULL); /* * Set the new event interest mask before calling f_op->poll(); * otherwise we might miss an event that happens between the * f_op->poll() call and the new event set registering. */ epi->event.events = event->events; /* need barrier below */ epi->event.data = event->data; /* protected by mtx */ if (epi->event.events & EPOLLWAKEUP) { if (!ep_has_wakeup_source(epi)) ep_create_wakeup_source(epi); } else if (ep_has_wakeup_source(epi)) { ep_destroy_wakeup_source(epi); } /* * The following barrier has two effects: * * 1) Flush epi changes above to other CPUs. This ensures * we do not miss events from ep_poll_callback if an * event occurs immediately after we call f_op->poll(). * We need this because we did not take ep->lock while * changing epi above (but ep_poll_callback does take * ep->lock). * * 2) We also need to ensure we do not miss _past_ events * when calling f_op->poll(). This barrier also * pairs with the barrier in wq_has_sleeper (see * comments for wq_has_sleeper). * * This barrier will now guarantee ep_poll_callback or f_op->poll * (or both) will notice the readiness of an item. */ smp_mb(); /* * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. */ revents = ep_item_poll(epi, &pt); /* * If the item is "hot" and it is not registered inside the ready * list, push it inside. */ if (revents & event->events) { spin_lock_irq(&ep->lock); if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } spin_unlock_irq(&ep->lock); } /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&ep->poll_wait); return 0;}
阅读全文
0 0
- 源码剖析之epoll(2)
- 源码剖析之epoll(1)
- epoll源码剖析
- epoll源码剖析
- epoll源码剖析
- epoll源码剖析
- epoll源码剖析
- poll与epoll源码剖析
- 【Linux深入】epoll源码剖析
- select、epoll源码剖析与对比
- epoll之ep_insert()源码分析
- epoll源码分析----2
- epoll剖析
- LevelDB源码剖析之Memtable(2)
- fork之源码剖析
- ThreadPoolExecutor 源码剖析之
- 源码剖析之poll
- Linux I/O复用 —— epoll部分源码剖析
- jmeter压测----环境准备(PerfMon Metrics Collector and Server Agent原理和使用)
- ISO-8859 转 UTF-8
- Hive计算引擎切换
- 2017/12/22 jQuery
- iOS开发-关于iOS11适配的一些坑
- 源码剖析之epoll(2)
- 使用MediaCodec播放视频
- 正则表达式
- 153. Find Minimum in Rotated Sorted Array
- python round函数 竟然把5舍去了
- AS3.0 性能分析工具Profier使用总结
- springboot配置中心
- Ubuntu 16.04.1 + NVIDIA driver + cuda 8.0 安装
- solr 中文分词器