Linux kernel [select poll epoll]区别

来源：互联网发布：武清房产网楚天网络编辑：程序博客网时间：2024/05/22 15:17

Linux中异步IO等待无非就三个系统调用：select， poll和epoll。很多人无法理解三种调用的区别，或不够了解，今天就结合Linux kernel code详细描述三个的区别！

select:

select 的限制就是最大1024个fd，可以查看kernel中的posix_types.h，里面定义了fdset数据结构，显然select不适合poll大量fd的场景（如webserver）。

include/linux/posix_types.h ：

C代码  
#undef __NFDBITS  
#define __NFDBITS       (8 * sizeof(unsigned long))  
  
#undef __FD_SETSIZE  
#define __FD_SETSIZE    1024  
  
#undef __FDSET_LONGS  
#define __FDSET_LONGS   (__FD_SETSIZE/__NFDBITS)  
  
#undef __FDELT  
#define __FDELT(d)      ((d) / __NFDBITS)  
  
#undef __FDMASK  
#define __FDMASK(d)     (1UL << ((d) % __NFDBITS))  
  
typedef struct {  
        unsigned long fds_bits [__FDSET_LONGS];  
} __kernel_fd_set;  

poll:

poll相对于select改进了fdset size的限制，poll没有再使用fdset数组结构，反而使用了pollfd，这样用户可以自定义非常大的pollfd数组，这个pollfd数组在kernel中的表现形式是poll_list链表，这样就不存在了1024的限制了，除此之外poll相比select无太大区别。

C代码  
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,  
                struct timespec *end_time)  
{  
        struct poll_wqueues table;  
        int err = -EFAULT, fdcount, len, size;  
        /* Allocate small arguments on the stack to save memory and be 
           faster - use long to make sure the buffer is aligned properly 
           on 64 bit archs to avoid unaligned access */  
        long stack_pps[POLL_STACK_ALLOC/sizeof(long)];  
        struct poll_list *const head = (struct poll_list *)stack_pps;  
        struct poll_list *walk = head;  
        unsigned long todo = nfds;  
  
        if (nfds > rlimit(RLIMIT_NOFILE))  
                return -EINVAL;  
  
        len = min_t(unsigned int, nfds, N_STACK_PPS);  
        for (;;) {  
                walk->next = NULL;  
                walk->len = len;  
                if (!len)  
                        break;  
  
                if (copy_from_user(walk->entries, ufds + nfds-todo,  
                                        sizeof(struct pollfd) * walk->len))  
                        goto out_fds;  
  
                todo -= walk->len;  
                if (!todo)  
                        break;  
  
                len = min(todo, POLLFD_PER_PAGE);  
                size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;  
                walk = walk->next = kmalloc(size, GFP_KERNEL);  
                if (!walk) {  
                        err = -ENOMEM;  
                        goto out_fds;  
                }  
        }  

epoll：

select与poll的共同点是fd有数据后kernel会遍历所有fd，找到有效fd后初始化相应的revents，用户空间程序须再次遍历整个fdset，以找到有效的fd，这样实际上就遍历了两次fd数组表，对于极大量fd的情况，这样的性能非常不好，请看一下do_poll代码：

C代码  
static int do_poll(unsigned int nfds,  struct poll_list *list,  
                   struct poll_wqueues *wait, struct timespec *end_time)  
{  
        poll_table* pt = &wait->pt;  
        ktime_t expire, *to = NULL;  
        int timed_out = 0, count = 0;  
        unsigned long slack = 0;  
  
        /* Optimise the no-wait case */  
        if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {  
                pt = NULL;  
                timed_out = 1;  
        }  
  
        if (end_time && !timed_out)  
                slack = select_estimate_accuracy(end_time);  
  
        for (;;) {  
                struct poll_list *walk;  
  
                for (walk = list; walk != NULL; walk = walk->next) {  
                        struct pollfd * pfd, * pfd_end;  
  
                        pfd = walk->entries;  
                        pfd_end = pfd + walk->len;  
                        for (; pfd != pfd_end; pfd++) {  
                                /* 
                                 * Fish for events. If we found one, record it 
                                 * and kill the poll_table, so we don't 
                                 * needlessly register any other waiters after 
                                 * this. They'll get immediately deregistered 
                                 * when we break out and return. 
                                 */  
                                if (do_pollfd(pfd, pt)) {  
                                        count++;  
                                        pt = NULL;  
                                }  
                        }  
                }  

epoll的出现解决了这种问题，那么epoll是如何做到的呢？我们知道select, poll和epoll都是使用waitqueue调用callback函数去wakeup你的异步等待线程的，如果设置了timeout的话就起一个hrtimer，select和poll的callback函数并没有做什么事情，但epoll的waitqueue callback函数把当前的有效fd加到ready list，然后唤醒异步等待进程，所以你的epoll函数返回的就是这个ready list， ready list中包含所有有效的fd，这样一来kernel不用去遍历所有的fd，用户空间程序也不用遍历所有的fd，而只是遍历返回有效fd链表，所以epoll自然比select和poll更适合大数量fd的场景。

C代码  
static int ep_send_events(struct eventpoll *ep,  
                          struct epoll_event __user *events, int maxevents)  
{  
        struct ep_send_events_data esed;  
  
        esed.maxevents = maxevents;  
        esed.events = events;  
  
        return ep_scan_ready_list(ep, ep_send_events_proc, &esed);  
}  

现在大家应该明白select, poll和epoll的区别了吧！有人问既然select和poll有这么明显的缺陷，为什么不改掉kernel中的实现呢？原因很简单，后向ABI兼容，select和poll的ABI无法返回ready list，只能返回整个fd数组，所以用户只得再次遍历整个fd数组以找到哪些fd是有数据的。

epoll还包括 “Level-Triggered” 和 “Edge-Triggered”，这两个概念在这里就不多赘述了，因为"man epoll"里面解释的非常详细，还有使用epoll的example。