IO复用之epoll

来源：互联网发布：淘宝站内推广有哪些编辑：程序博客网时间：2024/06/06 12:53

/*epoll 是由一组系统调用组成。

int epoll_create(int size);

int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);

int epoll_wait(int epfd, struct epoll_event *events,

int maxevents, int timeout);*/

select/poll 的缺点在于：

1.每次调用时要重复地从用户态读入参数。

2.每次调用时要重复地扫描文件描述符。

3.每次在调用开始时，要把当前进程放入各个文件描述符的等待队列。在调用结束后，

又把进程从各个等待队列中删除。

epoll 机制是针对 select/poll 的缺陷设计的。通过新引入的 eventpollfs 文件系统，

epoll 把参数拷贝到内核态，在每次轮询时不会重复拷贝。通过把操作拆分为

epoll_create,epoll_ctl,epoll_wait，避免了重复地遍历要监视的文件描述符。此外，由

于调用 epoll 的进程被唤醒后，只要直接从 epitem 的完成队列中找出完成的事件，找出完

成事件的复杂度由 O(N)降到了 O(1)。

但是 epoll 的性能提高是有前提的，那就是监视的文件描述符非常多，而且每次完成

操作的文件非常少。所以，epoll 能否显著提高效率，取决于实际的应用场景。这方面需要

进一步测试。

struct epoll_filefd {

struct file *file;

int fd;

};

* Node that is linked into the "wake_task_list" member of the "struct poll_safewake".

* It is used to keep track on all tasks that are currently inside the wake_up() code

* to 1) short-circuit the one coming from the same task and same wait queue head

* ( loop ) 2) allow a maximum number of epoll descriptors inclusion nesting

* 3) let go the ones coming from other tasks.

struct wake_task_node {

struct list_head llink;

task_t *task;

wait_queue_head_t *wq;

};

* This is used to implement(实现) the safe poll wake up avoiding to reenter(再进入)

* the poll callback from inside wake_up().

struct poll_safewake {

struct list_head wake_task_list;

spinlock_t lock;

};

* This structure is stored(存储) inside the "private_data" member of the file

* structure and rapresent the main data sructure for the eventpoll

* interface.

* 存储 epoll 文件描述符的扩展信息，它被保存在 file 结构体的

* private_data 中。它与 epoll 文件节点一一对应。通常一个 epoll 文件节点对应多个被监视

* 的文件描述符。所以一个 eventpoll 结构体会对应多个 epitem 结构体。

struct eventpoll {

/* Protect the this structure access */

rwlock_t lock; //读写锁

* This semaphore is used to ensure that files are not removed

* while epoll is using them. This is read-held during the event

* collection loop and it is write-held during the file cleanup

* path, the epoll file exit code and the ctl operations.

struct rw_semaphore sem;//读写信号路径，退出码，控制操作

/* Wait queue used by sys_epoll_wait() */

wait_queue_head_t wq; //等待队列

/* Wait queue used by file->poll() */

wait_queue_head_t poll_wait; //等待队列

/* List of ready file descriptors */

struct list_head rdllist;//就绪文件描述法链表

/* RB-Tree root used to store monitored fd structs */

struct rb_root rbr;//存储epoll监视的文件描述符

};

/* Wait structure used by the poll hooks（监视？） */

struct eppoll_entry {

/* List header used to link this structure to the "struct epitem" */

struct list_head llink;//链表头部连接"struct epitem"

/* The "base" pointer is set to the container "struct epitem" */

void *base; //设置"struct epitem"

* Wait queue item that will be linked to the target file wait

* queue head.

wait_queue_t wait; //等待队列

/* The wait queue head that linked the "wait" wait queue item */

wait_queue_head_t *whead;

};

**********************************************************

* Each file descriptor added to the eventpoll interface will

* have an entry of this type linked to the hash.

* 该结构体用来保存与 epoll 节点关联的多个文件描述符，保存的方式是使用红黑树实

* 现的 hash 表。

struct epitem {

/* RB-Tree node used to link this structure to the eventpoll rb-tree */

struct rb_node rbn;//红黑树保存eventpoll

/* List header used to link this structure to the eventpoll ready list */

struct list_head rdllink;//双向链表，用来保存已经完成的 eventpoll

/* The file descriptor information this item refers to */

struct epoll_filefd ffd;//对应文件描述法信息

/* Number of active wait queue attached to poll operations */

int nwait; //poll 操作中事件的个数

/* List containing poll wait queues */

struct list_head pwqlist; //双向链表，保存着被监视文件的等待队列

/* The "container" of this item */

struct eventpoll *ep;//指向 eventpoll，多个 epitem 对应一个 eventpoll

/* The structure that describe the interested events and the source fd */

struct epoll_event event;//记录发生的事件和对应的 fd

* Used to keep track of the usage count of the structure. This avoids

* that the structure will desappear from underneath our processing.

atomic_t usecnt; //（usage count）引用计数

/* List header used to link this item to the "struct file" items list */

struct list_head fllink;//双向链表，用来链接被监视的文件描述符对应的 struct file

/* List header used to link the item to the transfer list */

struct list_head txlink;//双向链表，用来保存传输队列

* This is used during the collection/transfer of events to userspace

* to pin items empty events set.

unsigned int revents;//文件描述符的状态，在收集和传输时用来锁住空的事件集合

};

/* Wrapper struct used by poll queueing */

struct ep_pqueue {

poll_table pt;

struct epitem *epi;

};

// epoll_create 的实现

* It opens an eventpoll file descriptor by suggesting(建议) a storage of "size"

* file descriptors. The size parameter is just an hint about how to size

* data structures. It won't prevent（阻止） the user to store more than "size"

* file descriptors inside the epoll interface. It is the kernel part of

* the userspace epoll_create(2).

asmlinkage long sys_epoll_create(int size)

{

int error, fd;

struct inode *inode;

struct file *file;

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",

current, size));

/* Sanity check on the size parameter */

error = -EINVAL;

if (size <= 0)

goto eexit_1;

* Creates all the items needed to setup an eventpoll file. That is,

* a file structure, and inode and a free file descriptor.

error = ep_getfd(&fd, &inode, &file); /*后把 file,dentry,inode 三者关联起来*/

if (error)

goto eexit_1;

/* Setup the file internal data structure ( "struct eventpoll" ) */

error = ep_file_init(file);

if (error)

goto eexit_2;

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",

current, size, fd));

return fd;

eexit_2:

sys_close(fd);

eexit_1:

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",

current, size, error));

return error;

}

* Creates the file descriptor to be used by the epoll interface.

static int ep_getfd(int *efd,struct inode **einode,struct file **efile)

{

struct qstr this;

char name[32];

struct dentry *dentry;

struct inode *inode;

struct file *file;

int error, fd;

/* Get an ready to use file */

error = -ENFILE;

file = get_empty_filp();

if (!file)

goto eexit_1;

/* Allocates an inode from the eventpoll file system */

inode = ep_eventpoll_inode();

error = PTR_ERR(inode);

if (IS_ERR(inode))

goto eexit_2;

/* Allocates a free descriptor to plug the file onto */

error = get_unused_fd();

if (error < 0)

goto eexit_3;

fd = error;

* Link the inode to a directory entry by creating a unique name

* using the inode number.

error = -ENOMEM;

sprintf(name, "[%lu]", inode->i_ino);

this.name = name;

this.len = strlen(name);

this.hash = inode->i_ino;

dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this);

if (!dentry)

goto eexit_4;

dentry->d_op = &eventpollfs_dentry_operations;

d_add(dentry, inode);

file->f_vfsmnt = mntget(eventpoll_mnt);

file->f_dentry = dentry;

file->f_mapping = inode->i_mapping;

file->f_pos = 0;

file->f_flags = O_RDONLY;

file->f_op = &eventpoll_fops;

file->f_mode = FMODE_READ;

file->f_version = 0;

file->private_data = NULL;

/* Install the new setup file into the allocated fd. */

fd_install(fd, file);

*efd = fd;

*einode = inode;

*efile = file;

return 0;

eexit_4:

put_unused_fd(fd);

eexit_3:

iput(inode);

eexit_2:

put_filp(file);

eexit_1:

return error;

}

static int ep_file_init(struct file *file)

{

struct eventpoll *ep; //创建eventpoll结构体

if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL)))

return -ENOMEM;

memset(ep, 0, sizeof(*ep));

rwlock_init(&ep->lock);

init_rwsem(&ep->sem);

init_waitqueue_head(&ep->wq);

init_waitqueue_head(&ep->poll_wait);

INIT_LIST_HEAD(&ep->rdllist);

ep->rbr = RB_ROOT;

file->private_data = ep; //eventpoll结构体与file私有数据关联

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_file_init() ep=%p\n",

current, ep));

return 0;

}

// epoll_ctl 的实现把文件与 eventpollfs 文件系统的 inode 节点关联起来

* The following function implements the controller interface for

* the eventpoll file that enables the insertion/removal/change of

* file descriptors inside the interest set. It represents

* the kernel part of the user space epoll_ctl(2).

asmlinkage long

sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)

{

int error;

struct file *file, *tfile;

struct eventpoll *ep;

struct epitem *epi;

struct epoll_event epds;

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",

current, epfd, op, fd, event));

error = -EFAULT;

if (EP_OP_HASH_EVENT(op) &&

copy_from_user(&epds, event, sizeof(struct epoll_event)))

goto eexit_1;

/* Get the "struct file *" for the eventpoll file */

error = -EBADF;

file = fget(epfd);

if (!file)

goto eexit_1;

/* Get the "struct file *" for the target file */

tfile = fget(fd);

if (!tfile)

goto eexit_2;

/* The target file descriptor must support poll */

error = -EPERM;

if (!tfile->f_op || !tfile->f_op->poll)

goto eexit_3;

* We have to check that the file structure underneath the file descriptor

* the user passed to us _is_ an eventpoll file. And also we do not permit

* adding an epoll file descriptor inside itself.

error = -EINVAL;

if (file == tfile || !IS_FILE_EPOLL(file))

goto eexit_3;

* At this point it is safe to assume that the "private_data" contains

* our own data structure.

ep = file->private_data;

down_write(&ep->sem);

* Search the file inside the eventpoll hash. It add usage count to

* the returned item, so the caller must call ep_release_epitem()

* after finished using the "struct epitem".

/* Try to lookup the file inside our hash table */

epi = ep_find(ep, tfile, fd);

error = -EINVAL;

switch (op) {

case EPOLL_CTL_ADD:

if (!epi) {

epds.events |= POLLERR | POLLHUP;

error = ep_insert(ep, &epds, tfile, fd);

}

else

error = -EEXIST;

break;

case EPOLL_CTL_DEL:

if (epi)

error = ep_remove(ep, epi);

else

error = -ENOENT;

break;

case EPOLL_CTL_MOD:

if (epi) {

epds.events |= POLLERR | POLLHUP;

error = ep_modify(ep, epi, &epds);

}

else

error = -ENOENT;

break;

}

* The function ep_find() increments the usage count of the structure

* so, if this is not NULL, we need to release it.

if (epi)

ep_release_epitem(epi);

up_write(&ep->sem);

eexit_3:

fput(tfile);

eexit_2:

fput(file);

eexit_1:

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",

current, epfd, op, fd, event, error));

return error;

}

static int ep_insert(struct eventpoll *ep,struct epoll_event *event,

struct file *tfile, int fd)

{

int error, revents, pwake = 0;

unsigned long flags;

struct epitem *epi;

struct ep_pqueue epq;

error = -ENOMEM;

if (!(epi = EPI_MEM_ALLOC()))

goto eexit_1;

/* Item initialization follow here ... */

EP_RB_INITNODE(&epi->rbn);

INIT_LIST_HEAD(&epi->rdllink);

INIT_LIST_HEAD(&epi->fllink);

INIT_LIST_HEAD(&epi->txlink);

INIT_LIST_HEAD(&epi->pwqlist);

epi->ep = ep;

EP_SET_FFD(&epi->ffd, tfile, fd);

epi->event = *event;

atomic_set(&epi->usecnt, 1);

epi->nwait = 0;

/* Initialize the poll table using the queue callback */

epq.epi = epi;

init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

* Attach the item to the poll hooks and get current event bits.

* We can safely use the file* here because its usage count has

* been increased by the caller of this function.

revents = tfile->f_op->poll(tfile, &epq.pt);

* We have to check if something went wrong during the poll wait queue

* install process. Namely an allocation for a wait queue failed due

* high memory pressure.

if (epi->nwait < 0)

goto eexit_2;

/* Add the current item to the list of active epoll hook for this file */

spin_lock(&tfile->f_ep_lock);

list_add_tail(&epi->fllink, &tfile->f_ep_links);

spin_unlock(&tfile->f_ep_lock);

/* We have to drop the new item inside our item list to keep track of it */

write_lock_irqsave(&ep->lock, flags);

/* Add the current item to the rb-tree */

ep_rbtree_insert(ep, epi);

/* If the file is already "ready" we drop it inside the ready list */

if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) {

list_add_tail(&epi->rdllink, &ep->rdllist);

/* Notify waiting tasks that events are available */

if (waitqueue_active(&ep->wq))

wake_up(&ep->wq);

if (waitqueue_active(&ep->poll_wait))

pwake++;

}

write_unlock_irqrestore(&ep->lock, flags);

/* We have to call this outside the lock */

if (pwake)

ep_poll_safewake(&psw, &ep->poll_wait);

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",

current, ep, tfile, fd));

return 0;

eexit_2:

ep_unregister_pollwait(ep, epi);

* We need to do this because an event could have been arrived on some

* allocated wait queue.

write_lock_irqsave(&ep->lock, flags);

if (EP_IS_LINKED(&epi->rdllink))

EP_LIST_DEL(&epi->rdllink);

write_unlock_irqrestore(&ep->lock, flags);

EPI_MEM_FREE(epi);

eexit_1:

return error;

}

* Implement the event wait interface for the eventpoll file. It is the kernel

* part of the user space epoll_wait(2).

asmlinkage long sys_epoll_wait(int epfd,struct epoll_event __user *events,

int maxevents, int timeout)

{

int error;

struct file *file;

struct eventpoll *ep;

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",

current, epfd, events, maxevents, timeout));

/* The maximum number of event must be greater than zero */

if (maxevents <= 0)

return -EINVAL;

/* Verify that the area passed by the user is writeable */

if ((error = verify_area(VERIFY_WRITE, events, maxevents *sizeof(struct epoll_event))))

goto eexit_1;

/* Get the "struct file *" for the eventpoll file */

error = -EBADF;

file = fget(epfd);

if (!file)

goto eexit_1;

* We have to check that the file structure underneath the fd

* the user passed to us _is_ an eventpoll file.

error = -EINVAL;

if (!IS_FILE_EPOLL(file))

goto eexit_2;

* At this point it is safe to assume that the "private_data" contains

* our own data structure.

ep = file->private_data;

/* Time to fish for events ... */

error = ep_poll(ep, events, maxevents, timeout); //****

eexit_2:

fput(file);

eexit_1:

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",

current, epfd, events, maxevents, timeout, error));

return error;

}

static int ep_poll(struct eventpoll *ep,struct epoll_event __user *events,

int maxevents, long timeout)

{

int res, eavail;

unsigned long flags;

long jtimeout;

wait_queue_t wait;

* Calculate the timeout by checking for the "infinite" value ( -1 )

* and the overflow condition. The passed timeout is in milliseconds,

* that why (t * HZ) / 1000.

jtimeout = timeout == -1 || timeout > (MAX_SCHEDULE_TIMEOUT - 1000) / HZ ?

MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;

retry:

write_lock_irqsave(&ep->lock, flags);

res = 0;

if (list_empty(&ep->rdllist)) {

* We don't have any available event to return to the caller.

* We need to sleep here, and we will be wake up by

* ep_poll_callback() when events will become available.

init_waitqueue_entry(&wait, current);

add_wait_queue(&ep->wq, &wait);

for (;;) {

* We don't want to sleep if the ep_poll_callback() sends us

* a wakeup in between. That's why we set the task state

* to TASK_INTERRUPTIBLE before doing the checks.

set_current_state(TASK_INTERRUPTIBLE);

if (!list_empty(&ep->rdllist) || !jtimeout)

break;

if (signal_pending(current)) {

res = -EINTR;

break;

}

write_unlock_irqrestore(&ep->lock, flags);

jtimeout = schedule_timeout(jtimeout);

write_lock_irqsave(&ep->lock, flags);

}

remove_wait_queue(&ep->wq, &wait);

set_current_state(TASK_RUNNING);

}

/* Is it worth to try to dig for events ? */

eavail = !list_empty(&ep->rdllist);

write_unlock_irqrestore(&ep->lock, flags);

* Try to transfer events to user space. In case we get 0 events and

* there's still timeout left over, we go trying again in search of

* more luck.

if (!res && eavail &&

!(res = ep_events_transfer(ep, events, maxevents)) && jtimeout)

goto retry;

return res;

}

Epoll是对select和poll的改进：

1 select和poll只提供了一个函数selset()&poll(),but epoll* 提供三个函数epoll_create()创建一个epoll句柄, epoll_ctl()注册要监听的事件类型, epoll_wait()等待事件的发生。

2.epoll_create()时把所有fd拷贝进内核，而不是epoll_wait()时重复拷贝；

3.Epoll_ctl()时把所有fd遍历为每个fd指定一个回调函数，当设备就绪，唤醒等待队列上的等待者，就会调用该回调函数，该回调函数把就绪fd加入一个就绪链表。Epoll_wait()，查找就绪来表中有没有就绪的fd；

4.支持的文件描述符数量没有限制

1 0