IO复用之epoll
来源:互联网 发布:淘宝站内推广有哪些 编辑:程序博客网 时间:2024/06/06 12:53
/*epoll 是由一组系统调用组成。
int epoll_create(int size);
int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
int epoll_wait(int epfd, struct epoll_event *events,
int maxevents, int timeout);*/
/*
select/poll 的缺点在于:
1.每次调用时要重复地从用户态读入参数。
2.每次调用时要重复地扫描文件描述符。
3.每次在调用开始时,要把当前进程放入各个文件描述符的等待队列。在调用结束后,
又把进程从各个等待队列中删除。
*/
/*
epoll 机制是针对 select/poll 的缺陷设计的。通过新引入的 eventpollfs 文件系统,
epoll 把参数拷贝到内核态,在每次轮询时不会重复拷贝。通过把操作拆分为
epoll_create,epoll_ctl,epoll_wait,避免了重复地遍历要监视的文件描述符。此外,由
于调用 epoll 的进程被唤醒后,只要直接从 epitem 的完成队列中找出完成的事件,找出完
成事件的复杂度由 O(N)降到了 O(1)。
但是 epoll 的性能提高是有前提的,那就是监视的文件描述符非常多,而且每次完成
操作的文件非常少。所以,epoll 能否显著提高效率,取决于实际的应用场景。这方面需要
进一步测试。
*/
struct epoll_filefd {
struct file *file;
int fd;
};
/*
* Node that is linked into the "wake_task_list" member of the "struct poll_safewake".
* It is used to keep track on all tasks that are currently inside the wake_up() code
* to 1) short-circuit the one coming from the same task and same wait queue head
* ( loop ) 2) allow a maximum number of epoll descriptors inclusion nesting
* 3) let go the ones coming from other tasks.
*/
struct wake_task_node {
struct list_head llink;
task_t *task;
wait_queue_head_t *wq;
};
/*
* This is used to implement(实现) the safe poll wake up avoiding to reenter(再进入)
* the poll callback from inside wake_up().
*/
struct poll_safewake {
struct list_head wake_task_list;
spinlock_t lock;
};
/*
* This structure is stored(存储) inside the "private_data" member of the file
* structure and rapresent the main data sructure for the eventpoll
* interface.
* 存储 epoll 文件描述符的扩展信息,它被保存在 file 结构体的
* private_data 中。它与 epoll 文件节点一一对应。通常一个 epoll 文件节点对应多个被监视
* 的文件描述符。所以一个 eventpoll 结构体会对应多个 epitem 结构体。
*/
struct eventpoll {
/* Protect the this structure access */
rwlock_t lock; //读写锁
/*
* This semaphore is used to ensure that files are not removed
* while epoll is using them. This is read-held during the event
* collection loop and it is write-held during the file cleanup
* path, the epoll file exit code and the ctl operations.
*/
struct rw_semaphore sem;//读写信号 路径,退出码,控制操作
/* Wait queue used by sys_epoll_wait() */
wait_queue_head_t wq; //等待队列
/* Wait queue used by file->poll() */
wait_queue_head_t poll_wait; //等待队列
/* List of ready file descriptors */
struct list_head rdllist;//就绪文件描述法链表
/* RB-Tree root used to store monitored fd structs */
struct rb_root rbr;//存储epoll监视的文件描述符
};
/* Wait structure used by the poll hooks(监视?) */
struct eppoll_entry {
/* List header used to link this structure to the "struct epitem" */
struct list_head llink;//链表头部 连接"struct epitem"
/* The "base" pointer is set to the container "struct epitem" */
void *base; //设置"struct epitem"
/*
* Wait queue item that will be linked to the target file wait
* queue head.
*/
wait_queue_t wait; //等待队列
/* The wait queue head that linked the "wait" wait queue item */
wait_queue_head_t *whead;
};
/*
**********************************************************
* Each file descriptor added to the eventpoll interface will
* have an entry of this type linked to the hash.
* 该结构体用来保存与 epoll 节点关联的多个文件描述符,保存的方式是使用红黑树实
* 现的 hash 表。
*/
struct epitem {
/* RB-Tree node used to link this structure to the eventpoll rb-tree */
struct rb_node rbn;//红黑树 保存eventpoll
/* List header used to link this structure to the eventpoll ready list */
struct list_head rdllink;//双向链表,用来保存已经完成的 eventpoll
/* The file descriptor information this item refers to */
struct epoll_filefd ffd;//对应文件描述法信息
/* Number of active wait queue attached to poll operations */
int nwait; //poll 操作中事件的个数
/* List containing poll wait queues */
struct list_head pwqlist; //双向链表,保存着被监视文件的等待队列
/* The "container" of this item */
struct eventpoll *ep;//指向 eventpoll,多个 epitem 对应一个 eventpoll
/* The structure that describe the interested events and the source fd */
struct epoll_event event;//记录发生的事件和对应的 fd
/*
* Used to keep track of the usage count of the structure. This avoids
* that the structure will desappear from underneath our processing.
*/
atomic_t usecnt; //(usage count)引用计数
/* List header used to link this item to the "struct file" items list */
struct list_head fllink;//双向链表,用来链接被监视的文件描述符对应的 struct file
/* List header used to link the item to the transfer list */
struct list_head txlink;//双向链表,用来保存传输队列
/*
* This is used during the collection/transfer of events to userspace
* to pin items empty events set.
*/
unsigned int revents;//文件描述符的状态,在收集和传输时用来锁住空的事件集合
};
/* Wrapper struct used by poll queueing */
struct ep_pqueue {
poll_table pt;
struct epitem *epi;
};
// epoll_create 的实现
/*
* It opens an eventpoll file descriptor by suggesting(建议) a storage of "size"
* file descriptors. The size parameter is just an hint about how to size
* data structures. It won't prevent(阻止) the user to store more than "size"
* file descriptors inside the epoll interface. It is the kernel part of
* the userspace epoll_create(2).
*/
asmlinkage long sys_epoll_create(int size)
{
int error, fd;
struct inode *inode;
struct file *file;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
current, size));
/* Sanity check on the size parameter */
error = -EINVAL;
if (size <= 0)
goto eexit_1;
/*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure, and inode and a free file descriptor.
*/
error = ep_getfd(&fd, &inode, &file); /*后把 file,dentry,inode 三者关联起来*/
if (error)
goto eexit_1;
/* Setup the file internal data structure ( "struct eventpoll" ) */
error = ep_file_init(file);
if (error)
goto eexit_2;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
current, size, fd));
return fd;
eexit_2:
sys_close(fd);
eexit_1:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
current, size, error));
return error;
}
/*
* Creates the file descriptor to be used by the epoll interface.
*/
static int ep_getfd(int *efd,struct inode **einode,struct file **efile)
{
struct qstr this;
char name[32];
struct dentry *dentry;
struct inode *inode;
struct file *file;
int error, fd;
/* Get an ready to use file */
error = -ENFILE;
file = get_empty_filp();
if (!file)
goto eexit_1;
/* Allocates an inode from the eventpoll file system */
inode = ep_eventpoll_inode();
error = PTR_ERR(inode);
if (IS_ERR(inode))
goto eexit_2;
/* Allocates a free descriptor to plug the file onto */
error = get_unused_fd();
if (error < 0)
goto eexit_3;
fd = error;
/*
* Link the inode to a directory entry by creating a unique name
* using the inode number.
*/
error = -ENOMEM;
sprintf(name, "[%lu]", inode->i_ino);
this.name = name;
this.len = strlen(name);
this.hash = inode->i_ino;
dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this);
if (!dentry)
goto eexit_4;
dentry->d_op = &eventpollfs_dentry_operations;
d_add(dentry, inode);
file->f_vfsmnt = mntget(eventpoll_mnt);
file->f_dentry = dentry;
file->f_mapping = inode->i_mapping;
file->f_pos = 0;
file->f_flags = O_RDONLY;
file->f_op = &eventpoll_fops;
file->f_mode = FMODE_READ;
file->f_version = 0;
file->private_data = NULL;
/* Install the new setup file into the allocated fd. */
fd_install(fd, file);
*efd = fd;
*einode = inode;
*efile = file;
return 0;
eexit_4:
put_unused_fd(fd);
eexit_3:
iput(inode);
eexit_2:
put_filp(file);
eexit_1:
return error;
}
static int ep_file_init(struct file *file)
{
struct eventpoll *ep; //创建eventpoll结构体
if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL)))
return -ENOMEM;
memset(ep, 0, sizeof(*ep));
rwlock_init(&ep->lock);
init_rwsem(&ep->sem);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
ep->rbr = RB_ROOT;
file->private_data = ep; //eventpoll结构体与file私有数据关联
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_file_init() ep=%p\n",
current, ep));
return 0;
}
// epoll_ctl 的实现 把文件与 eventpollfs 文件系统的 inode 节点关联起来
/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of
* file descriptors inside the interest set. It represents
* the kernel part of the user space epoll_ctl(2).
*/
asmlinkage long
sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
{
int error;
struct file *file, *tfile;
struct eventpoll *ep;
struct epitem *epi;
struct epoll_event epds;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
current, epfd, op, fd, event));
error = -EFAULT;
if (EP_OP_HASH_EVENT(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
goto eexit_1;
/* Get the "struct file *" for the eventpoll file */
error = -EBADF;
file = fget(epfd);
if (!file)
goto eexit_1;
/* Get the "struct file *" for the target file */
tfile = fget(fd);
if (!tfile)
goto eexit_2;
/* The target file descriptor must support poll */
error = -EPERM;
if (!tfile->f_op || !tfile->f_op->poll)
goto eexit_3;
/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
if (file == tfile || !IS_FILE_EPOLL(file))
goto eexit_3;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = file->private_data;
down_write(&ep->sem);
/*
* Search the file inside the eventpoll hash. It add usage count to
* the returned item, so the caller must call ep_release_epitem()
* after finished using the "struct epitem".
*/
/* Try to lookup the file inside our hash table */
epi = ep_find(ep, tfile, fd);
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_insert(ep, &epds, tfile, fd);
}
else
error = -EEXIST;
break;
case EPOLL_CTL_DEL:
if (epi)
error = ep_remove(ep, epi);
else
error = -ENOENT;
break;
case EPOLL_CTL_MOD:
if (epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_modify(ep, epi, &epds);
}
else
error = -ENOENT;
break;
}
/*
* The function ep_find() increments the usage count of the structure
* so, if this is not NULL, we need to release it.
*/
if (epi)
ep_release_epitem(epi);
up_write(&ep->sem);
eexit_3:
fput(tfile);
eexit_2:
fput(file);
eexit_1:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
current, epfd, op, fd, event, error));
return error;
}
static int ep_insert(struct eventpoll *ep,struct epoll_event *event,
struct file *tfile, int fd)
{
int error, revents, pwake = 0;
unsigned long flags;
struct epitem *epi;
struct ep_pqueue epq;
error = -ENOMEM;
if (!(epi = EPI_MEM_ALLOC()))
goto eexit_1;
/* Item initialization follow here ... */
EP_RB_INITNODE(&epi->rbn);
INIT_LIST_HEAD(&epi->rdllink);
INIT_LIST_HEAD(&epi->fllink);
INIT_LIST_HEAD(&epi->txlink);
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;
EP_SET_FFD(&epi->ffd, tfile, fd);
epi->event = *event;
atomic_set(&epi->usecnt, 1);
epi->nwait = 0;
/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
/*
* Attach the item to the poll hooks and get current event bits.
* We can safely use the file* here because its usage count has
* been increased by the caller of this function.
*/
revents = tfile->f_op->poll(tfile, &epq.pt);
/*
* We have to check if something went wrong during the poll wait queue
* install process. Namely an allocation for a wait queue failed due
* high memory pressure.
*/
if (epi->nwait < 0)
goto eexit_2;
/* Add the current item to the list of active epoll hook for this file */
spin_lock(&tfile->f_ep_lock);
list_add_tail(&epi->fllink, &tfile->f_ep_links);
spin_unlock(&tfile->f_ep_lock);
/* We have to drop the new item inside our item list to keep track of it */
write_lock_irqsave(&ep->lock, flags);
/* Add the current item to the rb-tree */
ep_rbtree_insert(ep, epi);
/* If the file is already "ready" we drop it inside the ready list */
if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
wake_up(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
write_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&psw, &ep->poll_wait);
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
current, ep, tfile, fd));
return 0;
eexit_2:
ep_unregister_pollwait(ep, epi);
/*
* We need to do this because an event could have been arrived on some
* allocated wait queue.
*/
write_lock_irqsave(&ep->lock, flags);
if (EP_IS_LINKED(&epi->rdllink))
EP_LIST_DEL(&epi->rdllink);
write_unlock_irqrestore(&ep->lock, flags);
EPI_MEM_FREE(epi);
eexit_1:
return error;
}
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
*/
asmlinkage long sys_epoll_wait(int epfd,struct epoll_event __user *events,
int maxevents, int timeout)
{
int error;
struct file *file;
struct eventpoll *ep;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
current, epfd, events, maxevents, timeout));
/* The maximum number of event must be greater than zero */
if (maxevents <= 0)
return -EINVAL;
/* Verify that the area passed by the user is writeable */
if ((error = verify_area(VERIFY_WRITE, events, maxevents *sizeof(struct epoll_event))))
goto eexit_1;
/* Get the "struct file *" for the eventpoll file */
error = -EBADF;
file = fget(epfd);
if (!file)
goto eexit_1;
/*
* We have to check that the file structure underneath the fd
* the user passed to us _is_ an eventpoll file.
*/
error = -EINVAL;
if (!IS_FILE_EPOLL(file))
goto eexit_2;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = file->private_data;
/* Time to fish for events ... */
error = ep_poll(ep, events, maxevents, timeout); //****
eexit_2:
fput(file);
eexit_1:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
current, epfd, events, maxevents, timeout, error));
return error;
}
static int ep_poll(struct eventpoll *ep,struct epoll_event __user *events,
int maxevents, long timeout)
{
int res, eavail;
unsigned long flags;
long jtimeout;
wait_queue_t wait;
/*
* Calculate the timeout by checking for the "infinite" value ( -1 )
* and the overflow condition. The passed timeout is in milliseconds,
* that why (t * HZ) / 1000.
*/
jtimeout = timeout == -1 || timeout > (MAX_SCHEDULE_TIMEOUT - 1000) / HZ ?
MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;
retry:
write_lock_irqsave(&ep->lock, flags);
res = 0;
if (list_empty(&ep->rdllist)) {
/*
* We don't have any available event to return to the caller.
* We need to sleep here, and we will be wake up by
* ep_poll_callback() when events will become available.
*/
init_waitqueue_entry(&wait, current);
add_wait_queue(&ep->wq, &wait);
for (;;) {
/*
* We don't want to sleep if the ep_poll_callback() sends us
* a wakeup in between. That's why we set the task state
* to TASK_INTERRUPTIBLE before doing the checks.
*/
set_current_state(TASK_INTERRUPTIBLE);
if (!list_empty(&ep->rdllist) || !jtimeout)
break;
if (signal_pending(current)) {
res = -EINTR;
break;
}
write_unlock_irqrestore(&ep->lock, flags);
jtimeout = schedule_timeout(jtimeout);
write_lock_irqsave(&ep->lock, flags);
}
remove_wait_queue(&ep->wq, &wait);
set_current_state(TASK_RUNNING);
}
/* Is it worth to try to dig for events ? */
eavail = !list_empty(&ep->rdllist);
write_unlock_irqrestore(&ep->lock, flags);
/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
* more luck.
*/
if (!res && eavail &&
!(res = ep_events_transfer(ep, events, maxevents)) && jtimeout)
goto retry;
return res;
}
Epoll是对select和poll的改进:
1 select和poll只提供了一个函数selset()&poll(),but epoll* 提供三个函数epoll_create()创建一个epoll句柄, epoll_ctl()注册要监听的事件类型, epoll_wait()等待事件的发生。
2.epoll_create()时把所有fd拷贝进内核,而不是epoll_wait()时重复拷贝;
3.Epoll_ctl()时把所有fd遍历为每个fd指定一个回调函数,当设备就绪,唤醒等待队列上的等待者,就会调用该回调函数,该回调函数把就绪fd加入一个就绪链表。Epoll_wait(),查找就绪来表中有没有就绪的fd;
4.支持的文件描述符数量没有限制
- IO复用之epoll
- IO复用之epoll
- Linux IO复用之epoll
- IO复用之——epoll
- Unix IO 复用模型之 select & poll & epoll
- 【Linux编程】IO复用之epoll详解
- IO复用之select poll epoll的总结
- IO复用之select poll epoll 函数
- 网络编程---IO复用之epoll模型
- IO多路复用之epoll
- IO多路复用之epoll
- IO多路复用之epoll
- IO多路复用之epoll
- IO多路复用之epoll
- IO多路复用之epoll
- IO多路复用之epoll
- IO多路复用之epoll
- 多路IO复用模型 select epoll
- 什么是真的学习能力
- PLSQL Recursion DB
- iframe获取父、子窗口的方法
- maven+hudson+git持续集成
- javascript-MDN笔记-函数
- IO复用之epoll
- Python(Pygame)字体设置
- Jenkins+Maven+Git搭建持续集成和自动化部署的配置手记
- Jquery和javascript在使用上的区别
- 【GDOI 2014】beyond
- LeetCode 523. Continuous Subarray Sum 解题报告
- visual studio常用快捷键
- Jenkins + Git + Maven + tomcat集成环境(转)
- 曼哈顿距离