IO复用之epoll

来源:互联网 发布:淘宝站内推广有哪些 编辑:程序博客网 时间:2024/06/06 12:53


 

/*epoll 是由一组系统调用组成。

int epoll_create(int size);

int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);

int epoll_wait(int epfd, struct epoll_event *events,

int maxevents, int timeout);*/

 

/*

select/poll 的缺点在于:

1.每次调用时要重复地从用户态读入参数。

2.每次调用时要重复地扫描文件描述符。

3.每次在调用开始时,要把当前进程放入各个文件描述符的等待队列。在调用结束后,

又把进程从各个等待队列中删除。

*/

 

/*

epoll 机制是针对 select/poll 的缺陷设计的。通过新引入的 eventpollfs 文件系统,

epoll 把参数拷贝到内核态,在每次轮询时不会重复拷贝。通过把操作拆分为

epoll_create,epoll_ctl,epoll_wait,避免了重复地遍历要监视的文件描述符。此外,由

于调用 epoll 的进程被唤醒后,只要直接从 epitem 的完成队列中找出完成的事件,找出完

成事件的复杂度由 O(N)降到了 O(1)。

但是 epoll 的性能提高是有前提的,那就是监视的文件描述符非常多,而且每次完成

操作的文件非常少。所以,epoll 能否显著提高效率,取决于实际的应用场景。这方面需要

进一步测试。

*/

 

 

struct epoll_filefd {

struct file *file;

int fd;

};

 

 

/*

* Node that is linked into the "wake_task_list" member of the "struct poll_safewake".

* It is used to keep track on all tasks that are currently inside the wake_up() code

* to 1) short-circuit the one coming from the same task and same wait queue head

* ( loop ) 2) allow a maximum number of epoll descriptors inclusion nesting

* 3) let go the ones coming from other tasks.

*/

struct wake_task_node {

struct list_head llink;

task_t *task;

wait_queue_head_t *wq;

};

 

 

 

/*

* This is used to implement(实现) the safe poll wake up avoiding to reenter(再进入)

* the poll callback from inside wake_up().

*/

struct poll_safewake {

struct list_head wake_task_list;

spinlock_t lock;

};

 

 

 

/*

* This structure is stored(存储) inside the "private_data" member of the file

* structure and rapresent the main data sructure for the eventpoll

* interface.

* 存储 epoll 文件描述符的扩展信息,它被保存在 file 结构体的

* private_data 中。它与 epoll 文件节点一一对应。通常一个 epoll 文件节点对应多个被监视

* 的文件描述符。所以一个 eventpoll 结构体会对应多个 epitem 结构体。

 

*/

 

struct eventpoll {

/* Protect the this structure access */

rwlock_t lock; //读写锁

 

/*

* This semaphore is used to ensure that files are not removed

* while epoll is using them. This is read-held during the event

* collection loop and it is write-held during the file cleanup

* path, the epoll file exit code and the ctl operations.

*/

struct rw_semaphore sem;//读写信号 路径,退出码,控制操作

 

/* Wait queue used by sys_epoll_wait() */

wait_queue_head_t wq; //等待队列

 

/* Wait queue used by file->poll() */

wait_queue_head_t poll_wait; //等待队列

 

/* List of ready file descriptors */

struct list_head rdllist;//就绪文件描述法链表

 

/* RB-Tree root used to store monitored fd structs */

struct rb_root rbr;//存储epoll监视的文件描述符

};

 

 

 

/* Wait structure used by the poll hooks(监视?) */

struct eppoll_entry {

/* List header used to link this structure to the "struct epitem" */

struct list_head llink;//链表头部 连接"struct epitem"

 

/* The "base" pointer is set to the container "struct epitem" */

void *base; //设置"struct epitem"

 

/*

* Wait queue item that will be linked to the target file wait

* queue head.

*/

wait_queue_t wait; //等待队列

 

/* The wait queue head that linked the "wait" wait queue item */

wait_queue_head_t *whead;

};

 

 

 

/*

**********************************************************

* Each file descriptor added to the eventpoll interface will

* have an entry of this type linked to the hash.

* 该结构体用来保存与 epoll 节点关联的多个文件描述符,保存的方式是使用红黑树实

* 现的 hash 表。

*/

struct epitem {

/* RB-Tree node used to link this structure to the eventpoll rb-tree */

struct rb_node rbn;//红黑树 保存eventpoll

 

/* List header used to link this structure to the eventpoll ready list */

struct list_head rdllink;//双向链表,用来保存已经完成的 eventpoll

 

/* The file descriptor information this item refers to */

struct epoll_filefd ffd;//对应文件描述法信息

 

/* Number of active wait queue attached to poll operations */

int nwait; //poll 操作中事件的个数

 

/* List containing poll wait queues */

struct list_head pwqlist;   //双向链表,保存着被监视文件的等待队列

 

/* The "container" of this item */

struct eventpoll *ep;//指向 eventpoll,多个 epitem 对应一个 eventpoll

 

/* The structure that describe the interested events and the source fd */

struct epoll_event event;//记录发生的事件和对应的 fd

 

/*

* Used to keep track of the usage count of the structure. This avoids

* that the structure will desappear from underneath our processing.

*/

atomic_t usecnt; //(usage count)引用计数

 

/* List header used to link this item to the "struct file" items list */

struct list_head fllink;//双向链表,用来链接被监视的文件描述符对应的 struct file

 

/* List header used to link the item to the transfer list */

struct list_head txlink;//双向链表,用来保存传输队列

 

/*

* This is used during the collection/transfer of events to userspace

* to pin items empty events set.

*/

unsigned int revents;//文件描述符的状态,在收集和传输时用来锁住空的事件集合

};

 

 

 

/* Wrapper struct used by poll queueing */

struct ep_pqueue {

poll_table pt;

struct epitem *epi;

};

 

// epoll_create 的实现

 

/*

* It opens an eventpoll file descriptor by suggesting(建议) a storage of "size"

* file descriptors. The size parameter is just an hint about how to size

* data structures. It won't prevent(阻止) the user to store more than "size"

* file descriptors inside the epoll interface. It is the kernel part of

* the userspace epoll_create(2).

*/

asmlinkage long sys_epoll_create(int size)

{

int error, fd;

struct inode *inode;

struct file *file;

 

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",

current, size));

 

/* Sanity check on the size parameter */

error = -EINVAL;

if (size <= 0)

goto eexit_1;

 

/*

* Creates all the items needed to setup an eventpoll file. That is,

* a file structure, and inode and a free file descriptor.

*/

error = ep_getfd(&fd, &inode, &file);  /*后把 file,dentry,inode 三者关联起来*/

if (error)

goto eexit_1;

 

/* Setup the file internal data structure ( "struct eventpoll" ) */

error = ep_file_init(file);  

if (error)

goto eexit_2;

 

 

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",

current, size, fd));

 

return fd;

 

eexit_2:

sys_close(fd);

eexit_1:

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",

current, size, error));

return error;

}

 

/*

* Creates the file descriptor to be used by the epoll interface.

*/

static int ep_getfd(int *efd,struct inode **einode,struct file **efile)

{

struct qstr this;

char name[32];

struct dentry *dentry;

struct inode *inode;

struct file *file;

int error, fd;

 

/* Get an ready to use file */

error = -ENFILE;

file = get_empty_filp();

if (!file)

goto eexit_1;

 

/* Allocates an inode from the eventpoll file system */

inode = ep_eventpoll_inode();

error = PTR_ERR(inode);

if (IS_ERR(inode))

goto eexit_2;

 

/* Allocates a free descriptor to plug the file onto */

error = get_unused_fd();

if (error < 0)

goto eexit_3;

fd = error;

 

/*

* Link the inode to a directory entry by creating a unique name

* using the inode number.

*/

error = -ENOMEM;

sprintf(name, "[%lu]", inode->i_ino);

this.name = name;

this.len = strlen(name);

this.hash = inode->i_ino;

dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this);

if (!dentry)

goto eexit_4;

dentry->d_op = &eventpollfs_dentry_operations;

d_add(dentry, inode);

file->f_vfsmnt = mntget(eventpoll_mnt);

file->f_dentry = dentry;

file->f_mapping = inode->i_mapping;

 

file->f_pos = 0;

file->f_flags = O_RDONLY;

file->f_op = &eventpoll_fops;

file->f_mode = FMODE_READ;

file->f_version = 0;

file->private_data = NULL;

 

/* Install the new setup file into the allocated fd. */

fd_install(fd, file);

 

*efd = fd;

*einode = inode;

*efile = file;

return 0;

 

eexit_4:

put_unused_fd(fd);

eexit_3:

iput(inode);

eexit_2:

put_filp(file);

eexit_1:

return error;

}

 

static int ep_file_init(struct file *file)

{

struct eventpoll *ep;   //创建eventpoll结构体

 

if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL)))

return -ENOMEM;

 

memset(ep, 0, sizeof(*ep));

rwlock_init(&ep->lock);

init_rwsem(&ep->sem);

init_waitqueue_head(&ep->wq);

init_waitqueue_head(&ep->poll_wait);

INIT_LIST_HEAD(&ep->rdllist);

ep->rbr = RB_ROOT;

 

file->private_data = ep;  //eventpoll结构体与file私有数据关联

 

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_file_init() ep=%p\n",

current, ep));

return 0;

}

 

 

 

 

 

// epoll_ctl 的实现 把文件与 eventpollfs 文件系统的 inode 节点关联起来

/*

* The following function implements the controller interface for

* the eventpoll file that enables the insertion/removal/change of

* file descriptors inside the interest set.  It represents

* the kernel part of the user space epoll_ctl(2).

*/

asmlinkage long

sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)

{

int error;

struct file *file, *tfile;

struct eventpoll *ep;

struct epitem *epi;

struct epoll_event epds;

 

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",

current, epfd, op, fd, event));

 

error = -EFAULT;

if (EP_OP_HASH_EVENT(op) &&

copy_from_user(&epds, event, sizeof(struct epoll_event)))

goto eexit_1;

 

/* Get the "struct file *" for the eventpoll file */

error = -EBADF;

file = fget(epfd);

if (!file)

goto eexit_1;

 

/* Get the "struct file *" for the target file */

tfile = fget(fd);

if (!tfile)

goto eexit_2;

 

/* The target file descriptor must support poll */

error = -EPERM;

if (!tfile->f_op || !tfile->f_op->poll)

goto eexit_3;

 

/*

* We have to check that the file structure underneath the file descriptor

* the user passed to us _is_ an eventpoll file. And also we do not permit

* adding an epoll file descriptor inside itself.

*/

error = -EINVAL;

if (file == tfile || !IS_FILE_EPOLL(file))

goto eexit_3;

 

/*

* At this point it is safe to assume that the "private_data" contains

* our own data structure.

*/

ep = file->private_data;

 

down_write(&ep->sem);

 

/*

* Search the file inside the eventpoll hash. It add usage count to

* the returned item, so the caller must call ep_release_epitem()

* after finished using the "struct epitem".

*/

/* Try to lookup the file inside our hash table */

epi = ep_find(ep, tfile, fd);

 

 

error = -EINVAL;

switch (op) {

case EPOLL_CTL_ADD:

if (!epi) {

epds.events |= POLLERR | POLLHUP;

 

error = ep_insert(ep, &epds, tfile, fd);

}

else

error = -EEXIST;

break;

case EPOLL_CTL_DEL:

if (epi)

error = ep_remove(ep, epi);

else

error = -ENOENT;

break;

case EPOLL_CTL_MOD:

if (epi) {

epds.events |= POLLERR | POLLHUP;

error = ep_modify(ep, epi, &epds);

}

else

error = -ENOENT;

break;

}

 

/*

* The function ep_find() increments the usage count of the structure

* so, if this is not NULL, we need to release it.

*/

if (epi)

ep_release_epitem(epi);

 

up_write(&ep->sem);

 

eexit_3:

fput(tfile);

eexit_2:

fput(file);

eexit_1:

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",

current, epfd, op, fd, event, error));

 

return error;

}

 

static int ep_insert(struct eventpoll *ep,struct epoll_event *event,

struct file *tfile, int fd)

{

int error, revents, pwake = 0;

unsigned long flags;

struct epitem *epi;

struct ep_pqueue epq;

 

error = -ENOMEM;

if (!(epi = EPI_MEM_ALLOC()))

goto eexit_1;

 

/* Item initialization follow here ... */

EP_RB_INITNODE(&epi->rbn);

INIT_LIST_HEAD(&epi->rdllink);

INIT_LIST_HEAD(&epi->fllink);

INIT_LIST_HEAD(&epi->txlink);

INIT_LIST_HEAD(&epi->pwqlist);

epi->ep = ep;

EP_SET_FFD(&epi->ffd, tfile, fd);

epi->event = *event;

atomic_set(&epi->usecnt, 1);

epi->nwait = 0;

 

/* Initialize the poll table using the queue callback */

epq.epi = epi;

init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

 

/*

* Attach the item to the poll hooks and get current event bits.

* We can safely use the file* here because its usage count has

* been increased by the caller of this function.

*/

revents = tfile->f_op->poll(tfile, &epq.pt);

 

/*

* We have to check if something went wrong during the poll wait queue

* install process. Namely an allocation for a wait queue failed due

* high memory pressure.

*/

if (epi->nwait < 0)

goto eexit_2;

 

/* Add the current item to the list of active epoll hook for this file */

spin_lock(&tfile->f_ep_lock);

list_add_tail(&epi->fllink, &tfile->f_ep_links);

spin_unlock(&tfile->f_ep_lock);

 

/* We have to drop the new item inside our item list to keep track of it */

write_lock_irqsave(&ep->lock, flags);

 

/* Add the current item to the rb-tree */

ep_rbtree_insert(ep, epi);

 

/* If the file is already "ready" we drop it inside the ready list */

if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) {

list_add_tail(&epi->rdllink, &ep->rdllist);

 

/* Notify waiting tasks that events are available */

if (waitqueue_active(&ep->wq))

wake_up(&ep->wq);

if (waitqueue_active(&ep->poll_wait))

pwake++;

}

 

write_unlock_irqrestore(&ep->lock, flags);

 

/* We have to call this outside the lock */

if (pwake)

ep_poll_safewake(&psw, &ep->poll_wait);

 

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",

current, ep, tfile, fd));

 

return 0;

 

eexit_2:

ep_unregister_pollwait(ep, epi);

 

/*

* We need to do this because an event could have been arrived on some

* allocated wait queue.

*/

write_lock_irqsave(&ep->lock, flags);

if (EP_IS_LINKED(&epi->rdllink))

EP_LIST_DEL(&epi->rdllink);

write_unlock_irqrestore(&ep->lock, flags);

 

EPI_MEM_FREE(epi);

eexit_1:

return error;

}

 

/*

* Implement the event wait interface for the eventpoll file. It is the kernel

* part of the user space epoll_wait(2).

*/

asmlinkage long sys_epoll_wait(int epfd,struct epoll_event __user *events,

int maxevents, int timeout)

{

int error;

struct file *file;

struct eventpoll *ep;

 

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",

current, epfd, events, maxevents, timeout));

 

/* The maximum number of event must be greater than zero */

if (maxevents <= 0)

return -EINVAL;

 

/* Verify that the area passed by the user is writeable */

if ((error = verify_area(VERIFY_WRITE, events, maxevents *sizeof(struct epoll_event))))

goto eexit_1;

 

/* Get the "struct file *" for the eventpoll file */

error = -EBADF;

file = fget(epfd);

if (!file)

goto eexit_1;

 

/*

* We have to check that the file structure underneath the fd

* the user passed to us _is_ an eventpoll file.

*/

error = -EINVAL;

if (!IS_FILE_EPOLL(file))

goto eexit_2;

 

/*

* At this point it is safe to assume that the "private_data" contains

* our own data structure.

*/

ep = file->private_data;

 

/* Time to fish for events ... */

error = ep_poll(ep, events, maxevents, timeout);   //****

 

eexit_2:

fput(file);

eexit_1:

DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",

current, epfd, events, maxevents, timeout, error));

 

return error;

}

 

 

 

static int ep_poll(struct eventpoll *ep,struct epoll_event __user *events,

int maxevents, long timeout)

{

int res, eavail;

unsigned long flags;

long jtimeout;

wait_queue_t wait;

 

/*

* Calculate the timeout by checking for the "infinite" value ( -1 )

* and the overflow condition. The passed timeout is in milliseconds,

* that why (t * HZ) / 1000.

*/

jtimeout = timeout == -1 || timeout > (MAX_SCHEDULE_TIMEOUT - 1000) / HZ ?

MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;

 

retry:

write_lock_irqsave(&ep->lock, flags);

 

res = 0;

if (list_empty(&ep->rdllist)) {

/*

* We don't have any available event to return to the caller.

* We need to sleep here, and we will be wake up by

* ep_poll_callback() when events will become available.

*/

init_waitqueue_entry(&wait, current);

add_wait_queue(&ep->wq, &wait);

 

for (;;) {

/*

* We don't want to sleep if the ep_poll_callback() sends us

* a wakeup in between. That's why we set the task state

* to TASK_INTERRUPTIBLE before doing the checks.

*/

set_current_state(TASK_INTERRUPTIBLE);

if (!list_empty(&ep->rdllist) || !jtimeout)

break;

if (signal_pending(current)) {

res = -EINTR;

break;

}

 

write_unlock_irqrestore(&ep->lock, flags);

jtimeout = schedule_timeout(jtimeout);

write_lock_irqsave(&ep->lock, flags);

}

remove_wait_queue(&ep->wq, &wait);

 

set_current_state(TASK_RUNNING);

}

 

/* Is it worth to try to dig for events ? */

eavail = !list_empty(&ep->rdllist);

 

write_unlock_irqrestore(&ep->lock, flags);

 

/*

* Try to transfer events to user space. In case we get 0 events and

* there's still timeout left over, we go trying again in search of

* more luck.

*/

if (!res && eavail &&

!(res = ep_events_transfer(ep, events, maxevents)) && jtimeout)

goto retry;

 

return res;

}

 



Epoll是对select和poll的改进:

1 select和poll只提供了一个函数selset()&poll(),but epoll* 提供三个函数epoll_create()创建一个epoll句柄, epoll_ctl()注册要监听的事件类型, epoll_wait()等待事件的发生。

2.epoll_create()时把所有fd拷贝进内核,而不是epoll_wait()时重复拷贝;

3.Epoll_ctl()时把所有fd遍历为每个fd指定一个回调函数,当设备就绪,唤醒等待队列上的等待者,就会调用该回调函数,该回调函数把就绪fd加入一个就绪链表。Epoll_wait(),查找就绪来表中有没有就绪的fd;

4.支持的文件描述符数量没有限制


 

1 0
原创粉丝点击