源码剖析之epoll(1)

来源:互联网 发布:淘宝关键词提取器 编辑:程序博客网 时间:2024/06/07 06:11

1. 源码剖析

以下源码取自4.10

1.1 核心数据结构

epitem

fs/eventpoll.c

/* * Each file descriptor added to the eventpoll interface will * have an entry of this type linked to the "rbr" RB tree. * Avoid increasing the size of this struct, there can be many thousands * of these on a server and we do not want this to take another cache line. * 每个我们监视的文件都有一个epitem与之对应 */struct epitem {    union {        /* RB tree node links this structure to the eventpoll RB tree */        struct rb_node rbn;        /* Used to free the struct epitem */        struct rcu_head rcu;    };    /* List header used to link this structure to the eventpoll ready list */    struct list_head rdllink;    /*     * Works together "struct eventpoll"->ovflist in keeping the     * single linked chain of items.     */    struct epitem *next;    /**     * The file descriptor information this item refers to      * 该值作为epitem在RB-Tree的key,用于增删改查操作     */    struct epoll_filefd ffd;    /* Number of active wait queue attached to poll operations */    int nwait;    /* List containing poll wait queues */    /* 对应于eppoll_entry中pwqlist */    struct list_head pwqlist;    /* The "container" of this item */    struct eventpoll *ep;    /* List header used to link this item to the "struct file" items list */    /* 在struct file中用f_ep_links链接所有对自己监听的epitem */    /* 与struct file中的f_ep_links进行链接 */    struct list_head fllink;    /* wakeup_source used when EPOLLWAKEUP is set */    struct wakeup_source __rcu *ws;    /* The structure that describe the interested events and the source fd */    struct epoll_event event;};

eventpoll

该结构是整个eventpoll的核心数据结构

fs/eventpoll.c

/* * This structure is stored inside the "private_data" member of the file * structure and represents the main data structure for the eventpoll * interface. */ struct eventpoll {    /* Protect the access to this structure */    spinlock_t lock;    /*     * This mutex is used to ensure that files are not removed     * while epoll is using them. This is held during the event     * collection loop, the file cleanup path, the epoll file exit     * code and the ctl operations.     */    struct mutex mtx;    /* Wait queue used by sys_epoll_wait() */    wait_queue_head_t wq;    /* Wait queue used by file->poll() */    wait_queue_head_t poll_wait;    /* List of ready file descriptors */    struct list_head rdllist;    /* RB tree root used to store monitored fd structs */    struct rb_root rbr;    /*     * This is a single linked list that chains all the "struct epitem" that     * happened while transferring ready events to userspace w/out     * holding ->lock.     * 该结构用于当rdllist中的event正在传回用户空间时暂存就绪事件 */     */    struct epitem *ovflist;    /* wakeup_source used when ep_scan_ready_list is running */    struct wakeup_source *ws;    /* The user that created the eventpoll descriptor */    struct user_struct *user;    /* eventpoll对应的file */    struct file *file;    /* used to optimize loop detection check */    int visited;    struct list_head visited_list_link;};

eppoll_entry

fs/eventepoll.c

/** * Wait structure used by the poll hooks * 该结构是epitem/wait_queue_t/wait_queue_head_t之间的的链接结构 */struct eppoll_entry {    /* List header used to link this structure to the "struct epitem" */    /* 对应于epitem的中pwqlist */    struct list_head llink;    /* The "base" pointer is set to the container "struct epitem" */    struct epitem *base;    /*     * Wait queue item that will be linked to the target file wait     * queue head.     */    wait_queue_t wait;    /* The wait queue head that linked the "wait" wait queue item */    wait_queue_head_t *whead;};

1.2 eventpoll_init

在系统的初始化阶段,在slab中分配cache用于存储epitemeventpoll

fs/eventepoll.c

static int __init eventpoll_init(void){    struct sysinfo si;    si_meminfo(&si);    /*     * Allows top 4% of lomem to be allocated for epoll watches (per user).     */    max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /        EP_ITEM_COST;    BUG_ON(max_user_watches < 0);    /*     * Initialize the structure used to perform epoll file descriptor     * inclusion loops checks.     */    ep_nested_calls_init(&poll_loop_ncalls);    /* Initialize the structure used to perform safe poll wait head wake ups */    ep_nested_calls_init(&poll_safewake_ncalls);    /* Initialize the structure used to perform file's f_op->poll() calls */    ep_nested_calls_init(&poll_readywalk_ncalls);    /*     * We can have many thousands of epitems, so prevent this from     * using an extra cache line on 64-bit (and smaller) CPUs     */    BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);    /* Allocates slab cache used to allocate "struct epitem" items */    epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),            0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);    /* Allocates slab cache used to allocate "struct eppoll_entry" */    pwq_cache = kmem_cache_create("eventpoll_pwq",            sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);    return 0;}/** * __init的作用的将该函数放在.init.text,使系统初始化阶段执行该函数  * include/linux/init.h */#define __init      __section(.init.text) __cold notrace __latent_entropy#define __initdata  __section(.init.data)#define __initconst __section(.init.rodata)#define __exitdata  __section(.exit.data)#define __exit_call __used __section(.exitcall.exit)

1.3 epoll_create

创建一个eventpoll实例

fs/eventpoll.c

SYSCALL_DEFINE1(epoll_create, int, size){    if (size <= 0)        return -EINVAL;    return sys_epoll_create1(0);}/* * Open an eventpoll file descriptor. */SYSCALL_DEFINE1(epoll_create1, int, flags){    int error, fd;    struct eventpoll *ep = NULL;    struct file *file;    /* Check the EPOLL_* constant for consistency.  */    BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);    if (flags & ~EPOLL_CLOEXEC)        return -EINVAL;    /*     * Create the internal data structure ("struct eventpoll").     */    error = ep_alloc(&ep);    if (error < 0)        return error;    /*     * Creates all the items needed to setup an eventpoll file. That is,     * a file structure and a free file descriptor.     */    fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));    if (fd < 0) {        error = fd;        goto out_free_ep;    }    file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,                 O_RDWR | (flags & O_CLOEXEC));    if (IS_ERR(file)) {        error = PTR_ERR(file);        goto out_free_fd;    }    ep->file = file;    fd_install(fd, file);    return fd;out_free_fd:    put_unused_fd(fd);out_free_ep:    ep_free(ep);    return error;}// 在slab cache中分配一块内存static int ep_alloc(struct eventpoll **pep){    int error;    struct user_struct *user;    struct eventpoll *ep;    user = get_current_user();    error = -ENOMEM;    ep = kzalloc(sizeof(*ep), GFP_KERNEL);    if (unlikely(!ep))        goto free_uid;    spin_lock_init(&ep->lock);    mutex_init(&ep->mtx);    init_waitqueue_head(&ep->wq);    init_waitqueue_head(&ep->poll_wait);    INIT_LIST_HEAD(&ep->rdllist);    ep->rbr = RB_ROOT;    ep->ovflist = EP_UNACTIVE_PTR;    ep->user = user;    *pep = ep;    return 0;free_uid:    free_uid(user);    return error;}/** * anon_inode_getfile - creates a new file instance by hooking it up to an *                      anonymous inode, and a dentry that describe the "class" *                      of the file * * @name:    [in]    name of the "class" of the new file * @fops:    [in]    file operations for the new file * @priv:    [in]    private data for the new file (will be file's private_data) * @flags:   [in]    flags * * Creates a new file by hooking it on a single inode. This is useful for files * that do not need to have a full-fledged inode in order to operate correctly. * All the files created with anon_inode_getfile() will share a single inode, * hence saving memory and avoiding code duplication for the file/inode/dentry * setup.  Returns the newly created file* or an error pointer. */struct file *anon_inode_getfile(const char *name,                const struct file_operations *fops,                void *priv, int flags){    struct qstr this;    struct path path;    struct file *file;    if (IS_ERR(anon_inode_inode))        return ERR_PTR(-ENODEV);    if (fops->owner && !try_module_get(fops->owner))        return ERR_PTR(-ENOENT);    /*     * Link the inode to a directory entry by creating a unique name     * using the inode sequence number.     */    file = ERR_PTR(-ENOMEM);    this.name = name;    this.len = strlen(name);    this.hash = 0;    path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this);    if (!path.dentry)        goto err_module;    path.mnt = mntget(anon_inode_mnt);    /*     * We know the anon_inode inode count is always greater than zero,     * so ihold() is safe.     */    ihold(anon_inode_inode);    d_instantiate(path.dentry, anon_inode_inode);    file = alloc_file(&path, OPEN_FMODE(flags), fops);    if (IS_ERR(file))        goto err_dput;    file->f_mapping = anon_inode_inode->i_mapping;    file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);    file->private_data = priv;    return file;err_dput:    path_put(&path);err_module:    module_put(fops->owner);    return file;}