源码剖析之poll

来源:互联网 发布:软件实施工资怎么样 编辑:程序博客网 时间:2024/06/15 19:19

1. poll

   从内核的角度看来,借助于VFS, 一切皆file

// 文件表示 include/linux/fs.hstruct file {      const struct file_operations    *f_op;      spinlock_t          f_lock;      // 文件内部实现细节      void               *private_data;  #ifdef CONFIG_EPOLL      /* Used by fs/eventpoll.c to link all the hooks to this file */      struct list_head    f_ep_links;      struct list_head    f_tfile_llink;  #endif /* #ifdef CONFIG_EPOLL */      // 其他细节....  };  // 文件操作  include/linux/fs.hstruct file_operations {      // 文件提供给poll/select/epoll      // 调用poll_table_struct中指定的函数并获取文件的当前状态    unsigned int (*poll) (struct file *, struct poll_table_struct *);      // 其他方法read/write 等... ...  };  /** * 通常文件poll方法的实现 * 调用poll_table_struct中指定的函数,并获得文件当前就绪事件的掩码 * @param flip 文件的指针 * @param wait 指向poll_table_struct的指针 * @return 返回文件当前就绪事件掩码 */unsigned int XXX_poll (struct file *filp, struct poll_table_struct *wait)  {      unsigned int mask = 0;      wait_queue_head_t * wait_queue;      // 1. 根据事件掩码wait->key_和文件实现filp->private_data 取得事件掩码对应的一个或多个wait queue head      some_code();      // 2. 调用poll_wait,目的是向获得的等待队列中添加等待队列项     poll_wait(filp, wait_queue, wait);      // 3. 取得文件当前就绪事件的掩码并保存到mask      some_code();      return mask;  }  // select/poll/epoll 向文件注册就绪后回调节点的接口结构  typedef struct poll_table_struct {      // 向指定等待队列(wait_queue_head)添加等待队列项的(wait_queue_t)的接口函数      poll_queue_proc _qproc;      // 关注的事件掩码, 文件的实现利用此掩码将对应的等待队列传递给_qproc      unsigned long   _key;  } poll_table;  // typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);  // 通用的poll_wait 函数, 文件的f_ops->poll 通常会调用此函数  static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)  {      if (p && p->_qproc && wait_address) {           // 调用poll_table_struct 中指定的函数_qproc        // qproc一般的作用是向指定事件等待队列中添加等待队列项        // 如果是select或poll 则是 __pollwait, 如果是 epoll 则是 ep_ptable_queue_proc         p->_qproc(filp, wait_address, p);      }  }  

2. upd的poll

2.1 poll

net/ipv4/af_inet.c

const struct proto_ops inet_dgram_ops = {    // 其它细节    .poll          = udp_poll,    // 其它细节};

2.2 udp_poll

net/ipv4/udp.c

/** *  udp_poll - wait for a UDP event. *  @file - file struct *  @sock - socket *  @wait - poll table * *  This is same as datagram poll, except for the special case of *  blocking sockets. If application is using a blocking fd *  and a packet with checksum error is in the queue; *  then it could get return from select indicating data available *  but then block when reading it. Add special case code *  to work around these arguably broken applications. */unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait){    unsigned int mask = datagram_poll(file, sock, wait);    struct sock *sk = sock->sk;    sock_rps_record_flow(sk);    /* Check for false positives due to checksum errors */    if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&        !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)        mask &= ~(POLLIN | POLLRDNORM);    return mask;}

2.3 datagram_poll

net/core/datagram.c

/** *  datagram_poll - generic datagram poll *  @file: file struct *  @sock: socket *  @wait: poll table * *  Datagram poll: Again totally generic. This also handles *  sequenced packet sockets providing the socket receive queue *  is only ever holding data ready to receive. * *  Note: when you _don't_ use this routine for this protocol, *  and you use a different write policy from sock_writeable() *  then please supply your own write_space callback. */unsigned int datagram_poll(struct file *file, struct socket *sock,               poll_table *wait){    struct sock *sk = sock->sk;    unsigned int mask;    sock_poll_wait(file, sk_sleep(sk), wait);    mask = 0;    /* exceptional events? */    if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))        mask |= POLLERR |            (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);    if (sk->sk_shutdown & RCV_SHUTDOWN)        mask |= POLLRDHUP | POLLIN | POLLRDNORM;    if (sk->sk_shutdown == SHUTDOWN_MASK)        mask |= POLLHUP;    /* readable? */    if (!skb_queue_empty(&sk->sk_receive_queue))        mask |= POLLIN | POLLRDNORM;    /* Connection-based need to check for termination and startup */    if (connection_based(sk)) {        if (sk->sk_state == TCP_CLOSE)            mask |= POLLHUP;        /* connection hasn't started yet? */        if (sk->sk_state == TCP_SYN_SENT)            return mask;    }    /* writable? */    if (sock_writeable(sk))        mask |= POLLOUT | POLLWRNORM | POLLWRBAND;    else        sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);    return mask;}

2.4 sock_poll_wait

include/net/sock.c

/** * sock_poll_wait - place memory barrier behind the poll_wait call. * @filp:           file * @wait_address:   socket wait queue * @p:              poll_table * * See the comments in the wq_has_sleeper function. */static inline void sock_poll_wait(struct file *filp,        wait_queue_head_t *wait_address, poll_table *p){    if (!poll_does_not_wait(p) && wait_address) {        poll_wait(filp, wait_address, p);        /* We need to be sure we are in sync with the         * socket flags modification.         *         * This memory barrier is paired in the wq_has_sleeper.         */        smp_mb();    }}

2.5 poll_wait

include/linux/poll.h

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p){    if (p && p->_qproc && wait_address)        p->_qproc(filp, wait_address, p);}

2.6 小结

poll -> udp_poll -> datagram_poll -> sock_poll_wait -> poll_wait

3. TCP的poll

3.1 poll

net/ipv4/af_inet.c

const struct proto_ops inet_stream_ops = {    .family        = PF_INET,    .owner         = THIS_MODULE,    .release       = inet_release,    .bind          = inet_bind,    .connect       = inet_stream_connect,    .socketpair    = sock_no_socketpair,    .accept        = inet_accept,    .getname       = inet_getname,    .poll          = tcp_poll,    .ioctl         = inet_ioctl,    .listen        = inet_listen,    .shutdown      = inet_shutdown,    .setsockopt    = sock_common_setsockopt,    .getsockopt    = sock_common_getsockopt,    .sendmsg       = inet_sendmsg,    .recvmsg       = inet_recvmsg,    .mmap          = sock_no_mmap,    .sendpage      = inet_sendpage,    .splice_read       = tcp_splice_read,    .read_sock     = tcp_read_sock,    .peek_len      = tcp_peek_len,#ifdef CONFIG_COMPAT    .compat_setsockopt = compat_sock_common_setsockopt,    .compat_getsockopt = compat_sock_common_getsockopt,    .compat_ioctl      = inet_compat_ioctl,#endif};

3.2 tcp_poll

net/ipv4/tcp.c

/* *  Wait for a TCP event. * *  Note that we don't need to lock the socket, as the upper poll layers *  take care of normal races (between the test and the event) and we don't *  go look at any of the socket buffers directly. */unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait){    unsigned int mask;    struct sock *sk = sock->sk;    const struct tcp_sock *tp = tcp_sk(sk);    int state;    sock_rps_record_flow(sk);    sock_poll_wait(file, sk_sleep(sk), wait);    state = sk_state_load(sk);    if (state == TCP_LISTEN)        return inet_csk_listen_poll(sk);    /* Socket is not locked. We are protected from async events     * by poll logic and correct handling of state changes     * made by other threads is impossible in any case.     */    mask = 0;    /*     * POLLHUP is certainly not done right. But poll() doesn't     * have a notion of HUP in just one direction, and for a     * socket the read side is more interesting.     *     * Some poll() documentation says that POLLHUP is incompatible     * with the POLLOUT/POLLWR flags, so somebody should check this     * all. But careful, it tends to be safer to return too many     * bits than too few, and you can easily break real applications     * if you don't tell them that something has hung up!     *     * Check-me.     *     * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and     * our fs/select.c). It means that after we received EOF,     * poll always returns immediately, making impossible poll() on write()     * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP     * if and only if shutdown has been made in both directions.     * Actually, it is interesting to look how Solaris and DUX     * solve this dilemma. I would prefer, if POLLHUP were maskable,     * then we could set it on SND_SHUTDOWN. BTW examples given     * in Stevens' books assume exactly this behaviour, it explains     * why POLLHUP is incompatible with POLLOUT.    --ANK     *     * NOTE. Check for TCP_CLOSE is added. The goal is to prevent     * blocking on fresh not-connected or disconnected socket. --ANK     */    if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)        mask |= POLLHUP;    if (sk->sk_shutdown & RCV_SHUTDOWN)        mask |= POLLIN | POLLRDNORM | POLLRDHUP;    /* Connected or passive Fast Open socket? */    if (state != TCP_SYN_SENT &&        (state != TCP_SYN_RECV || tp->fastopen_rsk)) {        int target = sock_rcvlowat(sk, 0, INT_MAX);        if (tp->urg_seq == tp->copied_seq &&            !sock_flag(sk, SOCK_URGINLINE) &&            tp->urg_data)            target++;        if (tp->rcv_nxt - tp->copied_seq >= target)            mask |= POLLIN | POLLRDNORM;        if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {            if (sk_stream_is_writeable(sk)) {                mask |= POLLOUT | POLLWRNORM;            } else {  /* send SIGIO later */                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);                /* Race breaker. If space is freed after                 * wspace test but before the flags are set,                 * IO signal will be lost. Memory barrier                 * pairs with the input side.                 */                smp_mb__after_atomic();                if (sk_stream_is_writeable(sk))                    mask |= POLLOUT | POLLWRNORM;            }        } else            mask |= POLLOUT | POLLWRNORM;        if (tp->urg_data & TCP_URG_VALID)            mask |= POLLPRI;    }    /* This barrier is coupled with smp_wmb() in tcp_reset() */    smp_rmb();    if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))        mask |= POLLERR;    return mask;}

3.3 sock_poll_wait

include/net/sock.c

/** * sock_poll_wait - place memory barrier behind the poll_wait call. * @filp:           file * @wait_address:   socket wait queue * @p:              poll_table * * See the comments in the wq_has_sleeper function. */static inline void sock_poll_wait(struct file *filp,        wait_queue_head_t *wait_address, poll_table *p){    if (!poll_does_not_wait(p) && wait_address) {        poll_wait(filp, wait_address, p);        /* We need to be sure we are in sync with the         * socket flags modification.         *         * This memory barrier is paired in the wq_has_sleeper.         */        smp_mb();    }}

3.4 poll_wait

include/linux/poll.h

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p){    if (p && p->_qproc && wait_address)        p->_qproc(filp, wait_address, p);}

3.5 小结

poll -> tcp_poll -> sock_poll_wait -> poll_wait