TCP/IP源码学习(52)——TCP的连接过程的实现(1)

来源：互联网发布：熊片数据库编辑：程序博客网时间：2024/05/16 18:39

http://blog.chinaunix.net/uid-23629988-id-3178006.html

作者：gfree.wind@gmail.com
博客：blog.focus-linux.net linuxfocus.blog.chinaunix.net

本文的copyleft归gfree.wind@gmail.com所有，使用GPL发布，可以自由拷贝，转载。但转载请保持文档的完整性，注明原作者及原链接，严禁用于任何商业用途。

======================================================================================================

在以前的文章中，学习了UDP数据包的接收和发送。今天开始研究一下TCP数据包的接受。与UDP数据包类似，当IP数据包到达ip_local_deliver_finish函数时，根据四层协议从inet_protos数组中得到TCP协议对应的tcp_protocol。

    static const struct net_protocol tcp_protocol = {        .handler = tcp_v4_rcv,        .err_handler = tcp_v4_err,        .gso_send_check = tcp_v4_gso_send_check,        .gso_segment = tcp_tso_segment,        .gro_receive = tcp4_gro_receive,        .gro_complete = tcp4_gro_complete,        .no_policy = 1,        .netns_ok = 1,    };

那么TCP数据包的接收函数入口即为tcp_v4_rcv

    int tcp_v4_rcv(struct sk_buff *skb)    {        const struct iphdr *iph;        const struct tcphdr *th;        struct sock *sk;        int ret;        struct net *net = dev_net(skb->dev);          /* 检测该包是否为发给本机的 */        if (skb->pkt_type != PACKET_HOST)            goto discard_it;        /* Count it even if it's bad */        TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);          /* 检查包长至少比TCP的首部长 */        if (!pskb_may_pull(skb, sizeof(struct tcphdr)))            goto discard_it;        th = tcp_hdr(skb);      /* 检查TCP首部 */        if (th->doff < sizeof(struct tcphdr) / 4)            goto bad_packet;        if (!pskb_may_pull(skb, th->doff * 4))            goto discard_it;        /* An explanation is required here, I think.         * Packet length and doff are validated by header prediction,         * provided case of th->doff==0 is eliminated.         * So, we defer the checks. */        if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))            goto bad_packet;     /* 将sequence，ack等保存到socket的TCP控制块中 */        th = tcp_hdr(skb);        iph = ip_hdr(skb);        TCP_SKB_CB(skb)->seq = ntohl(th->seq);        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +                     skb->len - th->doff * 4);        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);        TCP_SKB_CB(skb)->when     = 0;        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);        TCP_SKB_CB(skb)->sacked     = 0;          /*      通过源IP，目的IP，源端口，目的端口，和接收到的interface来查找socket。     这里一共涉及两个hash表，一个是保存已连接TCP session，一个是处于listening的TCP session     关于这两个hash，以后再分析。     */        sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);        if (!sk)            goto no_tcp_socket;    process:        /* TIME_WAIT的处理，以后再学习 */        if (sk->sk_state == TCP_TIME_WAIT)            goto do_time_wait;        if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {            NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);            goto discard_and_relse;        }     /* IPsec的检查 */        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))            goto discard_and_relse;        nf_reset(skb);          /* socket filter没有用过。。。 */        if (sk_filter(sk, skb))            goto discard_and_relse;        skb->dev = NULL;        bh_lock_sock_nested(sk);        ret = 0;     /*      检查该socket是否由当前执行上下文拥有，如果是，可以继续处理该skb，     如果不是，那么就将skb加到当前socket的sk_backlog上。     这样的处理与UDP不同，因为TCP是有内部状态的，当处理一个TCP报文的时候，在中间又处理另外一个TCP报文的      时候，可能会改变TCP的状态，导致被打断的TCP报文处理失败。     这里保证TCP的一个报文处理不会被打断     */        if (!sock_owned_by_user(sk)) {    #ifdef CONFIG_NET_DMA            struct tcp_sock *tp = tcp_sk(sk);            if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)                tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);            if (tp->ucopy.dma_chan)                ret = tcp_v4_do_rcv(sk, skb);            else    #endif            {                if (!tcp_prequeue(sk, skb))                    ret = tcp_v4_do_rcv(sk, skb);            }        } else if (unlikely(sk_add_backlog(sk, skb))) {            bh_unlock_sock(sk);            NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);            goto discard_and_relse;        }        ...... ......

进入tcp_v4_do_rcv

    int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)    {        struct sock *rsk;    #ifdef CONFIG_TCP_MD5SIG        /*         * We really want to reject the packet as early as possible         * if:         * o We're expecting an MD5'd packet and this is no MD5 tcp option         * o There is an MD5 option and we're not expecting one         */        if (tcp_v4_inbound_md5_hash(sk, skb))            goto discard;    #endif        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */            /* 该TCP处于已连接状态，留作以后学习 */            sock_rps_save_rxhash(sk, skb);            if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {                rsk = sk;                goto reset;            }            return 0;        }             if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))            goto csum_err;        if (sk->sk_state == TCP_LISTEN) {            /*             处理TCP request包，即请求连接本机TCP端口的TCP报文，并返回应处理该skb的socket。            对于第一个sync包，返回的nsk就是sk。            */            struct sock *nsk = tcp_v4_hnd_req(sk, skb);            if (!nsk)                goto discard;         /* 如前面所说，对于第一个sync包，nsk就是sk，于是继续往下执行 */            if (nsk != sk) {                sock_rps_save_rxhash(nsk, skb);                if (tcp_child_process(sk, nsk, skb)) {                    rsk = nsk;                    goto reset;                }                return 0;            }        } else            sock_rps_save_rxhash(sk, skb);        if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {            rsk = sk;            goto reset;        }        return 0;        ...... ......     }

进入tcp_rcv_state_process

    int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,                 const struct tcphdr *th, unsigned int len)    {        struct tcp_sock *tp = tcp_sk(sk);        struct inet_connection_sock *icsk = inet_csk(sk);        int queued = 0;        int res;        tp->rx_opt.saw_tstamp = 0;        switch (sk->sk_state) {        case TCP_CLOSE:            goto discard;        case TCP_LISTEN:            /* 本文的重点，第一个sync包会到这里 */                         /* 非法的TCP包，LISTEN状态只处理sync包 */            if (th->ack)                return 1;            if (th->rst)                goto discard;            if (th->syn) {                /* 第一个syn包 */                if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)                    return 1;                /* Now we have several options: In theory there is                 * nothing else in the frame. KA9Q has an option to                 * send data with the syn, BSD accepts data with the                 * syn up to the [to be] advertised window and                 * Solaris 2.1 gives you a protocol error. For now                 * we just ignore it, that fits the spec precisely                 * and avoids incompatibilities. It would be nice in                 * future to drop through and process the data.                 *                 * Now that TTCP is starting to be used we ought to                 * queue this data.                 * But, this leaves one open to an easy denial of                 * service attack, and SYN cookies can't defend                 * against this problem. So, we drop the data                 * in the interest of security over speed unless                 * it's still in use.                 */                kfree_skb(skb);                return 0;            }            goto discard;     ......  ......     ......  ......    }

对于IPv4的TCP数据包，conn_request为tcp_v4_conn_request

    int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)    {        struct tcp_extend_values tmp_ext;        struct tcp_options_received tmp_opt;        const u8 *hash_location;        struct request_sock *req;        struct inet_request_sock *ireq;        struct tcp_sock *tp = tcp_sk(sk);        struct dst_entry *dst = NULL;        __be32 saddr = ip_hdr(skb)->saddr;        __be32 daddr = ip_hdr(skb)->daddr;        __u32 isn = TCP_SKB_CB(skb)->when;        int want_cookie = 0;        /* Never answer to SYNs send to broadcast or multicast */        if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))            goto drop;        /* TW buckets are converted to open requests without         * limitations, they conserve resources and peer is         * evidently real one.         */        //检查syn queue是否已满，即request queue是否已满        if (inet_csk_reqsk_queue_is_full(sk) && !isn) {            /* 是否使用sync cookie */            want_cookie = tcp_syn_flood_action(sk, skb, "TCP");            if (!want_cookie)                goto drop;        }        /* Accept backlog is full. If we have already queued enough         * of warm entries in syn queue, drop request. It is better than         * clogging syn queue with openreqs with exponentially increasing         * timeout.         */        //检查accept queue是否已满        if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)            goto drop;          //申请一个新的request_sock        req = inet_reqsk_alloc(&tcp_request_sock_ops);        if (!req)            goto drop;    #ifdef CONFIG_TCP_MD5SIG        tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;    #endif     //解析TCP的option        tcp_clear_options(&tmp_opt);        tmp_opt.mss_clamp = TCP_MSS_DEFAULT;        tmp_opt.user_mss = tp->rx_opt.user_mss;        tcp_parse_options(skb, &tmp_opt, &hash_location, 0);        if (tmp_opt.cookie_plus > 0 &&         tmp_opt.saw_tstamp &&         !tp->rx_opt.cookie_out_never &&         (sysctl_tcp_cookie_size > 0 ||         (tp->cookie_values != NULL &&         tp->cookie_values->cookie_desired > 0))) {            /*             不太确定这部分代码的用途，看上去跟sync cookie相关            貌似是为了检查sync-cookie。            */            u8 *c;            u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];            int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;            if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)                goto drop_and_release;            /* Secret recipe starts with IP addresses */            *mess++ ^= (__force u32)daddr;            *mess++ ^= (__force u32)saddr;            /* plus variable length Initiator Cookie */            c = (u8 *)mess;            while (l-- > 0)                *c++ ^= *hash_location++;            want_cookie = 0;    /* not our kind of cookie */            tmp_ext.cookie_out_never = 0; /* false */            tmp_ext.cookie_plus = tmp_opt.cookie_plus;        } else if (!tp->rx_opt.cookie_in_always) {            /* redundant indications, but ensure initialization. */            tmp_ext.cookie_out_never = 1; /* true */            tmp_ext.cookie_plus = 0;        } else {            goto drop_and_release;        }        tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;        if (want_cookie && !tmp_opt.saw_tstamp)            tcp_clear_options(&tmp_opt);        tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;        tcp_openreq_init(req, &tmp_opt, skb);        ireq = inet_rsk(req);        ireq->loc_addr = daddr;        ireq->rmt_addr = saddr;        ireq->no_srccheck = inet_sk(sk)->transparent;        ireq->opt = tcp_v4_save_options(sk, skb);        if (security_inet_conn_request(sk, skb, req))            goto drop_and_free;        if (!want_cookie || tmp_opt.tstamp_ok)            TCP_ECN_create_request(req, tcp_hdr(skb));        if (want_cookie) {            /* 生成sync cookie使用的Initial sequence numnber */            isn = cookie_v4_init_sequence(sk, skb, &req->mss);            req->cookie_ts = tmp_opt.tstamp_ok;        } else if (!isn) {            struct inet_peer *peer = NULL;            struct flowi4 fl4;            /* VJ's idea. We save last timestamp seen             * from the destination in peer table, when entering             * state TIME-WAIT, and check against it before             * accepting new connection request.             *             * If "isn" is not zero, this request hit alive             * timewait bucket, so that all the necessary checks             * are made in the function processing timewait state.             */            /* 还是不懂这块的检查是为了什么。。。*/            if (tmp_opt.saw_tstamp &&             tcp_death_row.sysctl_tw_recycle &&             (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&             fl4.daddr == saddr &&             (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {                inet_peer_refcheck(peer);                if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&                 (s32)(peer->tcp_ts - req->ts_recent) >                                TCP_PAWS_WINDOW) {                    NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);                    goto drop_and_release;                }            }            /* Kill the following clause, if you dislike this way. */            else if (!sysctl_tcp_syncookies &&                 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <                 (sysctl_max_syn_backlog >> 2)) &&                 (!peer || !peer->tcp_ts_stamp) &&                 (!dst || !dst_metric(dst, RTAX_RTT))) {                /* Without syncookies last quarter of                 * backlog is filled with destinations,                 * proven to be alive.                 * It means that we continue to communicate                 * to destinations, already remembered                 * to the moment of synflood.                 */                LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",                     &saddr, ntohs(tcp_hdr(skb)->source));                goto drop_and_release;            }         /* 生成Initial Sequence Number */            isn = tcp_v4_init_sequence(skb);        }        tcp_rsk(req)->snt_isn = isn;        tcp_rsk(req)->snt_synack = tcp_time_stamp;     /* 回复syn+ack包 */        if (tcp_v4_send_synack(sk, dst, req,                 (struct request_values *)&tmp_ext) ||         want_cookie)            goto drop_and_free;     /* 将该request_sock添加到父socket的icsk_accept_queue中的listen_opt上 */        inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);        return 0;    drop_and_release:        dst_release(dst);    drop_and_free:        reqsk_free(req);    drop:        return 0;    }

今天仅仅学习了一下TCP处理第一个sync包的过程，就发现了很多不明白的地方，还需要继续努力啊。争取早日把TCP的这些细节搞懂。

0 0