TCP/IP源码学习(52)——TCP的连接过程的实现(1)
来源:互联网 发布:熊片数据库 编辑:程序博客网 时间:2024/05/16 18:39
http://blog.chinaunix.net/uid-23629988-id-3178006.html
作者:gfree.wind@gmail.com
博客:blog.focus-linux.net linuxfocus.blog.chinaunix.net
博客:blog.focus-linux.net linuxfocus.blog.chinaunix.net
本文的copyleft归gfree.wind@gmail.com所有,使用GPL发布,可以自由拷贝,转载。但转载请保持文档的完整性,注明原作者及原链接,严禁用于任何商业用途。
======================================================================================================
在以前的文章中,学习了UDP数据包的接收和发送。今天开始研究一下TCP数据包的接受。与UDP数据包类似,当IP数据包到达ip_local_deliver_finish函数时,根据四层协议从inet_protos数组中得到TCP协议对应的tcp_protocol。
static const struct net_protocol tcp_protocol = { .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, .gso_send_check = tcp_v4_gso_send_check, .gso_segment = tcp_tso_segment, .gro_receive = tcp4_gro_receive, .gro_complete = tcp4_gro_complete, .no_policy = 1, .netns_ok = 1, };
那么TCP数据包的接收函数入口即为tcp_v4_rcv
int tcp_v4_rcv(struct sk_buff *skb) { const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; int ret; struct net *net = dev_net(skb->dev); /* 检测该包是否为发给本机的 */ if (skb->pkt_type != PACKET_HOST) goto discard_it; /* Count it even if it's bad */ TCP_INC_STATS_BH(net, TCP_MIB_INSEGS); /* 检查包长至少比TCP的首部长 */ if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; th = tcp_hdr(skb); /* 检查TCP首部 */ if (th->doff < sizeof(struct tcphdr) / 4) goto bad_packet; if (!pskb_may_pull(skb, th->doff * 4)) goto discard_it; /* An explanation is required here, I think. * Packet length and doff are validated by header prediction, * provided case of th->doff==0 is eliminated. * So, we defer the checks. */ if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) goto bad_packet; /* 将sequence,ack等保存到socket的TCP控制块中 */ th = tcp_hdr(skb); iph = ip_hdr(skb); TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->when = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; /* 通过源IP,目的IP,源端口,目的端口,和接收到的interface来查找socket。 这里一共涉及两个hash表,一个是保存已连接TCP session,一个是处于listening的TCP session 关于这两个hash,以后再分析。 */ sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); if (!sk) goto no_tcp_socket; process: /* TIME_WAIT的处理,以后再学习 */ if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; } /* IPsec的检查 */ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; nf_reset(skb); /* socket filter没有用过。。。 */ if (sk_filter(sk, skb)) goto discard_and_relse; skb->dev = NULL; bh_lock_sock_nested(sk); ret = 0; /* 检查该socket是否由当前执行上下文拥有,如果是,可以继续处理该skb, 如果不是,那么就将skb加到当前socket的sk_backlog上。 这样的处理与UDP不同,因为TCP是有内部状态的,当处理一个TCP报文的时候,在中间又处理另外一个TCP报文的 时候,可能会改变TCP的状态,导致被打断的TCP报文处理失败。 这里保证TCP的一个报文处理不会被打断 */ if (!sock_owned_by_user(sk)) { #ifdef CONFIG_NET_DMA struct tcp_sock *tp = tcp_sk(sk); if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); if (tp->ucopy.dma_chan) ret = tcp_v4_do_rcv(sk, skb); else #endif { if (!tcp_prequeue(sk, skb)) ret = tcp_v4_do_rcv(sk, skb); } } else if (unlikely(sk_add_backlog(sk, skb))) { bh_unlock_sock(sk); NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); goto discard_and_relse; } ...... ......
进入tcp_v4_do_rcv
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { struct sock *rsk; #ifdef CONFIG_TCP_MD5SIG /* * We really want to reject the packet as early as possible * if: * o We're expecting an MD5'd packet and this is no MD5 tcp option * o There is an MD5 option and we're not expecting one */ if (tcp_v4_inbound_md5_hash(sk, skb)) goto discard; #endif if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ /* 该TCP处于已连接状态,留作以后学习 */ sock_rps_save_rxhash(sk, skb); if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; } return 0; } if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { /* 处理TCP request包,即请求连接本机TCP端口的TCP报文,并返回应处理该skb的socket。 对于第一个sync包,返回的nsk就是sk。 */ struct sock *nsk = tcp_v4_hnd_req(sk, skb); if (!nsk) goto discard; /* 如前面所说,对于第一个sync包,nsk就是sk,于是继续往下执行 */ if (nsk != sk) { sock_rps_save_rxhash(nsk, skb); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; } return 0; } } else sock_rps_save_rxhash(sk, skb); if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; } return 0; ...... ...... }
进入tcp_rcv_state_process
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int queued = 0; int res; tp->rx_opt.saw_tstamp = 0; switch (sk->sk_state) { case TCP_CLOSE: goto discard; case TCP_LISTEN: /* 本文的重点,第一个sync包会到这里 */ /* 非法的TCP包,LISTEN状态只处理sync包 */ if (th->ack) return 1; if (th->rst) goto discard; if (th->syn) { /* 第一个syn包 */ if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) return 1; /* Now we have several options: In theory there is * nothing else in the frame. KA9Q has an option to * send data with the syn, BSD accepts data with the * syn up to the [to be] advertised window and * Solaris 2.1 gives you a protocol error. For now * we just ignore it, that fits the spec precisely * and avoids incompatibilities. It would be nice in * future to drop through and process the data. * * Now that TTCP is starting to be used we ought to * queue this data. * But, this leaves one open to an easy denial of * service attack, and SYN cookies can't defend * against this problem. So, we drop the data * in the interest of security over speed unless * it's still in use. */ kfree_skb(skb); return 0; } goto discard; ...... ...... ...... ...... }
对于IPv4的TCP数据包,conn_request为tcp_v4_conn_request
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct tcp_extend_values tmp_ext; struct tcp_options_received tmp_opt; const u8 *hash_location; struct request_sock *req; struct inet_request_sock *ireq; struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = NULL; __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; __u32 isn = TCP_SKB_CB(skb)->when; int want_cookie = 0; /* Never answer to SYNs send to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; /* TW buckets are converted to open requests without * limitations, they conserve resources and peer is * evidently real one. */ //检查syn queue是否已满,即request queue是否已满 if (inet_csk_reqsk_queue_is_full(sk) && !isn) { /* 是否使用sync cookie */ want_cookie = tcp_syn_flood_action(sk, skb, "TCP"); if (!want_cookie) goto drop; } /* Accept backlog is full. If we have already queued enough * of warm entries in syn queue, drop request. It is better than * clogging syn queue with openreqs with exponentially increasing * timeout. */ //检查accept queue是否已满 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; //申请一个新的request_sock req = inet_reqsk_alloc(&tcp_request_sock_ops); if (!req) goto drop; #ifdef CONFIG_TCP_MD5SIG tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; #endif //解析TCP的option tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = TCP_MSS_DEFAULT; tmp_opt.user_mss = tp->rx_opt.user_mss; tcp_parse_options(skb, &tmp_opt, &hash_location, 0); if (tmp_opt.cookie_plus > 0 && tmp_opt.saw_tstamp && !tp->rx_opt.cookie_out_never && (sysctl_tcp_cookie_size > 0 || (tp->cookie_values != NULL && tp->cookie_values->cookie_desired > 0))) { /* 不太确定这部分代码的用途,看上去跟sync cookie相关 貌似是为了检查sync-cookie。 */ u8 *c; u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS]; int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE; if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0) goto drop_and_release; /* Secret recipe starts with IP addresses */ *mess++ ^= (__force u32)daddr; *mess++ ^= (__force u32)saddr; /* plus variable length Initiator Cookie */ c = (u8 *)mess; while (l-- > 0) *c++ ^= *hash_location++; want_cookie = 0; /* not our kind of cookie */ tmp_ext.cookie_out_never = 0; /* false */ tmp_ext.cookie_plus = tmp_opt.cookie_plus; } else if (!tp->rx_opt.cookie_in_always) { /* redundant indications, but ensure initialization. */ tmp_ext.cookie_out_never = 1; /* true */ tmp_ext.cookie_plus = 0; } else { goto drop_and_release; } tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always; if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb); ireq = inet_rsk(req); ireq->loc_addr = daddr; ireq->rmt_addr = saddr; ireq->no_srccheck = inet_sk(sk)->transparent; ireq->opt = tcp_v4_save_options(sk, skb); if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; if (!want_cookie || tmp_opt.tstamp_ok) TCP_ECN_create_request(req, tcp_hdr(skb)); if (want_cookie) { /* 生成sync cookie使用的Initial sequence numnber */ isn = cookie_v4_init_sequence(sk, skb, &req->mss); req->cookie_ts = tmp_opt.tstamp_ok; } else if (!isn) { struct inet_peer *peer = NULL; struct flowi4 fl4; /* VJ's idea. We save last timestamp seen * from the destination in peer table, when entering * state TIME-WAIT, and check against it before * accepting new connection request. * * If "isn" is not zero, this request hit alive * timewait bucket, so that all the necessary checks * are made in the function processing timewait state. */ /* 还是不懂这块的检查是为了什么。。。*/ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle && (dst = inet_csk_route_req(sk, &fl4, req)) != NULL && fl4.daddr == saddr && (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) { inet_peer_refcheck(peer); if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); goto drop_and_release; } } /* Kill the following clause, if you dislike this way. */ else if (!sysctl_tcp_syncookies && (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && (!peer || !peer->tcp_ts_stamp) && (!dst || !dst_metric(dst, RTAX_RTT))) { /* Without syncookies last quarter of * backlog is filled with destinations, * proven to be alive. * It means that we continue to communicate * to destinations, already remembered * to the moment of synflood. */ LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n", &saddr, ntohs(tcp_hdr(skb)->source)); goto drop_and_release; } /* 生成Initial Sequence Number */ isn = tcp_v4_init_sequence(skb); } tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->snt_synack = tcp_time_stamp; /* 回复syn+ack包 */ if (tcp_v4_send_synack(sk, dst, req, (struct request_values *)&tmp_ext) || want_cookie) goto drop_and_free; /* 将该request_sock添加到父socket的icsk_accept_queue中的listen_opt上 */ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); return 0; drop_and_release: dst_release(dst); drop_and_free: reqsk_free(req); drop: return 0; }
今天仅仅学习了一下TCP处理第一个sync包的过程,就发现了很多不明白的地方,还需要继续努力啊。争取早日把TCP的这些细节搞懂。
0 0
- TCP/IP源码学习(52)——TCP的连接过程的实现(1)
- TCP/IP源码学习(52)——TCP的连接过程的实现(1)
- TCP/IP源码学习(53)——TCP的连接过程的实现(2)
- TCP/IP源码学习(54)——TCP的连接过程的实现(3)
- TCP/IP 建立连接的过程
- LwIP协议栈源码详解——TCP/IP协议的实现 TCP定时器
- 多连接的tcp/ip程序实现
- TCP/IP详解学习 -- TCP连接的建立与终止
- TCP/IP 建立连接、断开连接的过程
- TCP/IP 建立连接、断开连接的过程
- TCP/IP 建立连接、断开连接的过程
- TCP/IP 建立连接、断开连接的过程
- TCP学习(3)--TCP释放连接的过程(四次挥手)
- TCP/IP (四) TCP连接的关闭
- TCP/IP连接的建立
- TCP/IP连接的建立
- TCP/IP 建立连接的过程(3-way shake)
- TCP/IP 建立连接的过程?(3-way shake)
- 论客户端应用架构
- 相机拍照功能之权限和Android6.0版本问题
- 第五周项目一(建立顺序栈算法库)
- Netty权威指南读书笔记-第一章
- intellij IDEA配置tomcat
- TCP/IP源码学习(52)——TCP的连接过程的实现(1)
- Java异常机制
- WordPress域名问题记录
- jQuery验证控件jquery.validate
- 微服务模式系列之二:微服务架构
- 在Nginx服务器中设置多个站点
- Cookie
- 23种设计模式,UML图
- 《iOS Run Loop 线下分享》的简单总结