3.1 Connect系统调用

来源:互联网 发布:mac的usb接口没反应 编辑:程序博客网 时间:2024/06/09 21:26

  在server端完成listen系统调用之后,就处于可以接受请求的状态,这时client端就可以发送SYN请求给server从而开始“三次握手”。SYN请求的发送是通过connect系统调用实现的:

int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
  其中,sockfd是使用socket系统调用创建的socket的文件描述符,addr为要连接的服务器端地址,addrlen为地址长度。addr的内容与addrlen的大小必须与服务器端bind系统调用的addr和addrlen参数相符,否则连接无法建立。
  connect系统调用对应的内核代码如下:
1658 SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,                                                                    1659         int, addrlen)1660 {1661     struct socket *sock;1662     struct sockaddr_storage address;1663     int err, fput_needed;1664 1665     sock = sockfd_lookup_light(fd, &err, &fput_needed);           //根据描述符查找socket                                                                        1666     if (!sock)1667         goto out;1668     err = move_addr_to_kernel(uservaddr, addrlen, &address);         //将地址信息由用户态copy到内核                                     1669     if (err < 0)1670         goto out_put;1671 1672     err =1673         security_socket_connect(sock, (struct sockaddr *)&address, addrlen);        //安全控制                1674     if (err)1675         goto out_put;1676 1677     err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,          //connect系统调用核心函数                                    1678                  sock->file->f_flags);1679 out_put:1680     fput_light(sock->file, fput_needed);                                                                                                  1681 out:1682     return err;1683 }
  sock->ops->connect指向的函数为inet_stream_connect,它封装了__inet_stream_connect函数:

 599 int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, 600               int addr_len, int flags) 601 { 602     struct sock *sk = sock->sk; 603     int err; 604     long timeo;... 631         err = sk->sk_prot->connect(sk, uaddr, addr_len); //调用TCP协议指定的connect函数,TCPv4是tcp_v4_connect,TCPv6是tcp_v6_connect 632         if (err < 0)      633             goto out;     634 635         sock->state = SS_CONNECTING; 636 637         /* Just entered SS_CONNECTING state; the only 638          * difference is that return value in non-blocking 639          * case is EINPROGRESS, rather than EALREADY. 640          */ 641         err = -EINPROGRESS; 642         break; 643     } 644 645     timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); //如果socket是非阻塞的,则获取timeo 646 647     if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { 648         int writebias = (sk->sk_protocol == IPPROTO_TCP) && 649                 tcp_sk(sk)->fastopen_req && 650                 tcp_sk(sk)->fastopen_req->data ? 1 : 0; 651 652         /* Error code is set above */ 653         if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) //如果timeo不为0,则会调用inet_wait_for_connect一直等待到连接成功或超时 654             goto out; 655 656         err = sock_intr_errno(timeo); 657         if (signal_pending(current)) 658             goto out; 659     } 660 661     /* Connection was closed by RST, timeout, ICMP error 662      * or another process disconnected us. 663      */ 664     if (sk->sk_state == TCP_CLOSE) 665         goto sock_error; 666 667     /* sk->sk_err may be not zero now, if RECVERR was ordered by user 668      * and error was received after socket entered established state. 669      * Hence, it is handled normally after connect() return successfully. 670      */ 671 672     sock->state = SS_CONNECTED; 673     err = 0; 674 out: 675     return err; 676 677 sock_error: 678     err = sock_error(sk) ? : -ECONNABORTED; 679     sock->state = SS_UNCONNECTED; 680     if (sk->sk_prot->disconnect(sk, flags)) 681         sock->state = SS_DISCONNECTING; 682     goto out; 683 }
  __inet_stream_connect函数除了调用TCP指定的connect函数完成connect系统调用的核心功能外,还会调用sock_sndtimeo获取阻塞时间timeo。如果socket是非阻塞的,则timeo是0;否则为可以通过setsockopt函数的SO_SNDTIMEO选项来设置的值,这个值默认是MAX_SCHEDULE_TIMEOUT.socket默认都是阻塞的。如果timeo非零则connect系统调用会睡眠,直到超时,或被对端的消息(SYN|ACK,ICMP等)唤醒。
  下面来看核心函数tcp_v4_connect

 142 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 143 { 144     struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 145     struct inet_sock *inet = inet_sk(sk); 146     struct tcp_sock *tp = tcp_sk(sk); 147     __be16 orig_sport, orig_dport; 148     __be32 daddr, nexthop;... 160     nexthop = daddr = usin->sin_addr.s_addr; 161     inet_opt = rcu_dereference_protected(inet->inet_opt, 162                          sock_owned_by_user(sk)); 163     if (inet_opt && inet_opt->opt.srr) { 164         if (!daddr) 165             return -EINVAL; 166         nexthop = inet_opt->opt.faddr; 167     } 168  169     orig_sport = inet->inet_sport; 170     orig_dport = usin->sin_port; 171     fl4 = &inet->cork.fl.u.ip4; 172     rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 173                   RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 174                   IPPROTO_TCP, 175                   orig_sport, orig_dport, sk, true);... 191     if (!inet->inet_saddr) //如果源IP地址没有确定,则使用路由查询的结果 192         inet->inet_saddr = fl4->saddr; 193     inet->inet_rcv_saddr = inet->inet_saddr;... 207     inet->inet_dport = usin->sin_port; 208     inet->inet_daddr = daddr;... 221     tcp_set_state(sk, TCP_SYN_SENT); 222     err = inet_hash_connect(&tcp_death_row, sk);  //绑定IP地址和端口,并将socket加入到连接表中 223     if (err) 224         goto failure; 225  226     rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 227                    inet->inet_sport, inet->inet_dport, sk);  //用新的源端口再次做路由查询 228     if (IS_ERR(rt)) { 229         err = PTR_ERR(rt); 230         rt = NULL; 231         goto failure; 232     } 233     /* OK, now commit destination to socket.  */ 234     sk->sk_gso_type = SKB_GSO_TCPV4; 235     sk_setup_caps(sk, &rt->dst); 236  237     if (!tp->write_seq && likely(!tp->repair)) 238         tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, 239                                inet->inet_daddr, 240                                inet->inet_sport, 241                                usin->sin_port); //使用加密的方法生成初始序列号... 245     err = tcp_connect(sk);    //发送SYN请求 246  247     rt = NULL; 248     if (err) 249         goto failure; 250  251     return 0;
  此函数小于208行的部分是确定socket的源/目的IP和源/目的端口。目的IP和目的端口是由connect系统调用的入参指定,如果没有调用过bind的话,源IP是由路由查询的结果决定。影响路由查询结果的主要因素是目的IP,端口如何影响查询结果目前尚不清楚。源端口如果是0(没有bind)则会在inet_hash_connect函数中指定,然后用新的端口再次进行路由查询(226-227行)。而新的查询结果不影响源IP的选取,由此看来源端口的值不会影响源IP的选择。

  secure_tcp_sequence_number函数用加密并与时间关联的非法生成初始序列号,以保证一个TCP连接的初始序列号很难被猜到,而且即使先后发起多次是源/目的IP和源/目的端口都一样的连接,它们的起始序列号也都不一样。前者是为了避免黑客攻击,后者是为了减小旧的报文混入新连接的概率。

  245 tcp_connect函数用于构建并发送一个SYN请求。
  下面总结一下connect系统调用的功能:

1、设置socket的源/目的IP和源/目的端口;

2、设置socket的状态为SYN_SENT;

3、将socket加入到连接表中等待后续TCP报文到来时进行匹配;

4、选取初始序列号;

5、构建并发送SYN请求报文。

  至此,还有两个关键函数:inet_hash_connect和tcp_connect函数我们没有分析。抱着喜欢探究秘密的好奇心,我们一起来看看这两个函数是这样工作的:

589 int inet_hash_connect(struct inet_timewait_death_row *death_row,590               struct sock *sk)591 {592     return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),593             __inet_check_established, __inet_hash_nolisten);594 }
  __inet_hash_connect函数:

477 int __inet_hash_connect(struct inet_timewait_death_row *death_row,478         struct sock *sk, u32 port_offset,479         int (*check_established)(struct inet_timewait_death_row *,480             struct sock *, __u16, struct inet_timewait_sock **),481         int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))482 {483     struct inet_hashinfo *hinfo = death_row->hashinfo;484     const unsigned short snum = inet_sk(sk)->inet_num;485     struct inet_bind_hashbucket *head;486     struct inet_bind_bucket *tb;487     int ret;488     struct net *net = sock_net(sk);489     int twrefcnt = 1;490 491     if (!snum) {  //端口未绑定492         int i, remaining, low, high, port;493         static u32 hint;494         u32 offset = hint + port_offset;495         struct inet_timewait_sock *tw = NULL;496 497         inet_get_local_port_range(&low, &high);498         remaining = (high - low) + 1;499 500         local_bh_disable();501         for (i = 1; i <= remaining; i++) {502             port = low + (i + offset) % remaining;503             if (inet_is_reserved_local_port(port))504                 continue;505             head = &hinfo->bhash[inet_bhashfn(net, port,506                     hinfo->bhash_size)];507             spin_lock(&head->lock);508 509             /* Does not bother with rcv_saddr checks,510              * because the established check is already511              * unique enough.512              */ //绑定到一个port的socket可能是通过bind 系统调用,也可能是调用connect系统调用时__inet_hash_connect函数选取的。513             inet_bind_bucket_for_each(tb, &head->chain) {  //遍历所有绑定到此端口的socket514                 if (net_eq(ib_net(tb), net) &&515                     tb->port == port) {516                     if (tb->fastreuse >= 0 ||517                         tb->fastreuseport >= 0)518                         goto next_port;519                     WARN_ON(hlist_empty(&tb->owners));520                     if (!check_established(death_row, sk,    521                                 port, &tw))522                         goto ok; //没有冲突523                     goto next_port;524                 }525             }526      //当前端口没有被使用527             tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,528                     net, head, port);529             if (!tb) {530                 spin_unlock(&head->lock);531                 break;532             }533             tb->fastreuse = -1;534             tb->fastreuseport = -1;535             goto ok;536 537         next_port:538             spin_unlock(&head->lock);539         }540         local_bh_enable();541 542         return -EADDRNOTAVAIL;543 544 ok:545         hint += i;546 547         /* Head lock still held and bh's disabled */548         inet_bind_hash(sk, tb, port);  //将socket加入port对应的tb的socket队列中,即将此socket与port相关联549         if (sk_unhashed(sk)) {  //如果socket没有被加入到“已建立连接”的连接表中550             inet_sk(sk)->inet_sport = htons(port);551             twrefcnt += hash(sk, tw);  //将socket加入到“已建立连接”的连接表中552         }553         if (tw)554             twrefcnt += inet_twsk_bind_unhash(tw, hinfo);555         spin_unlock(&head->lock);556 557         if (tw) {558             inet_twsk_deschedule(tw, death_row);559             while (twrefcnt) {560                 twrefcnt--;561                 inet_twsk_put(tw);562             }563         }564 565         ret = 0;566         goto out;567     }568 569     head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];570     tb  = inet_csk(sk)->icsk_bind_hash;  //将tb加入到bind hash表中 571     spin_lock_bh(&head->lock);572     if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { //有且仅有一个socket绑定到这个端口,无需冲突检查573         hash(sk, NULL); //将socket加入到“已建立连接”的连接表中574         spin_unlock_bh(&head->lock);575         return 0;576     } else { //否则检查是否冲突,577         spin_unlock(&head->lock);578         /* No definite answer... Walk to established hash table */579         ret = check_established(death_row, sk, snum, NULL); //这个检查与520行不重合,因为能走到这里说明端口一定是非零的580 out:581         local_bh_enable();582         return ret;583     }584 }
  __inet_hash_connect的主要功能与bind系统调用中的inet_csk_get_port类似,都是:

1、如果没有选取端口则选定一个;

2、将socket与端口绑定;

3、将scoket加入到连接表中(这个功能inet_csk_get_port没有)。

  另外一点不同是:inet_csk_get_port进行冲突检查时关注的是绑定冲突,而__inet_hash_connect检查的是当前socket是否与“已建立连接的socket”的冲突。__inet_hash_connect检查冲突的函数是__inet_check_established:

311 static int __inet_check_established(struct inet_timewait_death_row *death_row,312                     struct sock *sk, __u16 lport,313                     struct inet_timewait_sock **twp)314 {315     struct inet_hashinfo *hinfo = death_row->hashinfo;316     struct inet_sock *inet = inet_sk(sk);317     __be32 daddr = inet->inet_rcv_saddr;318     __be32 saddr = inet->inet_daddr;319     int dif = sk->sk_bound_dev_if;320     INET_ADDR_COOKIE(acookie, saddr, daddr)321     const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);322     struct net *net = sock_net(sk);323     unsigned int hash = inet_ehashfn(net, daddr, lport,324                      saddr, inet->inet_dport);325     struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);  //找到连接表中的表项...334     /* Check TIME-WAIT sockets first. */335     sk_nulls_for_each(sk2, node, &head->twchain) {336         if (sk2->sk_hash != hash)337             continue;338 339         if (likely(INET_TW_MATCH(sk2, net, acookie,340                      saddr, daddr, ports, dif))) {341             tw = inet_twsk(sk2);342             if (twsk_unique(sk, sk2, twp))343                 goto unique;344             else345                 goto not_unique;346         }347     }348     tw = NULL;349 350     /* And established part... */351     sk_nulls_for_each(sk2, node, &head->chain) {352         if (sk2->sk_hash != hash)353             continue;354         if (likely(INET_MATCH(sk2, net, acookie,355                       saddr, daddr, ports, dif)))356             goto not_unique;357     }358 359 unique:360     /* Must record num and sport now. Otherwise we will see361      * in hash table socket with a funny identity. */362     inet->inet_num = lport;363     inet->inet_sport = htons(lport);364     sk->sk_hash = hash;...
  可见__inet_check_established先检查TIME_WAIT表,然后再检查establish表,与这两个表中的任意一个冲突都是不允许的。后面的章节中会介绍TIME_WAIT功能。

  在listen系统调用中,inet_hash函数会将socket加入到listen连接表中:

426 static void __inet_hash(struct sock *sk)427 {428     struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;429     struct inet_listen_hashbucket *ilb;430 431     if (sk->sk_state != TCP_LISTEN) {432         __inet_hash_nolisten(sk, NULL);433         return;434     }435 436     WARN_ON(!sk_unhashed(sk));437     ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];438 439     spin_lock(&ilb->lock);440     __sk_nulls_add_node_rcu(sk, &ilb->head);441     sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);442     spin_unlock(&ilb->lock);443 }444 445 void inet_hash(struct sock *sk)446 {447     if (sk->sk_state != TCP_CLOSE) {448         local_bh_disable();449         __inet_hash(sk);450         local_bh_enable();451     }452 }

  在__inet_hash_connect函数中,加入socket到连接表的函数是__inet_hash_nolisten:

399 int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)400 {401     struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;402     struct hlist_nulls_head *list;403     spinlock_t *lock;404     struct inet_ehash_bucket *head;405     int twrefcnt = 0;406 407     WARN_ON(!sk_unhashed(sk));408 409     sk->sk_hash = inet_sk_ehashfn(sk);410     head = inet_ehash_bucket(hashinfo, sk->sk_hash);411     list = &head->chain;412     lock = inet_ehash_lockp(hashinfo, sk->sk_hash);413 414     spin_lock(lock);415     __sk_nulls_add_node_rcu(sk, list);416     if (tw) {417         WARN_ON(sk->sk_hash != tw->tw_hash);418         twrefcnt = inet_twsk_unhash(tw);419     }420     spin_unlock(lock);421     sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);422     return twrefcnt;423 }

159 static inline struct inet_ehash_bucket *inet_ehash_bucket(160     struct inet_hashinfo *hashinfo,161     unsigned int hash)162 {   163     return &hashinfo->ehash[hash & hashinfo->ehash_mask];164 }
  可见server端的socket在进行listen系统调用后被加入到sk->sk_prot->h.hashinfo->listening_hash中,client端的socket在进行connect系统调用后被加入到sk->sk_prot->h.hashinfo->ehash中,而对于TCPv4和TCPv6,sk->sk_prot->h.hashinfo指向的都是tcp_hashinfo。

  再看tcp_connect函数:

2925 int tcp_connect(struct sock *sk)2926 {2927     struct tcp_sock *tp = tcp_sk(sk);2928     struct sk_buff *buff;2929     int err;2930 2931     tcp_connect_init(sk);  //初始化TCP连接信息2932 2933     if (unlikely(tp->repair)) {2934         tcp_finish_connect(sk, NULL);2935         return 0;2936     }2937 2938     buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);  //构建SKB用于承载TCP报文2939     if (unlikely(buff == NULL))2940         return -ENOBUFS;2941 2942     /* Reserve space for headers. */2943     skb_reserve(buff, MAX_TCP_HEADER);2944 2945     tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);  //初始化不带数据的TCP SYN报文,SYN包的seq为tp->write_seq2946     tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp;2947     tcp_connect_queue_skb(sk, buff);  //将报文放入发送队列中2948     TCP_ECN_send_syn(sk, buff);2949 2950     /* Send off SYN; include data in Fast Open. */2951     err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :  //如果TFO功能开启,则用tcp_send_syn_data函数发送带数据的SYN2952           tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);      //否则用tcp_transmit_skb发送之前构建好的不带数据的SYN2953     if (err == -ECONNREFUSED)2954         return err;2955 2956     /* We change tp->snd_nxt after the tcp_transmit_skb() call2957      * in order to make this packet get counted in tcpOutSegs.2958      */2959     tp->snd_nxt = tp->write_seq;  //snd_nxt = seq + 12960     tp->pushed_seq = tp->write_seq;2961     TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);2962 2963     /* Timer for repeating the SYN until an answer. */2964     inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,2965                   inet_csk(sk)->icsk_rto, TCP_RTO_MAX);  //设置重传定时器,如果在超时之前没有收到SYN|ACK,则重传SYN2966     return 0;2967 }

  关于TFO的内容后续章节有详细分析。现在看TFO不开启的情况,这时SYN的发送靠tcp_transmit_skb函数完成:

 828 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, 829                 gfp_t gfp_mask) 830 { 831     const struct inet_connection_sock *icsk = inet_csk(sk); 832     struct inet_sock *inet; 833     struct tcp_sock *tp; 834     struct tcp_skb_cb *tcb; 835     struct tcp_out_options opts; 836     unsigned int tcp_options_size, tcp_header_size; 837     struct tcp_md5sig_key *md5; 838     struct tcphdr *th; 839     int err;...    //clone_it大多数情况下都会为非零,因为TCP的大部分报文都是先放到发送队列中,发送的时候再复制一份 849     if (likely(clone_it)) {    850         const struct sk_buff *fclone = skb + 1; 851  852         if (unlikely(skb->fclone == SKB_FCLONE_ORIG && 853                  fclone->fclone == SKB_FCLONE_CLONE)) 854             NET_INC_STATS_BH(sock_net(sk), 855                      LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); 856  857         if (unlikely(skb_cloned(skb))) 858             skb = pskb_copy(skb, gfp_mask); 859         else 860             skb = skb_clone(skb, gfp_mask); 861         if (unlikely(!skb)) 862             return -ENOBUFS; 863     } 864  865     inet = inet_sk(sk); 866     tp = tcp_sk(sk); 867     tcb = TCP_SKB_CB(skb); 868     memset(&opts, 0, sizeof(opts)); 869  870     if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) 871         tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); //构建SYN报文的选项字段 872     else 873         tcp_options_size = tcp_established_options(sk, skb, &opts, 874                                &md5); 875     tcp_header_size = tcp_options_size + sizeof(struct tcphdr);... 885     skb_push(skb, tcp_header_size);  //将skb数据指针向报文的开始部分移动TCP头的大小 886     skb_reset_transport_header(skb); //设置TCP首部指针 887  888     skb_orphan(skb); 889     skb->sk = sk; 890     skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ? 891               tcp_wfree : sock_wfree; 892     atomic_add(skb->truesize, &sk->sk_wmem_alloc); 893  894     /* Build TCP header and checksum it. */ 895     th = tcp_hdr(skb); 896     th->source      = inet->inet_sport; 897     th->dest        = inet->inet_dport; 898     th->seq         = htonl(tcb->seq); 899     th->ack_seq     = htonl(tp->rcv_nxt); 900     *(((__be16 *)th) + 6)   = htons(((tcp_header_size >> 2) << 12) | 901                     tcb->tcp_flags); 902  //设置窗口大小 903     if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { 904         /* RFC1323: The window in SYN & SYN/ACK segments 905          * is never scaled. 906          */ 907         th->window  = htons(min(tp->rcv_wnd, 65535U)); 908     } else { 909         th->window  = htons(tcp_select_window(sk)); 910     } 911     th->check       = 0; 912     th->urg_ptr     = 0;...  //将选项信息写入TCP报文段 925     tcp_options_write((__be32 *)(th + 1), tp, &opts);... 938     icsk->icsk_af_ops->send_check(sk, skb);  //调用tcp_v4_send_check函数计算检验和... 950     err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); //调用ip_queue_xmit函数将报文段向下层传递 951     if (likely(err <= 0)) 952         return err; 953  954     tcp_enter_cwr(sk, 1); 955  956     return net_xmit_eval(err); 957 }
  下层函数ip_queue_xmit的功能是构建IP报头、二层报头,并将报文直接发送到网卡驱动中或放入队列中等待发送。限于本文的目的,TCP下层的数据处理就不做分析。 
  发送完SYN后,connect系统调用回直接返回,或等待SYN|ACK的到来。server端在收到SYN请求后会如何处理呢?下回分解!

1 0
原创粉丝点击