实现原理-Connect

来源:互联网 发布:山西教师网络研修网 编辑:程序博客网 时间:2024/06/06 02:13

最近遇到阻塞式connect在三步握手过程中,客户端自身IP发生变化时,connect系统调用阻塞时间过长的问题。除将connect修改为非阻塞的方式外,跟踪学习connect的socket层及TCP层实现,简单归纳整理。

Socket层

SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,        int, addrlen){    struct socket *sock;    struct sockaddr_storage address;    int err, fput_needed;    sock = sockfd_lookup_light(fd, &err, &fput_needed);    if (!sock)        goto out;    err = move_addr_to_kernel(uservaddr, addrlen, &address);    if (err < 0)        goto out_put;    err =        security_socket_connect(sock, (struct sockaddr *)&address, addrlen);    if (err)        goto out_put;    err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,                 sock->file->f_flags);out_put:    fput_light(sock->file, fput_needed);out:    return err;}

TCP层

int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,            int addr_len, int flags){    int err;    lock_sock(sock->sk);    err = __inet_stream_connect(sock, uaddr, addr_len, flags);    release_sock(sock->sk);    return err;}
int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,              int addr_len, int flags){    struct sock *sk = sock->sk;    int err;    long timeo;    if (addr_len < sizeof(uaddr->sa_family))        return -EINVAL;    if (uaddr->sa_family == AF_UNSPEC) {        err = sk->sk_prot->disconnect(sk, flags);        sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;        goto out;    }    switch (sock->state) {    default:        err = -EINVAL;        goto out;    case SS_CONNECTED:        err = -EISCONN;        goto out;    case SS_CONNECTING:        err = -EALREADY;        /* Fall out of switch with err, set for this state */        break;    case SS_UNCONNECTED:        err = -EISCONN;        if (sk->sk_state != TCP_CLOSE)            goto out;        err = sk->sk_prot->connect(sk, uaddr, addr_len);        if (err < 0)        {            goto out;        }        sock->state = SS_CONNECTING;        /* Just entered SS_CONNECTING state; the only         * difference is that return value in non-blocking         * case is EINPROGRESS, rather than EALREADY.         */        err = -EINPROGRESS;        break;    }    timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);    if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {        int writebias = (sk->sk_protocol == IPPROTO_TCP) &&                tcp_sk(sk)->fastopen_req &&                tcp_sk(sk)->fastopen_req->data ? 1 : 0;        /* Error code is set above */        if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))        {            goto out;        }        err = sock_intr_errno(timeo);        if (signal_pending(current))            goto out;    }    /* Connection was closed by RST, timeout, ICMP error     * or another process disconnected us.     */    if (sk->sk_state == TCP_CLOSE)    {        printk("###%s[%d]### sk TCP CLOSE\n", __func__, __LINE__);        goto sock_error;    }    /* sk->sk_err may be not zero now, if RECVERR was ordered by user     * and error was received after socket entered established state.     * Hence, it is handled normally after connect() return successfully.     */    sock->state = SS_CONNECTED;    err = 0;out:    return err;sock_error:    err = sock_error(sk) ? : -ECONNABORTED;    sock->state = SS_UNCONNECTED;    if (sk->sk_prot->disconnect(sk, flags))        sock->state = SS_DISCONNECTING;    goto out;}
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len){    struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;    struct inet_sock *inet = inet_sk(sk);    struct tcp_sock *tp = tcp_sk(sk);    __be16 orig_sport, orig_dport;    __be32 daddr, nexthop;    struct flowi4 *fl4;    struct rtable *rt;    int err;    struct ip_options_rcu *inet_opt;    if (addr_len < sizeof(struct sockaddr_in))        return -EINVAL;    if (usin->sin_family != AF_INET)        return -EAFNOSUPPORT;    nexthop = daddr = usin->sin_addr.s_addr;    inet_opt = rcu_dereference_protected(inet->inet_opt,                         sock_owned_by_user(sk));    if (inet_opt && inet_opt->opt.srr) {        if (!daddr)            return -EINVAL;        nexthop = inet_opt->opt.faddr;    }    orig_sport = inet->inet_sport;    orig_dport = usin->sin_port;    fl4 = &inet->cork.fl.u.ip4;    rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,                  RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,                  IPPROTO_TCP,                  orig_sport, orig_dport, sk);    if (IS_ERR(rt)) {        err = PTR_ERR(rt);        if (err == -ENETUNREACH)            IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);        return err;    }    if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {        ip_rt_put(rt);        return -ENETUNREACH;    }    if (!inet_opt || !inet_opt->opt.srr)        daddr = fl4->daddr;    if (!inet->inet_saddr)        inet->inet_saddr = fl4->saddr;    inet->inet_rcv_saddr = inet->inet_saddr;    if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {        /* Reset inherited state */        tp->rx_opt.ts_recent       = 0;        tp->rx_opt.ts_recent_stamp = 0;        if (likely(!tp->repair))            tp->write_seq      = 0;    }    if (tcp_death_row.sysctl_tw_recycle &&        !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)        tcp_fetch_timewait_stamp(sk, &rt->dst);    inet->inet_dport = usin->sin_port;    inet->inet_daddr = daddr;    inet_csk(sk)->icsk_ext_hdr_len = 0;    if (inet_opt)        inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;    tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;    /* Socket identity is still unknown (sport may be zero).     * However we set state to SYN-SENT and not releasing socket     * lock select source port, enter ourselves into the hash tables and     * complete initialization after this.     */    tcp_set_state(sk, TCP_SYN_SENT);    err = inet_hash_connect(&tcp_death_row, sk);    if (err)        goto failure;    rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,                   inet->inet_sport, inet->inet_dport, sk);    if (IS_ERR(rt)) {        err = PTR_ERR(rt);        rt = NULL;        goto failure;    }    /* OK, now commit destination to socket.  */    sk->sk_gso_type = SKB_GSO_TCPV4;    sk_setup_caps(sk, &rt->dst);    if (!tp->write_seq && likely(!tp->repair))        tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,                               inet->inet_daddr,                               inet->inet_sport,                               usin->sin_port);    inet->inet_id = tp->write_seq ^ jiffies;    err = tcp_connect(sk);    rt = NULL;    if (err)        goto failure;    return 0;failure:    /*     * This unhashes the socket and releases the local port,     * if necessary.     */    tcp_set_state(sk, TCP_CLOSE);    ip_rt_put(rt);    sk->sk_route_caps = 0;    inet->inet_dport = 0;    return err;}
/* Build a SYN and send it off. */int tcp_connect(struct sock *sk){    struct tcp_sock *tp = tcp_sk(sk);    struct sk_buff *buff;    int err;    tcp_connect_init(sk);    if (unlikely(tp->repair)) {        tcp_finish_connect(sk, NULL);        return 0;    }    buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);    if (unlikely(!buff))        return -ENOBUFS;    tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);    tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp;    tcp_connect_queue_skb(sk, buff);    TCP_ECN_send_syn(sk, buff);    /* Send off SYN; include data in Fast Open. */    err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :          tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);    if (err == -ECONNREFUSED)        return err;    /* We change tp->snd_nxt after the tcp_transmit_skb() call     * in order to make this packet get counted in tcpOutSegs.     */    tp->snd_nxt = tp->write_seq;    tp->pushed_seq = tp->write_seq;    TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);    printk("###%s[%d]###\n", __func__, __LINE__);    /* Timer for repeating the SYN until an answer. */    inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,                  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);    return 0;}

超时重传

static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,                         unsigned long when,                         const unsigned long max_when){    struct inet_connection_sock *icsk = inet_csk(sk);    if (when > max_when) {#ifdef INET_CSK_DEBUG        pr_debug("reset_xmit_timer: sk=%p %d when=0x%lx, caller=%p\n",             sk, what, when, current_text_addr());#endif        when = max_when;    }    if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||        what == ICSK_TIME_EARLY_RETRANS || what ==  ICSK_TIME_LOSS_PROBE) {        icsk->icsk_pending = what;        icsk->icsk_timeout = jiffies + when;        printk("###%s[%d]### icsk->icsk_timeout: %lu\n", __func__, __LINE__, icsk->icsk_timeout);        sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);    } else if (what == ICSK_TIME_DACK) {        icsk->icsk_ack.pending |= ICSK_ACK_TIMER;        icsk->icsk_ack.timeout = jiffies + when;        sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);    }#ifdef INET_CSK_DEBUG    else {        pr_debug("%s", inet_csk_timer_bug_msg);    }#endif}
void tcp_write_timer_handler(struct sock *sk){    struct inet_connection_sock *icsk = inet_csk(sk);    int event;    if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)    {        printk("###%s[%d]### sk_state: %d\n", __func__, __LINE__, sk->sk_state);        goto out;    }    if (time_after(icsk->icsk_timeout, jiffies)) {        sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);        goto out;    }    event = icsk->icsk_pending;    switch (event) {    case ICSK_TIME_EARLY_RETRANS:        tcp_resume_early_retransmit(sk);        break;    case ICSK_TIME_LOSS_PROBE:        tcp_send_loss_probe(sk);        break;    case ICSK_TIME_RETRANS:        icsk->icsk_pending = 0;        tcp_retransmit_timer(sk);        break;    case ICSK_TIME_PROBE0:        icsk->icsk_pending = 0;        tcp_probe_timer(sk);        break;    }out:    sk_mem_reclaim(sk);}
void tcp_retransmit_timer(struct sock *sk){    struct tcp_sock *tp = tcp_sk(sk);    struct inet_connection_sock *icsk = inet_csk(sk);    printk("###%s[%d]###sk->sk_err_soft: %d\n", __func__, __LINE__, sk->sk_err_soft);    if (tp->fastopen_rsk) {        WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&                 sk->sk_state != TCP_FIN_WAIT1);        tcp_fastopen_synack_timer(sk);        /* Before we receive ACK to our SYN-ACK don't retransmit         * anything else (e.g., data or FIN segments).         */        return;    }    if (!tp->packets_out)        goto out;    WARN_ON(tcp_write_queue_empty(sk));    tp->tlp_high_seq = 0;    if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&        !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {        /* Receiver dastardly shrinks window. Our retransmits         * become zero probes, but we should not timeout this         * connection. If the socket is an orphan, time it out,         * we cannot allow such beasts to hang infinitely.         */        struct inet_sock *inet = inet_sk(sk);        if (sk->sk_family == AF_INET) {            LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"),                       &inet->inet_daddr,                       ntohs(inet->inet_dport), inet->inet_num,                       tp->snd_una, tp->snd_nxt);        }#if IS_ENABLED(CONFIG_IPV6)        else if (sk->sk_family == AF_INET6) {            LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"),                       &sk->sk_v6_daddr,                       ntohs(inet->inet_dport), inet->inet_num,                       tp->snd_una, tp->snd_nxt);        }#endif        if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {            printk("###%s[%d]### TCP_RTO_MAX\n", __func__, __LINE__);            tcp_write_err(sk);            goto out;        }        tcp_enter_loss(sk, 0);        tcp_retransmit_skb(sk, tcp_write_queue_head(sk));        __sk_dst_reset(sk);        printk("###%s[%d]###\n", __func__, __LINE__);        goto out_reset_timer;    }    if (tcp_write_timeout(sk))    {        printk("###%s[%d]###\n", __func__, __LINE__);        goto out;    }    if (icsk->icsk_retransmits == 0) {        int mib_idx;        if (icsk->icsk_ca_state == TCP_CA_Recovery) {            if (tcp_is_sack(tp))                mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;            else                mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;        } else if (icsk->icsk_ca_state == TCP_CA_Loss) {            mib_idx = LINUX_MIB_TCPLOSSFAILURES;        } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) ||               tp->sacked_out) {            if (tcp_is_sack(tp))                mib_idx = LINUX_MIB_TCPSACKFAILURES;            else                mib_idx = LINUX_MIB_TCPRENOFAILURES;        } else {            mib_idx = LINUX_MIB_TCPTIMEOUTS;        }        NET_INC_STATS_BH(sock_net(sk), mib_idx);    }    tcp_enter_loss(sk, 0);    printk("###%s[%d]###\n", __func__, __LINE__);    if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {        printk("###%s[%d]###\n", __func__, __LINE__);        /* Retransmission failed because of local congestion,         * do not backoff.         */        if (!icsk->icsk_retransmits)            icsk->icsk_retransmits = 1;        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,                      min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),                      TCP_RTO_MAX);        goto out;    }    /* Increase the timeout each time we retransmit.  Note that     * we do not increase the rtt estimate.  rto is initialized     * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests     * that doubling rto each time is the least we can get away with.     * In KA9Q, Karn uses this for the first few times, and then     * goes to quadratic.  netBSD doubles, but only goes up to *64,     * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is     * defined in the protocol as the maximum possible RTT.  I guess     * we'll have to use something other than TCP to talk to the     * University of Mars.     *     * PAWS allows us longer timeouts and large windows, so once     * implemented ftp to mars will work nicely. We will have to fix     * the 120 second clamps though!     */    icsk->icsk_backoff++;    icsk->icsk_retransmits++;out_reset_timer:    /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is     * used to reset timer, set to 0. Recalculate 'icsk_rto' as this     * might be increased if the stream oscillates between thin and thick,     * thus the old value might already be too high compared to the value     * set by 'tcp_set_rto' in tcp_input.c which resets the rto without     * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating     * exponential backoff behaviour to avoid continue hammering     * linear-timeout retransmissions into a black hole     */    if (sk->sk_state == TCP_ESTABLISHED &&        (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&        tcp_stream_is_thin(tp) &&        icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {        printk("###%s[%d]###\n", __func__, __LINE__);        icsk->icsk_backoff = 0;        icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);    } else {        printk("###%s[%d]###sk->sk_err_soft: %d\n", __func__, __LINE__, sk->sk_err_soft);        /* Use normal (exponential) backoff */        icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);    }    inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);    if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))        __sk_dst_reset(sk);out:;}
int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb){    struct tcp_sock *tp = tcp_sk(sk);    struct inet_connection_sock *icsk = inet_csk(sk);    unsigned int cur_mss;    int err;    /* Inconslusive MTU probe */    if (icsk->icsk_mtup.probe_size) {        icsk->icsk_mtup.probe_size = 0;    }    /* Do not sent more than we queued. 1/4 is reserved for possible     * copying overhead: fragmentation, tunneling, mangling etc.     */    if (atomic_read(&sk->sk_wmem_alloc) >        min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))        return -EAGAIN;    printk("###%s[%d]### sk->sk_err_soft: %d\n", __func__, __LINE__, sk->sk_err_soft);    if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {        if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))            BUG();        if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))            return -ENOMEM;    }    printk("###%s[%d]### sk->sk_err_soft: %d\n", __func__, __LINE__, sk->sk_err_soft);    if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))        return -EHOSTUNREACH; /* Routing failure or similar. */    printk("###%s[%d]### sk->sk_err_soft: %d\n", __func__, __LINE__, sk->sk_err_soft);    cur_mss = tcp_current_mss(sk);    /* If receiver has shrunk his window, and skb is out of     * new window, do not retransmit it. The exception is the     * case, when window is shrunk to zero. In this case     * our retransmit serves as a zero window probe.     */    if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&        TCP_SKB_CB(skb)->seq != tp->snd_una)        return -EAGAIN;    printk("###%s[%d]### sk->sk_err_soft: %d\n", __func__, __LINE__, sk->sk_err_soft);    if (skb->len > cur_mss) {        if (tcp_fragment(sk, skb, cur_mss, cur_mss))            return -ENOMEM; /* We'll try again later. */    } else {        int oldpcount = tcp_skb_pcount(skb);        if (unlikely(oldpcount > 1)) {            if (skb_unclone(skb, GFP_ATOMIC))                return -ENOMEM;            tcp_init_tso_segs(sk, skb, cur_mss);            tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));        }    }    printk("###%s[%d]### sk->sk_err_soft: %d\n", __func__, __LINE__, sk->sk_err_soft);    tcp_retrans_try_collapse(sk, skb, cur_mss);    /* Make a copy, if the first transmission SKB clone we made     * is still in somebody's hands, else make a clone.     */    TCP_SKB_CB(skb)->when = tcp_time_stamp;    printk("###%s[%d]### sk->sk_err_soft: %d\n", __func__, __LINE__, sk->sk_err_soft);    /* make sure skb->data is aligned on arches that require it     * and check if ack-trimming & collapsing extended the headroom     * beyond what csum_start can cover.     */    if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||             skb_headroom(skb) >= 0xFFFF)) {        struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,                           GFP_ATOMIC);        err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :                 -ENOBUFS;    } else {        err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);    }    printk("###%s[%d]### sk->sk_err_soft: %d\n", __func__, __LINE__, sk->sk_err_soft);    if (likely(!err))        TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;    return err;}
int inet_sk_rebuild_header(struct sock *sk){    struct inet_sock *inet = inet_sk(sk);    struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);    __be32 daddr;    struct ip_options_rcu *inet_opt;    struct flowi4 *fl4;    int err;    /* Route is OK, nothing to do. */    if (rt)        return 0;    /* Reroute. */    rcu_read_lock();    inet_opt = rcu_dereference(inet->inet_opt);    daddr = inet->inet_daddr;    if (inet_opt && inet_opt->opt.srr)        daddr = inet_opt->opt.faddr;    rcu_read_unlock();    fl4 = &inet->cork.fl.u.ip4;    printk("###%s[%d]### saddr: %x\n", __func__, __LINE__, inet->inet_saddr);    rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,                   inet->inet_dport, inet->inet_sport,                   sk->sk_protocol, RT_CONN_FLAGS(sk),                   sk->sk_bound_dev_if);    if (!IS_ERR(rt)) {        err = 0;        sk_setup_caps(sk, &rt->dst);    } else {        err = PTR_ERR(rt);        printk("###%s[%d]### err: %d\n", __func__, __LINE__, err);        /* Routing failed... */        sk->sk_route_caps = 0;        /*         * Other protocols have to map its equivalent state to TCP_SYN_SENT.         * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme         */        if (!sysctl_ip_dynaddr ||            sk->sk_state != TCP_SYN_SENT ||            (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||            (err = inet_sk_reselect_saddr(sk)) != 0)            sk->sk_err_soft = -err;    }    printk("###%s[%d]### sk->sk_err_soft: %d\n", __func__, __LINE__, sk->sk_err_soft);    return err;}