Socket Kernel Source Chapter06 connect

来源:互联网 发布:法兰绒哪个牌子好 知乎 编辑:程序博客网 时间:2024/05/29 14:40

6 connect

客户端调用connect,对应到sys_socketcall中会调用sys_connect

6.1 sys_connect

asmlinkage long sys_connect(int fd, struct sockaddr __user *uservaddr,
                            int addrlen)
{
        struct socket *sock;
        char address[MAX_SOCK_ADDR];
        int err, fput_needed;

        sock = sockfd_lookup_light(fd, &err, &fput_needed); //根据fd描述字,找到socket*,
        if (!sock)
                goto out;
        err = move_addr_to_kernel(uservaddr, addrlen, address);        //地址从用户层copy到内核
        if (err < 0)
                goto out_put;

        err = security_socket_connect(sock, (struct sockaddr *)address, addrlen);    //安全控制,跳过
        if (err)
                goto out_put;

        err = sock->ops->connect(sock, (struct sockaddr *)address, addrlen,
                                 sock->file->f_flags); //参见前面函数指针结构
out_put:
        fput_light(sock->file, fput_needed);
out:
        return err;
}

 

6.2 inet_stream_connect 

sock_ops->connect,由于没有inet_connect,因此调用了TCPinet_stream_connect

int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
                        int addr_len, int flags)
{
        struct sock *sk = sock->sk;
        int err;
        long timeo;

        lock_sock(sk);

         //协议检查

        if (uaddr->sa_family == AF_UNSPEC) {
                err = sk->sk_prot->disconnect(sk, flags);
                sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
                goto out;
        }

         //当前状态检查

        switch (sock->state) {
        default:
                err = -EINVAL;
                goto out;
        case SS_CONNECTED:
                err = -EISCONN;
                goto out;
        case SS_CONNECTING:
                err = -EALREADY;
                break;
        case SS_UNCONNECTED:
                err = -EISCONN;
                if (sk->sk_state != TCP_CLOSE)
                        goto out;

                err = sk->sk_prot->connect(sk, uaddr, addr_len);    //真正调用的函数,进行connect连接
                if (err < 0)
                        goto out;

                sock->state = SS_CONNECTING;

                err = -EINPROGRESS;
                break;
        }

        timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);        //参看内部函数,timeo一定返回0

        if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { //如果是SEND和RECV状态,就wait
                if (!timeo || !inet_wait_for_connect(sk, timeo))
                        goto out;

                err = sock_intr_errno(timeo);
                if (signal_pending(current))
                        goto out;
        }

        if (sk->sk_state == TCP_CLOSE)
                goto sock_error;

        sock->state = SS_CONNECTED;
        err = 0;
out:
        release_sock(sk);
        return err;

sock_error:
        err = sock_error(sk) ? : -ECONNABORTED;
        sock->state = SS_UNCONNECTED;
        if (sk->sk_prot->disconnect(sk, flags))
                sock->state = SS_DISCONNECTING;
        goto out;
}

 

6.2.1

static inline long sock_sndtimeo(const struct sock *sk, int noblock)
{
        return noblock ? 0 : sk->sk_sndtimeo;
}

 

6.2.2

//循环等待

static long inet_wait_for_connect(struct sock *sk, long timeo)
{
        DEFINE_WAIT(wait);

        prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);

        while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
                release_sock(sk);
                timeo = schedule_timeout(timeo); //schedule等待
                lock_sock(sk);
                if (signal_pending(current) || !timeo)
                        break;
                prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
        }
        finish_wait(sk->sk_sleep, &wait);
        return timeo;
}

6.2.3

//返回错误码

static inline int sock_intr_errno(long timeo)
{
        return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR;
}

6.3 tcp_v4_connect

参见前面的struct proto tcp_prot 函数指针,调用tcp_v4_connnect。
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
        struct inet_sock *inet = inet_sk(sk);    //获取inet_sock
        struct tcp_sock *tp = tcp_sk(sk);        //获取tcp_sock
        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; //目的地址
        struct rtable *rt;
        __be32 daddr, nexthop;
        int tmp;
        int err;

        //参数检查

        if (addr_len < sizeof(struct sockaddr_in))
                return -EINVAL;

        if (usin->sin_family != AF_INET)
                return -EAFNOSUPPORT;

        nexthop = daddr = usin->sin_addr.s_addr;     //目的地址
        if (inet->opt && inet->opt->srr) {
                if (!daddr)
                        return -EINVAL;
                nexthop = inet->opt->faddr;
        }

        

        //检查IP路由,路由部分比较复杂,跳过,

        tmp = ip_route_connect(&rt, nexthop, inet->saddr,
                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
                               IPPROTO_TCP,
                               inet->sport, usin->sin_port, sk, 1);
        if (tmp < 0) {
                if (tmp == -ENETUNREACH)
                        IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
                return tmp;
        }

         //对多播和广播处理

        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
                ip_rt_put(rt);     //释放rt
                return -ENETUNREACH;
        }

        if (!inet->opt || !inet->opt->srr)
                daddr = rt->rt_dst;

        if (!inet->saddr)
                inet->saddr = rt->rt_src;
        inet->rcv_saddr = inet->saddr;

        if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
                /* Reset inherited state */
                tp->rx_opt.ts_recent       = 0;
                tp->rx_opt.ts_recent_stamp = 0;
                tp->write_seq              = 0;
        }

        if (tcp_death_row.sysctl_tw_recycle &&
            !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
                struct inet_peer *peer = rt_get_peer(rt);
                if (peer != NULL &&
                    peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
                        tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
                        tp->rx_opt.ts_recent = peer->tcp_ts;
                }
        }

        inet->dport = usin->sin_port;
        inet->daddr = daddr;

        inet_csk(sk)->icsk_ext_hdr_len = 0;
        if (inet->opt)
                inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;

        tp->rx_opt.mss_clamp = 536;

        tcp_set_state(sk, TCP_SYN_SENT);
        err = inet_hash_connect(&tcp_death_row, sk);
        if (err)
                goto failure;

        err = ip_route_newports(&rt, IPPROTO_TCP,
                                inet->sport, inet->dport, sk);
        if (err)
                goto failure;

       sk->sk_gso_type = SKB_GSO_TCPV4;
        sk_setup_caps(sk, &rt->u.dst);

        if (!tp->write_seq)
                tp->write_seq = secure_tcp_sequence_number(inet->saddr,
                                                           inet->daddr,
                                                           inet->sport,
                                                           usin->sin_port);

        inet->id = tp->write_seq ^ jiffies;          //序号

        err = tcp_connect(sk);     //函数进入下一步,组tcp帧
        rt = NULL;
        if (err)
                goto failure;

        return 0;

failure:
        tcp_set_state(sk, TCP_CLOSE);
        ip_rt_put(rt);
        sk->sk_route_caps = 0;
        inet->dport = 0;
        return err;
}

 

 

6.4 tcp_connect

赋值到struct sk_bufftcp_transmit_skb

int tcp_connect(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);        //强制转为tcp_sock类型
        struct sk_buff *buff;

        tcp_connect_init(sk);                        //初始化6.5

        buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
        if (unlikely(buff == NULL))
                return -ENOBUFS;

 

        //保留指定的大小空间
        skb_reserve(buff, MAX_TCP_HEADER);

        tp->snd_nxt = tp->write_seq; //序号
        tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN); //初始化数据
        TCP_ECN_send_syn(sk, buff); //Explicit Congestion Notification,显示拥塞报告

 
        TCP_SKB_CB(buff)->when = tcp_time_stamp;
        tp->retrans_stamp = TCP_SKB_CB(buff)->when;
        skb_header_release(buff); //递减计数,


        __tcp_add_write_queue_tail(sk, buff); //添加到队列中
        sk->sk_wmem_queued += buff->truesize;
        sk_mem_charge(sk, buff->truesize);
        tp->packets_out += tcp_skb_pcount(buff);
        tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);

        /* We change tp->snd_nxt after the tcp_transmit_skb() call
         * in order to make this packet get counted in tcpOutSegs.
         */
        tp->snd_nxt = tp->write_seq;
        tp->pushed_seq = tp->write_seq;
        TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);

        /* Timer for repeating the SYN until an answer. */
        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                                  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
        return 0;
}

 

static inline struct tcp_sock *tcp_sk(const struct sock *sk)
{
        return (struct tcp_sock *)sk;
}
static inline struct sk_buff *alloc_skb(unsigned int size, gfp_t priority)
{
       return __alloc_skb(size, priority, 0, -1);
}

//Increase the headroom of an empty &sk_buff by reducing the tail
//room. This is only allowed for an empty buffer.
static inline void skb_reserve(struct sk_buff *skb, int len)
{
        skb->data += len;
        skb->tail += len;
}

//Constructs common control bits of non-data skb. If SYN/FIN is present,
//auto increment end seqno.
static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
{
        skb->csum = 0;

        TCP_SKB_CB(skb)->flags = flags;
        TCP_SKB_CB(skb)->sacked = 0;

        skb_shinfo(skb)->gso_segs = 1;
        skb_shinfo(skb)->gso_size = 0;
        skb_shinfo(skb)->gso_type = 0;

        TCP_SKB_CB(skb)->seq = seq;
        if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN))
                seq++;
        TCP_SKB_CB(skb)->end_seq = seq;
}


static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
{
        struct tcp_sock *tp = tcp_sk(sk);

        tp->ecn_flags = 0;

        //如果系统支持显式拥塞通告(ECN),则设置ECN status bits,Explicit Congestion Notification
        if (sysctl_tcp_ecn) {    
                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR;
                tp->ecn_flags = TCP_ECN_OK;
        }
}

#define TCP_SKB_CB(__skb)       ((struct tcp_skb_cb *)&((__skb)->cb[0]))

 

//      Drop a reference to the header part of the buffer.  This is done
//      by acquiring a payload reference.  You must not read from the header
//      part of skb->data after this.

static inline void skb_header_release(struct sk_buff *skb)
{
        BUG_ON(skb->nohdr);
        skb->nohdr = 1;
        atomic_add(1 << SKB_DATAREF_SHIFT, &skb_shinfo(skb)->dataref);
}

 static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
{
        __skb_queue_tail(&sk->sk_write_queue, skb);
}
// Queue a buffer at the end of a list. This function takes no locks
//and you must therefore hold required locks before calling it.
//A buffer cannot be placed on two lists at the same time.
static inline void __skb_queue_tail(struct sk_buff_head *list,
                                   struct sk_buff *newsk)
{
        struct sk_buff *prev, *next;

        list->qlen++;
        next = (struct sk_buff *)list;
        prev = next->prev;
        newsk->next = next;
        newsk->prev = prev;
        next->prev  = prev->next = newsk;
}

 

6.5初始化

static void tcp_connect_init(struct sock *sk)
{
        struct dst_entry *dst = __sk_dst_get(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        __u8 rcv_wscale;

 

        //tcp帧头长度tcp_header_len        

        tp->tcp_header_len = sizeof(struct tcphdr) +  (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);

#ifdef CONFIG_TCP_MD5SIG
        if (tp->af_specific->md5_lookup(sk, sk) != NULL)
                tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
#endif

        /* If user gave his TCP_MAXSEG, record it to clamp */
        if (tp->rx_opt.user_mss)
                tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
        tp->max_window = 0;   //窗口
        tcp_mtup_init(sk);  //最大传输单元(Maximum Transmission Unit,MTU)的初始化
        tcp_sync_mss(sk, dst_mtu(dst));        //初始化MSS: Maxitum Segment Size 最大分段大小,内部处理复杂,参考后面附注

        if (!tp->window_clamp)
                tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
        tp->advmss = dst_metric(dst, RTAX_ADVMSS);
        tcp_initialize_rcv_mss(sk);

        tcp_select_initial_window(tcp_full_space(sk),
                                  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
                                  &tp->rcv_wnd,
                                  &tp->window_clamp,
                                  sysctl_tcp_window_scaling,
                                  &rcv_wscale);

        tp->rx_opt.rcv_wscale = rcv_wscale;
        tp->rcv_ssthresh = tp->rcv_wnd;

        sk->sk_err = 0;
        sock_reset_flag(sk, SOCK_DONE);
        tp->snd_wnd = 0;
        tcp_init_wl(tp, tp->write_seq, 0);
        tp->snd_una = tp->write_seq;
        tp->snd_sml = tp->write_seq;
        tp->rcv_nxt = 0;
        tp->rcv_wup = 0;
        tp->copied_seq = 0;

        inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
        inet_csk(sk)->icsk_retransmits = 0;
        tcp_clear_retrans(tp);
}

 

6.5.1

//mtu,窗口初始化 等

void tcp_mtup_init(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);

        icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
        icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
                               icsk->icsk_af_ops->net_header_len;
        icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
        icsk->icsk_mtup.probe_size = 0;
}

static inline u32 dst_metric(const struct dst_entry *dst, int metric)
{
        return dst->metrics[metric-1];
}
* Initialize RCV_MSS value.
 * RCV_MSS is an our guess about MSS used by the peer.
 * We haven't any direct information about the MSS.
 * It's better to underestimate the RCV_MSS rather than overestimate.
 * Overestimations make us ACKing less frequently than needed.
 * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
 */
void tcp_initialize_rcv_mss(struct sock *sk)
{
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);

        hint = min(hint, tp->rcv_wnd / 2);
        hint = min(hint, TCP_MIN_RCVMSS);
        hint = max(hint, TCP_MIN_MSS);

        inet_csk(sk)->icsk_ack.rcv_mss = hint;
}


static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag)
{
        __clear_bit(flag, &sk->sk_flags);
}

static inline void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq)
{
        tp->snd_wl1 = seq;
}

6.5.2 决定tcp滑动窗口大小

 /* Determine a window scaling and initial window to offer.
 * Based on the assumption that the given amount of space
 * will be offered. Store the results in the tp structure.
 * NOTE: for smooth operation initial space offering should
 * be a multiple of mss if possible. We assume here that mss >= 1.
 * This MUST be enforced by all callers.
 */
void tcp_select_initial_window(int __space, __u32 mss,
                               __u32 *rcv_wnd, __u32 *window_clamp,
                               int wscale_ok, __u8 *rcv_wscale)
{
        unsigned int space = (__space < 0 ? 0 : __space);

        /* If no clamp set the clamp to the max possible scaled window */
        if (*window_clamp == 0)
                (*window_clamp) = (65535 << 14);
        space = min(*window_clamp, space);

        /* Quantize space offering to a multiple of mss if possible. */
        if (space > mss)
                space = (space / mss) * mss;

        /* NOTE: offering an initial window larger than 32767
         * will break some buggy TCP stacks. If the admin tells us
         * it is likely we could be speaking with such a buggy stack
         * we will truncate our initial window offering to 32K-1
         * unless the remote has sent us a window scaling option,
         * which we interpret as a sign the remote TCP is not
         * misinterpreting the window field as a signed quantity.
         */
        if (sysctl_tcp_workaround_signed_windows)
                (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
        else
                (*rcv_wnd) = space;

        (*rcv_wscale) = 0;
        if (wscale_ok) {
                /* Set window scaling on max possible window
                 * See RFC1323 for an explanation of the limit to 14
                 */
                space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
                space = min_t(u32, space, *window_clamp);
                while (space > 65535 && (*rcv_wscale) < 14) {
                        space >>= 1;
                        (*rcv_wscale)++;
                }
        }

        /* Set initial window to value enough for senders,
         * following RFC2414. Senders, not following this RFC,
         * will be satisfied with 2.
         */
        if (mss > (1 << *rcv_wscale)) {
                int init_cwnd = 4;
                if (mss > 1460 * 3)
                        init_cwnd = 2;
                else if (mss > 1460)
                        init_cwnd = 3;
                if (*rcv_wnd > init_cwnd * mss)
                        *rcv_wnd = init_cwnd * mss;
        }

        /* Set the clamp no higher than max representable value */
        (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
}

6.5.3 一些清理,置零

static void tcp_clear_retrans_partial(struct tcp_sock *tp)
{
        tp->retrans_out = 0;
        tp->lost_out = 0;

        tp->undo_marker = 0;
        tp->undo_retrans = 0;
}

void tcp_clear_retrans(struct tcp_sock *tp)
{
        tcp_clear_retrans_partial(tp);

        tp->fackets_out = 0;
        tp->sacked_out = 0;
}

 

 

int tcp_connect(struct sock* sk)

{

     struct sk_buff* buff = alloc_skb(MAX_TCP_HEADER+15, sk->allocation);

     //…一些buff初始化

TCP_SKB_CB(buff)->seq = tp->write_seq++; //序列号seq++TCP

     TCP_SKB_CB(buff)->when = tcp_time_stamp; //时间戳

//见后传输部分,后面与发送普通TCP包的函数一致了

     tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); 

}

 

 

 

附注:

http://www.yuanma.org/data/2008/0116/article_2946.htm

tcp协议栈中的mss

看tcp_sync_mss前面的注释,应该可以对这三个mss变量有所了解.
user_mss是使用TCP_MAXSEG选项的setsockopt/getsockopt函数设置的。
mss_clamp用在TCP连接建立时协商时候使用的,取接收到的SYN包的mss值和user_mss的最小值。
mss_cache是当前有效的发送mss。

  tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
   for TCP options, but includes only bare TCP header.

   tp->rx_opt.mss_clamp is mss negotiated at connection setup.
   It is minimum of user_mss and mss received with SYN.
   It also does not include TCP options.

   inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.

   tp->mss_cache is current effective sending mss, including
   all tcp options except for SACKs. It is evaluated,
   taking into account current pmtu, but never exceeds
   tp->rx_opt.mss_clamp.

   NOTE1. rfc1122 clearly states that advertised MSS
   DOES NOT include either tcp or ip options.

   NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
   are READ ONLY outside this function.        --ANK (980731)

参考:
http://oss.sgi.com/archives/netdev/2000-08/msg00039.html
http://topic.csdn.net/t/20031113/09/2455705.html

下面,我们看一下一些代码:
tcp_mtu_to_mss根据pmtu(路径最大传输单元)来计算mss(最大消息传输段),pmtu-IP头部-TCP头部-IP选项-TCP选项,(其中IP选项和TCP选项的长度可能都为0)。
/* Not accounting for SACKs here. */
int tcp_mtu_to_mss(struct sock *sk, int pmtu)
{
    struct tcp_sock *tp = tcp_sk(sk);
    struct inet_connection_sock *icsk = inet_csk(sk);
    int mss_now;

    /* Calculate base mss without TCP options:
       It is MMS_S - sizeof(tcphdr) of rfc1122
     */
    /*    ipv4_specific.net_header_len = sizeof(struct iphdr),*/
    mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);

    /* Clamp it (mss_clamp does not include tcp options) */
    if (mss_now > tp->rx_opt.mss_clamp)
        mss_now = tp->rx_opt.mss_clamp;

    /* Now subtract optional transport overhead */
    /*In tcp_v4_connect(),inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;*/
    mss_now -= icsk->icsk_ext_hdr_len;

    /* Then reserve room for full set of TCP options and 8 bytes of data */
    if (mss_now < 48)
        mss_now = 48;

    /* Now subtract TCP options size, not including SACKs */
    mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);

    return mss_now;
}

同理,将mss转化为mtu的tcp_mss_to_mtu函数,就是在mss+TCP头部+TCP选项+IP头+IP选项。
int tcp_mss_to_mtu(struct sock *sk, int mss)
{
    struct tcp_sock *tp = tcp_sk(sk);
    struct inet_connection_sock *icsk = inet_csk(sk);
    int mtu;
    /*tcp_header_len = TCP Header + TCP Options*/
    mtu = mss +
          tp->tcp_header_len +
          icsk->icsk_ext_hdr_len +
          icsk->icsk_af_ops->net_header_len;

    return mtu;
}

 

原创粉丝点击