实现原理-Connect
来源:互联网 发布:山西教师网络研修网 编辑:程序博客网 时间:2024/06/06 02:13
最近遇到阻塞式connect在三步握手过程中,客户端自身IP发生变化时,connect系统调用阻塞时间过长的问题。除将connect修改为非阻塞的方式外,跟踪学习connect的socket层及TCP层实现,简单归纳整理。
Socket层
SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, int, addrlen){ struct socket *sock; struct sockaddr_storage address; int err, fput_needed; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; err = move_addr_to_kernel(uservaddr, addrlen, &address); if (err < 0) goto out_put; err = security_socket_connect(sock, (struct sockaddr *)&address, addrlen); if (err) goto out_put; err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen, sock->file->f_flags);out_put: fput_light(sock->file, fput_needed);out: return err;}
TCP层
int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags){ int err; lock_sock(sock->sk); err = __inet_stream_connect(sock, uaddr, addr_len, flags); release_sock(sock->sk); return err;}
int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags){ struct sock *sk = sock->sk; int err; long timeo; if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) { err = sk->sk_prot->disconnect(sk, flags); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; goto out; } switch (sock->state) { default: err = -EINVAL; goto out; case SS_CONNECTED: err = -EISCONN; goto out; case SS_CONNECTING: err = -EALREADY; /* Fall out of switch with err, set for this state */ break; case SS_UNCONNECTED: err = -EISCONN; if (sk->sk_state != TCP_CLOSE) goto out; err = sk->sk_prot->connect(sk, uaddr, addr_len); if (err < 0) { goto out; } sock->state = SS_CONNECTING; /* Just entered SS_CONNECTING state; the only * difference is that return value in non-blocking * case is EINPROGRESS, rather than EALREADY. */ err = -EINPROGRESS; break; } timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { int writebias = (sk->sk_protocol == IPPROTO_TCP) && tcp_sk(sk)->fastopen_req && tcp_sk(sk)->fastopen_req->data ? 1 : 0; /* Error code is set above */ if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) { goto out; } err = sock_intr_errno(timeo); if (signal_pending(current)) goto out; } /* Connection was closed by RST, timeout, ICMP error * or another process disconnected us. */ if (sk->sk_state == TCP_CLOSE) { printk("###%s[%d]### sk TCP CLOSE\n", __func__, __LINE__); goto sock_error; } /* sk->sk_err may be not zero now, if RECVERR was ordered by user * and error was received after socket entered established state. * Hence, it is handled normally after connect() return successfully. */ sock->state = SS_CONNECTED; err = 0;out: return err;sock_error: err = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; if (sk->sk_prot->disconnect(sk, flags)) sock->state = SS_DISCONNECTING; goto out;}
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len){ struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); __be16 orig_sport, orig_dport; __be32 daddr, nexthop; struct flowi4 *fl4; struct rtable *rt; int err; struct ip_options_rcu *inet_opt; if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; if (usin->sin_family != AF_INET) return -EAFNOSUPPORT; nexthop = daddr = usin->sin_addr.s_addr; inet_opt = rcu_dereference_protected(inet->inet_opt, sock_owned_by_user(sk)); if (inet_opt && inet_opt->opt.srr) { if (!daddr) return -EINVAL; nexthop = inet_opt->opt.faddr; } orig_sport = inet->inet_sport; orig_dport = usin->sin_port; fl4 = &inet->cork.fl.u.ip4; rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, orig_dport, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); if (err == -ENETUNREACH) IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); return err; } if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { ip_rt_put(rt); return -ENETUNREACH; } if (!inet_opt || !inet_opt->opt.srr) daddr = fl4->daddr; if (!inet->inet_saddr) inet->inet_saddr = fl4->saddr; inet->inet_rcv_saddr = inet->inet_saddr; if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { /* Reset inherited state */ tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; if (likely(!tp->repair)) tp->write_seq = 0; } if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) tcp_fetch_timewait_stamp(sk, &rt->dst); inet->inet_dport = usin->sin_port; inet->inet_daddr = daddr; inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; /* Socket identity is still unknown (sport may be zero). * However we set state to SYN-SENT and not releasing socket * lock select source port, enter ourselves into the hash tables and * complete initialization after this. */ tcp_set_state(sk, TCP_SYN_SENT); err = inet_hash_connect(&tcp_death_row, sk); if (err) goto failure; rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, inet->inet_sport, inet->inet_dport, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; goto failure; } /* OK, now commit destination to socket. */ sk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(sk, &rt->dst); if (!tp->write_seq && likely(!tp->repair)) tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, inet->inet_daddr, inet->inet_sport, usin->sin_port); inet->inet_id = tp->write_seq ^ jiffies; err = tcp_connect(sk); rt = NULL; if (err) goto failure; return 0;failure: /* * This unhashes the socket and releases the local port, * if necessary. */ tcp_set_state(sk, TCP_CLOSE); ip_rt_put(rt); sk->sk_route_caps = 0; inet->inet_dport = 0; return err;}
/* Build a SYN and send it off. */int tcp_connect(struct sock *sk){ struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; int err; tcp_connect_init(sk); if (unlikely(tp->repair)) { tcp_finish_connect(sk, NULL); return 0; } buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); if (unlikely(!buff)) return -ENOBUFS; tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); tp->retrans_stamp = TCP_SKB_CB(buff)->when = tcp_time_stamp; tcp_connect_queue_skb(sk, buff); TCP_ECN_send_syn(sk, buff); /* Send off SYN; include data in Fast Open. */ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); if (err == -ECONNREFUSED) return err; /* We change tp->snd_nxt after the tcp_transmit_skb() call * in order to make this packet get counted in tcpOutSegs. */ tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); printk("###%s[%d]###\n", __func__, __LINE__); /* Timer for repeating the SYN until an answer. */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); return 0;}
超时重传
static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, unsigned long when, const unsigned long max_when){ struct inet_connection_sock *icsk = inet_csk(sk); if (when > max_when) {#ifdef INET_CSK_DEBUG pr_debug("reset_xmit_timer: sk=%p %d when=0x%lx, caller=%p\n", sk, what, when, current_text_addr());#endif when = max_when; } if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 || what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE) { icsk->icsk_pending = what; icsk->icsk_timeout = jiffies + when; printk("###%s[%d]### icsk->icsk_timeout: %lu\n", __func__, __LINE__, icsk->icsk_timeout); sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); } else if (what == ICSK_TIME_DACK) { icsk->icsk_ack.pending |= ICSK_ACK_TIMER; icsk->icsk_ack.timeout = jiffies + when; sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); }#ifdef INET_CSK_DEBUG else { pr_debug("%s", inet_csk_timer_bug_msg); }#endif}
void tcp_write_timer_handler(struct sock *sk){ struct inet_connection_sock *icsk = inet_csk(sk); int event; if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) { printk("###%s[%d]### sk_state: %d\n", __func__, __LINE__, sk->sk_state); goto out; } if (time_after(icsk->icsk_timeout, jiffies)) { sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); goto out; } event = icsk->icsk_pending; switch (event) { case ICSK_TIME_EARLY_RETRANS: tcp_resume_early_retransmit(sk); break; case ICSK_TIME_LOSS_PROBE: tcp_send_loss_probe(sk); break; case ICSK_TIME_RETRANS: icsk->icsk_pending = 0; tcp_retransmit_timer(sk); break; case ICSK_TIME_PROBE0: icsk->icsk_pending = 0; tcp_probe_timer(sk); break; }out: sk_mem_reclaim(sk);}
void tcp_retransmit_timer(struct sock *sk){ struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); printk("###%s[%d]###sk->sk_err_soft: %d\n", __func__, __LINE__, sk->sk_err_soft); if (tp->fastopen_rsk) { WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); tcp_fastopen_synack_timer(sk); /* Before we receive ACK to our SYN-ACK don't retransmit * anything else (e.g., data or FIN segments). */ return; } if (!tp->packets_out) goto out; WARN_ON(tcp_write_queue_empty(sk)); tp->tlp_high_seq = 0; if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { /* Receiver dastardly shrinks window. Our retransmits * become zero probes, but we should not timeout this * connection. If the socket is an orphan, time it out, * we cannot allow such beasts to hang infinitely. */ struct inet_sock *inet = inet_sk(sk); if (sk->sk_family == AF_INET) { LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"), &inet->inet_daddr, ntohs(inet->inet_dport), inet->inet_num, tp->snd_una, tp->snd_nxt); }#if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"), &sk->sk_v6_daddr, ntohs(inet->inet_dport), inet->inet_num, tp->snd_una, tp->snd_nxt); }#endif if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { printk("###%s[%d]### TCP_RTO_MAX\n", __func__, __LINE__); tcp_write_err(sk); goto out; } tcp_enter_loss(sk, 0); tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); __sk_dst_reset(sk); printk("###%s[%d]###\n", __func__, __LINE__); goto out_reset_timer; } if (tcp_write_timeout(sk)) { printk("###%s[%d]###\n", __func__, __LINE__); goto out; } if (icsk->icsk_retransmits == 0) { int mib_idx; if (icsk->icsk_ca_state == TCP_CA_Recovery) { if (tcp_is_sack(tp)) mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; else mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; } else if (icsk->icsk_ca_state == TCP_CA_Loss) { mib_idx = LINUX_MIB_TCPLOSSFAILURES; } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) || tp->sacked_out) { if (tcp_is_sack(tp)) mib_idx = LINUX_MIB_TCPSACKFAILURES; else mib_idx = LINUX_MIB_TCPRENOFAILURES; } else { mib_idx = LINUX_MIB_TCPTIMEOUTS; } NET_INC_STATS_BH(sock_net(sk), mib_idx); } tcp_enter_loss(sk, 0); printk("###%s[%d]###\n", __func__, __LINE__); if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { printk("###%s[%d]###\n", __func__, __LINE__); /* Retransmission failed because of local congestion, * do not backoff. */ if (!icsk->icsk_retransmits) icsk->icsk_retransmits = 1; inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), TCP_RTO_MAX); goto out; } /* Increase the timeout each time we retransmit. Note that * we do not increase the rtt estimate. rto is initialized * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests * that doubling rto each time is the least we can get away with. * In KA9Q, Karn uses this for the first few times, and then * goes to quadratic. netBSD doubles, but only goes up to *64, * and clamps at 1 to 64 sec afterwards. Note that 120 sec is * defined in the protocol as the maximum possible RTT. I guess * we'll have to use something other than TCP to talk to the * University of Mars. * * PAWS allows us longer timeouts and large windows, so once * implemented ftp to mars will work nicely. We will have to fix * the 120 second clamps though! */ icsk->icsk_backoff++; icsk->icsk_retransmits++;out_reset_timer: /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is * used to reset timer, set to 0. Recalculate 'icsk_rto' as this * might be increased if the stream oscillates between thin and thick, * thus the old value might already be too high compared to the value * set by 'tcp_set_rto' in tcp_input.c which resets the rto without * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating * exponential backoff behaviour to avoid continue hammering * linear-timeout retransmissions into a black hole */ if (sk->sk_state == TCP_ESTABLISHED && (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) && tcp_stream_is_thin(tp) && icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { printk("###%s[%d]###\n", __func__, __LINE__); icsk->icsk_backoff = 0; icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX); } else { printk("###%s[%d]###sk->sk_err_soft: %d\n", __func__, __LINE__, sk->sk_err_soft); /* Use normal (exponential) backoff */ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) __sk_dst_reset(sk);out:;}
int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb){ struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); unsigned int cur_mss; int err; /* Inconslusive MTU probe */ if (icsk->icsk_mtup.probe_size) { icsk->icsk_mtup.probe_size = 0; } /* Do not sent more than we queued. 1/4 is reserved for possible * copying overhead: fragmentation, tunneling, mangling etc. */ if (atomic_read(&sk->sk_wmem_alloc) > min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) return -EAGAIN; printk("###%s[%d]### sk->sk_err_soft: %d\n", __func__, __LINE__, sk->sk_err_soft); if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) BUG(); if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) return -ENOMEM; } printk("###%s[%d]### sk->sk_err_soft: %d\n", __func__, __LINE__, sk->sk_err_soft); if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ printk("###%s[%d]### sk->sk_err_soft: %d\n", __func__, __LINE__, sk->sk_err_soft); cur_mss = tcp_current_mss(sk); /* If receiver has shrunk his window, and skb is out of * new window, do not retransmit it. The exception is the * case, when window is shrunk to zero. In this case * our retransmit serves as a zero window probe. */ if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) && TCP_SKB_CB(skb)->seq != tp->snd_una) return -EAGAIN; printk("###%s[%d]### sk->sk_err_soft: %d\n", __func__, __LINE__, sk->sk_err_soft); if (skb->len > cur_mss) { if (tcp_fragment(sk, skb, cur_mss, cur_mss)) return -ENOMEM; /* We'll try again later. */ } else { int oldpcount = tcp_skb_pcount(skb); if (unlikely(oldpcount > 1)) { if (skb_unclone(skb, GFP_ATOMIC)) return -ENOMEM; tcp_init_tso_segs(sk, skb, cur_mss); tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); } } printk("###%s[%d]### sk->sk_err_soft: %d\n", __func__, __LINE__, sk->sk_err_soft); tcp_retrans_try_collapse(sk, skb, cur_mss); /* Make a copy, if the first transmission SKB clone we made * is still in somebody's hands, else make a clone. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; printk("###%s[%d]### sk->sk_err_soft: %d\n", __func__, __LINE__, sk->sk_err_soft); /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom * beyond what csum_start can cover. */ if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) || skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : -ENOBUFS; } else { err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } printk("###%s[%d]### sk->sk_err_soft: %d\n", __func__, __LINE__, sk->sk_err_soft); if (likely(!err)) TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; return err;}
int inet_sk_rebuild_header(struct sock *sk){ struct inet_sock *inet = inet_sk(sk); struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); __be32 daddr; struct ip_options_rcu *inet_opt; struct flowi4 *fl4; int err; /* Route is OK, nothing to do. */ if (rt) return 0; /* Reroute. */ rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); daddr = inet->inet_daddr; if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; rcu_read_unlock(); fl4 = &inet->cork.fl.u.ip4; printk("###%s[%d]### saddr: %x\n", __func__, __LINE__, inet->inet_saddr); rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); if (!IS_ERR(rt)) { err = 0; sk_setup_caps(sk, &rt->dst); } else { err = PTR_ERR(rt); printk("###%s[%d]### err: %d\n", __func__, __LINE__, err); /* Routing failed... */ sk->sk_route_caps = 0; /* * Other protocols have to map its equivalent state to TCP_SYN_SENT. * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme */ if (!sysctl_ip_dynaddr || sk->sk_state != TCP_SYN_SENT || (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || (err = inet_sk_reselect_saddr(sk)) != 0) sk->sk_err_soft = -err; } printk("###%s[%d]### sk->sk_err_soft: %d\n", __func__, __LINE__, sk->sk_err_soft); return err;}
阅读全文
0 0
- 实现原理-Connect
- 【Qt】connect机制原理
- connect的实现代码
- connect的内核实现/非阻塞connect
- linux下端口扫描的实现(TCP connect、TCP SYN、TCP FIN、UDP四种方式)1 原理篇
- Oracle中CONNECT BY函数的原理
- listen、connect、accept流程及原理
- listen、connect、accept流程及原理
- 基于epoll异步connect实现
- 非阻塞connect的实现
- 非阻塞connect的实现
- Linux下实现connect超时
- 非阻塞connect的实现
- 非阻塞connect的实现
- 非阻塞connect的实现
- 非阻塞connect的实现
- 非阻塞connect的实现
- 非阻塞 connect 的实现
- QT Qstring Qt中文编码和QString类Unicode编码转换
- JS中return false,return,return true的用法及区别
- 关于wamp服务器文件的配置
- 【Java集合源码剖析】ArrayList源码剖析
- vim操作汇总
- 实现原理-Connect
- java类型中的坑
- 冒泡排序算法
- HttpUtil工具类发送post请求
- 数据库切分(2)之垂直切分的粒度
- Android Binder机制原理(史上最强理解,没有之一)
- 练习1-7 编写一个打印打印EOF值的程序
- Java多线程(七)之同步器基础:AQS框架深入分析
- Remote Desktop Connection for mac 报错:证书或相关链无效。