Socket Kernel Source Chapter06 connect
来源:互联网 发布:法兰绒哪个牌子好 知乎 编辑:程序博客网 时间:2024/05/29 14:40
6 connect
客户端调用connect,对应到sys_socketcall中会调用sys_connect
6.1 sys_connect
asmlinkage long sys_connect(int fd, struct sockaddr __user *uservaddr,
int addrlen)
{
struct socket *sock;
char address[MAX_SOCK_ADDR];
int err, fput_needed;
sock = sockfd_lookup_light(fd, &err, &fput_needed); //根据fd描述字,找到socket*,
if (!sock)
goto out;
err = move_addr_to_kernel(uservaddr, addrlen, address); //地址从用户层copy到内核
if (err < 0)
goto out_put;
err = security_socket_connect(sock, (struct sockaddr *)address, addrlen); //安全控制,跳过
if (err)
goto out_put;
err = sock->ops->connect(sock, (struct sockaddr *)address, addrlen,
sock->file->f_flags); //参见前面函数指针结构
out_put:
fput_light(sock->file, fput_needed);
out:
return err;
}
6.2 inet_stream_connect
sock_ops->connect,由于没有inet_connect,因此调用了TCP的inet_stream_connect
int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
int err;
long timeo;
lock_sock(sk);
//协议检查
if (uaddr->sa_family == AF_UNSPEC) {
err = sk->sk_prot->disconnect(sk, flags);
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
goto out;
}
//当前状态检查
switch (sock->state) {
default:
err = -EINVAL;
goto out;
case SS_CONNECTED:
err = -EISCONN;
goto out;
case SS_CONNECTING:
err = -EALREADY;
break;
case SS_UNCONNECTED:
err = -EISCONN;
if (sk->sk_state != TCP_CLOSE)
goto out;
err = sk->sk_prot->connect(sk, uaddr, addr_len); //真正调用的函数,进行connect连接
if (err < 0)
goto out;
sock->state = SS_CONNECTING;
err = -EINPROGRESS;
break;
}
timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); //参看内部函数,timeo一定返回0
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { //如果是SEND和RECV状态,就wait
if (!timeo || !inet_wait_for_connect(sk, timeo))
goto out;
err = sock_intr_errno(timeo);
if (signal_pending(current))
goto out;
}
if (sk->sk_state == TCP_CLOSE)
goto sock_error;
sock->state = SS_CONNECTED;
err = 0;
out:
release_sock(sk);
return err;
sock_error:
err = sock_error(sk) ? : -ECONNABORTED;
sock->state = SS_UNCONNECTED;
if (sk->sk_prot->disconnect(sk, flags))
sock->state = SS_DISCONNECTING;
goto out;
}
6.2.1
static inline long sock_sndtimeo(const struct sock *sk, int noblock)
{
return noblock ? 0 : sk->sk_sndtimeo;
}
6.2.2
//循环等待
static long inet_wait_for_connect(struct sock *sk, long timeo)
{
DEFINE_WAIT(wait);
prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
release_sock(sk);
timeo = schedule_timeout(timeo); //schedule等待
lock_sock(sk);
if (signal_pending(current) || !timeo)
break;
prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
}
finish_wait(sk->sk_sleep, &wait);
return timeo;
}
6.2.3
//返回错误码
static inline int sock_intr_errno(long timeo)
{
return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR;
}
6.3 tcp_v4_connect
参见前面的struct proto tcp_prot 函数指针,调用tcp_v4_connnect。
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct inet_sock *inet = inet_sk(sk); //获取inet_sock
struct tcp_sock *tp = tcp_sk(sk); //获取tcp_sock
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; //目的地址
struct rtable *rt;
__be32 daddr, nexthop;
int tmp;
int err;
//参数检查
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
if (usin->sin_family != AF_INET)
return -EAFNOSUPPORT;
nexthop = daddr = usin->sin_addr.s_addr; //目的地址
if (inet->opt && inet->opt->srr) {
if (!daddr)
return -EINVAL;
nexthop = inet->opt->faddr;
}
//检查IP路由,路由部分比较复杂,跳过,
tmp = ip_route_connect(&rt, nexthop, inet->saddr,
RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
IPPROTO_TCP,
inet->sport, usin->sin_port, sk, 1);
if (tmp < 0) {
if (tmp == -ENETUNREACH)
IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
return tmp;
}
//对多播和广播处理
if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
ip_rt_put(rt); //释放rt
return -ENETUNREACH;
}
if (!inet->opt || !inet->opt->srr)
daddr = rt->rt_dst;
if (!inet->saddr)
inet->saddr = rt->rt_src;
inet->rcv_saddr = inet->saddr;
if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
/* Reset inherited state */
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
tp->write_seq = 0;
}
if (tcp_death_row.sysctl_tw_recycle &&
!tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
struct inet_peer *peer = rt_get_peer(rt);
if (peer != NULL &&
peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
tp->rx_opt.ts_recent = peer->tcp_ts;
}
}
inet->dport = usin->sin_port;
inet->daddr = daddr;
inet_csk(sk)->icsk_ext_hdr_len = 0;
if (inet->opt)
inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
tp->rx_opt.mss_clamp = 536;
tcp_set_state(sk, TCP_SYN_SENT);
err = inet_hash_connect(&tcp_death_row, sk);
if (err)
goto failure;
err = ip_route_newports(&rt, IPPROTO_TCP,
inet->sport, inet->dport, sk);
if (err)
goto failure;
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->u.dst);
if (!tp->write_seq)
tp->write_seq = secure_tcp_sequence_number(inet->saddr,
inet->daddr,
inet->sport,
usin->sin_port);
inet->id = tp->write_seq ^ jiffies; //序号
err = tcp_connect(sk); //函数进入下一步,组tcp帧
rt = NULL;
if (err)
goto failure;
return 0;
failure:
tcp_set_state(sk, TCP_CLOSE);
ip_rt_put(rt);
sk->sk_route_caps = 0;
inet->dport = 0;
return err;
}
6.4 tcp_connect
赋值到struct sk_buff,tcp_transmit_skb
int tcp_connect(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk); //强制转为tcp_sock类型
struct sk_buff *buff;
tcp_connect_init(sk); //初始化6.5
buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
if (unlikely(buff == NULL))
return -ENOBUFS;
//保留指定的大小空间
skb_reserve(buff, MAX_TCP_HEADER);
tp->snd_nxt = tp->write_seq; //序号
tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN); //初始化数据
TCP_ECN_send_syn(sk, buff); //Explicit Congestion Notification,显示拥塞报告
TCP_SKB_CB(buff)->when = tcp_time_stamp;
tp->retrans_stamp = TCP_SKB_CB(buff)->when;
skb_header_release(buff); //递减计数,
__tcp_add_write_queue_tail(sk, buff); //添加到队列中
sk->sk_wmem_queued += buff->truesize;
sk_mem_charge(sk, buff->truesize);
tp->packets_out += tcp_skb_pcount(buff);
tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
/* We change tp->snd_nxt after the tcp_transmit_skb() call
* in order to make this packet get counted in tcpOutSegs.
*/
tp->snd_nxt = tp->write_seq;
tp->pushed_seq = tp->write_seq;
TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
/* Timer for repeating the SYN until an answer. */
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
return 0;
}
static inline struct tcp_sock *tcp_sk(const struct sock *sk)
{
return (struct tcp_sock *)sk;
}
static inline struct sk_buff *alloc_skb(unsigned int size, gfp_t priority)
{
return __alloc_skb(size, priority, 0, -1);
}
//Increase the headroom of an empty &sk_buff by reducing the tail
//room. This is only allowed for an empty buffer.
static inline void skb_reserve(struct sk_buff *skb, int len)
{
skb->data += len;
skb->tail += len;
}
//Constructs common control bits of non-data skb. If SYN/FIN is present,
//auto increment end seqno.
static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
{
skb->csum = 0;
TCP_SKB_CB(skb)->flags = flags;
TCP_SKB_CB(skb)->sacked = 0;
skb_shinfo(skb)->gso_segs = 1;
skb_shinfo(skb)->gso_size = 0;
skb_shinfo(skb)->gso_type = 0;
TCP_SKB_CB(skb)->seq = seq;
if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN))
seq++;
TCP_SKB_CB(skb)->end_seq = seq;
}
static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
tp->ecn_flags = 0;
//如果系统支持显式拥塞通告(ECN),则设置ECN status bits,Explicit Congestion Notification
if (sysctl_tcp_ecn) {
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR;
tp->ecn_flags = TCP_ECN_OK;
}
}
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
// Drop a reference to the header part of the buffer. This is done
// by acquiring a payload reference. You must not read from the header
// part of skb->data after this.
static inline void skb_header_release(struct sk_buff *skb)
{
BUG_ON(skb->nohdr);
skb->nohdr = 1;
atomic_add(1 << SKB_DATAREF_SHIFT, &skb_shinfo(skb)->dataref);
}
static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
{
__skb_queue_tail(&sk->sk_write_queue, skb);
}
// Queue a buffer at the end of a list. This function takes no locks
//and you must therefore hold required locks before calling it.
//A buffer cannot be placed on two lists at the same time.
static inline void __skb_queue_tail(struct sk_buff_head *list,
struct sk_buff *newsk)
{
struct sk_buff *prev, *next;
list->qlen++;
next = (struct sk_buff *)list;
prev = next->prev;
newsk->next = next;
newsk->prev = prev;
next->prev = prev->next = newsk;
}
6.5初始化
static void tcp_connect_init(struct sock *sk)
{
struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
__u8 rcv_wscale;
//tcp帧头长度tcp_header_len
tp->tcp_header_len = sizeof(struct tcphdr) + (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
#ifdef CONFIG_TCP_MD5SIG
if (tp->af_specific->md5_lookup(sk, sk) != NULL)
tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
#endif
/* If user gave his TCP_MAXSEG, record it to clamp */
if (tp->rx_opt.user_mss)
tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
tp->max_window = 0; //窗口
tcp_mtup_init(sk); //最大传输单元(Maximum Transmission Unit,MTU)的初始化
tcp_sync_mss(sk, dst_mtu(dst)); //初始化MSS: Maxitum Segment Size 最大分段大小,内部处理复杂,参考后面附注
if (!tp->window_clamp)
tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
tp->advmss = dst_metric(dst, RTAX_ADVMSS);
tcp_initialize_rcv_mss(sk);
tcp_select_initial_window(tcp_full_space(sk),
tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
&tp->rcv_wnd,
&tp->window_clamp,
sysctl_tcp_window_scaling,
&rcv_wscale);
tp->rx_opt.rcv_wscale = rcv_wscale;
tp->rcv_ssthresh = tp->rcv_wnd;
sk->sk_err = 0;
sock_reset_flag(sk, SOCK_DONE);
tp->snd_wnd = 0;
tcp_init_wl(tp, tp->write_seq, 0);
tp->snd_una = tp->write_seq;
tp->snd_sml = tp->write_seq;
tp->rcv_nxt = 0;
tp->rcv_wup = 0;
tp->copied_seq = 0;
inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
inet_csk(sk)->icsk_retransmits = 0;
tcp_clear_retrans(tp);
}
6.5.1
//mtu,窗口初始化 等
void tcp_mtup_init(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
icsk->icsk_af_ops->net_header_len;
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
icsk->icsk_mtup.probe_size = 0;
}
static inline u32 dst_metric(const struct dst_entry *dst, int metric)
{
return dst->metrics[metric-1];
}
* Initialize RCV_MSS value.
* RCV_MSS is an our guess about MSS used by the peer.
* We haven't any direct information about the MSS.
* It's better to underestimate the RCV_MSS rather than overestimate.
* Overestimations make us ACKing less frequently than needed.
* Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
*/
void tcp_initialize_rcv_mss(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
hint = min(hint, tp->rcv_wnd / 2);
hint = min(hint, TCP_MIN_RCVMSS);
hint = max(hint, TCP_MIN_MSS);
inet_csk(sk)->icsk_ack.rcv_mss = hint;
}
static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag)
{
__clear_bit(flag, &sk->sk_flags);
}
static inline void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq)
{
tp->snd_wl1 = seq;
}
6.5.2 决定tcp滑动窗口大小
/* Determine a window scaling and initial window to offer.
* Based on the assumption that the given amount of space
* will be offered. Store the results in the tp structure.
* NOTE: for smooth operation initial space offering should
* be a multiple of mss if possible. We assume here that mss >= 1.
* This MUST be enforced by all callers.
*/
void tcp_select_initial_window(int __space, __u32 mss,
__u32 *rcv_wnd, __u32 *window_clamp,
int wscale_ok, __u8 *rcv_wscale)
{
unsigned int space = (__space < 0 ? 0 : __space);
/* If no clamp set the clamp to the max possible scaled window */
if (*window_clamp == 0)
(*window_clamp) = (65535 << 14);
space = min(*window_clamp, space);
/* Quantize space offering to a multiple of mss if possible. */
if (space > mss)
space = (space / mss) * mss;
/* NOTE: offering an initial window larger than 32767
* will break some buggy TCP stacks. If the admin tells us
* it is likely we could be speaking with such a buggy stack
* we will truncate our initial window offering to 32K-1
* unless the remote has sent us a window scaling option,
* which we interpret as a sign the remote TCP is not
* misinterpreting the window field as a signed quantity.
*/
if (sysctl_tcp_workaround_signed_windows)
(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
else
(*rcv_wnd) = space;
(*rcv_wscale) = 0;
if (wscale_ok) {
/* Set window scaling on max possible window
* See RFC1323 for an explanation of the limit to 14
*/
space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
space = min_t(u32, space, *window_clamp);
while (space > 65535 && (*rcv_wscale) < 14) {
space >>= 1;
(*rcv_wscale)++;
}
}
/* Set initial window to value enough for senders,
* following RFC2414. Senders, not following this RFC,
* will be satisfied with 2.
*/
if (mss > (1 << *rcv_wscale)) {
int init_cwnd = 4;
if (mss > 1460 * 3)
init_cwnd = 2;
else if (mss > 1460)
init_cwnd = 3;
if (*rcv_wnd > init_cwnd * mss)
*rcv_wnd = init_cwnd * mss;
}
/* Set the clamp no higher than max representable value */
(*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
}
6.5.3 一些清理,置零
static void tcp_clear_retrans_partial(struct tcp_sock *tp)
{
tp->retrans_out = 0;
tp->lost_out = 0;
tp->undo_marker = 0;
tp->undo_retrans = 0;
}
void tcp_clear_retrans(struct tcp_sock *tp)
{
tcp_clear_retrans_partial(tp);
tp->fackets_out = 0;
tp->sacked_out = 0;
}
int tcp_connect(struct sock* sk)
{
struct sk_buff* buff = alloc_skb(MAX_TCP_HEADER+15, sk->allocation);
//…一些buff初始化
TCP_SKB_CB(buff)->seq = tp->write_seq++; //序列号seq++,TCP的
TCP_SKB_CB(buff)->when = tcp_time_stamp; //时间戳
//见后传输部分,后面与发送普通TCP包的函数一致了
tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
}
附注:
http://www.yuanma.org/data/2008/0116/article_2946.htm
tcp协议栈中的mss
user_mss是使用TCP_MAXSEG选项的setsockopt/getsockopt函数设置的。
mss_clamp用在TCP连接建立时协商时候使用的,取接收到的SYN包的mss值和user_mss的最小值。
mss_cache是当前有效的发送mss。
tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
for TCP options, but includes only bare TCP header.
tp->rx_opt.mss_clamp is mss negotiated at connection setup.
It is minimum of user_mss and mss received with SYN.
It also does not include TCP options.
inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
tp->mss_cache is current effective sending mss, including
all tcp options except for SACKs. It is evaluated,
taking into account current pmtu, but never exceeds
tp->rx_opt.mss_clamp.
NOTE1. rfc1122 clearly states that advertised MSS
DOES NOT include either tcp or ip options.
NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
are READ ONLY outside this function. --ANK (980731)
参考:
http://oss.sgi.com/archives/netdev/2000-08/msg00039.html
http://topic.csdn.net/t/20031113/09/2455705.html
下面,我们看一下一些代码:
tcp_mtu_to_mss根据pmtu(路径最大传输单元)来计算mss(最大消息传输段),pmtu-IP头部-TCP头部-IP选项-TCP选项,(其中IP选项和TCP选项的长度可能都为0)。
/* Not accounting for SACKs here. */
int tcp_mtu_to_mss(struct sock *sk, int pmtu)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
int mss_now;
/* Calculate base mss without TCP options:
It is MMS_S - sizeof(tcphdr) of rfc1122
*/
/* ipv4_specific.net_header_len = sizeof(struct iphdr),*/
mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
/* Clamp it (mss_clamp does not include tcp options) */
if (mss_now > tp->rx_opt.mss_clamp)
mss_now = tp->rx_opt.mss_clamp;
/* Now subtract optional transport overhead */
/*In tcp_v4_connect(),inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;*/
mss_now -= icsk->icsk_ext_hdr_len;
/* Then reserve room for full set of TCP options and 8 bytes of data */
if (mss_now < 48)
mss_now = 48;
/* Now subtract TCP options size, not including SACKs */
mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
return mss_now;
}
同理,将mss转化为mtu的tcp_mss_to_mtu函数,就是在mss+TCP头部+TCP选项+IP头+IP选项。
int tcp_mss_to_mtu(struct sock *sk, int mss)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
int mtu;
/*tcp_header_len = TCP Header + TCP Options*/
mtu = mss +
tp->tcp_header_len +
icsk->icsk_ext_hdr_len +
icsk->icsk_af_ops->net_header_len;
return mtu;
}
- Socket Kernel Source Chapter06 connect
- Socket Kernel Source Chapter03 socket
- Socket Kernel Source Chapter01 Introducation
- Socket Kernel Source Chapter02 sys_socketcall
- Socket Kernel Source Chapter04 bind
- Socket Kernel Source Chapter05 listen
- kernel source
- kernel source
- socket-connect
- socket-connect
- socket-connect
- socket kernel
- ERROR : Unable to connect to foreign data source: Can't create TCP/IP socket (24)
- ERROR : Unable to connect to foreign data source: Can't create TCP/IP socket (24)
- Compile linux kernel source
- linux kernel source code
- Install kernel source
- base-kernel-source dir
- Class.getResourceAsStream 和 ClassLoader.getResourceAsStream
- Socket Kernel Source Chapter04 bind
- 读后感:>
- Mysql常用命令七(MySQL的优化方法)
- Socket Kernel Source Chapter05 listen
- Socket Kernel Source Chapter06 connect
- jQuery - UI/Theming
- Socket Kernel Module 01
- Socket_Kernel_Module 02
- 用IIS预览时,提示“Service Unavailable”
- 实战KDevelop进行Linux软件开发
- Socket_Kernel_Module 03
- 读谁杀了我的牛感想
- Socket_Kernel_Module 04