9.10 TIME_WAIT定时器

来源:互联网 发布:tao淘宝网 编辑:程序博客网 时间:2024/06/05 14:59

9.10.1 Why

        当socekt进入TIME_WAIT状态后,TIME_WAIT定时器启动。在超时之前,替代socket的tw sock会处理旧连接中的包,阻止其危害新连接。定时器超时后,tw sock被删除,并释放其占用的端口号。

9.10.2 When

        TIME_WAIT定时器的安装由tcp_time_wait函数完成,调用tcp_time_wait函数的时机有:

(1)在TCP_FIN_WAIT2状态下socket关闭,没有用TCP_LINGER2选项将tp->linger2设置为小于0且tcp_fin_time的大小小于等于TCP_TIMEWAIT_LEN:

2059 void tcp_close(struct sock *sk, long timeout)2060 {...2183     if (sk->sk_state == TCP_FIN_WAIT2) {2184         struct tcp_sock *tp = tcp_sk(sk);2185         if (tp->linger2 < 0) {...2190         } else {2191             const int tmo = tcp_fin_time(sk);2192 2193             if (tmo > TCP_TIMEWAIT_LEN) {2194                 inet_csk_reset_keepalive_timer(sk,2195                         tmo - TCP_TIMEWAIT_LEN);2196             } else {2197                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);2198                 goto out;2199             }2200         }... 
(2)TCP_FIN_WAIT2状态下收到FIN并发送ACK后:

3783 static void tcp_fin(struct sock *sk)3784 {...3818     case TCP_FIN_WAIT2:3819         /* Received a FIN -- send ACK and enter TIME_WAIT. */3820         tcp_send_ack(sk);3821         tcp_time_wait(sk, TCP_TIME_WAIT, 0);
(3)孤儿socket在TCP_FIN_WAIT1状态下收到ACK时,满足:

1)没有用TCP_LINGER2选项将tp->linger2设置为小于0

2)tcp_fin_time的大小小于等于TCP_TIMEWAIT_LEN:

3)ACK中没有数据或数据全是旧的

4)ACK中没有FIN标记并且socket没有被应用进程锁定

5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,5601               const struct tcphdr *th, unsigned int len)5602 {...5751         case TCP_FIN_WAIT1:...5780                 if (!sock_flag(sk, SOCK_DEAD))5781                     /* Wake up lingering close() */5782                     sk->sk_state_change(sk);5783                 else {5784                     int tmo;5785 5786                     if (tp->linger2 < 0 ||5787                         (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&5788                          after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {5789                         tcp_done(sk);5790                         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);5791                         return 1;5792                     }5793 5794                     tmo = tcp_fin_time(sk);5795                     if (tmo > TCP_TIMEWAIT_LEN) {5796                         inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);5797                     } else if (th->fin || sock_owned_by_user(sk)) {5798                         /* Bad case. We could lose such FIN otherwise.5799                          * It is not a big problem, but it looks confusing5800                          * and not so rare event. We still can lose it now,5801                          * if it spins in bh_lock_sock(), but it is really5802                          * marginal case.5803                          */5804                         inet_csk_reset_keepalive_timer(sk, tmo);5805                     } else {5806                         tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);5807                         goto discard;5808                     }...
(4)TCP在TCP_CLOSING状态下收到ACK时:
5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,5601               const struct tcphdr *th, unsigned int len)5602 {...5813         case TCP_CLOSING:5814             if (tp->snd_una == tp->write_seq) {5815                 tcp_time_wait(sk, TCP_TIME_WAIT, 0);5816                 goto discard;5817             }...
(5)FIN_WAIT2定时器超时时,没有用TCP_LINGER2选项将tp->linger2设置为小于0且tcp_fin_time的大小大于TCP_TIMEWAIT_LEN:
558 static void tcp_keepalive_timer (unsigned long data)559 {...578     if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {579         if (tp->linger2 >= 0) {580             const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;581 582             if (tmo > 0) {583                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);584                 goto out;585             }...
         tcp_time_wait函数会调用inet_twsk_schedule函数安装TIME_WAIT定时器:
266 void tcp_time_wait(struct sock *sk, int state, int timeo)267 {...327         __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); //将tw sock放入ESTABLESHED hash表和bind hash表中,将sk从ESTABLISHED hash表中移除328 329         /* Get the TIME_WAIT timeout firing. */330         if (timeo < rto)331             timeo = rto;332 333         if (recycle_ok) {334             tw->tw_timeout = rto;335         } else {336             tw->tw_timeout = TCP_TIMEWAIT_LEN;337             if (state == TCP_TIME_WAIT)338                 timeo = TCP_TIMEWAIT_LEN;339         }340 341         inet_twsk_schedule(tw, &tcp_death_row, timeo,342                    TCP_TIMEWAIT_LEN);343         inet_twsk_put(tw);...

        __inet_twsk_hashdance函数将tw_sock加入到bind hash表和ESTABLISHED表中,这样在tw_sock被删除之前相应IP|端口不允许bind,也不允许建立:

126 void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,127                struct inet_hashinfo *hashinfo)128 {129     const struct inet_sock *inet = inet_sk(sk); 130     const struct inet_connection_sock *icsk = inet_csk(sk);131     struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);132     spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);133     struct inet_bind_hashbucket *bhead;134     /* Step 1: Put TW into bind hash. Original socket stays there too.135        Note, that any socket with inet->num != 0 MUST be bound in136        binding cache, even if it is closed.137      */138     bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,139             hashinfo->bhash_size)];        140     spin_lock(&bhead->lock);141     tw->tw_tb = icsk->icsk_bind_hash;142     WARN_ON(!icsk->icsk_bind_hash);143     inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);    //加入到bind hash表中144     spin_unlock(&bhead->lock);145 146     spin_lock(lock);...153     inet_twsk_add_node_rcu(tw, &ehead->twchain);  //加入到ESBABLISHED hash表中154 155     /* Step 3: Remove SK from established hash. */156     if (__sk_nulls_del_node_init_rcu(sk))157         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);...167     atomic_add(1 + 1 + 1, &tw->tw_refcnt);168 169     spin_unlock(lock);170 } 
        这样,在应用进程使用bind系统调用绑定与tw_sock相同的IP|端口对时内核会用到inet_csk_bind_conflict函数,但由于成功匹配到bind hash表中的tw_sock,会导致冲突,无法bind(详见2.2 Bind系统调用)。而在建立连接时,inet_hash_connect函数会调用__inet_check_established检查即将建立的连接是否与已建立的连接冲突:
311 static int __inet_check_established(struct inet_timewait_death_row *death_row,312                     struct sock *sk, __u16 lport,313                     struct inet_timewait_sock **twp)314 {...335     sk_nulls_for_each(sk2, node, &head->twchain) {336         if (sk2->sk_hash != hash)337             continue;338 339         if (likely(INET_TW_MATCH(sk2, net, acookie,340                      saddr, daddr, ports, dif))) {    //地址|端口匹配341             tw = inet_twsk(sk2);342             if (twsk_unique(sk, sk2, twp))    //调用tcp_twsk_unique判断是否冲突343                 goto unique;    //不冲突344             else345                 goto not_unique; //冲突346         }347     }348     tw = NULL;...359 unique:...376     if (twp) {377         *twp = tw;    //交给调用者处理378     } else if (tw) {379         /* Silly. Should hash-dance instead... */380         inet_twsk_deschedule(tw, death_row);381 382         inet_twsk_put(tw);383     }384     return 0;385 386 not_unique:387     spin_unlock(lock);388     return -EADDRNOTAVAIL;389 } 
        tcp_twsk_unique函数
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) 110 {    111     const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); 112     struct tcp_sock *tp = tcp_sk(sk);... 125     if (tcptw->tw_ts_recent_stamp &&    //开启时间戳选项且在TIME_WAIT状态下收到过包 126         (twp == NULL || (sysctl_tcp_tw_reuse && 127                  get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { 128         tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; 129         if (tp->write_seq == 0) 130             tp->write_seq = 1; 131         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent; 132         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 133         sock_hold(sktw); 134         return 1; 135     } 136  137     return 0; 138 }
        可见,当:

(1)__inet_check_established函数的调用者不需要返回tw_sock的时候(即twp == NULL为真),或

(2)应用进程设置了net.ipv4.tcp_tw_reuse内核选项允许tw_sock重用时,

        tcp_twsk_unique函数会返回1,即不冲突。不冲突时如果是(1),则__inet_check_established函数会释放tw_sock;否则会将tw_sock返回给调用者inet_hash_connect函数处理。在不冲突时,情况(1)发生时到底意味着什么?情况(1)没有发生时inet_hash_connect函数用tw_sock干什么?来看代码:

589 int inet_hash_connect(struct inet_timewait_death_row *death_row,590               struct sock *sk)591 {592     return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),593             __inet_check_established, __inet_hash_nolisten);594 }
        看来__inet_check_established函数的使用者是__inet_hash_connect函数:
477 int __inet_hash_connect(struct inet_timewait_death_row *death_row,478         struct sock *sk, u32 port_offset,479         int (*check_established)(struct inet_timewait_death_row *,480             struct sock *, __u16, struct inet_timewait_sock **),481         int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))482 {...491     if (!snum) {...520                     if (!check_established(death_row, sk,521                                 port, &tw))522                         goto ok;...544 ok:545         hint += i;546 547         /* Head lock still held and bh's disabled */548         inet_bind_hash(sk, tb, port);549         if (sk_unhashed(sk)) {550             inet_sk(sk)->inet_sport = htons(port);551             twrefcnt += hash(sk, tw);    //将sk加入到ESTABLISHED hash表中,将tw_sock从这个表中摘出552         }553         if (tw)554             twrefcnt += inet_twsk_bind_unhash(tw, hinfo);  //将tw_sock从bind hash表中摘出555         spin_unlock(&head->lock);556 557         if (tw) {558             inet_twsk_deschedule(tw, death_row);  //释放tw_sock559             while (twrefcnt) {560                 twrefcnt--;561                 inet_twsk_put(tw);562             }563         }564 565         ret = 0;566         goto out;567     }568 569     head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];570     tb  = inet_csk(sk)->icsk_bind_hash;571     spin_lock_bh(&head->lock);572     if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {    //绑定到这个IP|port对的只有这一个socket573         hash(sk, NULL);574         spin_unlock_bh(&head->lock);575         return 0;576     } else {577         spin_unlock(&head->lock);578         /* No definite answer... Walk to established hash table */579         ret = check_established(death_row, sk, snum, NULL);580 out:581         local_bh_enable();582         return ret;583     }584 }                

        要绑定的端口非0情况(1)才会发生,这时意味着应用进程在调用connect系统调用之前已经成功地使用了bind系统调用,既然bind时不冲突,那么在connect时直接将tw_sock释放即可。而情况(1)没有发生时,tw_sock也会被释放并从hash表中摘出。

        tcp_death_row的定义为:

 35 struct inet_timewait_death_row tcp_death_row = { 36     .sysctl_max_tw_buckets = NR_FILE * 2, 37     .period     = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, 38     .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock), 39     .hashinfo   = &tcp_hashinfo, 40     .tw_timer   = TIMER_INITIALIZER(inet_twdr_hangman, 0, 41                         (unsigned long)&tcp_death_row), 42     .twkill_work    = __WORK_INITIALIZER(tcp_death_row.twkill_work, 43                          inet_twdr_twkill_work), 44 /* Short-time timewait calendar */ 45  46     .twcal_hand = -1, 47     .twcal_timer    = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, 48                         (unsigned long)&tcp_death_row), 49 };
        inet_twsk_schedule函数:
340 void inet_twsk_schedule(struct inet_timewait_sock *tw,341                struct inet_timewait_death_row *twdr,342                const int timeo, const int timewait_len)343 {           344     struct hlist_head *list;345     int slot;346             ...     //计算tw sock加入到time_wait定时器链表中的位置,slot越大则超时时间越长371     slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;372 373     spin_lock(&twdr->death_lock);374375     /* Unlink it, if it was scheduled */376     if (inet_twsk_del_dead_node(tw))//已经在time_wait定时器链表中了,则摘除377         twdr->tw_count--;378     else379         atomic_inc(&tw->tw_refcnt);380381     if (slot >= INET_TWDR_RECYCLE_SLOTS) {   //超时时间过长,使用慢速定时器382         /* Schedule to slow timer */383         if (timeo >= timewait_len) {384             slot = INET_TWDR_TWKILL_SLOTS - 1;385         } else {386             slot = DIV_ROUND_UP(timeo, twdr->period);387             if (slot >= INET_TWDR_TWKILL_SLOTS)388                 slot = INET_TWDR_TWKILL_SLOTS - 1;389         }390         tw->tw_ttd = jiffies + timeo;391         slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);392         list = &twdr->cells[slot];  //添加tw_sock到twdr->cells中393     } else {  //超时时间短的都放入再生定时器中394         tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);395396         if (twdr->twcal_hand < 0) { //再生定时器未设置或已经超时397             twdr->twcal_hand = 0;398             twdr->twcal_jiffie = jiffies;  //记录初次设置定时器的时间399             twdr->twcal_timer.expires = twdr->twcal_jiffie +400                           (slot << INET_TWDR_RECYCLE_TICK);401             add_timer(&twdr->twcal_timer);//设置再生定时器402         } else {403             if (time_after(twdr->twcal_timer.expires,404                        jiffies + (slot << INET_TWDR_RECYCLE_TICK)))  //再生定时器未超时405                 mod_timer(&twdr->twcal_timer,406                       jiffies + (slot << INET_TWDR_RECYCLE_TICK));//设置再生超时定时器407             slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);408         }409         list = &twdr->twcal_row[slot];  //添加tw_sock到twdr->twcal_row中410     }411412     hlist_add_head(&tw->tw_death_node, list);//加入到time_wait定时器链表中413414     if (twdr->tw_count++ == 0)//加入之前time_wait定时器链表中没有成员415         mod_timer(&twdr->tw_timer, jiffies + twdr->period); //设置慢速定时器416     spin_unlock(&twdr->death_lock);417 }

        371:按照超时时间长短划分slot:0 jiffies为slot 0,1-2^INET_TWDR_RECYCLE_TICK jiffies为slot 1,2^INET_TWDR_RECYCLE_TICK + 1 -2^(INET_TWDR_RECYCLE_TICK  + 1)为slot 2...每个slot 的时间长度是2^INET_TWDR_RECYCLE_TICK个jiffies。

        386:按照超时时间长短划分slot,每个slot的时间长度是twdr->period。

        可见TIME_WAIT定时器包含2个定时器结构:twcal_timer和tw_timer。其中twcal_timer的超时时间较短,被称为“再生定时器”。

        tw_timer的超时时间是TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS(即7.5s),删除的条件有:

(1)在应用进程使用connect系统调用绑定IP|端口时匹配到tw_sock,但判定不冲突时(详见__inet_twsk_hashdance函数相关分析);

(2)调用inet_twsk_deschedule删除一个tw_sock,如果tw队列中没有成员,则禁用tw_timer:

326 void inet_twsk_deschedule(struct inet_timewait_sock *tw,327               struct inet_timewait_death_row *twdr)328 {   329     spin_lock(&twdr->death_lock);330     if (inet_twsk_del_dead_node(tw)) {331         inet_twsk_put(tw);332         if (--twdr->tw_count == 0)     //tw队列为空333             del_timer(&twdr->tw_timer);   //删除tw_timer 334     }   335     spin_unlock(&twdr->death_lock);336     __inet_twsk_kill(tw, twdr->hashinfo);337 }           

        __inet_twsk_kill会将tw_sock从bind hash表和ESTABLISHED hash表中删除:

 70 static void __inet_twsk_kill(struct inet_timewait_sock *tw, 71                  struct inet_hashinfo *hashinfo) 72 {    73     struct inet_bind_hashbucket *bhead; 74     int refcnt; 75     /* Unlink from established hashes. */ 76     spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); 77  78     spin_lock(lock); 79     refcnt = inet_twsk_unhash(tw);    //从ESTABLISHED hash表中删除 80     spin_unlock(lock); 81      82     /* Disassociate with bind bucket. */ 83     bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, 84             hashinfo->bhash_size)]; 85          86     spin_lock(&bhead->lock); 87     refcnt += inet_twsk_bind_unhash(tw, hashinfo);   //从bind hash表中删除 88     spin_unlock(&bhead->lock); 89      90 #ifdef SOCK_REFCNT_DEBUG 91     if (atomic_read(&tw->tw_refcnt) != 1) { 92         pr_debug("%s timewait_sock %p refcnt=%d\n", 93              tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); 94     } 95 #endif 96     while (refcnt) { 97         inet_twsk_put(tw); 98         refcnt--; 99     }100 }

(3)twcal_timer超时时调用inet_twdr_twcal_tick删除tw_sock,如果tw队列中没有成员,则禁用tw_timer.

        再生定时器不会被删除,其超时时间为slot * 2^INET_TWDR_RECYCLE_TICK。INET_TWDR_RECYCLE_TICK的定义如下:

 41 #if HZ <= 16 || HZ > 4096 42 # error Unsupported: HZ <= 16 or HZ > 4096 43 #elif HZ <= 32 44 # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) 45 #elif HZ <= 64 46 # define INET_TWDR_RECYCLE_TICK (6 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) 47 #elif HZ <= 128 48 # define INET_TWDR_RECYCLE_TICK (7 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) 49 #elif HZ <= 256 50 # define INET_TWDR_RECYCLE_TICK (8 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) 51 #elif HZ <= 512 52 # define INET_TWDR_RECYCLE_TICK (9 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) 53 #elif HZ <= 1024 54 # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) 55 #elif HZ <= 2048 56 # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) 57 #else 58 # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) 59 #endif

        如果jiffies每1ms加1,则INET_TWDR_RECYCLE_TICK的值为7;如果timo的值为60s(通常是最大值),则slot的值为469,那么再生定时器的最大超时时间为60s.如果1ms <= timeo <= 128ms,则slot = 1,再生定时器的最小超时时间为127ms.

9.10.3 What

        twcal_timer对应的超时函数是inet_twdr_twcal_tick:

420 void inet_twdr_twcal_tick(unsigned long data)421 {422     struct inet_timewait_death_row *twdr;423     int n, slot;424     unsigned long j;425     unsigned long now = jiffies;426     int killed = 0;427     int adv = 0;428429     twdr = (struct inet_timewait_death_row *)data;430431     spin_lock(&twdr->death_lock);432     if (twdr->twcal_hand < 0)//再生超时定时器未设置或已经超时433         goto out;434435     slot = twdr->twcal_hand;436     j = twdr->twcal_jiffie;  //获取初次设置定时器的时间437438     for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {  //遍历所有时隙439         if (time_before_eq(j, now)) { //已经超时440             struct hlist_node *safe;441             struct inet_timewait_sock *tw;442443             inet_twsk_for_each_inmate_safe(tw, safe,444                                &twdr->twcal_row[slot]) {   //遍历一个时隙中的所有节点   445                 __inet_twsk_del_dead_node(tw); //删除定时节点446                 __inet_twsk_kill(tw, twdr->hashinfo);//将tw sock移出TCP ESTABLISH hash表...450                 inet_twsk_put(tw);             451                 killed++;  //记录已删除的节点的数量452             }453         } else {//尚未超时454             if (!adv) {   455                 adv = 1;  456                 twdr->twcal_jiffie = j;     //更新尚未超时的时间起点   457                 twdr->twcal_hand = slot;    //更新尚未超时的时隙起点   458             }459460             if (!hlist_empty(&twdr->twcal_row[slot])) {461                 mod_timer(&twdr->twcal_timer, j);462                 goto out;463             }464         }465         j += 1 << INET_TWDR_RECYCLE_TICK;466         slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);  //进入下一个时隙467     }468     twdr->twcal_hand = -1;   //标记再生定时器已经超时469470 out:471     if ((twdr->tw_count -= killed) == 0)472         del_timer(&twdr->tw_timer);473 #ifndef CONFIG_NET_NS474     NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed);475 #endif476     spin_unlock(&twdr->death_lock);477 }
        439-451:再生定时器会将所有落入相同时隙(slot)的节点做同样的对待,它的基本动作是超时则删除,否则再次设置再生定时器

        慢速定时器tw_timer对应的超时函数是inet_twdr_hangman:

262 void inet_twdr_hangman(unsigned long data)263 {264     struct inet_timewait_death_row *twdr;265     unsigned int need_timer;266267     twdr = (struct inet_timewait_death_row *)data;268     spin_lock(&twdr->death_lock);269270     if (twdr->tw_count == 0)  //没有tw_sock271         goto out;272273     need_timer = 0;274     if (inet_twdr_do_twkill_work(twdr, twdr->slot)) { //删除慢速定时器链表中的节点及其对应的tw_sock275         twdr->thread_slots |= (1 << twdr->slot);  //将当前slot的值标记下来276         schedule_work(&twdr->twkill_work); //若杀死了过多的tw_sock,则将没有删除完毕则将任务放入工作者队列中由工作者进程完成277         need_timer = 1;278     } else {  //没有杀死过多的tw_sock279         /* We purged the entire slot, anything left?  */280         if (twdr->tw_count)  //还有tw_sock281             need_timer = 1;  //还要继续设置tw_timer282         twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));  //进入下一个slot283     }284     if (need_timer)285         mod_timer(&twdr->tw_timer, jiffies + twdr->period);286 out:287     spin_unlock(&twdr->death_lock);288 
        inet_twdr_hangman每次超时只处理一个slot,然后再设置tw_timer在经过twdr->period的时间后再超时处理下一个slot。由于相邻slot的超时时间差正好是一个twdr->period,故所有slot都能得到及时的处理。

        inet_twdr_do_twkill_work函数删除慢速定时器链表中的节点及其对应的tw_sock:

215 static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,216                     const int slot)217 {218     struct inet_timewait_sock *tw;219     unsigned int killed;220     int ret;221222     /* NOTE: compare this to previous version where lock223      * was released after detaching chain. It was racy,224      * because tw buckets are scheduled in not serialized context225      * in 2.3 (with netfilter), and with softnet it is common, because226      * soft irqs are not sequenced.227      */228     killed = 0;229     ret = 0;230 rescan:231     inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) { //遍历慢速超时队列232         __inet_twsk_del_dead_node(tw);233         spin_unlock(&twdr->death_lock);234         __inet_twsk_kill(tw, twdr->hashinfo);235 #ifdef CONFIG_NET_NS236         NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);237 #endif238         inet_twsk_put(tw);239         killed++;       240         spin_lock(&twdr->death_lock);241         if (killed > INET_TWDR_TWKILL_QUOTA) {  //杀戮过重242             ret = 1;243             break;244         }245     246         /* While we dropped twdr->death_lock, another cpu may have247          * killed off the next TW bucket in the list, therefore248          * do a fresh re-read of the hlist head node with the249          * lock reacquired.  We still use the hlist traversal250          * macro in order to get the prefetches.251          */252         goto rescan;253     }254255     twdr->tw_count -= killed;256 #ifndef CONFIG_NET_NS257     NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed);258 #endif259     return ret;260 }
        inet_twdr_twkill_work函数是twdr->twkill_work对应的工作者线程处理函数,用于将inet_twdr_do_twkill_work函数未完成的屠杀进行到底:
291 void inet_twdr_twkill_work(struct work_struct *work)292 {   293     struct inet_timewait_death_row *twdr =294         container_of(work, struct inet_timewait_death_row, twkill_work);295     int i;296     297     BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) >298             (sizeof(twdr->thread_slots) * 8));299                          300     while (twdr->thread_slots) {301         spin_lock_bh(&twdr->death_lock);302         for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {303             if (!(twdr->thread_slots & (1 << i))) //slot i不需要处理304                 continue;305306             while (inet_twdr_do_twkill_work(twdr, i) != 0) {  //循环一直到杀光为止307                 if (need_resched()) {308                     spin_unlock_bh(&twdr->death_lock);309                     schedule();310                     spin_lock_bh(&twdr->death_lock);311                 }312             }313         314             twdr->thread_slots &= ~(1 << i);  //已经杀光此slot了315         }316         spin_unlock_bh(&twdr->death_lock);317     }318 }
        问题:慢速定时器超时时如果释放的tw_sock超出限制为什么要将任务转移到工作者线程中完成呢?

        答案(个人理解):Linux定时器是在软中断上下文执行,如果运行时间过长会导致当前CPU的其它任务无法执行,有违公平性。而工作者线程的优先级较低,运行的时间长一点也没关系。

0 0
原创粉丝点击