Linux内核分析

来源:互联网 发布:png格式打开软件 编辑:程序博客网 时间:2024/06/14 16:39

网络收包流程从网卡驱动开始,一直往上,涉及NAPI、GRO、RPS等特性,但是一般最后都会调用__netif_receive_skb函数:

函数主要有几个处理:

1、vlan报文的处理,主要是循环把vlan头剥掉,如果qinq场景,两个vlan都会被剥掉;

2、交给rx_handler处理,例如OVS、linux bridge等;

3、ptype_all处理,例如抓包程序、raw socket等;

4、ptype_base处理,交给协议栈处理,例如ip、arp、rarp等;


  1. static int __netif_receive_skb(struct sk_buff *skb)
  2. {
  3.     struct packet_type *ptype, *pt_prev;
  4.     rx_handler_func_t *rx_handler;
  5.     struct net_device *orig_dev;
  6.     struct net_device *null_or_dev;
  7.     bool deliver_exact = false;
  8.     int ret = NET_RX_DROP;
  9.     __be16 type;

  10.     if (!netdev_tstamp_prequeue)
  11.         net_timestamp_check(skb);

  12.     trace_netif_receive_skb(skb);

  13.     if (netpoll_receive_skb(skb))
  14.         return NET_RX_DROP;

  15.     if (!skb->skb_iif)
  16.         skb->skb_iif = skb->dev->ifindex;
  17.     orig_dev = skb->dev;

  18.     skb_reset_network_header(skb);  //把L3、L4的头都指向data数据结构,到这里的时候skb已经处理完L2层的头了
  19.     skb_reset_transport_header(skb);
  20.     skb_reset_mac_len(skb);

  21.     pt_prev = NULL;

  22.     rcu_read_lock();

  23. another_round:

  24.     __this_cpu_inc(softnet_data.processed);

  25.     if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
  26.         skb = vlan_untag(skb);
  27.         if (unlikely(!skb))
  28.             goto out;
  29.     }

  30. #ifdef CONFIG_NET_CLS_ACT
  31.     if (skb->tc_verd & TC_NCLS) {
  32.         skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
  33.         goto ncls;
  34.     }
  35. #endif

  36.     list_for_each_entry_rcu(ptype, &ptype_all, list) {  //把包交给特定协议相关的处理函数前,先调用ptype_all中注册的函数
  37.         if (!ptype->dev || ptype->dev == skb->dev) {    //最常见的为tcpdump,该工具就是从这里拿到所有收到的包的
  38.             if (pt_prev)
  39.                 ret = deliver_skb(skb, pt_prev, orig_dev);
  40.             pt_prev = ptype;  //pt_prev的加入是为了优化,只有当找到下一个匹配的时候,才执行这一次的回调函数
  41.         }                     
  42.     }

  43. #ifdef CONFIG_NET_CLS_ACT
  44.     skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
  45.     if (!skb)
  46.         goto out;
  47. ncls:
  48. #endif
  49.     rx_handler = rcu_dereference(skb->dev->rx_handler);  //由具体驱动决定
  50.     if (rx_handler) {
  51.         if (pt_prev) {
  52.             ret = deliver_skb(skb, pt_prev, orig_dev);
  53.             pt_prev = NULL;
  54.         }
  55.         switch (rx_handler(&skb)) {
  56.         case RX_HANDLER_CONSUMED:
  57.             goto out;
  58.         case RX_HANDLER_ANOTHER:
  59.             goto another_round;
  60.         case RX_HANDLER_EXACT:
  61.             deliver_exact = true;
  62.         case RX_HANDLER_PASS:
  63.             break;
  64.         default:
  65.             BUG();
  66.         }
  67.     }

  68.     if (vlan_tx_tag_present(skb)) {
  69.         if (pt_prev) {
  70.             ret = deliver_skb(skb, pt_prev, orig_dev);
  71.             pt_prev = NULL;
  72.         }
  73.         if (vlan_do_receive(&skb)) {
  74.             ret = __netif_receive_skb(skb);
  75.             goto out;
  76.         } else if (unlikely(!skb))
  77.             goto out;
  78.     }

  79.     /* deliver only exact match when indicated */
  80.     null_or_dev = deliver_exact ? skb->dev : NULL;

  81.     type = skb->protocol;
  82.     list_for_each_entry_rcu(ptype,
  83.             &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
  84.         if (ptype->type == type &&
  85.             (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
  86.              ptype->dev == orig_dev)) {
  87.             if (pt_prev)
  88.                 ret = deliver_skb(skb, pt_prev, orig_dev); //atomic_inc(&skb->users);
  89.             pt_prev = ptype;
  90.         }
  91.     }

  92.     if (pt_prev) {
  93.         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);  //一般的最后这一次没有引用计数的增加,直接调用函数
  94.     } else {
  95.         atomic_long_inc(&skb->dev->rx_dropped);
  96.         kfree_skb(skb);
  97.         /* Jamal, now you will not able to escape explaining
  98.          * me how you were going to use this. :-)
  99.          */
  100.         ret = NET_RX_DROP;
  101.     }

  102. out:
  103.     rcu_read_unlock();
  104.     return ret;
  105. }
该函数涉及两个全局变量:
  1. static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
  2. static struct list_head ptype_all __read_mostly
看几个常见的packet_type,这些都在相应的协议初始化的时候调用dev_add_pack加入到特性的链表中:
  1. static struct packet_type ip_packet_type __read_mostly = {
  2.     .type = cpu_to_be16(ETH_P_IP),
  3.     .func = ip_rcv,
  4.     .gso_send_check = inet_gso_send_check,
  5.     .gso_segment = inet_gso_segment,
  6.     .gro_receive = inet_gro_receive,
  7.     .gro_complete = inet_gro_complete,
  8. };

  9. static struct packet_type arp_packet_type __read_mostly = {
  10.     .type = cpu_to_be16(ETH_P_ARP),
  11.     .func = arp_rcv,
  12. }
在ip_rcv函数中会对L3头做一些有效性检测:
  1. int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
  2. {
  3.     const struct iphdr *iph;
  4.     u32 len;

  5.     /* When the interface is in promisc. mode, drop all the crap
  6.      * that it receives, do not try to analyse it.
  7.      */
  8.     if (skb->pkt_type == PACKET_OTHERHOST)  //驱动根据MAC地址设置的,如果MAC地址不是本机的话,在这里丢弃。
  9.         goto drop;


  10.     IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);

  11.     if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
  12.         IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
  13.         goto out;
  14.     }

  15.     if (!pskb_may_pull(skb, sizeof(struct iphdr)))
  16.         goto inhdr_error;

  17.     iph = ip_hdr(skb);

  18.     /*
  19.      * RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
  20.      *
  21.      * Is the datagram acceptable?
  22.      *
  23.      * 1. Length at least the size of an ip header
  24.      * 2. Version of 4
  25.      * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
  26.      * 4. Doesn't have a bogus length
  27.      */

  28.     if (iph->ihl < 5 || iph->version != 4)
  29.         goto inhdr_error;

  30.     if (!pskb_may_pull(skb, iph->ihl*4))
  31.         goto inhdr_error;

  32.     iph = ip_hdr(skb);

  33.     if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))  //校验ip头是否正确
  34.         goto inhdr_error;
  35.     len = ntohs(iph->tot_len);  //iph中的大小是真正的大小,skb中len的大小是驱动中设置的,当包很小的时候,会进行填充,因此会比iph中的大
  36.     if (skb->len < len) {//以r8169为例,如果收到udp的包负载为1,则iph中的大小为20+8+1=29。但是此时skb->len=46=64(min)-14-4(vlan)
  37.         IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
  38.         goto drop;
  39.     } else if (len < (iph->ihl*4))
  40.         goto inhdr_error;

  41.     /* Our transport medium may have padded the buffer out. Now we know it
  42.      * is IP we can trim to the true length of the frame.
  43.      * Note this now means skb->len holds ntohs(iph->tot_len).
  44.      */
  45.     if (pskb_trim_rcsum(skb, len)) {  //去除填充的数据
  46.         IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
  47.         goto drop;
  48.     }

  49.     /* Remove any debris in the socket control block */
  50.     memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));

  51.     /* Must drop socket now because of tproxy. */
  52.     skb_orphan(skb);

  53.     return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
  54.                ip_rcv_finish);

  55. inhdr_error:
  56.     IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
  57. drop:
  58.     kfree_skb(skb);
  59. out:
  60.     return NET_RX_DROP;
  61. }
然后调用ip_rcv_finish:

  1. static int ip_rcv_finish(struct sk_buff *skb)
  2. {
  3.     const struct iphdr *iph = ip_hdr(skb);
  4.     struct rtable *rt;

  5.     /* 
  6.      * Initialise the virtual path cache for the packet. It describes
  7.      * how the packet travels inside Linux networking.
  8.      */
  9.     if (skb_dst(skb) == NULL) {
  10.         int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,//路由寻找,根据目的地址判断是本地接收还是转发(使能forward的话)
  11.                            iph->tos, skb->dev);
  12.         if (unlikely(err)) {
  13.             if (err == -EHOSTUNREACH)
  14.                 IP_INC_STATS_BH(dev_net(skb->dev),
  15.                         IPSTATS_MIB_INADDRERRORS);
  16.             else if (err == -ENETUNREACH)
  17.                 IP_INC_STATS_BH(dev_net(skb->dev),
  18.                         IPSTATS_MIB_INNOROUTES);
  19.             else if (err == -EXDEV)
  20.                 NET_INC_STATS_BH(dev_net(skb->dev),
  21.                          LINUX_MIB_IPRPFILTER);
  22.             goto drop;
  23.         }
  24.     } 

  25. #ifdef CONFIG_IP_ROUTE_CLASSID
  26.     if (unlikely(skb_dst(skb)->tclassid)) {
  27.         struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
  28.         u32 idx = skb_dst(skb)->tclassid;
  29.         st[idx&0xFF].o_packets++;
  30.         st[idx&0xFF].o_bytes += skb->len;
  31.         st[(idx>>16)&0xFF].i_packets++;
  32.         st[(idx>>16)&0xFF].i_bytes += skb->len;
  33.     } 
  34. #endif

  35.     if (iph->ihl > 5 && ip_rcv_options(skb))
  36.         goto drop;

  37.     rt = skb_rtable(skb);
  38.     if (rt->rt_type == RTN_MULTICAST) {
  39.         IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
  40.                 skb->len);
  41.     } else if (rt->rt_type == RTN_BROADCAST)
  42.         IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
  43.                 skb->len);

  44.     return dst_input(skb);  //skb_dst(skb)->input(skb);路由寻找过程中赋值,本地接收的话为:ip_local_deliver

  45. drop:
  46.     kfree_skb(skb);
  47.     return NET_RX_DROP;
  48. }
  1. int ip_local_deliver(struct sk_buff *skb)
  2. {
  3.     /*
  4.      * Reassemble IP fragments.
  5.      */

  6.     if (ip_is_fragment(ip_hdr(skb))) {
  7.         if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
  8.             return 0;
  9.     }

  10.     return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
  11.                ip_local_deliver_finish);
  12. }
略过ip defrag流程,直接调用ip_local_deliver_finish,该函数根据L3头指定的L4协议,调用特定的函数:
  1. static int ip_local_deliver_finish(struct sk_buff *skb)
  2. {
  3.     struct net *net = dev_net(skb->dev);

  4.     __skb_pull(skb, ip_hdrlen(skb)); //增加data,略过L3头,此时data指向L4头

  5.     /* Point into the IP datagram, just past the header. */
  6.     skb_reset_transport_header(skb);

  7.     rcu_read_lock();
  8.     {
  9.         int protocol = ip_hdr(skb)->protocol;  //L4类型,如TCP或者UDP
  10.         int hash, raw;
  11.         const struct net_protocol *ipprot;

  12.     resubmit:
  13.         raw = raw_local_deliver(skb, protocol); //

  14.         hash = protocol & (MAX_INET_PROTOS - 1);
  15.         ipprot = rcu_dereference(inet_protos[hash]);  //udp_protocol
  16.         if (ipprot != NULL) {
  17.             int ret;

  18.             if (!net_eq(net, &init_net) && !ipprot->netns_ok) {
  19.                 if (net_ratelimit())
  20.                     printk("%s: proto %d isn't netns-ready\n",
  21.                         __func__, protocol);
  22.                 kfree_skb(skb);
  23.                 goto out;
  24.             }

  25.             if (!ipprot->no_policy) {
  26.                 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
  27.                     kfree_skb(skb);
  28.                     goto out;
  29.                 }
  30.                 nf_reset(skb);
  31.             }
  32.             ret = ipprot->handler(skb);  //udp_rcv
  33.             if (ret < 0) {
  34.                 protocol = -ret;
  35.                 goto resubmit;
  36.             }
  37.             IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
  38.         } else {
  39.             if (!raw) {
  40.                 if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
  41.                     IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
  42.                     icmp_send(skb, ICMP_DEST_UNREACH,
  43.                           ICMP_PROT_UNREACH, 0);
  44.                 }
  45.             } else
  46.                 IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
  47.             kfree_skb(skb);
  48.         }
  49.     }
  50.  out:
  51.     rcu_read_unlock();

  52.     return 0;
  53. }
udp调用udp_rcv,最后调用__udp4_lib_rcv:

  1. int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
  2.            int proto)
  3. {
  4.     struct sock *sk; 
  5.     struct udphdr *uh; 
  6.     unsigned short ulen;
  7.     struct rtable *rt = skb_rtable(skb);
  8.     __be32 saddr, daddr;
  9.     struct net *net = dev_net(skb->dev);

  10.     /* 
  11.      * Validate the packet.
  12.      */
  13.     if (!pskb_may_pull(skb, sizeof(struct udphdr)))
  14.         goto drop; /* No space for header. */

  15.     uh = udp_hdr(skb);
  16.     ulen = ntohs(uh->len);
  17.     saddr = ip_hdr(skb)->saddr;
  18.     daddr = ip_hdr(skb)->daddr;

  19.     if (ulen > skb->len)
  20.         goto short_packet;

  21.     if (proto == IPPROTO_UDP) {
  22.         /* UDP validates ulen. */
  23.         if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
  24.             goto short_packet;
  25.         uh = udp_hdr(skb);
  26.     } 

  27.     if (udp4_csum_init(skb, uh, proto))
  28.         goto csum_error;

  29.     if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
  30.         return __udp4_lib_mcast_deliver(net, skb, uh,
  31.                 saddr, daddr, udptable);

  32.     sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); //根据ip地址以及端口号查找对应的sock数据结构
  33.                                                                      //接收进程在对应的链表中睡眠
  34.     if (sk != NULL) { //不为空说明有对应的进程在等待这数据
  35.         int ret = udp_queue_rcv_skb(sk, skb);
  36.         sock_put(sk);

  37.         /* a return value > 0 means to resubmit the input, but
  38.          * it wants the return to be -protocol, or 0
  39.          */
  40.         if (ret > 0)
  41.             return -ret;
  42.         return 0;
  43.     }

  44.     if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
  45.         goto drop;
  46.     nf_reset(skb);

  47.     /* No socket. Drop packet silently, if checksum is wrong */
  48.     if (udp_lib_checksum_complete(skb))
  49.         goto csum_error;

  50.     UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
  51.     icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);

  52.     /*
  53.      * Hmm. We got an UDP packet to a port to which we
  54.      * don't wanna listen.  Ignore it.
  55.      */
  56.     kfree_skb(skb);
  57.     return 0;
  58. }     
首先看一下sock的hash查找函数:__udp4_lib_lookup_skb,该函数涉及hash表的一些查找,主要看一下具体的匹配函数:
  1. static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
  2.              unsigned short hnum,
  3.              __be16 sport, __be32 daddr, __be16 dport, int dif) 
  4. {
  5.     int score = -1;

  6.     if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
  7.             !ipv6_only_sock(sk)) {
  8.         struct inet_sock *inet = inet_sk(sk);

  9.         score = (sk->sk_family == PF_INET ? 1 : 0);  //一般为PF_INET
  10.         if (inet->inet_rcv_saddr) {   //bind指定地址的话有设置,否则为INADDR_ANY
  11.             if (inet->inet_rcv_saddr != daddr)
  12.                 return -1;
  13.             score += 2;
  14.         } 
  15.         if (inet->inet_daddr) {       //一般为0,参考inet_bind函数
  16.             if (inet->inet_daddr != saddr)
  17.                 return -1;
  18.             score += 2;
  19.         }
  20.         if (inet->inet_dport) {        //一般为0
  21.             if (inet->inet_dport != sport)
  22.                 return -1;
  23.             score += 2;
  24.         }
  25.         if (sk->sk_bound_dev_if) {    //一般为0
  26.             if (sk->sk_bound_dev_if != dif) 
  27.                 return -1;
  28.             score += 2;
  29.         }
  30.     } 
  31.     return score;
  32. }
该函数使用端口号寻找hash表中项,然后根据各个参数决定score,score大于-1表示找到对应的sock
找到sock后,去掉一些有效性检测,udp_queue_rcv_skb的逻辑如下:
  1. if (sk_rcvqueues_full(sk, skb))  //超过限值,sk->sk_rmem_alloc
  2.         goto drop;

  3.     rc = 0;

  4.     bh_lock_sock(sk);
  5.     if (!sock_owned_by_user(sk))
  6.         rc = __udp_queue_rcv_skb(sk, skb);  
  7.     else if (sk_add_backlog(sk, skb)) {
  8.         bh_unlock_sock(sk);
  9.         goto drop;
  10.     } 
  11.     bh_unlock_sock(sk)
分成两种情况:
1)sk没有被人占用,则把skb加入sk_receive_queue,然后唤醒等待的进程。
2如果sk被人占用,则把skb加入backlog链表,释放sk的时候会处理这种流程
先看第一种情况:

  1. int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
  2. {
  3.     int err;
  4.     int skb_len;
  5.     unsigned long flags;
  6.     struct sk_buff_head *list = &sk->sk_receive_queue; //获取链表头

  7.     /* Cast sk->rcvbuf to unsigned... It is pointless, but reduces
  8.        number of warnings when compiling with ---ANK
  9.      */
  10.     if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
  11.         (unsigned)sk->sk_rcvbuf) {
  12.         atomic_inc(&sk->sk_drops);
  13.         trace_sock_rcvqueue_full(sk, skb);
  14.         return -ENOMEM;
  15.     }

  16.     err = sk_filter(sk, skb);
  17.     if (err)
  18.         return err;

  19.     if (!sk_rmem_schedule(sk, skb->truesize)) {
  20.         atomic_inc(&sk->sk_drops);
  21.         return -ENOBUFS;
  22.     }

  23.     skb->dev = NULL;
  24.     skb_set_owner_r(skb, sk);

  25.     /* Cache the SKB length before we tack it onto the receive
  26.      * queue. Once it is added it no longer belongs to us and
  27.      * may be freed by other threads of control pulling packets
  28.      * from the queue.
  29.      */
  30.     skb_len = skb->len;

  31.     /* we escape from rcu protected region, make sure we dont leak
  32.      * a norefcounted dst
  33.      */
  34.     skb_dst_force(skb);
  35.     spin_lock_irqsave(&list->lock, flags);
  36.     skb->dropcount = atomic_read(&sk->sk_drops);
  37.     __skb_queue_tail(list, skb);
  38.     spin_unlock_irqrestore(&list->lock, flags);

  39.     if (!sock_flag(sk, SOCK_DEAD))
  40.         sk->sk_data_ready(sk, skb_len); //sock_init_data初始化的时候赋值为:sock_def_readable
  41.     return 0;
  42. }
  1. static void sock_def_readable(struct sock *sk, int len)
  2. {
  3.     struct socket_wq *wq;

  4.     rcu_read_lock();
  5.     wq = rcu_dereference(sk->sk_wq);
  6.     if (wq_has_sleeper(wq))
  7.         wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
  8.                         POLLRDNORM | POLLRDBAND);
  9.     sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
  10.     rcu_read_unlock();
  11. }
再看第二种情况,加入到对应的链表:
  1. static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
  2. {
  3.     /* dont let skb dst not refcounted, we are going to leave rcu lock */
  4.     skb_dst_force(skb);

  5.     if (!sk->sk_backlog.tail)
  6.         sk->sk_backlog.head = skb;
  7.     else
  8.         sk->sk_backlog.tail->next = skb;

  9.     sk->sk_backlog.tail = skb;
  10.     skb->next = NULL;
  11. }
释放sock的时候会判断该链表:
  1. void release_sock(struct sock *sk)
  2. {
  3.     /*
  4.      * The sk_lock has mutex_unlock() semantics:
  5.      */
  6.     mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);

  7.     spin_lock_bh(&sk->sk_lock.slock);
  8.     if (sk->sk_backlog.tail)
  9.         __release_sock(sk);
  10.     sk->sk_lock.owned = 0;
  11.     if (waitqueue_active(&sk->sk_lock.wq))
  12.         wake_up(&sk->sk_lock.wq);
  13.     spin_unlock_bh(&sk->sk_lock.slock);
  14. }
__release_sock会遍历tail对应链表上的所有skb,分别调用sk_backlog_rcv函数:

  1. static void __release_sock(struct sock *sk)
  2.     __releases(&sk->sk_lock.slock)
  3.     __acquires(&sk->sk_lock.slock)
  4. {
  5.     struct sk_buff *skb = sk->sk_backlog.head;

  6.     do {
  7.         sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
  8.         bh_unlock_sock(sk);

  9.         do {
  10.             struct sk_buff *next = skb->next;

  11.             WARN_ON_ONCE(skb_dst_is_noref(skb));
  12.             skb->next = NULL;
  13.             sk_backlog_rcv(sk, skb); //sk->sk_backlog_rcv(sk, skb)=sk->sk_prot->backlog_rcv
  14.     
  15.             /*
  16.              * We are in process context here with softirqs
  17.              * disabled, use cond_resched_softirq() to preempt.
  18.              * This is safe to do because we've taken the backlog
  19.              * queue private:
  20.              */
  21.             cond_resched_softirq();

  22.             skb = next;
  23.         } while (skb != NULL);

  24.         bh_lock_sock(sk);
  25.     } while ((skb = sk->sk_backlog.head) != NULL);
  26.     
  27.     /*
  28.      * Doing the zeroing here guarantee we can not loop forever
  29.      * while a wild producer attempts to flood us.
  30.      */
  31.     sk->sk_backlog.len = 0;
  32. }
对于udp为__udp_queue_rcv_skb:
  1. static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
  2. {
  3.     int rc;

  4.     if (inet_sk(sk)->inet_daddr)
  5.         sock_rps_save_rxhash(sk, skb->rxhash);

  6.     rc = ip_queue_rcv_skb(sk, skb); //调用sock_queue_rcv_skb,回到第一种处理情况
  7.     if (rc < 0) {
  8.         int is_udplite = IS_UDPLITE(sk);

  9.         /* Note that an ENOMEM error is charged twice */
  10.         if (rc == -ENOMEM)
  11.             UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
  12.                      is_udplite);
  13.         UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
  14.         kfree_skb(skb);
  15.         trace_udp_fail_queue_rcv_skb(rc, sk);
  16.         return -1;
  17.     }

  18.     return 0;
  19.     
  20. }

一句话总结,对应udp而言,__netif_receive_skb把底层传上来的skb放到sock对应的sk_receive_queue链表中,然后唤醒等待数据的进程。


ARP报文处理:

netif_receive_skb()函数中,可以看出处理的是像ARPIP这些链路层以上的协议,那么,链路层报头是在哪里去掉的呢?答案是网卡驱动中,在调用netif_receive_skb()前,

skb->protocol = eth_type_trans(skb, bp->dev);

该函数对处理后skb>data跳过以太网报头,由mac_header指示以太网报头:

进入netif_receive_skb()函数

list_for_each_entry_rcu(ptype,&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list)

按照协议类型依次由相应的协议模块进行处理,而所以的协议模块处理都会注册在ptype_base中,实际是链表结构。

net/core/dev.c

static struct list_head ptype_base __read_mostly;   /* Taps */

 

而相应的协议模块是通过dev_add_pack()函数加入的:

void dev_add_pack(struct packet_type *pt)

{

     int hash;

 

     spin_lock_bh(&ptype_lock);

     if (pt->type == htons(ETH_P_ALL))

              list_add_rcu(&pt->list, &ptype_all);

     else {

              hash = ntohs(pt->type) & PTYPE_HASH_MASK;

              list_add_rcu(&pt->list, &ptype_base[hash]);

     }

     spin_unlock_bh(&ptype_lock);

}

 

ARP处理为例

该模块的定义,它会在arp_init()中注册进ptype_base链表中:

static struct packet_type arp_packet_type __read_mostly = {

     .type =      cpu_to_be16(ETH_P_ARP),

     .func =      arp_rcv,

};

 

然后在根据报文的TYPE来在ptype_base中查找相应协议模块进行处理时,实际调用arp_rcv()进行接收

arp_rcv() --> arp_process()

arp = arp_hdr(skb);

……

arp_ptr= (unsigned char *)(arp+1);

sha= arp_ptr;

arp_ptr += dev->addr_len;

memcpy(&sip, arp_ptr, 4);

arp_ptr += 4;

arp_ptr += dev->addr_len;

memcpy(&tip, arp_ptr, 4);

操作后这指针位置:

然后判断是ARP请求报文,这时先查询路由表ip_route_input()

if (arp->ar_op == htons(ARPOP_REQUEST) &&

         ip_route_input(skb, tip, sip, 0, dev) == 0)

ip_route_input()函数中,先在cache中查询是否存在相应的路由表项:

hash = rt_hash(daddr, saddr, iif, rt_genid(net));

缓存的路由项在内核中组织成hash表的形式,因此在查询时,先算出的hash值,再用该项- rt_hash_table[hash].chain即可。这里可以看到,缓存路由项包括了源IP地址、目的IP地址、网卡号。

 

如果在缓存中没有查到匹配项,或指定不查询cache,则查询路由表ip_route_input_slow()

进入ip_route_input_slow()函数,最终调用fib_lookup()得到查询结果fib_result

if ((err = fib_lookup(net, &fl, &res)) != 0)

如果结果fib_result合法,则需要更新路由缓存,将此次查询结果写入缓存

hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));

err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);

 

在查找完路由表后,回到arp_process()函数,如果路由项指向本地,则应由本机接收该报文:

if (addr_type == RTN_LOCAL) {

              ……

              if (!dont_send) {

                       n = neigh_event_ns(&arp_tbl, sha, &sip, dev);

                       if (n) {

                                 arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);

                                 neigh_release(n);

                       }

              }

              goto out;

     }

首先更新邻居表neigh_event_ns(),然后发送ARP响应 – arp_send

至此,大致的ARP流程完成。由于ARP部分涉及到路由表以及邻居表,这都是很大的概念。