Linux内核分析

来源：互联网发布：png格式打开软件编辑：程序博客网时间：2024/06/14 16:39

网络收包流程从网卡驱动开始，一直往上，涉及NAPI、GRO、RPS等特性，但是一般最后都会调用__netif_receive_skb函数：

函数主要有几个处理：

1、vlan报文的处理，主要是循环把vlan头剥掉，如果qinq场景，两个vlan都会被剥掉；

2、交给rx_handler处理，例如OVS、linux bridge等；

3、ptype_all处理，例如抓包程序、raw socket等；

4、ptype_base处理，交给协议栈处理，例如ip、arp、rarp等；

static int __netif_receive_skb(struct sk_buff *skb)
{
    struct packet_type *ptype, *pt_prev;
    rx_handler_func_t *rx_handler;
    struct net_device *orig_dev;
    struct net_device *null_or_dev;
    bool deliver_exact = false;
    int ret = NET_RX_DROP;
    __be16 type;

    if (!netdev_tstamp_prequeue)
        net_timestamp_check(skb);

    trace_netif_receive_skb(skb);

    if (netpoll_receive_skb(skb))
        return NET_RX_DROP;

    if (!skb->skb_iif)
        skb->skb_iif = skb->dev->ifindex;
    orig_dev = skb->dev;

    skb_reset_network_header(skb);  //把L3、L4的头都指向data数据结构，到这里的时候skb已经处理完L2层的头了
    skb_reset_transport_header(skb);
    skb_reset_mac_len(skb);

    pt_prev = NULL;

    rcu_read_lock();

another_round:

    __this_cpu_inc(softnet_data.processed);

    if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
        skb = vlan_untag(skb);
        if (unlikely(!skb))
            goto out;
    }

#ifdef CONFIG_NET_CLS_ACT
    if (skb->tc_verd & TC_NCLS) {
        skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
        goto ncls;
    }
#endif

    list_for_each_entry_rcu(ptype, &ptype_all, list) {  //把包交给特定协议相关的处理函数前，先调用ptype_all中注册的函数
        if (!ptype->dev || ptype->dev == skb->dev) {    //最常见的为tcpdump，该工具就是从这里拿到所有收到的包的
            if (pt_prev)
                ret = deliver_skb(skb, pt_prev, orig_dev);
            pt_prev = ptype;  //pt_prev的加入是为了优化，只有当找到下一个匹配的时候，才执行这一次的回调函数
        }                     
    }

#ifdef CONFIG_NET_CLS_ACT
    skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
    if (!skb)
        goto out;
ncls:
#endif
    rx_handler = rcu_dereference(skb->dev->rx_handler);  //由具体驱动决定
    if (rx_handler) {
        if (pt_prev) {
            ret = deliver_skb(skb, pt_prev, orig_dev);
            pt_prev = NULL;
        }
        switch (rx_handler(&skb)) {
        case RX_HANDLER_CONSUMED:
            goto out;
        case RX_HANDLER_ANOTHER:
            goto another_round;
        case RX_HANDLER_EXACT:
            deliver_exact = true;
        case RX_HANDLER_PASS:
            break;
        default:
            BUG();
        }
    }

    if (vlan_tx_tag_present(skb)) {
        if (pt_prev) {
            ret = deliver_skb(skb, pt_prev, orig_dev);
            pt_prev = NULL;
        }
        if (vlan_do_receive(&skb)) {
            ret = __netif_receive_skb(skb);
            goto out;
        } else if (unlikely(!skb))
            goto out;
    }

    /* deliver only exact match when indicated */
    null_or_dev = deliver_exact ? skb->dev : NULL;

    type = skb->protocol;
    list_for_each_entry_rcu(ptype,
            &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
        if (ptype->type == type &&
            (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
             ptype->dev == orig_dev)) {
            if (pt_prev)
                ret = deliver_skb(skb, pt_prev, orig_dev); //atomic_inc(&skb->users);
            pt_prev = ptype;
        }
    }

    if (pt_prev) {
        ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);  //一般的最后这一次没有引用计数的增加，直接调用函数
    } else {
        atomic_long_inc(&skb->dev->rx_dropped);
        kfree_skb(skb);
        /* Jamal, now you will not able to escape explaining
         * me how you were going to use this. :-)
         */
        ret = NET_RX_DROP;
    }

out:
    rcu_read_unlock();
    return ret;
}

该函数涉及两个全局变量：

static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
static struct list_head ptype_all __read_mostly

看几个常见的packet_type，这些都在相应的协议初始化的时候调用dev_add_pack加入到特性的链表中：

static struct packet_type ip_packet_type __read_mostly = {
    .type = cpu_to_be16(ETH_P_IP),
    .func = ip_rcv,
    .gso_send_check = inet_gso_send_check,
    .gso_segment = inet_gso_segment,
    .gro_receive = inet_gro_receive,
    .gro_complete = inet_gro_complete,
};

static struct packet_type arp_packet_type __read_mostly = {
    .type = cpu_to_be16(ETH_P_ARP),
    .func = arp_rcv,
}

在ip_rcv函数中会对L3头做一些有效性检测:

int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
    const struct iphdr *iph;
    u32 len;

    /* When the interface is in promisc. mode, drop all the crap
     * that it receives, do not try to analyse it.
     */
    if (skb->pkt_type == PACKET_OTHERHOST)  //驱动根据MAC地址设置的，如果MAC地址不是本机的话，在这里丢弃。
        goto drop;


    IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);

    if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
        IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
        goto out;
    }

    if (!pskb_may_pull(skb, sizeof(struct iphdr)))
        goto inhdr_error;

    iph = ip_hdr(skb);

    /*
     * RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
     *
     * Is the datagram acceptable?
     *
     * 1. Length at least the size of an ip header
     * 2. Version of 4
     * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
     * 4. Doesn't have a bogus length
     */

    if (iph->ihl < 5 || iph->version != 4)
        goto inhdr_error;

    if (!pskb_may_pull(skb, iph->ihl*4))
        goto inhdr_error;

    iph = ip_hdr(skb);

    if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))  //校验ip头是否正确
        goto inhdr_error;
    len = ntohs(iph->tot_len);  //iph中的大小是真正的大小，skb中len的大小是驱动中设置的，当包很小的时候，会进行填充，因此会比iph中的大
    if (skb->len < len) {//以r8169为例，如果收到udp的包负载为1,则iph中的大小为20+8+1=29。但是此时skb->len=46=64(min)-14-4(vlan)
        IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
        goto drop;
    } else if (len < (iph->ihl*4))
        goto inhdr_error;

    /* Our transport medium may have padded the buffer out. Now we know it
     * is IP we can trim to the true length of the frame.
     * Note this now means skb->len holds ntohs(iph->tot_len).
     */
    if (pskb_trim_rcsum(skb, len)) {  //去除填充的数据
        IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
        goto drop;
    }

    /* Remove any debris in the socket control block */
    memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));

    /* Must drop socket now because of tproxy. */
    skb_orphan(skb);

    return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
               ip_rcv_finish);

inhdr_error:
    IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
drop:
    kfree_skb(skb);
out:
    return NET_RX_DROP;
}

然后调用ip_rcv_finish：

static int ip_rcv_finish(struct sk_buff *skb)
{
    const struct iphdr *iph = ip_hdr(skb);
    struct rtable *rt;

    /* 
     * Initialise the virtual path cache for the packet. It describes
     * how the packet travels inside Linux networking.
     */
    if (skb_dst(skb) == NULL) {
        int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,//路由寻找，根据目的地址判断是本地接收还是转发（使能forward的话）
                           iph->tos, skb->dev);
        if (unlikely(err)) {
            if (err == -EHOSTUNREACH)
                IP_INC_STATS_BH(dev_net(skb->dev),
                        IPSTATS_MIB_INADDRERRORS);
            else if (err == -ENETUNREACH)
                IP_INC_STATS_BH(dev_net(skb->dev),
                        IPSTATS_MIB_INNOROUTES);
            else if (err == -EXDEV)
                NET_INC_STATS_BH(dev_net(skb->dev),
                         LINUX_MIB_IPRPFILTER);
            goto drop;
        }
    } 

#ifdef CONFIG_IP_ROUTE_CLASSID
    if (unlikely(skb_dst(skb)->tclassid)) {
        struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
        u32 idx = skb_dst(skb)->tclassid;
        st[idx&0xFF].o_packets++;
        st[idx&0xFF].o_bytes += skb->len;
        st[(idx>>16)&0xFF].i_packets++;
        st[(idx>>16)&0xFF].i_bytes += skb->len;
    } 
#endif

    if (iph->ihl > 5 && ip_rcv_options(skb))
        goto drop;

    rt = skb_rtable(skb);
    if (rt->rt_type == RTN_MULTICAST) {
        IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
                skb->len);
    } else if (rt->rt_type == RTN_BROADCAST)
        IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
                skb->len);

    return dst_input(skb);  //skb_dst(skb)->input(skb);路由寻找过程中赋值，本地接收的话为：ip_local_deliver

drop:
    kfree_skb(skb);
    return NET_RX_DROP;
}

int ip_local_deliver(struct sk_buff *skb)
{
    /*
     * Reassemble IP fragments.
     */

    if (ip_is_fragment(ip_hdr(skb))) {
        if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
            return 0;
    }

    return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
               ip_local_deliver_finish);
}

略过ip defrag流程，直接调用ip_local_deliver_finish,该函数根据L3头指定的L4协议，调用特定的函数：

static int ip_local_deliver_finish(struct sk_buff *skb)
{
    struct net *net = dev_net(skb->dev);

    __skb_pull(skb, ip_hdrlen(skb)); //增加data，略过L3头，此时data指向L4头

    /* Point into the IP datagram, just past the header. */
    skb_reset_transport_header(skb);

    rcu_read_lock();
    {
        int protocol = ip_hdr(skb)->protocol;  //L4类型，如TCP或者UDP
        int hash, raw;
        const struct net_protocol *ipprot;

    resubmit:
        raw = raw_local_deliver(skb, protocol); //

        hash = protocol & (MAX_INET_PROTOS - 1);
        ipprot = rcu_dereference(inet_protos[hash]);  //udp_protocol
        if (ipprot != NULL) {
            int ret;

            if (!net_eq(net, &init_net) && !ipprot->netns_ok) {
                if (net_ratelimit())
                    printk("%s: proto %d isn't netns-ready\n",
                        __func__, protocol);
                kfree_skb(skb);
                goto out;
            }

            if (!ipprot->no_policy) {
                if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                    kfree_skb(skb);
                    goto out;
                }
                nf_reset(skb);
            }
            ret = ipprot->handler(skb);  //udp_rcv
            if (ret < 0) {
                protocol = -ret;
                goto resubmit;
            }
            IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
        } else {
            if (!raw) {
                if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                    IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
                    icmp_send(skb, ICMP_DEST_UNREACH,
                          ICMP_PROT_UNREACH, 0);
                }
            } else
                IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
            kfree_skb(skb);
        }
    }
 out:
    rcu_read_unlock();

    return 0;
}

udp调用udp_rcv，最后调用__udp4_lib_rcv：

int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
           int proto)
{
    struct sock *sk; 
    struct udphdr *uh; 
    unsigned short ulen;
    struct rtable *rt = skb_rtable(skb);
    __be32 saddr, daddr;
    struct net *net = dev_net(skb->dev);

    /* 
     * Validate the packet.
     */
    if (!pskb_may_pull(skb, sizeof(struct udphdr)))
        goto drop; /* No space for header. */

    uh = udp_hdr(skb);
    ulen = ntohs(uh->len);
    saddr = ip_hdr(skb)->saddr;
    daddr = ip_hdr(skb)->daddr;

    if (ulen > skb->len)
        goto short_packet;

    if (proto == IPPROTO_UDP) {
        /* UDP validates ulen. */
        if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
            goto short_packet;
        uh = udp_hdr(skb);
    } 

    if (udp4_csum_init(skb, uh, proto))
        goto csum_error;

    if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
        return __udp4_lib_mcast_deliver(net, skb, uh,
                saddr, daddr, udptable);

    sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); //根据ip地址以及端口号查找对应的sock数据结构
                                                                     //接收进程在对应的链表中睡眠
    if (sk != NULL) { //不为空说明有对应的进程在等待这数据
        int ret = udp_queue_rcv_skb(sk, skb);
        sock_put(sk);

        /* a return value > 0 means to resubmit the input, but
         * it wants the return to be -protocol, or 0
         */
        if (ret > 0)
            return -ret;
        return 0;
    }

    if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
        goto drop;
    nf_reset(skb);

    /* No socket. Drop packet silently, if checksum is wrong */
    if (udp_lib_checksum_complete(skb))
        goto csum_error;

    UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
    icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);

    /*
     * Hmm. We got an UDP packet to a port to which we
     * don't wanna listen.  Ignore it.
     */
    kfree_skb(skb);
    return 0;
}     

首先看一下sock的hash查找函数：__udp4_lib_lookup_skb，该函数涉及hash表的一些查找，主要看一下具体的匹配函数：

static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
             unsigned short hnum,
             __be16 sport, __be32 daddr, __be16 dport, int dif) 
{
    int score = -1;

    if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
            !ipv6_only_sock(sk)) {
        struct inet_sock *inet = inet_sk(sk);

        score = (sk->sk_family == PF_INET ? 1 : 0);  //一般为PF_INET
        if (inet->inet_rcv_saddr) {   //bind指定地址的话有设置，否则为INADDR_ANY
            if (inet->inet_rcv_saddr != daddr)
                return -1;
            score += 2;
        } 
        if (inet->inet_daddr) {       //一般为0，参考inet_bind函数
            if (inet->inet_daddr != saddr)
                return -1;
            score += 2;
        }
        if (inet->inet_dport) {        //一般为0
            if (inet->inet_dport != sport)
                return -1;
            score += 2;
        }
        if (sk->sk_bound_dev_if) {    //一般为0
            if (sk->sk_bound_dev_if != dif) 
                return -1;
            score += 2;
        }
    } 
    return score;
}

该函数使用端口号寻找hash表中项，然后根据各个参数决定score，score大于-1表示找到对应的sock
找到sock后，去掉一些有效性检测，udp_queue_rcv_skb的逻辑如下：

if (sk_rcvqueues_full(sk, skb))  //超过限值，sk->sk_rmem_alloc
        goto drop;

    rc = 0;

    bh_lock_sock(sk);
    if (!sock_owned_by_user(sk))
        rc = __udp_queue_rcv_skb(sk, skb);  
    else if (sk_add_backlog(sk, skb)) {
        bh_unlock_sock(sk);
        goto drop;
    } 
    bh_unlock_sock(sk)

分成两种情况：
1）sk没有被人占用，则把skb加入sk_receive_queue，然后唤醒等待的进程。
2）如果sk被人占用，则把skb加入backlog链表，释放sk的时候会处理这种流程
先看第一种情况：

int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
    int err;
    int skb_len;
    unsigned long flags;
    struct sk_buff_head *list = &sk->sk_receive_queue; //获取链表头

    /* Cast sk->rcvbuf to unsigned... It is pointless, but reduces
       number of warnings when compiling with -W --ANK
     */
    if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
        (unsigned)sk->sk_rcvbuf) {
        atomic_inc(&sk->sk_drops);
        trace_sock_rcvqueue_full(sk, skb);
        return -ENOMEM;
    }

    err = sk_filter(sk, skb);
    if (err)
        return err;

    if (!sk_rmem_schedule(sk, skb->truesize)) {
        atomic_inc(&sk->sk_drops);
        return -ENOBUFS;
    }

    skb->dev = NULL;
    skb_set_owner_r(skb, sk);

    /* Cache the SKB length before we tack it onto the receive
     * queue. Once it is added it no longer belongs to us and
     * may be freed by other threads of control pulling packets
     * from the queue.
     */
    skb_len = skb->len;

    /* we escape from rcu protected region, make sure we dont leak
     * a norefcounted dst
     */
    skb_dst_force(skb);
    spin_lock_irqsave(&list->lock, flags);
    skb->dropcount = atomic_read(&sk->sk_drops);
    __skb_queue_tail(list, skb);
    spin_unlock_irqrestore(&list->lock, flags);

    if (!sock_flag(sk, SOCK_DEAD))
        sk->sk_data_ready(sk, skb_len); //sock_init_data初始化的时候赋值为：sock_def_readable
    return 0;
}

static void sock_def_readable(struct sock *sk, int len)
{
    struct socket_wq *wq;

    rcu_read_lock();
    wq = rcu_dereference(sk->sk_wq);
    if (wq_has_sleeper(wq))
        wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
                        POLLRDNORM | POLLRDBAND);
    sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
    rcu_read_unlock();
}

再看第二种情况，加入到对应的链表：

static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{
    /* dont let skb dst not refcounted, we are going to leave rcu lock */
    skb_dst_force(skb);

    if (!sk->sk_backlog.tail)
        sk->sk_backlog.head = skb;
    else
        sk->sk_backlog.tail->next = skb;

    sk->sk_backlog.tail = skb;
    skb->next = NULL;
}

释放sock的时候会判断该链表：

void release_sock(struct sock *sk)
{
    /*
     * The sk_lock has mutex_unlock() semantics:
     */
    mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);

    spin_lock_bh(&sk->sk_lock.slock);
    if (sk->sk_backlog.tail)
        __release_sock(sk);
    sk->sk_lock.owned = 0;
    if (waitqueue_active(&sk->sk_lock.wq))
        wake_up(&sk->sk_lock.wq);
    spin_unlock_bh(&sk->sk_lock.slock);
}

__release_sock会遍历tail对应链表上的所有skb，分别调用sk_backlog_rcv函数：

static void __release_sock(struct sock *sk)
    __releases(&sk->sk_lock.slock)
    __acquires(&sk->sk_lock.slock)
{
    struct sk_buff *skb = sk->sk_backlog.head;

    do {
        sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
        bh_unlock_sock(sk);

        do {
            struct sk_buff *next = skb->next;

            WARN_ON_ONCE(skb_dst_is_noref(skb));
            skb->next = NULL;
            sk_backlog_rcv(sk, skb); //sk->sk_backlog_rcv(sk, skb)=sk->sk_prot->backlog_rcv
    
            /*
             * We are in process context here with softirqs
             * disabled, use cond_resched_softirq() to preempt.
             * This is safe to do because we've taken the backlog
             * queue private:
             */
            cond_resched_softirq();

            skb = next;
        } while (skb != NULL);

        bh_lock_sock(sk);
    } while ((skb = sk->sk_backlog.head) != NULL);
    
    /*
     * Doing the zeroing here guarantee we can not loop forever
     * while a wild producer attempts to flood us.
     */
    sk->sk_backlog.len = 0;
}

对于udp为__udp_queue_rcv_skb：

static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
    int rc;

    if (inet_sk(sk)->inet_daddr)
        sock_rps_save_rxhash(sk, skb->rxhash);

    rc = ip_queue_rcv_skb(sk, skb); //调用sock_queue_rcv_skb，回到第一种处理情况
    if (rc < 0) {
        int is_udplite = IS_UDPLITE(sk);

        /* Note that an ENOMEM error is charged twice */
        if (rc == -ENOMEM)
            UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
                     is_udplite);
        UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
        kfree_skb(skb);
        trace_udp_fail_queue_rcv_skb(rc, sk);
        return -1;
    }

    return 0;
    
}

一句话总结，对应udp而言，__netif_receive_skb把底层传上来的skb放到sock对应的sk_receive_queue链表中，然后唤醒等待数据的进程。

ARP报文处理：

在netif_receive_skb()函数中，可以看出处理的是像ARP、IP这些链路层以上的协议，那么，链路层报头是在哪里去掉的呢？答案是网卡驱动中，在调用netif_receive_skb()前，

skb->protocol = eth_type_trans(skb, bp->dev);

该函数对处理后skb>data跳过以太网报头，由mac_header指示以太网报头：

进入netif_receive_skb()函数

list_for_each_entry_rcu(ptype,&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list)

按照协议类型依次由相应的协议模块进行处理，而所以的协议模块处理都会注册在ptype_base中，实际是链表结构。

net/core/dev.c

static struct list_head ptype_base __read_mostly; /* Taps */

而相应的协议模块是通过dev_add_pack()函数加入的：

void dev_add_pack(struct packet_type *pt)

{

int hash;

spin_lock_bh(&ptype_lock);

if (pt->type == htons(ETH_P_ALL))

list_add_rcu(&pt->list, &ptype_all);

else {

hash = ntohs(pt->type) & PTYPE_HASH_MASK;

list_add_rcu(&pt->list, &ptype_base[hash]);

}

spin_unlock_bh(&ptype_lock);

}

以ARP处理为例

该模块的定义，它会在arp_init()中注册进ptype_base链表中：

static struct packet_type arp_packet_type __read_mostly = {

.type = cpu_to_be16(ETH_P_ARP),

.func = arp_rcv,

};

然后在根据报文的TYPE来在ptype_base中查找相应协议模块进行处理时，实际调用arp_rcv()进行接收

arp_rcv() --> arp_process()

arp = arp_hdr(skb);

……

arp_ptr= (unsigned char *)(arp+1);

sha= arp_ptr;

arp_ptr += dev->addr_len;

memcpy(&sip, arp_ptr, 4);

arp_ptr += 4;

arp_ptr += dev->addr_len;

memcpy(&tip, arp_ptr, 4);

操作后这指针位置：

然后判断是ARP请求报文，这时先查询路由表ip_route_input()

if (arp->ar_op == htons(ARPOP_REQUEST) &&

ip_route_input(skb, tip, sip, 0, dev) == 0)

在ip_route_input()函数中，先在cache中查询是否存在相应的路由表项：

hash = rt_hash(daddr, saddr, iif, rt_genid(net));

缓存的路由项在内核中组织成hash表的形式，因此在查询时，先算出的hash值，再用该项- rt_hash_table[hash].chain即可。这里可以看到，缓存路由项包括了源IP地址、目的IP地址、网卡号。

如果在缓存中没有查到匹配项，或指定不查询cache，则查询路由表ip_route_input_slow()；

进入ip_route_input_slow()函数，最终调用fib_lookup()得到查询结果fib_result

if ((err = fib_lookup(net, &fl, &res)) != 0)

如果结果fib_result合法，则需要更新路由缓存，将此次查询结果写入缓存

hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));

err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);

在查找完路由表后，回到arp_process()函数，如果路由项指向本地，则应由本机接收该报文：

if (addr_type == RTN_LOCAL) {

……

if (!dont_send) {

n = neigh_event_ns(&arp_tbl, sha, &sip, dev);

if (n) {

arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);

neigh_release(n);

}

goto out;

}

首先更新邻居表neigh_event_ns()，然后发送ARP响应 – arp_send。

至此，大致的ARP流程完成。由于ARP部分涉及到路由表以及邻居表，这都是很大的概念。

阅读全文

0 0