ip_route_output_slow() ip_route_input()与linux的tunnel技术实现 ipip_tunnel_lookup

来源：互联网发布：淘宝店铺的运营方案编辑：程序博客网时间：2024/06/05 19:04

Linux-2.6.21.1 网络函数调用流程

接收以太帧:
netif_rx
-> queue
-> netif_receive_skb
-> bond
-> packet_type_all: deliver_skb
-> bridge
-> packet_type(IPV4)->func == ip_rcv

接收IPv4包:

ip_rcv
-> NF_HOOK(PREROUTING)
    ->ip_rcv_finish
      -> ip_route_input
        -> ip_route_input_cached
          -> ip_route_input_slow
            -> ip_mkroute_input
              -> __mkroute_input
                dst->input = ip_forward
                dst->output = ip_output
      -> dst_input
        -> LOCAL_IN: dst->input == ip_local_deliver
          -> NF_HOOK(NF_INPUT)
            -> ip_local_deliver_finish
              -> ipprot->handler(tcp, udp, icmp ...)
        -> FORWARD: dst->input == ip_forward

转发:

ip_forward
-> xfrm4_route_forward (net/xfrm.h, get xfrm_dst)
    -> xfrm_route_forward
      -> __xfrm_route_forward
        -> xfrm_lookup
          -> xfrm_find_bundle
            -> afinfo->find_bundle == __xfrm4_find_bundle
          -> xfrm_bundle_create
            -> afinfo->bundle_create == __xfrm4_bundle_create
              tunnel mode
              -> xfrm_dst_lookup
                -> afinfo->dst_lookup == xfrm4_dst_lookup
                  -> __ip_route_output_key
          -> dst_list: dst->list=policy_bundles, policy->bundles = dst
-> NF_HOOK(NF_FORWARD)
-> ip_forward_finish
-> dst_output

输出:

icmp:
icmp_send
-> ip_route_output_key
    -> ip_route_output_flow
-> icmp_push_reply
    -> ip_append_data
-> skb_queue_walk
    -> ip_push_appending_frames

tcp:
tcp_connect
-> ip_route_connect
    -> ip_route_output_flow
tcp_sendmsg
-> __tcp_push_appending_frames
    -> tcp_write_xmit
   -> tcp_transmit_skb
        -> net_xmit_eval
          -> icsk->icsk_af_ops->queue_xmit == ipv4_specific->queue_xmit == ip_queue_xmit
-> tcp_push_one
    -> tcp_transmit_skb
      -> net_xmit_eval
        -> icsk->icsk_af_ops->queue_xmit == ipv4_specific->queue_xmit == ip_queue_xmit

tcp_protocol->handler == tcp_v4_rcv
-> __inet_lookup
-> xfrm_policy_check
-> tcp_v4_do_rcv
    -> tcp_rcv_state_process
      -> icsk->icsk_af_ops->conn_request == tcp_v4_conn_request
        -> tcp_v4_send_synack
          -> ip_build_and_send_pkt
            -> NF_HOOK( NF_OUTPUT )
              -> dst_output

udp:
udp_sendmsg
-> ip_route_output_flow
-> ip_append_data
-> __skb_queue_tail( sk_write_queue )
-> udp_push_pending_frames
-> ip_push_pending_frames

raw:
raw_sendmsg
-> ip_route_output_flow
-> ip_append_data
-> __skb_queue_tail( sk_write_queue )
-> ip_push_pending_frames

ip_push_pending_frames
-> __skb_dequeue(sk_write_queue)
-> NF_HOOK(NF_OUTPUT)
-> dst_output

ip_queue_xmit
-> ip_route_output_flow
    -> xfrm_lookup
      -> xfrm_find_bundle
        -> bundle_create
          -> afinfo->bundle_create == __xfrm4_bundle_create
            -> xfrm_dst_lookup
              -> afinfo->dst_lookup == xfrm4_dst_lookup
                -> __ip_route_output_key
        -> dst_list
        -> dst->list=policy_bundles, policy->bundles = dst

-> NF_HOOK(NF_OUTPUT)
-> dst_output
-> dst->output

dst_output: dst_list循环
-> dst->output == xfrm_dst->output == xfrm4_output == xfrm4_state_afinfo->output
    -> NF_HOOK(POSTROUTING)
      -> xfrm4_output_finish
        -> gso ?
        -> xfrm4_output_finish2
          -> xfrm4_output_one
            -> mode->output
            -> type->output
            -> skb->dst=dst_pop(skb->dst)
          -> nf_hook(NF_OUTPUT)
            -> !dst->xfrm
              -> dst_output
          -> nf_hook(POSTROUTING)
-> dst->output == ip_output
    -> NF_HOOK(POSTROUTING)
      -> ip_finish_output
        -> ip_finish_output2
          -> hh_output == dev_queue_xmit

有段日子没写了，今天继续，想了一下，觉得先温习一下tunnel技术。

（一）tunnel即隧道，被用于在公网内传输私网数据，也就是VPN。实现类似于我们学习的数据结构中的栈，把数据报文封装在新的报文中，通过第三方协议(比如IP协议)传输到对端，对端进行解封，重新路由。

linux内核支持IPIP/GRE隧道协议（不考虑IPV6） tunnel4.c是一个框架程序，相当于容器，ipip是他肚子里的实体。觉得没有必要这么写，因为ip_gre.c的实现就不是这样的。

IPIP是最简单的实现隧道功能的协议，只支持承载IP报文，所以在应用上也就有了局限性，比如无法实现ARP代理，但对于分析设计思路还是非常好的，简单的东西往往更具有代表性，复杂的东西简单化么，我不是博士，所以没有能力把这么简单的东西说的谁也看不懂。

（二）在ipip中，首先要理解的是初始化过程

staticint __init ipip_init(void) { int err; printk(banner); //在框架程序里添加接收处理函数 if (xfrm4_tunnel_register(&ipip_handler,AF_INET)){ printk(KERN_INFO "ipip init: can't register tunnel\n"); return -EAGAIN; } //创建虚拟接口，这个很重要，在配置好tunnel后，如果发送的目的地址是私网的IP，路由系统就会把报文发送给这个虚拟接口，这样私网报文就通过ipip_tunnel_xmit函数被封装起来了，通过路由系统重新路由，找到公网IP对应的物理接口，把报文通过这个真是的物理接口发送给对端网关，也就是隧道的另一端。 ipip_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "tunl0", ipip_tunnel_setup); if (!ipip_fb_tunnel_dev){ err = -ENOMEM; goto err1; } ipip_fb_tunnel_dev->init= ipip_fb_tunnel_init; if ((err= register_netdev(ipip_fb_tunnel_dev))) goto err2; out: return err; err2: free_netdev(ipip_fb_tunnel_dev); err1: xfrm4_tunnel_deregister(&ipip_handler,AF_INET); goto out; }

（三）调用上顺序上，

报文从以太接口接收，交给二层处理netif_receive_skb；

二层发现是IP报文，交给三层处理ip_rcv；

三层根据IP协议中的协议号，发现时IPPROTO_IPIP报文，交给tunnel4_rcv->ipip_rcv这才是核心接收处理流程。

staticint ipip_rcv(struct sk_buff*skb) { struct iphdr *iph; struct ip_tunnel *tunnel; iph = skb->nh.iph; read_lock(&ipip_lock);

//由于可能有多个tunnel虚拟接口，先查找到对应的tunnel接口 if ((tunnel= ipip_tunnel_lookup(iph->saddr, iph->daddr))!=NULL){ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)){ read_unlock(&ipip_lock); kfree_skb(skb); return 0; } secpath_reset(skb); skb->mac.raw= skb->nh.raw;//把二层地址指向三层数据的起始地址 skb->nh.raw= skb->data;//三层地址指向数据，也就是其封装的IP报文地址，其实还是三层报文地址，相当于做了还原。 skb->protocol= htons(ETH_P_IP);//修改协议，不然下一次报文仍然会进入这个函数 skb->pkt_type= PACKET_HOST;//本机报文 tunnel->stat.rx_packets++; tunnel->stat.rx_bytes += skb->len; skb->dev = tunnel->dev;//把tunnel接口指向真正的物理接口 dst_release(skb->dst); skb->dst= NULL; nf_reset(skb); ipip_ecn_decapsulate(iph, skb); netif_rx(skb);//把这个报文重新发给二层缓存，重新分发。 read_unlock(&ipip_lock); return 0; } read_unlock(&ipip_lock); return -1; }

如果能读懂上面的程序，也就理解了tunnel技术。而后面的衍生技术GRE/SIT都只不过是其v2.0/v3.0。在往后就可以理解多种VPN实现，L2TP,PPTP,MPLS VPN等技术。

下面的函数是发送给tunnel接口报文的处理，就是recv函数的逆实现。

staticint ipip_tunnel_xmit(struct sk_buff*skb,struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct net_device_stats *stats = &tunnel->stat; struct iphdr *tiph = &tunnel->parms.iph; u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; struct rtable *rt; /* Route to the other host */ struct net_device *tdev; /* Device to other host */ struct iphdr *old_iph = skb->nh.iph; struct iphdr *iph; /* Our new IP header */ int max_headroom; /* The extra header space needed */ __be32 dst = tiph->daddr; int mtu; if (tunnel->recursion++){ tunnel->stat.collisions++; goto tx_error; } if (skb->protocol!=htons(ETH_P_IP)) goto tx_error; if (tos&1) tos = old_iph->tos; if (!dst){ /* NBMA tunnel */ if ((rt = (struct rtable*)skb->dst)==NULL){ tunnel->stat.tx_fifo_errors++; goto tx_error; } if ((dst = rt->rt_gateway)== 0) goto tx_error_icmp; } { struct flowi fl = { .oif = tunnel->parms.link, .nl_u = { .ip4_u = { .daddr = dst, .saddr = tiph->saddr, .tos = RT_TOS(tos)} }, .proto = IPPROTO_IPIP }; if (ip_route_output_key(&rt, &fl)) { tunnel->stat.tx_carrier_errors++; goto tx_error_icmp; } } tdev = rt->u.dst.dev; if (tdev== dev){ ip_rt_put(rt); tunnel->stat.collisions++; goto tx_error; } if (tiph->frag_off) mtu = dst_mtu(&rt->u.dst)- sizeof(struct iphdr); else mtu = skb->dst? dst_mtu(skb->dst): dev->mtu; if (mtu< 68){ tunnel->stat.collisions++; ip_rt_put(rt); goto tx_error; } if (skb->dst) skb->dst->ops->update_pmtu(skb->dst, mtu); df |=(old_iph->frag_off&htons(IP_DF)); if ((old_iph->frag_off&htons(IP_DF))&& mtu< ntohs(old_iph->tot_len)){ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,htonl(mtu)); ip_rt_put(rt); goto tx_error; } if (tunnel->err_count> 0){ if (jiffies - tunnel->err_time< IPTUNNEL_ERR_TIMEO){ tunnel->err_count--; dst_link_failure(skb); } else tunnel->err_count= 0; } /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr)); if (skb_headroom(skb)< max_headroom || skb_cloned(skb)|| skb_shared(skb)){ struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb){ ip_rt_put(rt); stats->tx_dropped++; dev_kfree_skb(skb); tunnel->recursion--; return 0; } if (skb->sk) skb_set_owner_w(new_skb, skb->sk); dev_kfree_skb(skb); skb = new_skb; old_iph = skb->nh.iph; } skb->h.raw= skb->nh.raw; skb->nh.raw= skb_push(skb,sizeof(struct iphdr)); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags&=~(IPSKB_XFRM_TUNNEL_SIZE| IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); dst_release(skb->dst); skb->dst= &rt->u.dst; /* * Push down and install the IPIP header. */ iph = skb->nh.iph; iph->version = 4; iph->ihl = sizeof(struct iphdr)>>2; iph->frag_off = df; iph->protocol = IPPROTO_IPIP; iph->tos = INET_ECN_encapsulate(tos, old_iph->tos); iph->daddr = rt->rt_dst; iph->saddr = rt->rt_src; if ((iph->ttl= tiph->ttl)== 0) iph->ttl = old_iph->ttl; nf_reset(skb); IPTUNNEL_XMIT(); tunnel->recursion--; return 0; tx_error_icmp: dst_link_failure(skb); tx_error: stats->tx_errors++; dev_kfree_skb(skb); tunnel->recursion--; return 0; }

0 0