LVS源码分析

来源:互联网 发布:声优赚钱软件 编辑:程序博客网 时间:2024/05/24 05:34

     转载自:http://blog.csdn.net/majieyue/article/details/8574580

      由于要做一个类似LVS的包转发模块,研究了LVS的架构和代码,下面这个系列会做一个总结。首先推荐下这个blog http://yfydz.cublog.cn 里面对LVS, IPSec的讲解非常不错

几个重要的数据结构如下:

ip_vs_conn:一个连接由N元组构成,包括 caddr (客户端地址cip), vaddr (服务虚拟地址vip), daddr (目的realserver地址dip), cport (客户端连接端口), vport (服务虚拟端口), dport (目的realserver端口), protocol (协议)

ip_vs_service:代表一个虚拟服务。LVS中虚拟服务代表一个虚拟IP和端口,作为服务的入口,后面跟着一些realserver,在这些realserver之间做负载平衡。ip_vs_service中包括了protocol, addr, port。struct list_head destinations, __u32 num_dests则代表了后面realserver的链表和个数

ip_vs_dest:代表一个realserver。addr, port, weight分别代表了realserver的ip, port, 权重。struct dst_entry *dst_cache代表了从LVS到realserver的路由缓存项,在我看来这个应该只对NAT, tunnel模式有效。vport, vaddr, protocol代表了虚拟服务地址,端口和协议

ip_vs_scheduler:所有调度器的基类,对ip_vs_service进行调度,其最重要的方法是 struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc, const struct sk_buff* skb),从ip_vs_service下的ip_vs_dest数组中选取一个出来返回


static int __init ip_vs_init(void)用来初始化ipvs.ko,也就是LVS的核心模块:

ip_vs_control_init调用nf_register_sockopt注册struct nf_sockopt_ops结构,ip_vs_genl_register注册struct genl_ops ip_vs_genl_ops[]数组,这是通过netlink进行控制的命令结构。

ip_vs_protocol_init依次注册了ip_vs_protocol_tcp, ip_vs_protocol_udp, ip_vs_protocol_ah, ip_vs_protocol_esp四个协议

ip_vs_conn_init首先调用vmalloc分配一块大的内存(64k)区域用于存放连接的哈希表的key数组,也就是说有4096和list_head。

LVS最后调用nf_register_hooks,向netfilter注册自己的钩子结构。LVS一共有4个钩子(不算IPV6),

static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
    /* After packet filtering, forward packet through VS/DR, VS/TUN,
     * or VS/NAT(change destination), so that filtering rules can be
     * applied to IPVS. */
    {   
        .hook       = ip_vs_in,
        .owner      = THIS_MODULE,
        .pf     = PF_INET,
        .hooknum        = NF_INET_LOCAL_IN,
        .priority       = 100,
    },
    /* After packet filtering, change source only for VS/NAT */
    {
        .hook       = ip_vs_out,
        .owner      = THIS_MODULE,
        .pf     = PF_INET,
        .hooknum        = NF_INET_FORWARD,
        .priority       = 100,
    },
    /* After packet filtering (but before ip_vs_out_icmp), catch icmp
     * destined for 0.0.0.0/0, which is for incoming IPVS connections */
    {
        .hook       = ip_vs_forward_icmp,
        .owner      = THIS_MODULE,
        .pf     = PF_INET,
        .hooknum        = NF_INET_FORWARD,
        .priority       = 99,
    },

    /* Before the netfilter connection tracking, exit from POST_ROUTING */
    {
        .hook       = ip_vs_post_routing,
        .owner      = THIS_MODULE,
        .pf     = PF_INET,
        .hooknum        = NF_INET_POST_ROUTING,
        .priority       = NF_IP_PRI_NAT_SRC-1,
    },

};


LVS无论是VS/DR, VS/TUN, VS/NAT哪种模式,由于vip配置在LVS上,因此访问vip的流量首先会走到NF_INET_LOCAL_IN,从而调用ip_vs_in

static unsigned int
ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
     const struct net_device *in, const struct net_device *out,
     int (*okfn)(struct sk_buff *))
{

    ...

    // LVS ip_vs_in只处理发给本机的报文

    if (unlikely(skb->pkt_type != PACKET_HOST)) {
        IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
                  skb->pkt_type,
                  iph.protocol,
                  IP_VS_DBG_ADDR(af, &iph.daddr));
        return NF_ACCEPT;
    }

    ...

    /*
     * Check if the packet belongs to an existing connection entry
     */

    // conn_in_get由协议本身实现,对于TCP而言,调用tcp_conn_in_get得到一个ip_vs_conn
    cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);

    if (unlikely(!cp)) {
        int v;
    
        /* For local client packets, it could be a response */
        cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);  // 查看是否是一个出去的连接
        if (cp)
            return handle_response(af, skb, pp, cp, iph.len); // 主要是执行snat
    
        if (!pp->conn_schedule(af, skb, pp, &v, &cp))    // 执行tcp_conn_schedule,TCP协议的调度就是为client找一个realserver,然后把这个conn保存下来,下次就直接基于这个ip_vs_conn转发了
            return v;   
    }

    ...

    /* Check the server status */
    if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
        /* the destination server is not available */
        if (sysctl_ip_vs_expire_nodest_conn) {
            /* try to expire the connection immediately */
            ip_vs_conn_expire_now(cp);
        }
        /* don't restart its timer, and silently
           drop the packet. */
        __ip_vs_conn_put(cp);  // 如果后面的realserver失效,那么drop这个ip_vs_conn
        return NF_DROP;
    }

    ip_vs_in_stats(cp, skb);
    restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);  // 调用tcp_state_transition,改变连接的自动机状态
    if (cp->packet_xmit)
        ret = cp->packet_xmit(skb, cp, pp);  // 根据模式不同,调用不同的发送方法 e.g. NAT调用ip_vs_nat_xmit, DR调用ip_vs_dr_xmit
        /* do not touch skb anymore */
    else {
        IP_VS_DBG_RL("warning: packet_xmit is null");
        ret = NF_ACCEPT;
    }

    ...

}


LVS的VS/DR, VS/TUN都是单臂模式,只有VS/NAT是双臂模式。在VS/NAT模式下,LVS会作为realserver回包的next hop,因此在NF_IP_FORWARD上注册ip_vs_out,用来处理NAT模式下的回包

static unsigned int
ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
      const struct net_device *in, const struct net_device *out,
      int (*okfn)(struct sk_buff *))
{
    struct ip_vs_iphdr iph;
    struct ip_vs_protocol *pp;
    struct ip_vs_conn *cp;
    int af;

    ....

    ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);  // 填充ip_vs_iphdr的IP头

    if (unlikely(iph.protocol == IPPROTO_ICMP)) {  // 这部分代码用来处理icmp报文,主要逻辑在ip_vs_out_icmp上,该函数用来处理outgoing方向的icmp
        int related, verdict = ip_vs_out_icmp(skb, &related);

        if (related)
            return verdict;
        ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
    }

    ....

    if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && !pp->dont_defrag)) {   // 如果是IP分片的包,那么调用ip_vs_gather_frags先尝试整合成一个完整包,具体请参考内核IP层的frag/defrag的相关代码
        if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
            return NF_STOLEN;

        ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
    }

    /*
     * Check if the packet belongs to an existing entry
     */
    cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);  // 查找是否有已有连接

    if (unlikely(!cp)) {
        if (sysctl_ip_vs_nat_icmp_send &&
            (pp->protocol == IPPROTO_TCP ||
             pp->protocol == IPPROTO_UDP)) {
            __be16 _ports[2], *pptr;

            pptr = skb_header_pointer(skb, iph.len,
                          sizeof(_ports), _ports);
            if (pptr == NULL)
                return NF_ACCEPT;   /* Not for me */
            if (ip_vs_lookup_real_service(af, iph.protocol,
                              &iph.saddr,
                              pptr[0])) {   // 查看这个realserver是否在LVS的hash表中,如果是真实的realserver,返回一个ICMP不可达

                /*
                 * Notify the real server: there is no
                 * existing entry if it is not RST
                 * packet or not TCP packet.
                 */
                if (iph.protocol != IPPROTO_TCP
                    || !is_tcp_reset(skb, iph.len)) {
                        icmp_send(skb,
                              ICMP_DEST_UNREACH,
                              ICMP_PORT_UNREACH, 0);
                    return NF_DROP;
                }
            }
        }
        IP_VS_DBG_PKT(12, pp, skb, 0,
                  "packet continues traversal as normal");
        return NF_ACCEPT;
    }

    return handle_response(af, skb, pp, cp, iph.len);  // handle_response真正去做SNAT
}


static unsigned int 
handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
        struct ip_vs_conn *cp, int ihl)
{

    if (!skb_make_writable(skb, ihl))  /* 如果要修改skb的话,当前内核版本需要先判断skb_make_writable */
        goto drop;

    /* mangle the packet */
    if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))  /* 对TCP而言,这里是调用tcp_snat_handler, 主要功能是修改了tcp头之后再做下checksum */
        goto drop;

    ip_hdr(skb)->saddr = cp->vaddr.ip;  /* SNAT, 把包的源IP替换为virtual IP */
    ip_send_check(ip_hdr(skb));  /* 对IP头做checksum */

    /* For policy routing, packets originating from this

     * machine itself may be routed differently to packets
     * passing through.  We want this packet to be routed as
     * if it came from this machine itself.  So re-compute
     * the routing information.
     */

        if (ip_route_me_harder(skb, RTN_LOCAL) != 0)  /* 由于源IP变成了本地IP,而不是之前的转发包,需要重新计算路由 */
            goto drop;

    ip_vs_out_stats(cp, skb);
    ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); /* 对TCP而言,调用tcp_state_transition */
    ip_vs_conn_put(cp);

    skb->ipvs_property = 1;  /* 标记这个skb已经被LVS处理过 */

    LeaveFunction(11);
    return NF_ACCEPT;

drop:
    ip_vs_conn_put(cp);
    kfree_skb(skb);
    return NF_STOLEN;
}


LVS在NF_INET_POST_ROUTING chain上还注册了一个优先级为NF_IP_PRI_NAT_SRC - 1的hook函数ip_vs_post_routing。该函数在iptables SNAT之前执行,检查LVS是否处理过该skb,如果处理过则跳过下面的netfilter hook点

/*
 *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
 *      chain, and is used for VS/NAT.
 *      It detects packets for VS/NAT connections and sends the packets
 *      immediately. This can avoid that iptable_nat mangles the packets
 *      for VS/NAT.
 */     
static unsigned int ip_vs_post_routing(unsigned int hooknum,
                       struct sk_buff *skb,
                       const struct net_device *in,
                       const struct net_device *out,
                       int (*okfn)(struct sk_buff *))
{
    if (!skb->ipvs_property)
        return NF_ACCEPT;
    /* The packet was sent from IPVS, exit this chain */
    return NF_STOP;  
}

netfilter框架下,NF_HOOK宏会调用到nf_hook_slow,进而调用nf_iterate,即对于特定PF下的特定HOOKNUM,按优先级遍历上面注册的所有hook函数,只有当所有函数都返回NF_ACCEPT,或者有任意函数返回NF_STOP,整个nf_iterate才会返回NF_ACCEPT。与NF_ACCEPT不同的是,NF_STOP的语义会忽略该挂载点下其他优先级的函数。


0 0
原创粉丝点击