kube-proxy细节分析

来源:互联网 发布:yum 安装ant 编辑:程序博客网 时间:2024/06/03 11:36

其实kube-proxy的代码本身并不复杂,只是有个细节容易被大家忽略,大家可能都知道它有轮询的复杂均衡策略,是通过iptables实现的,那它是怎样控制平均转发的呢?iptables有个random的模块支持,那怎样控制权重呢?
看代码,一步一步分析

    {        tablesNeedServicesChain := []utiliptables.Table{utiliptables.TableFilter, utiliptables.TableNAT}        for _, table := range tablesNeedServicesChain {            if _, err := proxier.iptables.EnsureChain(table, kubeServicesChain); err != nil {                glog.Errorf("Failed to ensure that %s chain %s exists: %v", table, kubeServicesChain, err)                return            }        }        tableChainsNeedJumpServices := []struct {            table utiliptables.Table            chain utiliptables.Chain        }{            {utiliptables.TableFilter, utiliptables.ChainInput},            {utiliptables.TableFilter, utiliptables.ChainOutput},            {utiliptables.TableNAT, utiliptables.ChainOutput},            {utiliptables.TableNAT, utiliptables.ChainPrerouting},        }        comment := "kubernetes service portals"        args := []string{"-m", "comment", "--comment", comment, "-j", string(kubeServicesChain)}        for _, tc := range tableChainsNeedJumpServices {            if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, tc.table, tc.chain, args...); err != nil {                glog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", tc.table, tc.chain, kubeServicesChain, err)                return            }        }    }

首先是建立filter表的INPUT/OUTPUT和nat表的OUTPUT/PREROUTE规则全部跳转到service链
效果如下:

-A OUTPUT -m comment --comment "kubernetes service portals" -j KUBE-SERVICES-A PREROUTING -m comment --comment "kubernetes service portals" -j KUBE-SERVICES-A OUTPUT -m comment --comment "kubernetes service portals" -j KUBE-SERVICES

这样出去的流量都会被service的链截获了

当然如果有些流量需要通过SNAT出去

    {        if _, err := proxier.iptables.EnsureChain(utiliptables.TableNAT, kubePostroutingChain); err != nil {            glog.Errorf("Failed to ensure that %s chain %s exists: %v", utiliptables.TableNAT, kubePostroutingChain, err)            return        }        comment := "kubernetes postrouting rules"        args := []string{"-m", "comment", "--comment", comment, "-j", string(kubePostroutingChain)}        if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, utiliptables.TableNAT, utiliptables.ChainPostrouting, args...); err != nil {            glog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", utiliptables.TableNAT, utiliptables.ChainPostrouting, kubePostroutingChain, err)            return        }    }

效果如下:

-A POSTROUTING -m comment --comment "kubernetes postrouting rules" -j KUBE-POSTROUTING-A KUBE-POSTROUTING -m comment --comment "kubernetes service traffic requiring SNAT" -m mark --mark 0x4000/0x4000 -j MASQUERADE

现在开始建立kubernetes proxy的各个链

    writeLine(proxier.filterChains, "*filter")    writeLine(proxier.natChains, "*nat")    // Make sure we keep stats for the top-level chains, if they existed    // (which most should have because we created them above).    if chain, ok := existingFilterChains[kubeServicesChain]; ok {        writeLine(proxier.filterChains, chain)    } else {        writeLine(proxier.filterChains, utiliptables.MakeChainLine(kubeServicesChain))    }    if chain, ok := existingNATChains[kubeServicesChain]; ok {        writeLine(proxier.natChains, chain)    } else {        writeLine(proxier.natChains, utiliptables.MakeChainLine(kubeServicesChain))    }    if chain, ok := existingNATChains[kubeNodePortsChain]; ok {        writeLine(proxier.natChains, chain)    } else {        writeLine(proxier.natChains, utiliptables.MakeChainLine(kubeNodePortsChain))    }    if chain, ok := existingNATChains[kubePostroutingChain]; ok {        writeLine(proxier.natChains, chain)    } else {        writeLine(proxier.natChains, utiliptables.MakeChainLine(kubePostroutingChain))    }    if chain, ok := existingNATChains[KubeMarkMasqChain]; ok {        writeLine(proxier.natChains, chain)    } else {        writeLine(proxier.natChains, utiliptables.MakeChainLine(KubeMarkMasqChain))    }

这个里面创建KUBE-SERVICES、KUBE-NODEPORTS、KUBE-POSTROUTING、KUBE-MARK-MASQ

通过kubernetes创建的service会分配一个clusterIP,这些clusterIP是在iptables上面实现的

        args := []string{            "-A", string(kubeServicesChain),            "-m", "comment", "--comment", fmt.Sprintf(`"%s cluster IP"`, svcNameString),            "-m", protocol, "-p", protocol,            "-d", fmt.Sprintf("%s/32", svcInfo.clusterIP.String()),            "--dport", fmt.Sprintf("%d", svcInfo.port),        }        if proxier.masqueradeAll {            writeLine(proxier.natRules, append(args, "-j", string(KubeMarkMasqChain))...)        }        if len(proxier.clusterCIDR) > 0 {            writeLine(proxier.natRules, append(args, "! -s", proxier.clusterCIDR, "-j", string(KubeMarkMasqChain))...)        }        writeLine(proxier.natRules, append(args, "-j", string(svcChain))...)

上面就是截获clusterIP的流量做DNAT,这里面需要补充的就是如果一个服务后面有多个endpoint的,

for i, endpointChain := range endpointChains {            // Balancing rules in the per-service chain.            args := []string{                "-A", string(svcChain),                "-m", "comment", "--comment", svcNameString,            }            if i < (n - 1) {                // Each rule is a probabilistic match.                args = append(args,                    "-m", "statistic",                    "--mode", "random",                    "--probability", fmt.Sprintf("%0.5f", 1.0/float64(n-i)))            }            // The final (or only if n == 1) rule is a guaranteed match.            args = append(args, "-j", string(endpointChain))            writeLine(proxier.natRules, args...)            // Rules in the per-endpoint chain.            args = []string{                "-A", string(endpointChain),                "-m", "comment", "--comment", svcNameString,            }            // Handle traffic that loops back to the originator with SNAT.            writeLine(proxier.natRules, append(args,                "-s", fmt.Sprintf("%s/32", strings.Split(endpoints[i].endpoint, ":")[0]),                "-j", string(KubeMarkMasqChain))...)            // Update client-affinity lists.            if svcInfo.sessionAffinityType == api.ServiceAffinityClientIP {                args = append(args, "-m", "recent", "--name", string(endpointChain), "--set")            }            // DNAT to final destination.            args = append(args, "-m", protocol, "-p", protocol, "-j", "DNAT", "--to-destination", endpoints[i].endpoint)            writeLine(proxier.natRules, args...)        }

上面通过循环的方式创建后端endpoint的转发,概率是通过probability后的1.0/float64(n-i)计算出来的,譬如有两个的场景,那么将会是一个0.5和1也就是第一个是50%概率第二个是100%概率,如果是三个的话类似,33%、50%、100%。下面是10个endpoint的例子。

kubectl get svc --all-namespacesNAMESPACE      NAME                    CLUSTER-IP      EXTERNAL-IP   PORT(S)                      AGEadmin          docker2048              10.13.52.135    11.11.1.1     80/TCP                       1d[root@master-62 ~]# [root@master-62 ~]# iptables-save |grep 10.13.52.135-A KUBE-SERVICES -d 10.13.52.135/32 -p tcp -m comment --comment "admin/docker2048:docker2048-1 cluster IP" -m tcp --dport 80 -j KUBE-SVC-MHWEDWK6NM5OGU2T[root@master-62 ~]# [root@master-62 ~]# [root@master-62 ~]# iptables-save |grep KUBE-SVC-MHWEDWK6NM5OGU2T:KUBE-SVC-MHWEDWK6NM5OGU2T - [0:0]-A KUBE-SERVICES -d 10.13.52.135/32 -p tcp -m comment --comment "admin/docker2048:docker2048-1 cluster IP" -m tcp --dport 80 -j KUBE-SVC-MHWEDWK6NM5OGU2T-A KUBE-SERVICES -d 11.11.1.1/32 -p tcp -m comment --comment "admin/docker2048:docker2048-1 external IP" -m tcp --dport 80 -m physdev ! --physdev-is-in -m addrtype ! --src-type LOCAL -j KUBE-SVC-MHWEDWK6NM5OGU2T-A KUBE-SERVICES -d 11.11.1.1/32 -p tcp -m comment --comment "admin/docker2048:docker2048-1 external IP" -m tcp --dport 80 -m addrtype --dst-type LOCAL -j KUBE-SVC-MHWEDWK6NM5OGU2T-A KUBE-SVC-MHWEDWK6NM5OGU2T -m comment --comment "admin/docker2048:docker2048-1" -m statistic --mode random --probability 0.10000000009 -j KUBE-SEP-VC767CJYOTCBCN3B-A KUBE-SVC-MHWEDWK6NM5OGU2T -m comment --comment "admin/docker2048:docker2048-1" -m statistic --mode random --probability 0.11110999994 -j KUBE-SEP-HQELSIUR5HSCB2VN-A KUBE-SVC-MHWEDWK6NM5OGU2T -m comment --comment "admin/docker2048:docker2048-1" -m statistic --mode random --probability 0.12500000000 -j KUBE-SEP-X2UDSU7Q4UA4IKY7-A KUBE-SVC-MHWEDWK6NM5OGU2T -m comment --comment "admin/docker2048:docker2048-1" -m statistic --mode random --probability 0.14286000002 -j KUBE-SEP-DQ3TZIZIDTXU77P7-A KUBE-SVC-MHWEDWK6NM5OGU2T -m comment --comment "admin/docker2048:docker2048-1" -m statistic --mode random --probability 0.16667000018 -j KUBE-SEP-A3JWOZYQIIDDEKNM-A KUBE-SVC-MHWEDWK6NM5OGU2T -m comment --comment "admin/docker2048:docker2048-1" -m statistic --mode random --probability 0.20000000019 -j KUBE-SEP-6EZ2MUBOPU2WH44E-A KUBE-SVC-MHWEDWK6NM5OGU2T -m comment --comment "admin/docker2048:docker2048-1" -m statistic --mode random --probability 0.25000000000 -j KUBE-SEP-4KG3GD3BQ5TCAUPR-A KUBE-SVC-MHWEDWK6NM5OGU2T -m comment --comment "admin/docker2048:docker2048-1" -m statistic --mode random --probability 0.33332999982 -j KUBE-SEP-6EXLETYC4LYB5NLM-A KUBE-SVC-MHWEDWK6NM5OGU2T -m comment --comment "admin/docker2048:docker2048-1" -m statistic --mode random --probability 0.50000000000 -j KUBE-SEP-VLQQMEFA6Y5RZLE7-A KUBE-SVC-MHWEDWK6NM5OGU2T -m comment --comment "admin/docker2048:docker2048-1" -j KUBE-SEP-CXDZACZ7ESWWLYJM
原创粉丝点击