Linux 连线跟踪流程整理(linux-2.6.31)

来源：互联网发布：下载天猫淘宝商城编辑：程序博客网时间：2024/05/17 04:45

author: jonathan

本文档的CopyRight归jonathan所有，可自由转载，转载时请保持文档的完整性。
/*----------------------------------------------------------------------------------------------------------------------------*/

0 主要数据结构

0.1
/ *linux内核中抽象出一个net结构，代表了网络协议．其内部是协议族相关的数据结构．对于PF_INET来说，这里重点就是连线跟踪表 */

struct net {
atomic_t   count;   /* To decided when the network
       * namespace should be freed.
       */
        ...

       /* 从下面宏可以看出，　连线跟踪是建立在NETFILTER基础上的 */
#ifdef CONFIG_NETFILTER
struct netns_xt   xt;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct netns_ct   ct; /* 连线跟踪表 */
#endif
#endif

#ifdef CONFIG_XFRM
struct netns_xfrm xfrm; /* ipsec,gre等tunnel相关内容 */
#endif
struct net_generic *gen;
};

struct netns_ct {
atomic_t   count;
unsigned int   expect_count;
struct hlist_nulls_head *hash; /* 主要的hash表 */
struct hlist_head *expect_hash;
struct hlist_nulls_head unconfirmed;
struct hlist_nulls_head dying;
struct ip_conntrack_stat *stat;
        ....
int    hash_vmalloc;
int    expect_vmalloc;
};

/* hlist_nulls_node 有是什么连表呢？ */
/* 一般的hlist表是以null指针来结尾的，而hlist_nulls_node却是以标志位来表示．只所以可以这么做，因为内核中对象都是４或者８字节对齐的，那么指针的后两位一定是０．因此这里可以复用这个位置来表示连表的结束：１表示连表结束；０表示未结束．一般连表最后一个数字都是由特别含义的．见如下实例：*/

struct hlist_nulls_node {
struct hlist_nulls_node *next, **pprev;
};
#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
((ptr)->first = (struct hlist_nulls_node *) (1UL | (((long)nulls) << 1)))

/* for example:

#define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
#define DYING_NULLS_VAL ((1<<30)+1)

NIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL);
*/

/* 接上回书，net 是通过nf_conntrack_init来初始化相关连线跟踪的资源 */
static struct pernet_operations nf_conntrack_net_ops = {
.init = nf_conntrack_net_init, /* 其调用了nf_conntrack_init接口 ,对于每一个 net都要调用这个接口的*/
.exit = nf_conntrack_net_exit,
};

static int __init nf_conntrack_standalone_init(void)
{
return register_pernet_subsys(&nf_conntrack_net_ops);
}

0.2
/* 记录数据报的特征值，对于tcp/ip协议，即是记录五元组 */
struct nf_conntrack_tuple
{
struct nf_conntrack_man src; /* 记录源方向的协议族\地址\端口等 */

/* These are the parts of the tuple which are fixed. */
struct {
union nf_inet_addr u3; /* 目标地址 */

/* 目标端口 */

/* The protocol. */
u_int8_t protonum; /* 四层协议类型 tcp , udp, gre等*/

/* The direction (for tuplehash) */
u_int8_t dir; /* 方向: IP_CT_DIR_ORIGINAL; IP_CT_DIR_REPLY */
} dst;
};

0.3

struct nf_conntrack_tuple_hash {
struct hlist_nulls_node hnnode;
struct nf_conntrack_tuple tuple;
};

0.4
struct sk_buff {
    ....
    struct nf_conntrack *nfct; /* 数据包中连线跟踪数据,其实就是 nf_conn 数据头*/
    ....
};

struct nf_conntrack {
atomic_t use;
};

struct nf_conn {
/* Usage count in here is 1 for hash table/destruct timer, 1 per skb,
plus 1 for any connection(s) we are `master' for */
struct nf_conntrack ct_general;

spinlock_t lock;

/* XXX should I move this to the tail ? - Y.K */
/* These are my tuples; original and reply */
struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX];

/* Have we seen traffic both ways yet? (bitset) */
unsigned long status;

/* If we were expected by an expectation, this will be it */
struct nf_conn *master;

/* Timer function; drops refcnt when it goes off. */
struct timer_list timeout;

　　　　...

/* Storage reserved for other modules: */
union nf_conntrack_proto proto;

/* Extensions */
struct nf_ct_ext *ext;
#ifdef CONFIG_NET_NS
struct net *ct_net;
#endif
};

0.5 数据结构关系总结

    sk_buff              ->
                          |
                          V
　　net.ct.hash[hash] -> nf_conn -> nf_conntrack_tuple_hash[orig/reply] -> nf_conntrack_tuple


1 初始化

static int __init nf_conntrack_l3proto_ipv4_init(void)
{
int ret = 0;

need_conntrack();
nf_defrag_ipv4_enable();

ret = nf_register_sockopt(&so_getorigdst);
if (ret < 0) {
printk(KERN_ERR "Unable to register netfilter socket option\n");
return ret;
}

        /* 注册四层协议 */
        /* 所有协议相关的流程都通过这个接口来注册，与协议无关的流程通过流程来控制 */
        /* 四层协议都保存在nf_ct_protos这个２维数组的全局变量中，其初始化为nf_conntrack_l4proto_generic处理流程 */
        /* struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
         {
.l3proto   = PF_INET,
.l4proto    = IPPROTO_TCP,
.name     = "tcp",
        ....
        } */
        /* 最后通过nf_conntrack_l4proto_register: rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],l4proto);注册完毕 */

ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4);
if (ret < 0) {
   printk("nf_conntrack_ipv4: can't register tcp.\n");
   goto cleanup_sockopt;
}

ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4);
if (ret < 0) {
printk("nf_conntrack_ipv4: can't register udp.\n");
goto cleanup_tcp;
}

ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp);
if (ret < 0) {
printk("nf_conntrack_ipv4: can't register icmp.\n");
goto cleanup_udp;
}

/* 注册三层协议 */
        /* 所有协议相关的流程都通过这个接口来注册，与协议无关的流程通过流程来控制 */
        /* 三层协议都保存在nf_ct_l3protos全局变量中，其初始化为nf_conntrack_l3proto_generic处理流程 */
        /* extern struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX]; */
        /* struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = {
.l3proto = PF_UNSPEC,
.name   = "unknown",
.pkt_to_tuple = generic_pkt_to_tuple,              /* 此处处理流程基本都是无操作过程 */
.invert_tuple = generic_invert_tuple,
.print_tuple = generic_print_tuple,
.get_l4proto = generic_get_l4proto,
        }; */
        /* 最后通过nf_conntrack_l3proto_register: rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto);注册完毕 */
ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4);
if (ret < 0) {
   printk("nf_conntrack_ipv4: can't register ipv4\n");
   goto cleanup_icmp;
}

/* 注册通用的网络报处理流程, 网上很多这方面介绍，这里不多讲了 */
ret = nf_register_hooks(ipv4_conntrack_ops,
ARRAY_SIZE(ipv4_conntrack_ops));
...

return ret;
... /*错误处理 */
return ret;
}

通过这个函数, 简单的说就是初始化了三个数组: 四层协议数组,三层协议数组,网络流控制数组.

以后的处理流程就是: 网络流控制数组中函数 -> 三层协议数组 -> 四层协议数组

那么就让我们来跟踪流程.

2 ipv4_conntrack_in

static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
{
   .hook   = ipv4_conntrack_in,
   .owner   = THIS_MODULE,
   .pf   = PF_INET,
   .hooknum = NF_INET_PRE_ROUTING,
   .priority = NF_IP_PRI_CONNTRACK,
},
{
   .hook   = ipv4_conntrack_local,
   .owner   = THIS_MODULE,
   .pf   = PF_INET,
   .hooknum = NF_INET_LOCAL_OUT,
   .priority = NF_IP_PRI_CONNTRACK,
},
{
   .hook   = ipv4_confirm,
   .owner   = THIS_MODULE,
   .pf   = PF_INET,
   .hooknum = NF_INET_POST_ROUTING,
   .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
{
   .hook   = ipv4_confirm,
   .owner   = THIS_MODULE,
   .pf   = PF_INET,
   .hooknum = NF_INET_LOCAL_IN,
   .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
};

这个数组可以看出, 函数分为两类: 创建新连接和确认新连接. 由于连线跟踪模块是很多业务处理的基础,所以创建新连接优先级很高, 而确认新连连接优先级很低.

ipv4_conntrack_in接口很简单,就是调用nf_conntrack_in:

unsigned int
nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
   struct sk_buff *skb)
{
...
/* Previously seen (loopback or untracked)? Ignore. */
if (skb->nfct) {
   NF_CT_STAT_INC_ATOMIC(net, ignore);
   return NF_ACCEPT;
}

/* rcu_read_lock()ed by nf_hook_slow */
        /* 获取协议相关的三层协议结构 */
l3proto = __nf_ct_l3proto_find(pf);
        /* 获取协议相关的四层协议号 */
ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
       &dataoff, &protonum);
if (ret <= 0) {
   ....
   return -ret;
}

/* 获取协议相关的四层协议结构 */
l4proto = __nf_ct_l4proto_find(pf, protonum);

....

/* 连线主接口 */
ct = resolve_normal_ct(net, skb, dataoff, pf, protonum,
l3proto, l4proto, &set_reply, &ctinfo);
...

/* 协议相关处理，基本就是协议相关的状态检测*/
ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum);
...

if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_STATUS, ct);

return ret;
}

static inline struct nf_conn *
resolve_normal_ct(struct net *net,
    struct sk_buff *skb,
    unsigned int dataoff,
    u_int16_t l3num,
    u_int8_t protonum,
    struct nf_conntrack_l3proto *l3proto,
    struct nf_conntrack_l4proto *l4proto,
    int *set_reply,
    enum ip_conntrack_info *ctinfo)
{
struct nf_conntrack_tuple tuple;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;

        /* 调用三层和四层的pkb_to_tuple获取数据包五元组 */
        /* 对于pf_INET: 三层协议仅是获取源和目的地址 */
        /*              四层协议(tcp来说)仅是获取源和目的端口 */
if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
        dataoff, l3num, protonum, &tuple, l3proto,
        l4proto)) {
   pr_debug("resolve_normal_ct: Can't get tuple\n");
   return NULL;
}

/* look for tuple match */
        /* 其是根据jhash2来生成tuple hash值 */
h = nf_conntrack_find_get(net, &tuple);
if (!h) {/* 一个新的连接到来了,并添加到net未确认连表中 */
   h = init_conntrack(net, &tuple, l3proto, l4proto, skb, dataoff);
   if (!h)
    return NULL;
   if (IS_ERR(h))
    return (void *)h;
}
        /* 获取真正的连线跟踪数据 */
ct = nf_ct_tuplehash_to_ctrack(h);

        /* 更新连线跟踪数据状态 */
/* It exists; we have (non-exclusive) reference. */
if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
   *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
   /* Please set reply bit if this packet OK */
   *set_reply = 1;
} else {
   /* Once we've had two way comms, always ESTABLISHED. */
   if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
    pr_debug("nf_conntrack_in: normal packet for %p\n", ct);
    *ctinfo = IP_CT_ESTABLISHED;
   } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
    pr_debug("nf_conntrack_in: related packet for %p\n",
     ct);
    *ctinfo = IP_CT_RELATED;
   } else {
    pr_debug("nf_conntrack_in: new packet for %p\n", ct);
    *ctinfo = IP_CT_NEW;
   }
   *set_reply = 0;
}
skb->nfct = &ct->ct_general;
skb->nfctinfo = *ctinfo;
return ct;
}

/* 这个函数很简单，但是有句话很牛，看看您能明白吗？*/
struct nf_conn *nf_conntrack_alloc(struct net *net,
       const struct nf_conntrack_tuple *orig,
       const struct nf_conntrack_tuple *repl,
       gfp_t gfp)
{
    　　...
　　　　memset(&ct->tuplehash[IP_CT_DIR_MAX], 0,
        sizeof(*ct) - offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX]));

　　　　...
}

3 ipv4_confirm

static unsigned int ipv4_confirm(unsigned int hooknum,
     struct sk_buff *skb,
     const struct net_device *in,
     const struct net_device *out,
     int (*okfn)(struct sk_buff *))
{
...
return nf_conntrack_confirm(skb);
}

int
__nf_conntrack_confirm(struct sk_buff *skb)
{
unsigned int hash, repl_hash;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
struct nf_conn_help *help;
struct hlist_nulls_node *n;
enum ip_conntrack_info ctinfo;
struct net *net;

ct = nf_ct_get(skb, &ctinfo);
net = nf_ct_net(ct);

/* ipt_REJECT uses nf_conntrack_attach to attach related
    ICMP/TCP RST packets in other direction. Actual packet
    which created connection will be IP_CT_NEW or for an
    expected connection, IP_CT_RELATED. */
if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
   return NF_ACCEPT;

hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);

/* We're not in hash table, and we refuse to set up related
connections for unconfirmed conns. But packet copies and
REJECT will give spurious warnings here. */
/* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */

/* No external references means noone else could have
confirmed us. */
NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
pr_debug("Confirming conntrack %p\n", ct);

spin_lock_bh(&nf_conntrack_lock);

/* See if there's one in the list already, including reverse:
    NAT could have grabbed it without realizing, since we're
    not in the hash. If there is, we lost race. */
hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
   if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
          &h->tuple))
    goto out;
hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode)
   if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
          &h->tuple))
    goto out;

/* Remove from unconfirmed list */
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);

/* Timer relative to confirmation time, not original
setting time, otherwise we'd get timer wrap in
weird delay cases. */
ct->timeout.expires += jiffies;
add_timer(&ct->timeout);
atomic_inc(&ct->ct_general.use);
set_bit(IPS_CONFIRMED_BIT, &ct->status);

/* Since the lookup is lockless, hash insertion must be done after
* starting the timer and setting the CONFIRMED bit. The RCU barriers
* guarantee that no other CPU can find the conntrack before the above
* stores are visible.
*/
__nf_conntrack_hash_insert(ct, hash, repl_hash);
NF_CT_STAT_INC(net, insert);
spin_unlock_bh(&nf_conntrack_lock);

help = nfct_help(ct);
if (help && help->helper)
nf_conntrack_event_cache(IPCT_HELPER, ct);

nf_conntrack_event_cache(master_ct(ct) ?
IPCT_RELATED : IPCT_NEW, ct);
return NF_ACCEPT;

out:
NF_CT_STAT_INC(net, insert_failed);
spin_unlock_bh(&nf_conntrack_lock);
return NF_DROP;
}

转自：http://hi.baidu.com/jonathan2004/item/d8b1573e092df1647d034b5c