【OVS2.5源码解读】 内核中的flow table流表操作
来源:互联网 发布:关闭windows自动更新 编辑:程序博客网 时间:2024/06/08 13:59
当一个数据包到达网卡的时候,首先要经过内核Openvswitch.ko,流表Flow Table在内核中有一份,通过key查找内核中的flow table,即可以得到action,然后执行action之后,直接发送这个包,只有在内核无法查找到流表项的时候,才会到用户态查找用户态的流表。仅仅查找内核中flow table的情况被称为fast path;需要查找用户态中flow table的情况被称为slow path.
第一步:从数据包中提取出key
当一个OVS端口接收到一个数据包,不是将整个数据包在内核层的流表中匹配查找,这样效率低下,而是需要对此数据包头字段进行解析,将解析出来的各个匹配字段值和端口号一起构造成查询key,然后用key在流表中进行匹配查找。
实现函数为int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key)
在这个函数中,首先提取的是物理层的信息,主要是从哪个网口进入的
key->phy.priority = skb->priority;key->phy.in_port = OVS_CB(skb)->input_vport->port_no;key->phy.skb_mark = skb->mark;ovs_ct_fill_key(skb, key);key->ovs_flow_hash = 0;key->recirc_id = 0;
然后调用函数static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
提取MAC层的key:
/* Link layer. We are guaranteed to have at least the 14 byte Ethernet * header in the linear data area. */eth = eth_hdr(skb);ether_addr_copy(key->eth.src, eth->h_source);ether_addr_copy(key->eth.dst, eth->h_dest);__skb_pull(skb, 2 * ETH_ALEN);/* We are going to push all headers that we pull, so no need to * update skb->csum here. */key->eth.tci = 0;if (skb_vlan_tag_present(skb)) key->eth.tci = htons(vlan_get_tci(skb));else if (eth->h_proto == htons(ETH_P_8021Q)) if (unlikely(parse_vlan(skb, key))) return -ENOMEM;key->eth.type = parse_ethertype(skb);
提取网络层的key:
struct iphdr *nh;__be16 offset;error = check_iphdr(skb);if (unlikely(error)) { memset(&key->ip, 0, sizeof(key->ip)); memset(&key->ipv4, 0, sizeof(key->ipv4)); if (error == -EINVAL) { skb->transport_header = skb->network_header; error = 0; } return error;}nh = ip_hdr(skb);key->ipv4.addr.src = nh->saddr;key->ipv4.addr.dst = nh->daddr;key->ip.proto = nh->protocol;key->ip.tos = nh->tos;key->ip.ttl = nh->ttl;offset = nh->frag_off & htons(IP_OFFSET);if (offset) { key->ip.frag = OVS_FRAG_TYPE_LATER; return 0;}if (nh->frag_off & htons(IP_MF) || skb_shinfo(skb)->gso_type & SKB_GSO_UDP) key->ip.frag = OVS_FRAG_TYPE_FIRST;else key->ip.frag = OVS_FRAG_TYPE_NONE;
提取传输层的key:
/* Transport layer. */if (key->ip.proto == IPPROTO_TCP) { if (tcphdr_ok(skb)) { struct tcphdr *tcp = tcp_hdr(skb); key->tp.src = tcp->source; key->tp.dst = tcp->dest; key->tp.flags = TCP_FLAGS_BE16(tcp); } else { memset(&key->tp, 0, sizeof(key->tp)); }} else if (key->ip.proto == IPPROTO_UDP) { if (udphdr_ok(skb)) { struct udphdr *udp = udp_hdr(skb); key->tp.src = udp->source; key->tp.dst = udp->dest; } else { memset(&key->tp, 0, sizeof(key->tp)); }} else if (key->ip.proto == IPPROTO_SCTP) { if (sctphdr_ok(skb)) { struct sctphdr *sctp = sctp_hdr(skb); key->tp.src = sctp->source; key->tp.dst = sctp->dest; } else { memset(&key->tp, 0, sizeof(key->tp)); }} else if (key->ip.proto == IPPROTO_ICMP) { if (icmphdr_ok(skb)) { struct icmphdr *icmp = icmp_hdr(skb); /* The ICMP type and code fields use the 16-bit * transport port fields, so we need to store * them in 16-bit network byte order. */ key->tp.src = htons(icmp->type); key->tp.dst = htons(icmp->code); } else { memset(&key->tp, 0, sizeof(key->tp)); }}
第二步:根据key查找flow table
在内核中,flow table的数据结构如下图所示。
每个虚拟交换机对应一个datapath,每个datapath有一个flow table,每个flow table分成N个桶,根据key进行哈希,不同的key分布在不同的桶里面。
每个桶的大小是一个内存页的大小,在内存页的头部保存了保存了元素个数和每个元素的大小。每个元素都是sw_flow,里面有key,也有action。
查找过程主要通过调用struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, const struct sw_flow_key *key, u32 skb_hash, u32 *n_mask_hit)
实现。
1、ovs_flow_tbl_lookup_stats函数
struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, const struct sw_flow_key *key, //由ovs_flow_key_extract函数根据skb生成 u32 skb_hash,<span style="white-space:pre"> </span> //skb中携带的信息 u32 *n_mask_hit) { struct mask_array *ma = rcu_dereference(tbl->mask_array); struct table_instance *ti = rcu_dereference(tbl->ti); //得到table实例 struct mask_cache_entry *entries, *ce; struct sw_flow *flow; u32 hash; int seg; *n_mask_hit = 0; if (unlikely(!skb_hash)) { //如果报文没有hash值,则mask_index为0,全遍历所有的mask。 u32 mask_index = 0; return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index); } /* Pre and post recirulation flows usually have the same skb_hash * value. To avoid hash collisions, rehash the 'skb_hash' with * 'recirc_id'. */ if (key->recirc_id) skb_hash = jhash_1word(skb_hash, key->recirc_id); ce = NULL; hash = skb_hash; entries = this_cpu_ptr(tbl->mask_cache); /* Find the cache entry 'ce' to operate on. */ for (seg = 0; seg < MC_HASH_SEGS; seg++) { //32位的hash值被分成4段,每段8字节,作为cache的索引 int index = hash & (MC_HASH_ENTRIES - 1); struct mask_cache_entry *e; e = &entries[index]; //entry最大为256项 if (e->skb_hash == skb_hash) { //如果在cache entry找到报文hash相同项,则根据该entry指定的mask查表 flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &e->mask_index); if (!flow) e->skb_hash = 0; return flow; } if (!ce || e->skb_hash < ce->skb_hash) ce = e; /* A better replacement cache candidate. */ hash >>= MC_HASH_SHIFT; } /* Cache miss, do full lookup. */ flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index); //没有命中,ce作为新的cache项,将被刷新,下一次可以直接命中 if (flow) ce->skb_hash = skb_hash; return flow; }
ovs_flow_tbl_lookup_stats
会调用static struct sw_flow *flow_lookup(struct flow_table *tbl, struct table_instance *ti, const struct mask_array *ma, const struct sw_flow_key *key, u32 *n_mask_hit, u32 *index)
2、flow_lookup函数
static struct sw_flow *flow_lookup(struct flow_table *tbl, struct table_instance *ti, const struct mask_array *ma, const struct sw_flow_key *key, u32 *n_mask_hit, u32 *index) { struct sw_flow_mask *mask; struct sw_flow *flow; int i; if (*index < ma->max) { //如果index的值小于mask的entry数量,说明index是有效值,基于该值获取sw_flow_mask值 mask = rcu_dereference_ovsl(ma->masks[*index]); if (mask) { flow = masked_flow_lookup(ti, key, mask, n_mask_hit); if (flow) return flow; } } for (i = 0; i < ma->max; i++) { if (i == *index) //前面已查询过,所以跳过该mask continue; mask = rcu_dereference_ovsl(ma->masks[i]); if (!mask) continue; flow = masked_flow_lookup(ti, key, mask, n_mask_hit); if (flow) { /* Found */ *index = i; //更新index指向的值,下次可以直接命中;此处说明cache没有命中,下一次可以直接命中 return flow; } } return NULL; }
会调用masked_flow_lookup如下
3、masked_flow_lookup函数
static struct sw_flow *masked_flow_lookup(struct table_instance *ti, const struct sw_flow_key *unmasked, const struct sw_flow_mask *mask, u32 *n_mask_hit) { struct sw_flow *flow; struct hlist_head *head; u32 hash; struct sw_flow_key masked_key; ovs_flow_mask_key(&masked_key, unmasked, false, mask); //根据mask,计算masked后的key,用以支持通配符 hash = flow_hash(&masked_key, &mask->range); //根据masked key和mask.range 计算hash值 head = find_bucket(ti, hash); //根据hash值,找到sw_flow的链表头 (*n_mask_hit)++; hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) { //遍历链表 if (flow->mask == mask && flow->flow_table.hash == hash && //mask相同、hash相同并且key相同,则匹配到流表 flow_cmp_masked_key(flow, &masked_key, &mask->range)) return flow; } return NULL; }
其中flow_hash计算哈希值,find_bucket根据哈希值查找桶,然后就是一个循环,逐个比较key是否相等,相等则返回flow。
到此流表查找过程已经比较清晰了,tbl->mask_cache是用来加速报文处理的,相同流的skb其hash值也是相同的,可以快速找到mask对象,然后通过hash计算找到bucket进行匹配。 下图想用来阐述skb在流表查询中依赖了哪些数据,通过哪些数据完成了flow的查找。
PS:如果有多个箭头输入,表示要获取该框的内容需要依赖多个信息。
第三步:执行action
调用 int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_actions *acts,struct sw_flow_key *key)
调用 static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, int len)
在这个函数中,通过case语句,不同的action进行不同的操作。
static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, int len){ /* Every output action needs a separate clone of 'skb', but the common * case is just a single output action, so that doing a clone and * then freeing the original skbuff is wasteful. So the following code * is slightly obscure just to avoid that. */ int prev_port = -1; const struct nlattr *a; int rem; for (a = attr, rem = len; rem > 0; a = nla_next(a, &rem)) { int err = 0; if (unlikely(prev_port != -1)) { struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC); if (out_skb) do_output(dp, out_skb, prev_port, key); prev_port = -1; } switch (nla_type(a)) { case OVS_ACTION_ATTR_OUTPUT: prev_port = nla_get_u32(a); break; case OVS_ACTION_ATTR_USERSPACE: output_userspace(dp, skb, key, a, attr, len); break; case OVS_ACTION_ATTR_HASH: execute_hash(skb, key, a); break; case OVS_ACTION_ATTR_PUSH_MPLS: err = push_mpls(skb, key, nla_data(a)); break; case OVS_ACTION_ATTR_POP_MPLS: err = pop_mpls(skb, key, nla_get_be16(a)); break; case OVS_ACTION_ATTR_PUSH_VLAN: err = push_vlan(skb, key, nla_data(a)); break; case OVS_ACTION_ATTR_POP_VLAN: err = pop_vlan(skb, key); break; case OVS_ACTION_ATTR_RECIRC: err = execute_recirc(dp, skb, key, a, rem); if (nla_is_last(a, rem)) { /* If this is the last action, the skb has * been consumed or freed. * Return immediately. */ return err; } break; case OVS_ACTION_ATTR_SET: err = execute_set_action(skb, key, nla_data(a)); break; case OVS_ACTION_ATTR_SET_MASKED: case OVS_ACTION_ATTR_SET_TO_MASKED: err = execute_masked_set_action(skb, key, nla_data(a)); break; case OVS_ACTION_ATTR_SAMPLE: err = sample(dp, skb, key, a, attr, len); break; case OVS_ACTION_ATTR_CT: if (!is_flow_key_valid(key)) { err = ovs_flow_key_update(skb, key); if (err) return err; } err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key, nla_data(a)); /* Hide stolen IP fragments from user space. */ if (err) return err == -EINPROGRESS ? 0 : err; break; } if (unlikely(err)) { kfree_skb(skb); return err; } } if (prev_port != -1) do_output(dp, skb, prev_port, key); else consume_skb(skb); return 0;}
如果可以直接输出,则调用 static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, struct sw_flow_key *key)
,它调用void ovs_vport_send(struct vport *vport, struct sk_buff *skb)
进行发送。
当内核无法查找到流表项的时候,则会通过upcall来调用用户态ovs-vswtichd中的flow table。详见:
http://blog.csdn.net/qq_15437629/article/details/78690386
- 【OVS2.5源码解读】 内核中的flow table流表操作
- 【OVS2.5源码解读】 用户态的flow table流表操作
- Openvswitch原理与代码分析(5): 内核中的流表flow table操作
- 【OVS2.5源码解读】datapath的netlink机制
- 【OVS2.5源码解读】datapath主流程分析
- bootstrap-table源码解读
- 内核源码解读基础
- 【OVS2.5.0源码分析】enqueue action精确流表生成过程分析
- 【OVS2.5.0源码分析】normal action精确流表生成和刷新过程分析
- 【OVS2.5.0源码分析】openflow连接实现分析(5)
- 【OVS2.5.0源码分析】datapath之action分析(5)
- 【OVS2.5.0源码分析】datapath之流表查询
- 【OVS2.5.0源码分析】datapath之流表创建过程
- Grab Cut 源码解读(最大流-最小割, min-cut\max-flow)
- AMPS:字符串操作源码解读
- VxWorks内核解读-5
- 解读linux内核源码的入门方法
- 解读linux内核源码的入门方法
- 机器学习实战-KNN
- JQuery学习之路(样式篇_选择器)
- awk 的內建函数
- 莫烦tensorflow教程笔记(六)
- 本地数据库和hibernate生成的映射文件不一致
- 【OVS2.5源码解读】 内核中的flow table流表操作
- Git 基础
- Linux下 laravel5 访问控制器 404
- 计算页码总数的高效方法
- 客户端 消息处理
- eclipse编译出现错误: 找不到或无法加载主类的解决
- 分配问题
- deepmind_lab msys2-------------------安装好了下载
- 微信小程序环境搭建-项目案例 (一)启动页