【OVS2.5源码解读】内核中的flow table流表操作

来源：互联网发布：关闭windows自动更新编辑：程序博客网时间：2024/06/08 13:59

当一个数据包到达网卡的时候，首先要经过内核Openvswitch.ko，流表Flow Table在内核中有一份，通过key查找内核中的flow table，即可以得到action，然后执行action之后，直接发送这个包，只有在内核无法查找到流表项的时候，才会到用户态查找用户态的流表。仅仅查找内核中flow table的情况被称为fast path；需要查找用户态中flow table的情况被称为slow path.
这里写图片描述

第一步：从数据包中提取出key

当一个OVS端口接收到一个数据包，不是将整个数据包在内核层的流表中匹配查找，这样效率低下，而是需要对此数据包头字段进行解析，将解析出来的各个匹配字段值和端口号一起构造成查询key，然后用key在流表中进行匹配查找。

实现函数为int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key)

在这个函数中，首先提取的是物理层的信息，主要是从哪个网口进入的

key->phy.priority = skb->priority;key->phy.in_port = OVS_CB(skb)->input_vport->port_no;key->phy.skb_mark = skb->mark;ovs_ct_fill_key(skb, key);key->ovs_flow_hash = 0;key->recirc_id = 0;

然后调用函数static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
提取MAC层的key：

/* Link layer. We are guaranteed to have at least the 14 byte Ethernet * header in the linear data area. */eth = eth_hdr(skb);ether_addr_copy(key->eth.src, eth->h_source);ether_addr_copy(key->eth.dst, eth->h_dest);__skb_pull(skb, 2 * ETH_ALEN);/* We are going to push all headers that we pull, so no need to * update skb->csum here. */key->eth.tci = 0;if (skb_vlan_tag_present(skb))   key->eth.tci = htons(vlan_get_tci(skb));else if (eth->h_proto == htons(ETH_P_8021Q))   if (unlikely(parse_vlan(skb, key)))      return -ENOMEM;key->eth.type = parse_ethertype(skb);

提取网络层的key：

struct iphdr *nh;__be16 offset;error = check_iphdr(skb);if (unlikely(error)) {   memset(&key->ip, 0, sizeof(key->ip));   memset(&key->ipv4, 0, sizeof(key->ipv4));   if (error == -EINVAL) {      skb->transport_header = skb->network_header;      error = 0;   }   return error;}nh = ip_hdr(skb);key->ipv4.addr.src = nh->saddr;key->ipv4.addr.dst = nh->daddr;key->ip.proto = nh->protocol;key->ip.tos = nh->tos;key->ip.ttl = nh->ttl;offset = nh->frag_off & htons(IP_OFFSET);if (offset) {   key->ip.frag = OVS_FRAG_TYPE_LATER;   return 0;}if (nh->frag_off & htons(IP_MF) ||   skb_shinfo(skb)->gso_type & SKB_GSO_UDP)   key->ip.frag = OVS_FRAG_TYPE_FIRST;else   key->ip.frag = OVS_FRAG_TYPE_NONE;

提取传输层的key：

/* Transport layer. */if (key->ip.proto == IPPROTO_TCP) {   if (tcphdr_ok(skb)) {      struct tcphdr *tcp = tcp_hdr(skb);      key->tp.src = tcp->source;      key->tp.dst = tcp->dest;      key->tp.flags = TCP_FLAGS_BE16(tcp);   } else {      memset(&key->tp, 0, sizeof(key->tp));   }} else if (key->ip.proto == IPPROTO_UDP) {   if (udphdr_ok(skb)) {      struct udphdr *udp = udp_hdr(skb);      key->tp.src = udp->source;      key->tp.dst = udp->dest;   } else {      memset(&key->tp, 0, sizeof(key->tp));   }} else if (key->ip.proto == IPPROTO_SCTP) {   if (sctphdr_ok(skb)) {      struct sctphdr *sctp = sctp_hdr(skb);      key->tp.src = sctp->source;      key->tp.dst = sctp->dest;   } else {      memset(&key->tp, 0, sizeof(key->tp));   }} else if (key->ip.proto == IPPROTO_ICMP) {   if (icmphdr_ok(skb)) {      struct icmphdr *icmp = icmp_hdr(skb);      /* The ICMP type and code fields use the 16-bit       * transport port fields, so we need to store       * them in 16-bit network byte order.       */      key->tp.src = htons(icmp->type);      key->tp.dst = htons(icmp->code);   } else {      memset(&key->tp, 0, sizeof(key->tp));   }}

第二步：根据key查找flow table

在内核中，flow table的数据结构如下图所示。

这里写图片描述

每个虚拟交换机对应一个datapath，每个datapath有一个flow table，每个flow table分成N个桶，根据key进行哈希，不同的key分布在不同的桶里面。

每个桶的大小是一个内存页的大小，在内存页的头部保存了保存了元素个数和每个元素的大小。每个元素都是sw_flow，里面有key，也有action。

查找过程主要通过调用struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, const struct sw_flow_key *key, u32 skb_hash, u32 *n_mask_hit)实现。

1、ovs_flow_tbl_lookup_stats函数

struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,                        const struct sw_flow_key *key,            //由ovs_flow_key_extract函数根据skb生成                        u32 skb_hash,<span style="white-space:pre">             </span>    //skb中携带的信息                        u32 *n_mask_hit)  {      struct mask_array *ma = rcu_dereference(tbl->mask_array);      struct table_instance *ti = rcu_dereference(tbl->ti);     //得到table实例      struct mask_cache_entry *entries, *ce;      struct sw_flow *flow;      u32 hash;      int seg;      *n_mask_hit = 0;      if (unlikely(!skb_hash)) {  //如果报文没有hash值，则mask_index为0，全遍历所有的mask。          u32 mask_index = 0;          return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index);      }      /* Pre and post recirulation flows usually have the same skb_hash      * value. To avoid hash collisions, rehash the 'skb_hash' with      * 'recirc_id'.  */      if (key->recirc_id)          skb_hash = jhash_1word(skb_hash, key->recirc_id);      ce = NULL;      hash = skb_hash;      entries = this_cpu_ptr(tbl->mask_cache);      /* Find the cache entry 'ce' to operate on. */      for (seg = 0; seg < MC_HASH_SEGS; seg++) {       //32位的hash值被分成4段，每段8字节，作为cache的索引          int index = hash & (MC_HASH_ENTRIES - 1);          struct mask_cache_entry *e;          e = &entries[index];                    //entry最大为256项          if (e->skb_hash == skb_hash) {                  //如果在cache entry找到报文hash相同项，则根据该entry指定的mask查表              flow = flow_lookup(tbl, ti, ma, key, n_mask_hit,                         &e->mask_index);              if (!flow)                  e->skb_hash = 0;              return flow;          }          if (!ce || e->skb_hash < ce->skb_hash)              ce = e;  /* A better replacement cache candidate. */          hash >>= MC_HASH_SHIFT;      }      /* Cache miss, do full lookup. */      flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index);     //没有命中，ce作为新的cache项，将被刷新，下一次可以直接命中      if (flow)          ce->skb_hash = skb_hash;      return flow;  }

ovs_flow_tbl_lookup_stats会调用static struct sw_flow *flow_lookup(struct flow_table *tbl, struct table_instance *ti, const struct mask_array *ma, const struct sw_flow_key *key, u32 *n_mask_hit, u32 *index)

2、flow_lookup函数

static struct sw_flow *flow_lookup(struct flow_table *tbl,                     struct table_instance *ti,                     const struct mask_array *ma,                     const struct sw_flow_key *key,                     u32 *n_mask_hit,                     u32 *index)  {      struct sw_flow_mask *mask;      struct sw_flow *flow;      int i;      if (*index < ma->max) {         //如果index的值小于mask的entry数量，说明index是有效值，基于该值获取sw_flow_mask值          mask = rcu_dereference_ovsl(ma->masks[*index]);          if (mask) {              flow = masked_flow_lookup(ti, key, mask, n_mask_hit);              if (flow)                  return flow;          }      }      for (i = 0; i < ma->max; i++)  {          if (i == *index)    //前面已查询过，所以跳过该mask              continue;          mask = rcu_dereference_ovsl(ma->masks[i]);          if (!mask)              continue;          flow = masked_flow_lookup(ti, key, mask, n_mask_hit);          if (flow) { /* Found */              *index = i;     //更新index指向的值，下次可以直接命中；此处说明cache没有命中，下一次可以直接命中              return flow;          }      }      return NULL;  }

会调用masked_flow_lookup如下

3、masked_flow_lookup函数

static struct sw_flow *masked_flow_lookup(struct table_instance *ti,                        const struct sw_flow_key *unmasked,                        const struct sw_flow_mask *mask,                        u32 *n_mask_hit)  {      struct sw_flow *flow;      struct hlist_head *head;      u32 hash;      struct sw_flow_key masked_key;       ovs_flow_mask_key(&masked_key, unmasked, false, mask);       //根据mask，计算masked后的key，用以支持通配符      hash = flow_hash(&masked_key, &mask->range);                 //根据masked key和mask.range 计算hash值      head = find_bucket(ti, hash);                                //根据hash值，找到sw_flow的链表头      (*n_mask_hit)++;      hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) {   //遍历链表          if (flow->mask == mask && flow->flow_table.hash == hash &&      //mask相同、hash相同并且key相同，则匹配到流表              flow_cmp_masked_key(flow, &masked_key, &mask->range))              return flow;      }      return NULL;  }

其中flow_hash计算哈希值，find_bucket根据哈希值查找桶，然后就是一个循环，逐个比较key是否相等，相等则返回flow。

到此流表查找过程已经比较清晰了，tbl->mask_cache是用来加速报文处理的，相同流的skb其hash值也是相同的，可以快速找到mask对象，然后通过hash计算找到bucket进行匹配。下图想用来阐述skb在流表查询中依赖了哪些数据，通过哪些数据完成了flow的查找。
这里写图片描述

PS：如果有多个箭头输入，表示要获取该框的内容需要依赖多个信息。

第三步：执行action

调用
int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_actions *acts,struct sw_flow_key *key)

调用
static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, int len)

在这个函数中，通过case语句，不同的action进行不同的操作。

static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,               struct sw_flow_key *key,               const struct nlattr *attr, int len){   /* Every output action needs a separate clone of 'skb', but the common    * case is just a single output action, so that doing a clone and    * then freeing the original skbuff is wasteful. So the following code    * is slightly obscure just to avoid that.    */   int prev_port = -1;   const struct nlattr *a;   int rem;   for (a = attr, rem = len; rem > 0;        a = nla_next(a, &rem)) {      int err = 0;      if (unlikely(prev_port != -1)) {         struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);         if (out_skb)            do_output(dp, out_skb, prev_port, key);         prev_port = -1;      }      switch (nla_type(a)) {      case OVS_ACTION_ATTR_OUTPUT:         prev_port = nla_get_u32(a);         break;      case OVS_ACTION_ATTR_USERSPACE:         output_userspace(dp, skb, key, a, attr, len);         break;      case OVS_ACTION_ATTR_HASH:         execute_hash(skb, key, a);         break;      case OVS_ACTION_ATTR_PUSH_MPLS:         err = push_mpls(skb, key, nla_data(a));         break;      case OVS_ACTION_ATTR_POP_MPLS:         err = pop_mpls(skb, key, nla_get_be16(a));         break;      case OVS_ACTION_ATTR_PUSH_VLAN:         err = push_vlan(skb, key, nla_data(a));         break;      case OVS_ACTION_ATTR_POP_VLAN:         err = pop_vlan(skb, key);         break;      case OVS_ACTION_ATTR_RECIRC:         err = execute_recirc(dp, skb, key, a, rem);         if (nla_is_last(a, rem)) {            /* If this is the last action, the skb has             * been consumed or freed.             * Return immediately.             */            return err;         }         break;      case OVS_ACTION_ATTR_SET:         err = execute_set_action(skb, key, nla_data(a));         break;      case OVS_ACTION_ATTR_SET_MASKED:      case OVS_ACTION_ATTR_SET_TO_MASKED:         err = execute_masked_set_action(skb, key, nla_data(a));         break;      case OVS_ACTION_ATTR_SAMPLE:         err = sample(dp, skb, key, a, attr, len);         break;      case OVS_ACTION_ATTR_CT:         if (!is_flow_key_valid(key)) {            err = ovs_flow_key_update(skb, key);            if (err)               return err;         }         err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key,                    nla_data(a));         /* Hide stolen IP fragments from user space. */         if (err)            return err == -EINPROGRESS ? 0 : err;         break;      }      if (unlikely(err)) {         kfree_skb(skb);         return err;      }   }   if (prev_port != -1)      do_output(dp, skb, prev_port, key);   else      consume_skb(skb);   return 0;}

如果可以直接输出，则调用
static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, struct sw_flow_key *key)，它调用void ovs_vport_send(struct vport *vport, struct sk_buff *skb)进行发送。

当内核无法查找到流表项的时候，则会通过upcall来调用用户态ovs-vswtichd中的flow table。详见：
http://blog.csdn.net/qq_15437629/article/details/78690386

阅读全文

0 0

【OVS2.5源码解读】 内核中的flow table流表操作

第一步：从数据包中提取出key

第二步：根据key查找flow table

第三步：执行action

【OVS2.5源码解读】内核中的flow table流表操作