open vswitch研究：datapath

来源：互联网发布：淘宝店铺手机端编辑：程序博客网时间：2024/06/07 03:00

struct vport是OVS的设备结构，个人认为非常类似于kernel里的netdev结构

/**
* struct vport - one port within a datapath
* @rcu: RCU callback head for deferred destruction.
* @port_no: Index into @dp's @ports array.
* @dp: Datapath to which this port belongs.
* @kobj: Represents /sys/class/net/<devname>/brport.
* @linkname: The name of the link from /sys/class/net/<datapath>/brif to this
* &struct vport. (We keep this around so that we can delete it if the
* device gets renamed.) Set to the null string when no link exists.
* @node: Element in @dp's @port_list.
* @upcall_pid: The Netlink port to use for packets received on this port that
* miss the flow table.
* @hash_node: Element in @dev_table hash table in vport.c.
* @dp_hash_node: Element in @datapath->ports hash table in datapath.c.
* @ops: Class structure.
* @percpu_stats: Points to per-CPU statistics used and maintained by vport
* @stats_lock: Protects @err_stats and @offset_stats.
* @err_stats: Points to error statistics used and maintained by vport
* @offset_stats: Added to actual statistics as a sop to compatibility with
* XAPI for Citrix XenServer. Deprecated.
*/
struct vport {
struct rcu_head rcu;
u16 port_no;
struct datapath *dp;
struct kobject kobj;
char linkname[IFNAMSIZ];
struct list_head node;
u32 upcall_pid;

struct hlist_node hash_node;
struct hlist_node dp_hash_node;
const struct vport_ops *ops;

struct vport_percpu_stats __percpu *percpu_stats;

spinlock_t stats_lock;
struct vport_err_stats err_stats;
struct ovs_vport_stats offset_stats;
};

和vport紧密相关的是struct datapath

/**
* struct datapath - datapath for flow-based packet switching
* @rcu: RCU callback head for deferred destruction.
* @list_node: Element in global 'dps' list.
* @ifobj: Represents /sys/class/net/<devname>/brif. Protected by RTNL.
* @n_flows: Number of flows currently in flow table.
* @table: Current flow table. Protected by genl_lock and RCU.
* @ports: Hash table for ports. %OVSP_LOCAL port always exists. Protected by
* RTNL and RCU.
* @stats_percpu: Per-CPU datapath statistics.
* @net: Reference to net namespace.
*
* Context: See the comment on locking at the top of datapath.c for additional
* locking information.
*/
struct datapath {
struct rcu_head rcu;
struct list_head list_node;
struct kobject ifobj;

/* Flow table. */
struct flow_table __rcu *table;

/* Switch ports. */
struct hlist_head *ports;

/* Stats. */
struct dp_stats_percpu __percpu *stats_percpu;

#ifdef CONFIG_NET_NS
/* Network namespace ref. */
struct net *net;
#endif
};

我的理解是，无论vport还是datapath都是OVS用的虚拟设备，datapath中包含了多个vport，通过datapath->ports, vport->dp_hash_node的哈希表关联起来， vport->dp指向vport属于的datapath

datapath同时包含了一个flow_table

struct ovs_skb_cb用来作为sk_buff的私有结构，

/**
* struct ovs_skb_cb - OVS data in skb CB
* @flow: The flow associated with this packet. May be %NULL if no flow.
* @tun_id: ID of the tunnel that encapsulated this packet. It is 0 if the
* @ip_summed: Consistently stores L4 checksumming status across different
* kernel versions.
* @csum_start: Stores the offset from which to start checksumming independent
* of the transport header on all kernel versions.
* packet was not received on a tunnel.
* @vlan_tci: Provides a substitute for the skb->vlan_tci field on kernels
* before 2.6.27.
*/
struct ovs_skb_cb {
struct sw_flow *flow;
__be64 tun_id;
#ifdef NEED_CSUM_NORMALIZE
enum csum_type ip_summed;
u16 csum_start;
#endif
#ifdef NEED_VLAN_FIELD
u16 vlan_tci;
#endif
};
#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)

vlan_tci表示了802.1q的Tag Control Identifier，包括3bits 的Priority Code Point ( aka. CoS )，1bit 的Drop Eligible，和12bits的VLAN ID

tun_id表示了这个skb在OVS中的tunnel ID

flow表示了skb所属于的流，这是个openflow的概念，sw_flow->sw_flow_key用于唯一标识一个流，sw_flow->sw_flow_actions用于记录流match了之后的行为

struct vport_ops定义了vport的行为，

/**
* struct vport_ops - definition of a type of virtual port
*
* @type: %OVS_VPORT_TYPE_* value for this type of virtual port.
* @flags: Flags of type VPORT_F_* that influence how the generic vport layer
* handles this vport.
* @init: Called at module initialization. If VPORT_F_REQUIRED is set then the
* failure of this function will cause the module to not load. If the flag is
* not set and initialzation fails then no vports of this type can be created.
* @exit: Called at module unload.
* @create: Create a new vport configured as specified. On success returns
* a new vport allocated with ovs_vport_alloc(), otherwise an ERR_PTR() value.
* @destroy: Destroys a vport. Must call vport_free() on the vport but not
* before an RCU grace period has elapsed.
* @set_options: Modify the configuration of an existing vport. May be %NULL
* if modification is not supported.
* @get_options: Appends vport-specific attributes for the configuration of an
* existing vport to a &struct sk_buff. May be %NULL for a vport that does not
* have any configuration.
* @set_addr: Set the device's MAC address. May be null if not supported.
* @get_name: Get the device's name.
* @get_addr: Get the device's MAC address.
* @get_config: Get the device's configuration.
* @get_kobj: Get the kobj associated with the device (may return null).
* @get_dev_flags: Get the device's flags.
* @is_running: Checks whether the device is running.
* @get_operstate: Get the device's operating state.
* @get_ifindex: Get the system interface index associated with the device.
* May be null if the device does not have an ifindex.
* @get_mtu: Get the device's MTU. May be %NULL if the device does not have an
* MTU (as e.g. some tunnels do not). Must be implemented if @get_ifindex is
* implemented.
* @send: Send a packet on the device. Returns the length of the packet sent.
*/

struct vport_ops {
enum ovs_vport_type type;
u32 flags;

/* Called at module init and exit respectively. */
int (*init)(void);
void (*exit)(void);

/* Called with RTNL lock. */
struct vport *(*create)(const struct vport_parms *);
void (*destroy)(struct vport *);

int (*set_options)(struct vport *, struct nlattr *);
int (*get_options)(const struct vport *, struct sk_buff *);

int (*set_addr)(struct vport *, const unsigned char *);

/* Called with rcu_read_lock or RTNL lock. */
const char *(*get_name)(const struct vport *);
const unsigned char *(*get_addr)(const struct vport *);
void (*get_config)(const struct vport *, void *);
struct kobject *(*get_kobj)(const struct vport *);

unsigned (*get_dev_flags)(const struct vport *);
int (*is_running)(const struct vport *);
unsigned char (*get_operstate)(const struct vport *);

int (*get_ifindex)(const struct vport *);

int (*get_mtu)(const struct vport *);

int (*send)(struct vport *, struct sk_buff *);
};

struct vport 有private data 的数据部分，是紧跟在vport后面的一段线性数据空间，可以通过vport_priv，vport_from_priv来操作

struct vport 其实是个基类，实际应用时会有netdev_vport, internal_vport, patch_vport, gre_vport等，相应的vport_ops为ovs_netdev_vport_ops, ovs_internal_vport_ops, ovs_patch_vport_ops, ovs_gre_vport_ops

下面跟进vport-netdev设备，对于任意的net_device设备，如果要成为OVS的vport，需要把OVS的接收函数hook到net_device的包接收函数中，这样net_device的进包就不会进入常规的内核协议栈中，而是由OVS接过来处理
netdev_vport结构就是一个struct net_device*的封装

struct netdev_vport {
struct net_device *dev;
};

我们来看netdev_create函数：

static struct vport *netdev_create(const struct vport_parms *parms)
{
struct vport *vport;
struct netdev_vport *netdev_vport;
int err;

vport = ovs_vport_alloc(sizeof(struct netdev_vport),
&ovs_netdev_vport_ops, parms);
if (IS_ERR(vport)) {
err = PTR_ERR(vport);
goto error;
}

netdev_vport = netdev_vport_priv(vport);

netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name);
if (!netdev_vport->dev) {
err = -ENODEV;
goto error_free_vport;
}

if (netdev_vport->dev->flags & IFF_LOOPBACK ||
netdev_vport->dev->type != ARPHRD_ETHER ||
ovs_is_internal_dev(netdev_vport->dev)) {
err = -EINVAL;
goto error_put;
}

err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook,
vport);
if (err)
goto error_put;

dev_set_promiscuity(netdev_vport->dev, 1);
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
dev_disable_lro(netdev_vport->dev);
#endif
netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH;

return vport;

error_put:
dev_put(netdev_vport->dev);
error_free_vport:
ovs_vport_free(vport);
error:
return ERR_PTR(err);
}

首先调用 ovs_vport_alloc(sizeof(struct netdev_vport), &ovs_netdev_vport_ops, vport_parms* parms) 创建一个vport结构，后面跟一个vport_private，即一个netdev_vport结构，这两个结构都在一段连续线性内存中。

接着调用内核函数dev_get_by_name，该函数基于net_device->name_hlist（这里全局的net_device都基于name_hlist形成一个hash表），通过name查找到对应的net_device

如果发现net_device是loopback，或不是以太网接口，或是internal vport（internal vport表示这个vport可以把包直接交给内核，可以认为是内核连到OVS的一个虚拟网口），报错返回

调用netdev_rx_handler_register把net_device->br_port 设置为 vport，这样net_device 算是连到bridge上了，最后调用dev_set_promiscuity设置混杂模式后结束

这里要注意的是，2.6.36之后的内核，专门用了一个rx_handler函数指针，用来替代之前内核版本中的br_handle_frame_hook，用来接收传到bridge上的包，本人用的是redhat发布的2.6.32的内核版本，因此netdev_rx_handler_register里rx_handler = netdev_frame_hook这步被省去了，取而代之的是在net_init中进行了br_handle_frame_hook的初始化

static int netdev_init(void)
{
/* Hook into callback used by the bridge to intercept packets.
* Parasites we are. */
br_handle_frame_hook = netdev_frame_hook;

return 0;
}

netdev_frame_hook实际上是调用了netdev_port_receive：

/* Must be called with rcu_read_lock. */
static void netdev_port_receive(struct vport *vport, struct sk_buff *skb)
{
if (unlikely(!vport)) {
kfree_skb(skb);
return;
}

/* Make our own copy of the packet. Otherwise we will mangle the
* packet for anyone who came before us (e.g. tcpdump via AF_PACKET).
* (No one comes after us, since we tell handle_bridge() that we took
* the packet.) */
skb = skb_share_check(skb, GFP_ATOMIC);
if (unlikely(!skb))
return;

skb_push(skb, ETH_HLEN);

if (unlikely(compute_ip_summed(skb, false))) {
kfree_skb(skb);
return;
}
vlan_copy_skb_tci(skb);

ovs_vport_receive(vport, skb);
}

可以看出netdev_port_receive实际是调用了ovs_vport_receive：

void ovs_vport_receive(struct vport *vport, struct sk_buff *skb)
{
struct vport_percpu_stats *stats;

stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id());

u64_stats_update_begin(&stats->sync);
stats->rx_packets++;
stats->rx_bytes += skb->len;
u64_stats_update_end(&stats->sync);

if (!(vport->ops->flags & VPORT_F_FLOW))
OVS_CB(skb)->flow = NULL;

if (!(vport->ops->flags & VPORT_F_TUN_ID))
OVS_CB(skb)->tun_id = 0;

ovs_dp_process_received_packet(vport, skb);
}

ovs_vport_receive实际上更新了统计信息，对ovs_skb_cb->tun_id, ovs_skb_flow做相应更新，接着调用ovs_dp_process_received_packet

/* Must be called with rcu_read_lock. */
void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
{
struct datapath *dp = p->dp;
struct sw_flow *flow;
struct dp_stats_percpu *stats;
u64 *stats_counter;
int error;

stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());

if (!OVS_CB(skb)->flow) {
struct sw_flow_key key;
int key_len;

/* Extract flow from 'skb' into 'key'. */
error = ovs_flow_extract(skb, p->port_no, &key, &key_len);
if (unlikely(error)) {
kfree_skb(skb);
return;
}

/* Look up flow. */
flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table),
&key, key_len);
if (unlikely(!flow)) {
struct dp_upcall_info upcall;

upcall.cmd = OVS_PACKET_CMD_MISS;
upcall.key = &key;
upcall.userdata = NULL;
upcall.pid = p->upcall_pid;
ovs_dp_upcall(dp, skb, &upcall);
consume_skb(skb);
stats_counter = &stats->n_missed;
goto out;
}

OVS_CB(skb)->flow = flow;
}

stats_counter = &stats->n_hit;

ovs_flow_used(OVS_CB(skb)->flow, skb);
ovs_execute_actions(dp, skb);

out:
/* Update datapath statistics. */
u64_stats_update_begin(&stats->sync);
(*stats_counter)++;
u64_stats_update_end(&stats->sync);
}

函数里很大一部分是和skb所属的flow相关，我们知道OVS是遵守openFlow规范的，所以OVS和bridge很大一块不同就是OVS在处理skb的时候有一个flow的概念在里面。首先会调用ovs_flow_extract基于skb算出一个key出来，之后调用ovs_flow_tbl_lookup查找这个flow，最后是调用ovs_execute_actions

ovs_execute_actions会调用do_execute_actions，后者一般情况下，会找出out_port出来，然后调用do_output把skb从out_port口发送出去，发送函数为ovs_vport_send

ovs_vport_send：

/**
* ovs_vport_send - send a packet on a device
*
* @vport: vport on which to send the packet
* @skb: skb to send
*
* Sends the given packet and returns the length of data sent. Either RTNL
* lock or rcu_read_lock must be held.
*/
int ovs_vport_send(struct vport *vport, struct sk_buff *skb)
{
int sent = vport->ops->send(vport, skb);

if (likely(sent)) {
struct vport_percpu_stats *stats;

stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id());

u64_stats_update_begin(&stats->sync);
stats->tx_packets++;
stats->tx_bytes += sent;
u64_stats_update_end(&stats->sync);
}
return sent;
}

还有像ovs_vport_init, ovs_vport_exit, ovs_vport_destroy, ovs_vport_set_xxxx, ovs_vport_get_xxxx 这些函数就不一一介绍了，请自己阅读datapath/vport.c的源码

最后来看下vport-netdev的特殊实现，可以看出vport只是一个基类，而vport根据设备的不同而不同，netdev vport应该是最普遍的场景，前面我们看来接收的流程，对于发送，vport->ops->send会调用到netdev_send函数

static int netdev_send(struct vport *vport, struct sk_buff *skb)
{
struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
int mtu = netdev_vport->dev->mtu;
int len;

if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
if (net_ratelimit())
pr_warn("%s: dropped over-mtu packet: %d > %d\n",
ovs_dp_name(vport->dp), packet_length(skb), mtu);
goto error;
}

这段代码发现如果skb->len大过了MTU，同时skb又不允许gso，那么直接丢弃

下面一大段是和vlan相关的操作，如果没有vlan的话，那就直接通过dev_queue_xmit把skb发送出去

skb->dev = netdev_vport->dev;
forward_ip_summed(skb, true);

if (vlan_tx_tag_present(skb) && !dev_supports_vlan_tx(skb->dev)) {
int features;

features = netif_skb_features(skb);

if (!vlan_tso)
features &= ~(NETIF_F_TSO | NETIF_F_TSO6 |
NETIF_F_UFO | NETIF_F_FSO);

if (netif_needs_gso(skb, features)) {
struct sk_buff *nskb;

nskb = skb_gso_segment(skb, features);
if (!nskb) {
if (unlikely(skb_cloned(skb) &&
pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) {
kfree_skb(skb);
return 0;
}

skb_shinfo(skb)->gso_type &= ~SKB_GSO_DODGY;
goto tag;
}

if (IS_ERR(nskb)) {
kfree_skb(skb);
return 0;
}
consume_skb(skb);
skb = nskb;

len = 0;

do {
nskb = skb->next;
skb->next = NULL;

skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
if (likely(skb)) {
len += skb->len;
vlan_set_tci(skb, 0);
dev_queue_xmit(skb);
}

skb = nskb;
} while (skb);

return len;
}

如果设备不支持tso/gso，那么需要调用skb_gso_segment在内核里进行分段，如果分段成功会返回一个skb的list，然后对list里每一个skb，调用dev_queue_xmit发送出去

tag:
skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
if (unlikely(!skb))
return 0;
vlan_set_tci(skb, 0);
}

下面是vlan无关部分，直接通过dev_queue_xmit发送skb

len = skb->len;
dev_queue_xmit(skb);

return len;

error:
kfree_skb(skb);
ovs_vport_record_error(vport, VPORT_E_TX_DROPPED);
return 0;
}

开发者可以去参考datapath/README文档