open vswitch研究:datapath

来源:互联网 发布:淘宝店铺手机端 编辑:程序博客网 时间:2024/06/07 03:00

struct vport是OVS的设备结构,个人认为非常类似于kernel里的netdev结构

/**
 * struct vport - one port within a datapath
 * @rcu: RCU callback head for deferred destruction.
 * @port_no: Index into @dp's @ports array.
 * @dp: Datapath to which this port belongs.
 * @kobj: Represents /sys/class/net/<devname>/brport.
 * @linkname: The name of the link from /sys/class/net/<datapath>/brif to this
 * &struct vport.  (We keep this around so that we can delete it if the
 * device gets renamed.)  Set to the null string when no link exists.
 * @node: Element in @dp's @port_list.
 * @upcall_pid: The Netlink port to use for packets received on this port that
 * miss the flow table.
 * @hash_node: Element in @dev_table hash table in vport.c.
 * @dp_hash_node: Element in @datapath->ports hash table in datapath.c.
 * @ops: Class structure.
 * @percpu_stats: Points to per-CPU statistics used and maintained by vport
 * @stats_lock: Protects @err_stats and @offset_stats.
 * @err_stats: Points to error statistics used and maintained by vport
 * @offset_stats: Added to actual statistics as a sop to compatibility with
 * XAPI for Citrix XenServer.  Deprecated.
 */ 
struct vport {
    struct rcu_head rcu;
    u16 port_no;
    struct datapath *dp;
    struct kobject kobj;
    char linkname[IFNAMSIZ];
    struct list_head node;
    u32 upcall_pid;

    struct hlist_node hash_node;
    struct hlist_node dp_hash_node;
    const struct vport_ops *ops;

    struct vport_percpu_stats __percpu *percpu_stats;

    spinlock_t stats_lock;
    struct vport_err_stats err_stats;
    struct ovs_vport_stats offset_stats;
};


和vport紧密相关的是struct datapath

/**
 * struct datapath - datapath for flow-based packet switching
 * @rcu: RCU callback head for deferred destruction.
 * @list_node: Element in global 'dps' list.
 * @ifobj: Represents /sys/class/net/<devname>/brif.  Protected by RTNL.
 * @n_flows: Number of flows currently in flow table.
 * @table: Current flow table.  Protected by genl_lock and RCU.
 * @ports: Hash table for ports.  %OVSP_LOCAL port always exists.  Protected by
 * RTNL and RCU.
 * @stats_percpu: Per-CPU datapath statistics.
 * @net: Reference to net namespace.
 *
 * Context: See the comment on locking at the top of datapath.c for additional
 * locking information.
 */
struct datapath {
    struct rcu_head rcu;
    struct list_head list_node;
    struct kobject ifobj;

    /* Flow table. */
    struct flow_table __rcu *table;

    /* Switch ports. */
    struct hlist_head *ports;

    /* Stats. */
    struct dp_stats_percpu __percpu *stats_percpu;

#ifdef CONFIG_NET_NS
    /* Network namespace ref. */
    struct net *net;
#endif
};


我的理解是,无论vport还是datapath都是OVS用的虚拟设备,datapath中包含了多个vport,通过datapath->ports, vport->dp_hash_node的哈希表关联起来, vport->dp指向vport属于的datapath

datapath同时包含了一个flow_table


struct ovs_skb_cb用来作为sk_buff的私有结构,

/**
 * struct ovs_skb_cb - OVS data in skb CB
 * @flow: The flow associated with this packet.  May be %NULL if no flow.
 * @tun_id: ID of the tunnel that encapsulated this packet.  It is 0 if the
 * @ip_summed: Consistently stores L4 checksumming status across different
 * kernel versions.
 * @csum_start: Stores the offset from which to start checksumming independent
 * of the transport header on all kernel versions.
 * packet was not received on a tunnel.
 * @vlan_tci: Provides a substitute for the skb->vlan_tci field on kernels
 * before 2.6.27.
 */
struct ovs_skb_cb {
    struct sw_flow      *flow;
    __be64          tun_id;
#ifdef NEED_CSUM_NORMALIZE
    enum csum_type      ip_summed;
    u16         csum_start;
#endif
#ifdef NEED_VLAN_FIELD
    u16         vlan_tci;
#endif
};
#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)

vlan_tci表示了802.1q的Tag Control Identifier,包括3bits 的Priority Code Point ( aka. CoS ),1bit 的Drop Eligible,和12bits的VLAN ID

tun_id表示了这个skb在OVS中的tunnel ID

flow表示了skb所属于的流,这是个openflow的概念,sw_flow->sw_flow_key用于唯一标识一个流,sw_flow->sw_flow_actions用于记录流match了之后的行为


struct vport_ops定义了vport的行为,

/**
 * struct vport_ops - definition of a type of virtual port
 *
 * @type: %OVS_VPORT_TYPE_* value for this type of virtual port.
 * @flags: Flags of type VPORT_F_* that influence how the generic vport layer
 * handles this vport.
 * @init: Called at module initialization.  If VPORT_F_REQUIRED is set then the
 * failure of this function will cause the module to not load.  If the flag is
 * not set and initialzation fails then no vports of this type can be created.
 * @exit: Called at module unload.
 * @create: Create a new vport configured as specified.  On success returns
 * a new vport allocated with ovs_vport_alloc(), otherwise an ERR_PTR() value.
 * @destroy: Destroys a vport.  Must call vport_free() on the vport but not
 * before an RCU grace period has elapsed.
 * @set_options: Modify the configuration of an existing vport.  May be %NULL
 * if modification is not supported.
 * @get_options: Appends vport-specific attributes for the configuration of an
 * existing vport to a &struct sk_buff.  May be %NULL for a vport that does not
 * have any configuration.
 * @set_addr: Set the device's MAC address.  May be null if not supported.
 * @get_name: Get the device's name.
 * @get_addr: Get the device's MAC address.
 * @get_config: Get the device's configuration.
 * @get_kobj: Get the kobj associated with the device (may return null).
 * @get_dev_flags: Get the device's flags.
 * @is_running: Checks whether the device is running.
 * @get_operstate: Get the device's operating state.
 * @get_ifindex: Get the system interface index associated with the device.
 * May be null if the device does not have an ifindex.
 * @get_mtu: Get the device's MTU.  May be %NULL if the device does not have an
 * MTU (as e.g. some tunnels do not).  Must be implemented if @get_ifindex is
 * implemented.
 * @send: Send a packet on the device.  Returns the length of the packet sent.
 */               

struct vport_ops {
    enum ovs_vport_type type;
    u32 flags;

    /* Called at module init and exit respectively. */
    int (*init)(void);
    void (*exit)(void);

    /* Called with RTNL lock. */
    struct vport *(*create)(const struct vport_parms *);
    void (*destroy)(struct vport *);

    int (*set_options)(struct vport *, struct nlattr *);
    int (*get_options)(const struct vport *, struct sk_buff *);

    int (*set_addr)(struct vport *, const unsigned char *);

    /* Called with rcu_read_lock or RTNL lock. */
    const char *(*get_name)(const struct vport *);
    const unsigned char *(*get_addr)(const struct vport *);
    void (*get_config)(const struct vport *, void *);
    struct kobject *(*get_kobj)(const struct vport *);

    unsigned (*get_dev_flags)(const struct vport *);
    int (*is_running)(const struct vport *);
    unsigned char (*get_operstate)(const struct vport *);

    int (*get_ifindex)(const struct vport *);

    int (*get_mtu)(const struct vport *);

    int (*send)(struct vport *, struct sk_buff *);
};


struct vport 有private data 的数据部分,是紧跟在vport后面的一段线性数据空间,可以通过vport_priv,vport_from_priv来操作

struct vport 其实是个基类,实际应用时会有netdev_vport, internal_vport, patch_vport, gre_vport等,相应的vport_ops为ovs_netdev_vport_ops, ovs_internal_vport_ops, ovs_patch_vport_ops, ovs_gre_vport_ops


下面跟进vport-netdev设备,对于任意的net_device设备,如果要成为OVS的vport,需要把OVS的接收函数hook到net_device的包接收函数中,这样net_device的进包就不会进入常规的内核协议栈中,而是由OVS接过来处理
netdev_vport结构就是一个struct net_device*的封装

struct netdev_vport {
    struct net_device *dev;
};

我们来看netdev_create函数:

static struct vport *netdev_create(const struct vport_parms *parms)
{   
    struct vport *vport;
    struct netdev_vport *netdev_vport;
    int err;
    
    vport = ovs_vport_alloc(sizeof(struct netdev_vport),
                &ovs_netdev_vport_ops, parms);
    if (IS_ERR(vport)) {
        err = PTR_ERR(vport);
        goto error;
    }
    
    netdev_vport = netdev_vport_priv(vport);
        
    netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name);
    if (!netdev_vport->dev) {
        err = -ENODEV;
        goto error_free_vport;
    }

    if (netdev_vport->dev->flags & IFF_LOOPBACK ||
        netdev_vport->dev->type != ARPHRD_ETHER ||
        ovs_is_internal_dev(netdev_vport->dev)) {
        err = -EINVAL;
        goto error_put;
    }

    err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook,
                     vport);
    if (err)
        goto error_put;

    dev_set_promiscuity(netdev_vport->dev, 1);
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
    dev_disable_lro(netdev_vport->dev);
#endif
    netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH;

    return vport;

error_put:
    dev_put(netdev_vport->dev);
error_free_vport:
    ovs_vport_free(vport);
error:
    return ERR_PTR(err);
}

首先调用 ovs_vport_alloc(sizeof(struct netdev_vport), &ovs_netdev_vport_ops, vport_parms* parms) 创建一个vport结构,后面跟一个vport_private,即一个netdev_vport结构,这两个结构都在一段连续线性内存中。

接着调用内核函数dev_get_by_name,该函数基于net_device->name_hlist(这里全局的net_device都基于name_hlist形成一个hash表),通过name查找到对应的net_device

如果发现net_device是loopback, 或不是以太网接口,或是internal vport(internal vport表示这个vport可以把包直接交给内核,可以认为是内核连到OVS的一个虚拟网口),报错返回

调用netdev_rx_handler_register把net_device->br_port 设置为 vport,这样net_device 算是连到bridge上了,最后调用dev_set_promiscuity设置混杂模式后结束


这里要注意的是,2.6.36之后的内核,专门用了一个rx_handler函数指针,用来替代之前内核版本中的br_handle_frame_hook,用来接收传到bridge上的包,本人用的是redhat发布的2.6.32的内核版本,因此netdev_rx_handler_register里rx_handler = netdev_frame_hook这步被省去了,取而代之的是在net_init中进行了br_handle_frame_hook的初始化

static int netdev_init(void)
{   
    /* Hook into callback used by the bridge to intercept packets.
     * Parasites we are. */
    br_handle_frame_hook = netdev_frame_hook;
    
    return 0;
}


netdev_frame_hook实际上是调用了netdev_port_receive

/* Must be called with rcu_read_lock. */
static void netdev_port_receive(struct vport *vport, struct sk_buff *skb)
{
    if (unlikely(!vport)) {
        kfree_skb(skb);
        return;
    }                

    /* Make our own copy of the packet.  Otherwise we will mangle the
     * packet for anyone who came before us (e.g. tcpdump via AF_PACKET).
     * (No one comes after us, since we tell handle_bridge() that we took
     * the packet.) */
    skb = skb_share_check(skb, GFP_ATOMIC);
    if (unlikely(!skb))
        return;

    skb_push(skb, ETH_HLEN);

    if (unlikely(compute_ip_summed(skb, false))) {
        kfree_skb(skb);
        return;
    }
    vlan_copy_skb_tci(skb);

    ovs_vport_receive(vport, skb);
}

可以看出netdev_port_receive实际是调用了ovs_vport_receive

void ovs_vport_receive(struct vport *vport, struct sk_buff *skb)
{
    struct vport_percpu_stats *stats;

    stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id());

    u64_stats_update_begin(&stats->sync);
    stats->rx_packets++;
    stats->rx_bytes += skb->len;
    u64_stats_update_end(&stats->sync);
        
    if (!(vport->ops->flags & VPORT_F_FLOW))
        OVS_CB(skb)->flow = NULL;

    if (!(vport->ops->flags & VPORT_F_TUN_ID))
        OVS_CB(skb)->tun_id = 0;

    ovs_dp_process_received_packet(vport, skb);
}

ovs_vport_receive实际上更新了统计信息,对ovs_skb_cb->tun_id, ovs_skb_flow做相应更新,接着调用ovs_dp_process_received_packet


/* Must be called with rcu_read_lock. */
void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
{   
    struct datapath *dp = p->dp;
    struct sw_flow *flow;
    struct dp_stats_percpu *stats;
    u64 *stats_counter;
    int error;

    stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());

    if (!OVS_CB(skb)->flow) {
        struct sw_flow_key key;
        int key_len;

        /* Extract flow from 'skb' into 'key'. */
        error = ovs_flow_extract(skb, p->port_no, &key, &key_len);
        if (unlikely(error)) {
            kfree_skb(skb);
            return;
        }

        /* Look up flow. */
        flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table),
                       &key, key_len);
        if (unlikely(!flow)) {
            struct dp_upcall_info upcall;

            upcall.cmd = OVS_PACKET_CMD_MISS;
            upcall.key = &key;
            upcall.userdata = NULL;
            upcall.pid = p->upcall_pid;
            ovs_dp_upcall(dp, skb, &upcall);
            consume_skb(skb);
            stats_counter = &stats->n_missed;
            goto out;
        }

        OVS_CB(skb)->flow = flow;
    }

    stats_counter = &stats->n_hit;

    ovs_flow_used(OVS_CB(skb)->flow, skb);
    ovs_execute_actions(dp, skb);

out:
    /* Update datapath statistics. */
    u64_stats_update_begin(&stats->sync);
    (*stats_counter)++;
    u64_stats_update_end(&stats->sync);
}

函数里很大一部分是和skb所属的flow相关,我们知道OVS是遵守openFlow规范的,所以OVS和bridge很大一块不同就是OVS在处理skb的时候有一个flow的概念在里面。首先会调用ovs_flow_extract基于skb算出一个key出来,之后调用ovs_flow_tbl_lookup查找这个flow,最后是调用ovs_execute_actions

ovs_execute_actions会调用do_execute_actions,后者一般情况下,会找出out_port出来,然后调用do_output把skb从out_port口发送出去,发送函数为ovs_vport_send


ovs_vport_send:

/** 
 *  ovs_vport_send - send a packet on a device
 *      
 * @vport: vport on which to send the packet
 * @skb: skb to send
 *  
 * Sends the given packet and returns the length of data sent.  Either RTNL
 * lock or rcu_read_lock must be held.
 */ 
int ovs_vport_send(struct vport *vport, struct sk_buff *skb)
{   
    int sent = vport->ops->send(vport, skb);

    if (likely(sent)) {
        struct vport_percpu_stats *stats;
                
        stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id());
    
        u64_stats_update_begin(&stats->sync);
        stats->tx_packets++;
        stats->tx_bytes += sent;
        u64_stats_update_end(&stats->sync);
    }
    return sent;

还有像ovs_vport_init, ovs_vport_exit, ovs_vport_destroy, ovs_vport_set_xxxx, ovs_vport_get_xxxx 这些函数就不一一介绍了,请自己阅读datapath/vport.c的源码


最后来看下vport-netdev的特殊实现,可以看出vport只是一个基类,而vport根据设备的不同而不同,netdev vport应该是最普遍的场景,前面我们看来接收的流程,对于发送,vport->ops->send会调用到netdev_send函数

static int netdev_send(struct vport *vport, struct sk_buff *skb)
{           
    struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
    int mtu = netdev_vport->dev->mtu;
    int len;
            
    if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
        if (net_ratelimit())
            pr_warn("%s: dropped over-mtu packet: %d > %d\n",
                ovs_dp_name(vport->dp), packet_length(skb), mtu);
        goto error;
    }  

这段代码发现如果skb->len大过了MTU,同时skb又不允许gso,那么直接丢弃

下面一大段是和vlan相关的操作,如果没有vlan的话,那就直接通过dev_queue_xmit把skb发送出去

    skb->dev = netdev_vport->dev;
    forward_ip_summed(skb, true);
                
    if (vlan_tx_tag_present(skb) && !dev_supports_vlan_tx(skb->dev)) {
        int features; 
            
        features = netif_skb_features(skb);
            
        if (!vlan_tso)
            features &= ~(NETIF_F_TSO | NETIF_F_TSO6 |
                      NETIF_F_UFO | NETIF_F_FSO);
        
        if (netif_needs_gso(skb, features)) {
            struct sk_buff *nskb;
        
            nskb = skb_gso_segment(skb, features);
            if (!nskb) {
                if (unlikely(skb_cloned(skb) &&
                    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) {
                    kfree_skb(skb);
                    return 0;
                }

                skb_shinfo(skb)->gso_type &= ~SKB_GSO_DODGY;
                goto tag;
            }

            if (IS_ERR(nskb)) {
                kfree_skb(skb);
                return 0;
            }
            consume_skb(skb);
            skb = nskb;

            len = 0;

            do {
                nskb = skb->next;
                skb->next = NULL;

                skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
                if (likely(skb)) {
                    len += skb->len;
                    vlan_set_tci(skb, 0);
                    dev_queue_xmit(skb);
                }

                skb = nskb;
            } while (skb);

            return len;
        }

如果设备不支持tso/gso,那么需要调用skb_gso_segment在内核里进行分段,如果分段成功会返回一个skb的list,然后对list里每一个skb,调用dev_queue_xmit发送出去

tag:
        skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
        if (unlikely(!skb))
            return 0;
        vlan_set_tci(skb, 0);
    }

下面是vlan无关部分,直接通过dev_queue_xmit发送skb

    len = skb->len;
    dev_queue_xmit(skb);

    return len;

error:
    kfree_skb(skb);
    ovs_vport_record_error(vport, VPORT_E_TX_DROPPED);
    return 0;
}


开发者可以去参考datapath/README文档