OVS datapath模块分析:基本架构
来源:互联网 发布:青岛广电的网络电视 编辑:程序博客网 时间:2024/06/15 00:19
err = genl_exec_init();
err = ovs_workqueues_init();
err = ovs_tnl_init();
err = ovs_flow_init();
err = ovs_vport_init();
err = register_pernet_device(&ovs_net_ops);
err = register_netdevice_notifier(&ovs_dp_device_notifier);
err = dp_register_genl();
schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL);
return 0;
}
module_init(dp_init);
module_exit(dp_cleanup);
----------------------------------
genlmsg_skb = genlmsg_new(0, GFP_KERNEL); //发送generic netlink message需要三个步骤:申请内存(底层都是内核通过get_free_pages得到的内存空间),创建消息,发送消息。genlmsg_new这个方法会自动为netlink , generic netlink 消息头增加空间,而且如果不知道需要多大空间的话就使用宏NLMSG_GOODSIZE, 跟进到 linux/genetlink.h;
.id = GENL_ID_GENERATE, // channel number: assigned by the genl controller
.name = "ovs_genl_exec",
.version = 1,
};
static struct genl_ops genl_exec_ops[] = {
{
.cmd = GENL_EXEC_RUN, //refer the operation
.doit = genl_exec_cmd, // callback function
.flags = CAP_NET_ADMIN,
},
};
35 struct genl_family { 37 unsigned int id; // 域族ID 38 unsigned int hdrsize; //用户自定义header的长度 39 char name[GENL_NAMSIZ]; //family name 40 unsigned int version; //协议版本 41 unsigned int maxattr; //支持的最大的attribute数 42 struct nlattr ** attrbuf; /* private 缓存解析的attributes*/ 43 struct list_head ops_list; /* private 分配的操作链表*/ 44 struct list_head family_list; /* private ,所有的family构成链表*/ 45 struct list_head mcast_groups; /* private 多播组链表*/ 46 };
{
genl_exec_function_ret = genl_exec_function(genl_exec_data);
complete(&done); // struct completion 有待理解??
return 0;
}
u32 snd_seq; // 发送序列号
u32 snd_portid; //发送者的netlink portid
struct nlmsghdr * nlhdr;
struct genlmsghdr * genlhdr;
void * userhdr; //用户自定义消息头
struct nlattr ** attrs; //
#ifdef CONFIG_NET_NS
struct net * _net; //网络命名空间
#endif
void * user_ptr[2];
};
{
spin_lock_init(&wq_lock);
INIT_LIST_HEAD(&workq); //初始化一个链表头workq(其实用作队列,前取后插)
init_waitqueue_head(&more_work); //more_work代表等待队列,这是Linux内核常见的处理方式。
workq_thread = kthread_create(worker_thread, NULL, "ovs_workq"); //创建内核线程,handler是worker_thread
wake_up_process(workq_thread); //让这个内核线程跑起来
return 0;
}
{
for (;;) {
wait_event_interruptible(more_work, (kthread_should_stop() || !list_empty(&workq)));
//若kthread_should_stop()返回假,并且worq非空就让等待队列中的任务休眠,否则从取出等待的任务进行调度。
if (kthread_should_stop())
break;
run_workqueue();
}
return 0;
}
{
while (!list_empty(&workq)) {
struct work_struct *work = list_entry(workq.next, struct work_struct, entry);
//通过workq.next指针找到容器结构体的指针,也就是非空的话从队列头取出一个work_struct 实体。
work_func_t f = work->func; //以struct work_struct *为参数的函数指针。
list_del_init(workq.next); //删除work.next后面的那个节点,初始化的目的让取出的那个节点 list_head 域不要悬空。
current_work = work; //这样就得到了当前应该投入调度的 work_struct
work_clear_pending(work); //没懂TODO
f(work); //核心(还没找到关于work的实例化代码)
BUG_ON(in_interrupt());
current_work = NULL;
}
}
#define WORK_STRUCT_PENDING 0 /* T if work item pending execution */
atomic_long_t data;
struct list_head entry;
work_func_t func;
#ifdef CONFIG_LOCKDEP
struct lockdep_map lockdep_map;
#endif
};
{
port_table = kmalloc(PORT_TABLE_SIZE * sizeof(struct hlist_head *), GFP_KERNEL);
for (i = 0; i < PORT_TABLE_SIZE; i++)
INIT_HLIST_HEAD(&port_table[i]);
return 0;
}
@几个参数的含义分别为 name:用于/proc/slabinfo文件中确认此高速缓冲的字符串,size:要创建的cache所对应对象的大小,align:对象对齐偏移量,flags:对应slab的标志,ctor:构建对象构造函数。
struct rcu_head rcu; //锁机制,read copy update
struct hlist_node hash_node[2]; //为何2个??
u32 hash;
struct sw_flow_key key;
struct sw_flow_actions __rcu *sf_acts;
spinlock_t lock; /* Lock for values below. */
unsigned long used; /* Last used time (in jiffies). */
u64 packet_count; /* Number of packets matched. */
u64 byte_count; /* Number of bytes matched. */
u8 tcp_flags; /* Union of seen TCP flags. */
};
{
dev_table = kzalloc(VPORT_HASH_BUCKETS * sizeof(struct hlist_head), GFP_KERNEL);
//kzalloc相比于kmalloc多了一个memset的功能。申请容量为1024个桶的哈希表dev_table 。
//ARRAY_SIZE(base_vport_ops_list)得到有多少中vport type,接下来遍历容器中的vport_ops 如果有初始化函数的话就执行下,而后在把这些 //vport type 倒腾到全局变量vport_ops_list中,n_vport_types记录 vport type 的数量。
const struct vport_ops *new_ops = base_vport_ops_list[i];
if (new_ops->init)
err = new_ops->init(); //在2.6.36以前是直接替换掉内核的模块br_handle_frame_hook = netdev_frame_hook;
else //现在的话在netdev_create的时候注册handler
err = 0;
if (!err)
vport_ops_list[n_vport_types++] = new_ops;
else if (new_ops->flags & VPORT_F_REQUIRED) {
ovs_vport_exit();
goto error;
}
}
return 0;
}
&ovs_netdev_vport_ops,
&ovs_internal_vport_ops,
&ovs_patch_vport_ops,
&ovs_gre_vport_ops,
&ovs_gre_ft_vport_ops,
&ovs_gre64_vport_ops,
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
&ovs_capwap_vport_ops,
#endif
};
.init = ovs_init_net,
.exit = ovs_exit_net,
.id = &ovs_net_id,
.size = sizeof(struct ovs_net),
};
{
struct ovs_net *ovs_net = net_generic(net, ovs_net_id); //通过id过得相应的地址,return __ovs_net_data[id];
INIT_LIST_HEAD(&ovs_net->dps); //初始化与这个ovs网络命名空间相关的 datapath list
return 0;
}
struct list_head dps;
};
struct capwap_net capwap;
};
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
struct socket *capwap_rcv_socket; //
struct netns_frags frag_state;
int n_tunnels;
#endif
};
.notifier_call = dp_device_event
};
{
struct net_device *dev = ptr;
struct vport *vport;
//因为我们用的Ubuntu12.04内核版本是3.2.0 > 3.1.0,所以ovs_is_internal_dev 返回真,同时会更新我们的dev: netdev->netdev_ops = &internal_dev_netdev_ops;同时设置的vport为NULL,直接返回。
if (ovs_is_internal_dev(dev))
vport = ovs_internal_dev_get_vport(dev);
else
vport = ovs_netdev_get_vport(dev);
if (!vport)
return NOTIFY_DONE;
switch (event) {
case NETDEV_UNREGISTER:
if (!ovs_is_internal_dev(dev)) {
struct sk_buff *notify;
struct datapath *dp = vport->dp;
notify = ovs_vport_cmd_build_info(vport, 0, 0, OVS_VPORT_CMD_DEL);
ovs_dp_detach_port(vport);
if (IS_ERR(notify)) {
netlink_set_err(GENL_SOCK(ovs_dp_get_net(dp)), 0, ovs_dp_vport_multicast_group.id, PTR_ERR(notify));
break;
}
genlmsg_multicast_netns(ovs_dp_get_net(dp), notify, 0, ovs_dp_vport_multicast_group.id, GFP_KERNEL);
}
break;
case NETDEV_CHANGENAME:
if (vport->port_no != OVSP_LOCAL) {
ovs_dp_sysfs_del_if(vport);
ovs_dp_sysfs_add_if(vport);
}
break;
}
return NOTIFY_DONE;
}
static const struct net_device_ops internal_dev_netdev_ops = {
.ndo_open = internal_dev_open, //网络设备注册的时候会调用
.ndo_stop = internal_dev_stop, //网络设备转换为down 状态的时候会调用
.ndo_start_xmit = internal_dev_xmit, //当有packets需要传输的时候调用,必须返回NETDEV_TX_OK,NETDEV_TX_BUSY,不能为NULL。
.ndo_do_ioctl = internal_dev_do_ioctl,
.ndo_change_mtu = internal_dev_change_mtu,
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
.ndo_get_stats64 = internal_dev_get_stats,
#else
.ndo_get_stats = internal_dev_sys_stats,
#endif
};
#endif
for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {
const struct genl_family_and_ops *f = &dp_genl_families[i];
err = genl_register_family_with_ops(f->family, f->ops, f->n_ops);
if (f->group) {
err = genl_register_mc_group(f->family, f->group);
}
}
return 0;
}
{ &dp_datapath_genl_family, dp_datapath_genl_ops, ARRAY_SIZE(dp_datapath_genl_ops), &ovs_dp_datapath_multicast_group },
{ &dp_vport_genl_family, dp_vport_genl_ops, ARRAY_SIZE(dp_vport_genl_ops),
{ &dp_flow_genl_family, dp_flow_genl_ops, ARRAY_SIZE(dp_flow_genl_ops),
{ &dp_packet_genl_family, dp_packet_genl_ops, ARRAY_SIZE(dp_packet_genl_ops), NULL },
};
.id = GENL_ID_GENERATE,
.hdrsize = sizeof(struct ovs_header),
.name = OVS_DATAPATH_FAMILY,
.version = OVS_DATAPATH_VERSION,
.maxattr = OVS_DP_ATTR_MAX,
SET_NETNSOK
};
{ .cmd = OVS_DP_CMD_NEW,
.flags = GENL_ADMIN_PERM, /* 需要权能 CAP_NET_ADMIN */
.policy = datapath_policy,
.doit = ovs_dp_cmd_new
},
{ .cmd = OVS_DP_CMD_DEL,
.flags = GENL_ADMIN_PERM,
.policy = datapath_policy,
.doit = ovs_dp_cmd_del
},
{ .cmd = OVS_DP_CMD_GET,
.flags = 0, /* OK for unprivileged users. */
.policy = datapath_policy,
.doit = ovs_dp_cmd_get,
.dumpit = ovs_dp_cmd_dump
},
{ .cmd = OVS_DP_CMD_SET,
.flags = GENL_ADMIN_PERM,
.policy = datapath_policy,
.doit = ovs_dp_cmd_set,
},
};
#ifdef HAVE_NLA_NUL_STRING
[OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
#endif
[OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 },
};
[OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED },
[OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED },
[OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG },
};
static struct genl_family dp_flow_genl_family= {
.id = GENL_ID_GENERATE,
.hdrsize = sizeof(struct ovs_header),
.name = OVS_FLOW_FAMILY,
.version = OVS_FLOW_VERSION,
.maxattr = OVS_FLOW_ATTR_MAX,
SET_NETNSOK
};
{ .cmd = OVS_FLOW_CMD_NEW,.flags =GENL_ADMIN_PERM, .policy = flow_policy, .doit =ovs_flow_cmd_new_or_set},
{ .cmd = OVS_FLOW_CMD_DEL, .flags = GENL_ADMIN_PERM, .policy = flow_policy, .doit =ovs_flow_cmd_del},
{ .cmd = OVS_FLOW_CMD_GET, .flags = 0, .policy = flow_policy, .doit = ovs_flow_cmd_get, .dumpit =ovs_flow_cmd_dump},
{ .cmd = OVS_FLOW_CMD_SET, .flags = GENL_ADMIN_PERM, .policy = flow_policy, .doit = ovs_flow_cmd_new_or_set,},
};
static struct genl_ops dp_packet_genl_ops[] = {
{ .cmd = OVS_PACKET_CMD_EXECUTE, .flags = GENL_ADMIN_PERM, .policy = packet_policy,
};
{
struct ovs_header *ovs_header = info->userhdr;
//OVS genl msg 的头结构,里面只有一个域ifindex(datapath关联的本地端口的索引,include/linux/openvswitch.h)
struct nlattr **a = info->attrs;struct sw_flow_actions *acts;
struct sk_buff *packet;
struct sw_flow *flow;
struct datapath *dp;
struct ethhdr *eth;
int len;
int err;
int key_len;
err = -EINVAL; //没有我们需要的属性或者packet不完整
if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
!a[OVS_PACKET_ATTR_ACTIONS] || nla_len(a[OVS_PACKET_ATTR_PACKET]) < ETH_HLEN)
goto err;
len = nla_len(a[OVS_PACKET_ATTR_PACKET]);
packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL);
//调用__netdev_alloc_skb(NULL,NET_IP_ALIGH+len,GFP_KERNEL),其中参数 struct net_device *为NULL,
//在特定的设备上分配一个sk_buff来接收packet
err = -ENOMEM;if (!packet)
goto err;
skb_reserve(packet, NET_IP_ALIGN);
//NET_IP_ALIGN=2 宏定义在net/skbuff.h中,以太网头是14B 为了对齐,带来的利弊是什么??
memcpy(__skb_put(packet, len), nla_data(a[OVS_PACKET_ATTR_PACKET]), len);
//skb_put使得skbuff增加len字节的数据,只是返回了可以装入数据的位置指针(tail pointer)而后copy;
//这样从用户空间传来的包构造到了skbuff中。
skb_reset_mac_header(packet); //将链路层的mac_header 指向包头
eth = eth_hdr(packet); //linux/if_ether.h ,获得以太网头结构
//根据ethhdr中的协议域来设置skb中的protocol字段,不懂。
if (ntohs(eth->h_proto) >= 1536)packet->protocol = eth->h_proto;
else
packet->protocol = htons(ETH_P_802_2);
//为这个发送包构建一个 sw_flow
flow = ovs_flow_alloc(); // -->datapath/flow.c
err = PTR_ERR(flow); //转换成长整型 linux/err.h
if (IS_ERR(flow))
goto err_kfree_skb;
err = ovs_flow_extract(packet, -1, &flow->key, &key_len);
//从skbuff packet的以太网帧中获得数据,构建flow的 sw_flow_key;
err = ovs_flow_metadata_from_nlattrs(flow, key_len, a[OVS_PACKET_ATTR_KEY]);
//从接收到的nlattr中继续填充flow->key 的相关字段;
acts = ovs_flow_actions_alloc(nla_len(a[OVS_PACKET_ATTR_ACTIONS]));
//动态分配一个struct sw_flow_actions 实例,下面会填充;
err = PTR_ERR(acts);if (IS_ERR(acts))
goto err_flow_free;
err = validate_and_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0, &acts);
rcu_assign_pointer(flow->sf_acts, acts);
if (err)
goto err_flow_free;
OVS_CB(packet)->flow = flow;
//((struct ovs_skb_cb *)(packet)->cb ) -> flow = flow; ovs data in skb(datapath/datapath.h )
packet->priority = flow->key.phy.priority;skb_set_mark(packet, flow->key.phy.skb_mark);
rcu_read_lock();
dp =get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);//datapath/datapath.c
//sock_net(skb->sk)=NULL,通过dp_ifindex得到net_device ,然后找到vport ,返回vport->dp;
err = -ENODEV;if (!dp)
goto err_unlock;
local_bh_disable();
err =ovs_execute_actions(dp, packet); //datapath/actions.c
//呼叫do_execute_actions(dp, skb, acts->actions, acts->actions_len, NULL, false);对skb执行一系列action;
local_bh_enable();rcu_read_unlock();
ovs_flow_free(flow);
return err;
}
static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
const struct nlattr *attr, int len,
struct ovs_key_ipv4_tunnel *tun_key, bool keep_skb)
{
/* Every output action needs a separate clone of 'skb', but the common
* case is just a single output action, so that doing a clone and
* then freeing the original skbuff is wasteful. So the following code
* is slightly obscure just to avoid that. */
int prev_port = -1;
const struct nlattr *a;
int rem;
for (a = attr, rem = len; rem > 0;
a = nla_next(a, &rem)) {
int err = 0;
if (prev_port != -1) { //说明前面的循环告知有output action
do_output(dp, skb_clone(skb, GFP_ATOMIC), prev_port);
//呼叫ovs_vport_send从设备上发送packet(datapath/vport.c):vport->ops->send(vport, skb);
prev_port = -1;
}
switch (nla_type(a)) {
case OVS_ACTION_ATTR_OUTPUT:
prev_port = nla_get_u32(a);
break;
case OVS_ACTION_ATTR_USERSPACE:
output_userspace(dp, skb, a); // 通过ovs_dp_upcall 发送到用户空间(datapath.c);
break; //ovs_dp_upcall->queue_userspace_packet->genlmsg_unicast
case OVS_ACTION_ATTR_PUSH_VLAN:
err = push_vlan(skb, nla_data(a));
if (unlikely(err)) /* skb already freed. */
return err;
break;
case OVS_ACTION_ATTR_POP_VLAN:
err = pop_vlan(skb);
break;
case OVS_ACTION_ATTR_SET:
err = execute_set_action(skb, nla_data(a), tun_key);
break;
case OVS_ACTION_ATTR_SAMPLE:
err = sample(dp, skb, a, tun_key);
break;
}
if (unlikely(err)) {
kfree_skb(skb);
return err;
}
}
if (prev_port != -1) {
if (keep_skb)
skb = skb_clone(skb, GFP_ATOMIC);
do_output(dp, skb, prev_port);
} else if (!keep_skb)
consume_skb(skb);
return 0;
}
- OVS datapath模块分析:基本架构
- OVS datapath模块分析:packet处理流程
- OVS datapath模块分析:packet处理流程
- OVS中的datapath学习
- OVS vswitchd 模块分析(2)
- OVS源码研究 Datapath进行Packet处理
- OVS vport, datapath, flow_table, flex_array, sw_flow
- OVS vswitchd 模块分析(1)
- note: Actions entry points function of OVS datapath
- OVS datapath流表结构及匹配过程
- ovs 内置模块说明
- ovs常用命令(基本)
- OVS转发面分析
- OVS VxLAN Flow 分析
- OVS VxLAN Flow 分析
- OVS VxLAN Flow 分析
- OVS流表分析
- OVS 源码分析
- 如何在DevExpress ASPxGridView中进行编辑
- Java里的字符串, String类简单介绍.
- 关于.so 文件的认识
- ARM内容简介
- 新工具:AllJoyn C++ 代码生成器
- OVS datapath模块分析:基本架构
- JS 异步队列的实现
- 软件设计是怎样炼成的(1)——什么是优秀的设计?
- 也许爱情只是因为寂寞,需要找一个人来爱
- vs2012智能提示突然不能用了
- 多线程初步
- SGMLParser (二)
- java 面试题锦集
- Uva 11991