IP层实现1--初始化

来源:互联网 发布:python游戏开发百度云 编辑:程序博客网 时间:2024/06/16 01:18

IP头部:

[ include/uapi/linux/ip.h  ]

struct iphdr {#if defined(__LITTLE_ENDIAN_BITFIELD)__u8ihl:4,// 头部长度(单位为32位)version:4;// IP版本4或6#elif defined (__BIG_ENDIAN_BITFIELD)__u8version:4,  ihl:4;#else#error"Please fix <asm/byteorder.h>"#endif__u8tos;// Type of Service,不太常用__be16tot_len;// 包的长度(包含头部),单位为字节__be16id;// ID,在分片中起核心作用__be16frag_off;// DF (Don’t Fragment);MF (More Fragments);Fragment Offset__u8ttl;// 生存时间, 默认64__u8protocol;// 上一层协议__sum16check;// IP头部校验和__be32saddr;// 源地址__be32daddr;// 目的地址/*The options start here.  * IP选项 */};
在IP层之上的协议(TCP,UDP),都要支持socket接口的调用。socket提供了一个结构,用来提供各接口:
[ include/net/sock.h ]
struct proto {void(*close)(struct sock *sk,long timeout);int(*connect)(struct sock *sk,struct sockaddr *uaddr,int addr_len);         ...};
然后又提供了一个全局列表,所有支持socket的协议都注册到些列表上:
[ net/core/sock.c ]
static LIST_HEAD(proto_list);
注册函数为:
[ net/core/sock.c ]
int proto_register(struct proto *prot, int alloc_slab){if (alloc_slab) {/* 分配缓冲,名称为协议的名字(如"TCP") * 大小为对应结构的大小(如,对TCP, .obj_size = sizeof(struct tcp_sock) ) */prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,SLAB_HWCACHE_ALIGN | prot->slab_flags,NULL);if (prot->slab == NULL) {pr_crit("%s: Can't create sock SLAB cache!\n",prot->name);goto out;}/* socket中各种响应(ack)的操作 */if (prot->rsk_prot != NULL) {prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);// 缓冲名称if (prot->rsk_prot->slab_name == NULL)goto out_free_sock_slab;/* 分配缓冲,名称(如"request_sock_TCP") * 大小为对应结构的大小(如,对TCP, .obj_size = sizeof(struct tcp_request_sock) ) */prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name, prot->rsk_prot->obj_size, 0, SLAB_HWCACHE_ALIGN, NULL);if (prot->rsk_prot->slab == NULL) {pr_crit("%s: Can't create request sock SLAB cache!\n",prot->name);goto out_free_request_sock_slab_name;}}/* socket处于TIMEWAIT状态时的操作 */if (prot->twsk_prot != NULL) {prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);// 缓冲名称if (prot->twsk_prot->twsk_slab_name == NULL)goto out_free_request_sock_slab;/* 分配缓冲,名称(如"tw_sock_TCP") * 大小为对应结构的大小(如,对TCP, .obj_size = sizeof(struct tcp_timewait_sock) ) */prot->twsk_prot->twsk_slab =kmem_cache_create(prot->twsk_prot->twsk_slab_name,  prot->twsk_prot->twsk_obj_size,  0,  SLAB_HWCACHE_ALIGN |prot->slab_flags,  NULL);if (prot->twsk_prot->twsk_slab == NULL)goto out_free_timewait_sock_slab_name;}}mutex_lock(&proto_list_mutex);list_add(&prot->node, &proto_list);// 将协议加入到全局列表中/* 为快速查询协议是否在socket中有效,申明了一个bitmap,其中的每一位都表示一个协议 * bitmap中位的索引保存在 prot->inuse_idx中 * 使用时可根据些索引设置和查询对应bitmap中位的值 * 别外还申明了一个PRE_CPU数组变量,通过prot->inuse_idx可设置和查询当前使用prot的数量 */assign_proto_idx(prot);mutex_unlock(&proto_list_mutex);return 0;out_free_timewait_sock_slab_name:kfree(prot->twsk_prot->twsk_slab_name);out_free_request_sock_slab:if (prot->rsk_prot && prot->rsk_prot->slab) {kmem_cache_destroy(prot->rsk_prot->slab);prot->rsk_prot->slab = NULL;}out_free_request_sock_slab_name:if (prot->rsk_prot)kfree(prot->rsk_prot->slab_name);out_free_sock_slab:kmem_cache_destroy(prot->slab);prot->slab = NULL;out:return -ENOBUFS;}EXPORT_SYMBOL(proto_register);
对于IP来说,socket对应的family为PF_INET,与之对应的结构为:

[ net/ipv4/af_inet.c ]

static const struct net_proto_family inet_family_ops = {.family = PF_INET,.create = inet_create,// 建立socket时被调用.owner= THIS_MODULE,};
对与socket支持的family,都有一个对应的net_proto_family结构,同样,内核提供一个全局数组:

[ net/socket.c ]

static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
NPROTO为socket支持的family的总数。用下面的函数进行注册:

[ net/socket.c ]

int sock_register(const struct net_proto_family *ops){int err;if (ops->family >= NPROTO) {printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family,       NPROTO);return -ENOBUFS;}spin_lock(&net_family_lock);if (rcu_dereference_protected(net_families[ops->family],      lockdep_is_held(&net_family_lock)))err = -EEXIST;else {rcu_assign_pointer(net_families[ops->family], ops);// 将ops设置到全局数组中err = 0;}spin_unlock(&net_family_lock);printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);return err;}EXPORT_SYMBOL(sock_register);

socket 同一个family中有很多类型(如:SOCK_STREAM,SOCK_DGRAM,SOCK_RAW),对IP层来说,这些所有的类型都保存在全局列表中:

[ net/ipv4/af_inet.c ]

/* The inetsw table contains everything that inet_create needs to * build a new socket. */static struct list_head inetsw[SOCK_MAX];
内核提供了一个初始化列表:
[ net/ipv4/af_inet.c ]
/* Upon startup we insert all the elements in inetsw_array[] into * the linked list inetsw. */static struct inet_protosw inetsw_array[] ={{.type =       SOCK_STREAM,.protocol =   IPPROTO_TCP,.prot =       &tcp_prot,.ops =        &inet_stream_ops,.no_check =   0,.flags =      INET_PROTOSW_PERMANENT |      INET_PROTOSW_ICSK,},{.type =       SOCK_DGRAM,.protocol =   IPPROTO_UDP,.prot =       &udp_prot,.ops =        &inet_dgram_ops,.no_check =   UDP_CSUM_DEFAULT,.flags =      INET_PROTOSW_PERMANENT,       },       {.type =       SOCK_DGRAM,.protocol =   IPPROTO_ICMP,.prot =       &ping_prot,.ops =        &inet_dgram_ops,.no_check =   UDP_CSUM_DEFAULT,.flags =      INET_PROTOSW_REUSE,       },       {       .type =       SOCK_RAW,       .protocol =   IPPROTO_IP,/* wild card */       .prot =       &raw_prot,       .ops =        &inet_sockraw_ops,       .no_check =   UDP_CSUM_DEFAULT,       .flags =      INET_PROTOSW_REUSE,       }};
将socket类型注册到全局数组inetsw_array,调用下面函数:

[ net/ipv4/af_inet.c ]

void inet_register_protosw(struct inet_protosw *p){struct list_head *lh;struct inet_protosw *answer;/* 协议类型(TCP,UDP...) * 这里要和 p->type( SOCK_STREAM,SOCK_DGRAM... )区分 */int protocol = p->protocol;struct list_head *last_perm;spin_lock_bh(&inetsw_lock);if (p->type >= SOCK_MAX)goto out_illegal;/* If we are trying to override a permanent protocol, bail. */answer = NULL;last_perm = &inetsw[p->type];// 协议对应的位置list_for_each(lh, &inetsw[p->type]) {// 协议列表answer = list_entry(lh, struct inet_protosw, list);/* Check only the non-wild match. */if (INET_PROTOSW_PERMANENT & answer->flags) { /* Permanent protocols are unremovable. */if (protocol == answer->protocol)// 协议类型相同break;last_perm = lh;}answer = NULL;}if (answer)// 协议己经存在goto out_permanent;/* Add the new entry after the last permanent entry if any, so that * the new entry does not override a permanent entry when matched with * a wild-card protocol. But it is allowed to override any existing * non-permanent entry.  This means that when we remove this entry, the * system automatically returns to the old behavior. * 加到列表的最后 */list_add_rcu(&p->list, last_perm);out:spin_unlock_bh(&inetsw_lock);return;out_permanent:pr_err("Attempt to override permanent protocol %d\n", protocol);goto out;out_illegal:pr_err("Ignoring attempt to register invalid socket type %d\n",       p->type);goto out;}

内核支持不同的协议,如UDP,TCP。当数据到达IP层后,要根据上层协议的类型调用不同的接收函数,内核通过下面的方式处理这种情况:

  1. 定义一个结构封装各函数:
    [ include/net/protocol.h ]
    /* This is used to register protocols. */struct net_protocol {void(*early_demux)(struct sk_buff *skb);int(*handler)(struct sk_buff *skb);void(*err_handler)(struct sk_buff *skb, u32 info);unsigned intno_policy:1,netns_ok:1,/* does the protocol do more stringent * icmp tag validation than simple * socket lookup? */icmp_strict_tag_validation:1;};
  2. 定义一个全局列表,所有协议都注册到此列表
    [ net/ipv4/protocol.c ]
    const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;

    [ include/net/protocol.h ]
    /* This is one larger than the largest protocol value that can be * found in an ipv4 or ipv6 header.  Since in both cases the protocol * value is presented in a __u8, this is defined to be 256. */#define MAX_INET_PROTOS256

以下函数用来向全局列表注册net_protocol类型:

[ net/ipv4/protocol.c ]

int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol){if (!prot->netns_ok) {pr_err("Protocol %u is not namespace aware, cannot register.\n",protocol);return -EINVAL;}return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],NULL, prot) ? 0 : -1;}EXPORT_SYMBOL(inet_add_protocol);

所有接收的包都有不同的类型,如IP,802.3,ARP,IPv6等,当接收到不同类型的包后,要调用不同的处理函数,内核通过下面的方式处理这种情况。

  1. 定义一个结构,用来将类型和函数对应起来
    [ include/linux/netdevice.h ]
    struct packet_type {__be16type;/* This is really htons(ether_type).包的类型 */struct net_device*dev;/* NULL is wildcarded here.对应的网络设备 */int(*func) (struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *);bool(*id_match)(struct packet_type *ptype,    struct sock *sk);void*af_packet_priv;struct list_headlist;};
  2. 定义一个全局列表,所有packet_type类型为ETH_P_ALL(接收所有类型的包)的都挂在此列表上
    [ net/core/dev.c ]
    struct list_head ptype_all __read_mostly;    /* Taps */
  3. 定义一个哈希表,其中的key为包的类型
    [ net/core/dev.c ]
    struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;

    [ include/linux/netdevice.h ]
    /* *The list of packet types we will receive (as opposed to discard) *and the routines to invoke. * *Why 16. Because with 16 the only overlap we get on a hash of the *low nibble of the protocol value is RARP/SNAP/X.25. * *      NOTE:  That is no longer true with the addition of VLAN tags.  Not *             sure which should go first, but I bet it won't make much *             difference if we are running VLANs.  The good news is that *             this protocol won't be in the list unless compiled in, so *             the average user (w/out VLANs) will not be adversely affected. *             --BLG * *0800IP *8100    802.1Q VLAN *0001802.3 *0002AX.25 *0004802.2 *8035RARP *0005SNAP *0805X.25 *0806ARP *8137IPX *0009Localtalk *86DDIPv6 */#define PTYPE_HASH_SIZE(16)#define PTYPE_HASH_MASK(PTYPE_HASH_SIZE - 1)

以下函数用来向全局列表注册packet_type类型:

[ net/core/dev.c ]

/* *Add a protocol ID to the list. Now that the input handler is *smarter we can dispense with all the messy stuff that used to be *here. * *BEWARE!!! Protocol handlers, mangling input packets, *MUST BE last in hash buckets and checking protocol handlers *MUST start from promiscuous ptype_all chain in net_bh. *It is true now, do not change it. *Explanation follows: if protocol handler, mangling packet, will *be the first on list, it is not able to sense, that packet *is cloned and should be copied-on-write, so that it will *change it and subsequent readers will get broken packet. *--ANK (980803) */static inline struct list_head *ptype_head(const struct packet_type *pt){if (pt->type == htons(ETH_P_ALL))// 接收所有类型的包return &ptype_all;elsereturn &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];}/** *dev_add_pack - add packet handler *@pt: packet type declaration * *Add a protocol handler to the networking stack. The passed &packet_type *is linked into kernel lists and may not be freed until it has been *removed from the kernel lists. * *This call does not sleep therefore it can not *guarantee all CPU's that are in middle of receiving packets *will see the new packet type (until the next received packet). */void dev_add_pack(struct packet_type *pt){struct list_head *head = ptype_head(pt);// 得到要挂载的列表spin_lock(&ptype_lock);list_add_rcu(&pt->list, head);// 将pt挂到列表上spin_unlock(&ptype_lock);}EXPORT_SYMBOL(dev_add_pack);
对于IP包,定义了如下的类型:

[ net/ipv4/af_inet.c ]

/* 网络数据包的类型(链路层) */static struct packet_type ip_packet_type __read_mostly = {.type = cpu_to_be16(ETH_P_IP),/* Internet Protocol packet*/.func = ip_rcv,};
它会在初始化IP模块时注册到全局列表当中去。其中的ip_rcv就是接收数据包的函数。

下面就可以看IP层的初始化:
[ net/ipv4/af_inet.c ]
static int __init inet_init(void){struct inet_protosw *q;struct list_head *r;int rc = -EINVAL;BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb));sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);// 先分配一块大小为8192字节的空间,将些空间初始化为0if (!sysctl_local_reserved_ports)goto out;rc = proto_register(&tcp_prot, 1);// 注册TCP接口if (rc)goto out_free_reserved_ports;rc = proto_register(&udp_prot, 1);// 注册UDP接口if (rc)goto out_unregister_tcp_proto;rc = proto_register(&raw_prot, 1);// 注册RAW接口if (rc)goto out_unregister_udp_proto;rc = proto_register(&ping_prot, 1);// 注册PING接口if (rc)goto out_unregister_raw_proto;/* *Tell SOCKET that we are alive... */(void)sock_register(&inet_family_ops);// 向socket注册IP协议#ifdef CONFIG_SYSCTLip_static_sysctl_init();// 注册sysctl,和路由相关#endif/* *Add all the base protocols. * inet_protos是一个全局数组,包含所有支持的协议 */if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)// 添加ICMP协议pr_crit("%s: Cannot add ICMP protocol\n", __func__);if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)pr_crit("%s: Cannot add UDP protocol\n", __func__);// 添加UDP协议if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)pr_crit("%s: Cannot add TCP protocol\n", __func__);// 添加TCP协议#ifdef CONFIG_IP_MULTICASTif (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)// 添加IGMP协议pr_crit("%s: Cannot add IGMP protocol\n", __func__);#endif/* Register the socket-side information for inet_create.  * inetsw是一个列表,包含所有SOCKET类型 * 初始化所有的SOCKET类型列表 */for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)INIT_LIST_HEAD(r);/* 用inetsw_array初始化inetsw */for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)inet_register_protosw(q);/* *Set the ARP module up */arp_init();/* *Set the IP module up */ip_init();tcp_v4_init();/* Setup TCP slab cache for open requests. */tcp_init();/* Setup UDP memory threshold */udp_init();/* Add UDP-Lite (RFC 3828) */udplite4_register();ping_init();/* *Set the ICMP layer up */if (icmp_init() < 0)panic("Failed to create the ICMP control socket.\n");/* *Initialise the multicast router */#if defined(CONFIG_IP_MROUTE)if (ip_mr_init())pr_crit("%s: Cannot init ipv4 mroute\n", __func__);#endif/* *Initialise per-cpu ipv4 mibs */if (init_ipv4_mibs())pr_crit("%s: Cannot init ipv4 mibs\n", __func__);ipv4_proc_init();ipfrag_init();dev_add_pack(&ip_packet_type);// 注册数据包的类型(ETH_P_IP)rc = 0;out:return rc;out_unregister_raw_proto:proto_unregister(&raw_prot);out_unregister_udp_proto:proto_unregister(&udp_prot);out_unregister_tcp_proto:proto_unregister(&tcp_prot);out_free_reserved_ports:kfree(sysctl_local_reserved_ports);goto out;}fs_initcall(inet_init);// 在系统初始化时调用inet_init

为提高接收和发送的效率,尤其是在大负载下的效率,内核作了特别的处理

[ net/ipv4/af_inet.c ]

static int __init ipv4_offload_init(void){/* * Add offloads */if (udpv4_offload_init() < 0)pr_crit("%s: Cannot add UDP protocol offload\n", __func__);if (tcpv4_offload_init() < 0)pr_crit("%s: Cannot add TCP protocol offload\n", __func__);dev_add_offload(&ip_packet_offload);inet_add_offload(&ipip_offload, IPPROTO_IPIP);return 0;}fs_initcall(ipv4_offload_init);// 在内核初始化时调用ipv4_offload_init
所有接收的包都有不同的类型,如IP,802.3,ARP,IPv6等,每种类型都有对应的packet_offload类型。其中IP层处理的数据包的类型是ETH_P_IP,对应结构为ip_packet_offload

[ net/ipv4/af_inet.c ]

/* *IP protocol layer initialiser */static struct packet_offload ip_packet_offload __read_mostly = {.type = cpu_to_be16(ETH_P_IP),.callbacks = {.gso_send_check = inet_gso_send_check,.gso_segment = inet_gso_segment,.gro_receive = inet_gro_receive,.gro_complete = inet_gro_complete,},};
内核申明一个全局数组offload_base,通过下面的函数将packet_offload注册到数组中:

[ net/core/dev.c ]

static struct list_head offload_base __read_mostly;/** *dev_add_offload - register offload handlers *@po: protocol offload declaration * *Add protocol offload handlers to the networking stack. The passed *&proto_offload is linked into kernel lists and may not be freed until *it has been removed from the kernel lists. * *This call does not sleep therefore it can not *guarantee all CPU's that are in middle of receiving packets *will see the new offload handlers (until the next received packet). */void dev_add_offload(struct packet_offload *po){struct list_head *head = &offload_base;// 全局列表spin_lock(&offload_lock);list_add_rcu(&po->list, head);spin_unlock(&offload_lock);}EXPORT_SYMBOL(dev_add_offload);
上面提到内核为支持不同的协议,如UDP,TCP,申明了全局数组inet_protos,内核用相同的方法处理大负载:

[ net/ipv4/protocol.c ]

const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;int inet_add_offload(const struct net_offload *prot, unsigned char protocol){    return !cmpxchg((const struct net_offload **)&inet_offloads[protocol],            NULL, prot) ? 0 : -1;}EXPORT_SYMBOL(inet_add_offload);
这样就把不同协议的net_offload结构注册到了全局数组inet_offloads中了。而对于IP层对应的结构为:

[ net/ipv4/af_inet.c ]

static const struct net_offload ipip_offload = {.callbacks = {.gso_send_check = inet_gso_send_check,.gso_segment= inet_gso_segment,},};

0 0
原创粉丝点击