学习linux协议栈关键数据结构

来源:互联网 发布:python与量化投资 编辑:程序博客网 时间:2024/05/17 22:22

0x01 缘由

     从大学开始算法题开始,强调数据结构的重要性,良好的数据结构设计会使程序设计更加合理和健壮。

0x02 关键结构

     先摘录一个图,了解各数据结构件的关系。--摘录于《Linux TCP IP 协议栈分析.pdf》
          

     1.struct sk_buff - socket buffer


struct sk_buff {    /* 这两个结构必须放在此结构的前面,主要方便数据的强制转换*/    struct sk_buff        *next; //双向链表结构,指向下一个sk_buff    struct sk_buff        *prev; //指向前一个sk_buff结构    struct sock        *sk;     //这个指针指向一个套接字sock数据结构。当数据在本地产生或者本地进程接受时,需要这个指针;里面的数据会有tcp/udp和用户态程序使用。如果是转发此指针为NULL。后续详解。    ktime_t            tstamp;    //包到达的时间戳    struct net_device    *dev; //网络设备,哪个网卡、虚拟网卡,后续结构详解。    unsigned long        _skb_dst;     /*     * 这是控制缓冲区。 每层都可以自由使用 请把您的私有变量放在那里。 如果你想让他们跨层,你必须先做一个skb_clone()。 这是由谁拥有skb排队的ATM拥有。     */    char            cb[48];    unsigned int        len, //数据包的全部数据长度,包括data指向的数据和end后面的分片的数据的总长                data_len;  //本分片所包含的数据长度    __u16            mac_len,    //mac包头长度                hdr_len; //硬件头部长度    union {        __wsum        csum;        struct {            __u16    csum_start;            __u16    csum_offset;        };    };  //校验和    __u32            priority;  //QoS等级    kmemcheck_bitfield_begin(flags1);    __u8            local_df:1,                cloned:1,                ip_summed:2,                nohdr:1,                nfctinfo:3;    __u8            pkt_type:3,        // 根据L2层帧的目的地址进行类型划分。                fclone:2,                        //sk_buff克隆状态                ipvs_property:1,        //IP虚拟服务器属性                peeked:1,           //这个数据包已经被看到了,所以已经做了统计,不要再做了                nf_trace:1;        //netfilter 包记录标识    __be16            protocol:16; //从L2层设备驱动看使用在下一个较高层的协议。    kmemcheck_bitfield_end(flags1);    void            (*destructor)(struct sk_buff *skb); //析构函数#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)    struct nf_conntrack    *nfct; skb与连接的关系    struct sk_buff        *nfct_reasm; netfilter conntrack重组指针#endif#ifdef CONFIG_BRIDGE_NETFILTER    struct nf_bridge_info    *nf_bridge;//桥接帧数据#endif    int            iif; //到达的设备的索引,网卡索引编号#ifdef CONFIG_NET_SCHED    __u16            tc_index;    //流量控制索引#ifdef CONFIG_NET_CLS_ACT    __u16            tc_verd;    //流量控制决定#endif#endif    kmemcheck_bitfield_begin(flags2);    __u16            queue_mapping:16; //多队列网卡设备的映射关系#ifdef CONFIG_IPV6_NDISC_NODETYPE    __u8            ndisc_nodetype:2; //路由类型,来自链路层;#endif    kmemcheck_bitfield_end(flags2);    /* 0/14 bit hole */#ifdef CONFIG_NET_DMA    dma_cookie_t        dma_cookie; //被DMA相关函数完成的相关操作cookeie#endif#ifdef CONFIG_NETWORK_SECMARK    __u32            secmark; //安全相关标记#endif    __u32            mark;  //通用标记    __u16            vlan_tci; //vlan标签控制信息    sk_buff_data_t        transport_header;  //传输层头    sk_buff_data_t        network_header;    //网络层头    sk_buff_data_t        mac_header;                 //以太网层头    /* These elements must be at the end, see alloc_skb() for details.  */    sk_buff_data_t        tail;                 sk_buff_data_t        end;                               unsigned char        *head,                                               *data;                                                    //head和end指向的是数据区的开端和尾、,data和tail指向的是实际数据的开头和结尾    unsigned int        truesize;                            //此缓冲区总大小,包括sk_buff。sk_buff只不过是个指针的集合,他所指的才是真正的数据区,所以是两部分。    atomic_t        users;  //引用计数,使用这个sk_buff的使用者的数目,可能有多个函数要使用同一个sk_buff所以防止提前释放掉,设置此计数}; 


2.struct sock - scokets的网络层描述


struct sock { sock_common    __sk_common; //套接口在网络层的最小表示#define sk_node            __sk_common.skc_node#define sk_nulls_node        __sk_common.skc_nulls_node#define sk_refcnt        __sk_common.skc_refcnt#define sk_copy_start        __sk_common.skc_hash#define sk_hash            __sk_common.skc_hash#define sk_family        __sk_common.skc_family#define sk_state        __sk_common.skc_state#define sk_reuse        __sk_common.skc_reuse#define sk_bound_dev_if        __sk_common.skc_bound_dev_if#define sk_bind_node        __sk_common.skc_bind_node#define sk_prot            __sk_common.skc_prot#define sk_net            __sk_common.skc_net    kmemcheck_bitfield_begin(flags);    unsigned int        sk_shutdown  : 2, //是一组标志位,SEND_SHUTDOWN and/or RCV_SHUTDOWN。                sk_no_check  : 2,        //不对包进行检查标识                sk_userlocks : 4,        // %SO_SNDBUF 和 %SO_RCVBUF 缓存设置锁                sk_protocol  : 8,                       sk_type      : 16;    kmemcheck_bitfield_end(flags);    int            sk_rcvbuf;                //接收缓存区大小    socket_lock_t        sk_lock;  //同步锁    /*     * The backlog queue is special, it is always used with     * the per-socket spinlock held and requires low latency     * access. Therefore we special case it's implementation.     */    struct {        struct sk_buff *head;        struct sk_buff *tail;    } sk_backlog;   //总是被自旋锁持有    wait_queue_head_t    *sk_sleep; //在队列中等待的socket    struct dst_entry    *sk_dst_cache; //目的地址的缓存#ifdef CONFIG_XFRM    struct xfrm_policy    *sk_policy[2];#endif    rwlock_t        sk_dst_lock;  //目的缓存读写锁    atomic_t        sk_rmem_alloc; //表示接收队列已提交的字节数。    atomic_t        sk_wmem_alloc; //表示发送队列已提交的字节数。    atomic_t        sk_omem_alloc; //用“O”或“other”做选项    int            sk_sndbuf;    struct sk_buff_head    sk_receive_queue; //表示接收的数据包的队列。    struct sk_buff_head    sk_write_queue;   //表示发送的数据包的队列。#ifdef CONFIG_NET_DMA    struct sk_buff_head    sk_async_wait_queue; //DMA复制数据包#endif    int            sk_wmem_queued;            //维持的队列大小    int            sk_forward_alloc;   //转发空间分配    gfp_t            sk_allocation;        //分配空间的模式    int            sk_route_caps;            //路由容量    int            sk_gso_type;                //GSO type (e.g. %SKB_GSO_TCPV4)    unsigned int        sk_gso_max_size;//最大的GSO段大小    int            sk_rcvlowat;    //    unsigned long         sk_flags; //%SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, %SO_OOBINLINE 标识设定, %SO_TIMESTAMPING 标识设定    unsigned long            sk_lingertime; //SO_LINGER设定    struct sk_buff_head    sk_error_queue;  //非常少用    struct proto        *sk_prot_creator; //原始socket创建器    rwlock_t        sk_callback_lock;    //    int            sk_err,                sk_err_soft;    atomic_t        sk_drops; //raw/udp drop计数    unsigned short        sk_ack_backlog; //当前监听队列数    unsigned short        sk_max_ack_backlog; //在listen()中设置的数目    __u32            sk_priority;  //优先级    struct ucred        sk_peercred;    long            sk_rcvtimeo;    long            sk_sndtimeo;    struct sk_filter          *sk_filter; //socket 过滤结构    void            *sk_protinfo;    //私有区域    struct timer_list    sk_timer; //socket清理定时器    ktime_t            sk_stamp;    //最后一包接收时间    struct socket        *sk_socket;    //IO信号    void            *sk_user_data;    //RPC层私有数据    struct page        *sk_sndmsg_page; //sndmsg缓存    struct sk_buff        *sk_send_head; //转发数据头    __u32            sk_sndmsg_off; // sndmsg缓存偏移    int            sk_write_pending;#ifdef CONFIG_SECURITY    void            *sk_security;#endif    __u32            sk_mark;    /* XXX 4 bytes hole on 64 bit */    void            (*sk_state_change)(struct sock *sk);    void            (*sk_data_ready)(struct sock *sk, int bytes);    void            (*sk_write_space)(struct sock *sk);    void            (*sk_error_report)(struct sock *sk);      int            (*sk_backlog_rcv)(struct sock *sk,                          struct sk_buff *skb);     void                    (*sk_destruct)(struct sock *sk);};struct sock_common {  unsigned short      skc_family;         /*地址族*/  volatile unsigned char  skc_state;      /*连接状态*/  unsigned char       skc_reuse;          /*SO_REUSEADDR设置*/  int         skc_bound_dev_if;  struct hlist_node   skc_node;  struct hlist_node   skc_bind_node;      /*哈希表相关*/  atomic_t        skc_refcnt;             /*引用计数*/};

3.struct net_device


struct net_device{    char            name[IFNAMSIZ]; //网络设备名称,如eth0    struct hlist_node    name_hlist; //这个字段用于构建网络设备名的哈希散列表,而struct net中的name_hlist就指向每个哈希散列表的链表头;    char             *ifalias; //网络设备的别名;    /*网络设备内存映射时在主机中的内存区域*/    unsigned long        mem_end;    /* 共享内存结束    */    unsigned long        mem_start;    /* 共享内存开始    */    unsigned long        base_addr;    /* 网络设备I/O基地址     */    unsigned int        irq;        /* 设备终端号*/    unsigned char        if_port;    /* 传输介质,如双绞线、同轴电缆等,在多端口设备中指定使用哪个端口*/    unsigned char        dma;        /* DMA通道*/    unsigned long        state; /* 网络设备物理上的工作状态 */    struct list_head    dev_list; //网络设备链表    struct list_head    napi_list;//支持NAPI传输的网络设备链表    /* Net device features */    unsigned long        features; //设备硬件功能特性#define NETIF_F_SG        1    /* Scatter/gather IO. */#define NETIF_F_IP_CSUM        2    /* Can checksum TCP/UDP over IPv4. */#define NETIF_F_NO_CSUM        4    /* Does not require checksum. F.e. loopack. */#define NETIF_F_HW_CSUM        8    /* Can checksum all the packets. */#define NETIF_F_IPV6_CSUM    16    /* Can checksum TCP/UDP over IPV6 */#define NETIF_F_HIGHDMA        32    /* Can DMA to high memory. */#define NETIF_F_FRAGLIST    64    /* Scatter/gather IO. */#define NETIF_F_HW_VLAN_TX    128    /* Transmit VLAN hw acceleration */#define NETIF_F_HW_VLAN_RX    256    /* Receive VLAN hw acceleration */#define NETIF_F_HW_VLAN_FILTER    512    /* Receive filtering on VLAN */#define NETIF_F_VLAN_CHALLENGED    1024    /* Device cannot handle VLAN packets */#define NETIF_F_GSO        2048    /* Enable software GSO. */#define NETIF_F_LLTX        4096    /* LockLess TX - deprecated. Please */                    /* do not use LLTX in new drivers */#define NETIF_F_NETNS_LOCAL    8192    /* Does not change network namespaces */#define NETIF_F_GRO        16384    /* Generic receive offload */#define NETIF_F_LRO        32768    /* large receive offload *//* the GSO_MASK reserves bits 16 through 23 */#define NETIF_F_FCOE_CRC    (1 << 24) /* FCoE CRC32 */#define NETIF_F_SCTP_CSUM    (1 << 25) /* SCTP checksum offload */#define NETIF_F_FCOE_MTU    (1 << 26) /* Supports max FCoE MTU, 2158 bytes*/    /* Segmentation offload features */#define NETIF_F_GSO_SHIFT    16#define NETIF_F_GSO_MASK    0x00ff0000#define NETIF_F_TSO        (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT)#define NETIF_F_UFO        (SKB_GSO_UDP << NETIF_F_GSO_SHIFT)#define NETIF_F_GSO_ROBUST    (SKB_GSO_DODGY << NETIF_F_GSO_SHIFT)#define NETIF_F_TSO_ECN        (SKB_GSO_TCP_ECN << NETIF_F_GSO_SHIFT)#define NETIF_F_TSO6        (SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT)#define NETIF_F_FSO        (SKB_GSO_FCOE << NETIF_F_GSO_SHIFT)    /* List of features with software fallbacks. */#define NETIF_F_GSO_SOFTWARE    (NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6)#define NETIF_F_GEN_CSUM    (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)#define NETIF_F_V4_CSUM        (NETIF_F_GEN_CSUM | NETIF_F_IP_CSUM)#define NETIF_F_V6_CSUM        (NETIF_F_GEN_CSUM | NETIF_F_IPV6_CSUM)#define NETIF_F_ALL_CSUM    (NETIF_F_V4_CSUM | NETIF_F_V6_CSUM)    /*     * If one device supports one of these features, then enable them     * for all in netdev_increment_features.     */#define NETIF_F_ONE_FOR_ALL    (NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \                 NETIF_F_SG | NETIF_F_HIGHDMA |        \                 NETIF_F_FRAGLIST)    /* Interface index. Unique device identifier    */    int            ifindex;//标识网络设备的唯一索引号       int            iflink;//用于虚拟网络设备    struct net_device_stats    stats; //统计信息#ifdef CONFIG_WIRELESS_EXT    /* List of functions to handle Wireless Extensions (instead of ioctl).     * See <net/iw_handler.h> for details. Jean II */    const struct iw_handler_def *    wireless_handlers;    /* Instance data managed by the core of Wireless Extensions. */    struct iw_public_data *    wireless_data;#endif    /* Management operations */    const struct net_device_ops *netdev_ops;//网络设备驱动程序需要实现的一组操作函数    const struct ethtool_ops *ethtool_ops;//支持ethtool功能的一组操作函数       /* Hardware header description */    const struct header_ops *header_ops;//数据链路层协议头相关的一组操作函数    unsigned int        flags;    /* 它们的可能取值定义在linux-2.6.38.8/include/linux/if.h文件中。*/    unsigned short        gflags;        unsigned short          priv_flags; /* Like 'flags' but invisible to userspace. */    unsigned short        padded;    /* 分配net_device结构体及其私有数据时为对齐所需的填充位数目*/    unsigned char        operstate; /*//RFC 2863操作状态  */    unsigned char        link_mode; /* 映射到RFC2863兼容状态的策略  */    unsigned        mtu;    /* MTU        */    unsigned short        type;    /* 网络设备硬件类型,如10Mbps以太网ARPHRD_ETHER       */    unsigned short        hard_header_len;    /* 硬件数据帧头的长度,以太网为14字节       */    /* extra head- and tailroom the hardware may need, but not in all cases     * can this be guaranteed, especially tailroom. Some cases also use     * LL_MAX_HEADER instead to allocate the skb.     */    unsigned short        needed_headroom; //分配套接字缓冲区时预留空间的长度    unsigned short        needed_tailroom;    struct net_device    *master; /* 分组状态 */    /* 硬件(如MAC)地址长度以及设备的硬件地址   */    unsigned char        perm_addr[MAX_ADDR_LEN]; /* permanent hw address */    unsigned char        addr_len;    /* hardware address length    */    unsigned short          dev_id;        /* for shared network cards */    struct netdev_hw_addr_list    uc;    /* 网络设备硬件地址组成的链表   */    int            uc_promisc; //混杂模式时的单播地址个数       spinlock_t        addr_list_lock;//防止单播地址链表和组播地址链表被并发访问的自旋锁     struct dev_addr_list    *mc_list;    /* Multicast mac addresses    */    int            mc_count;    /* Number of installed mcasts    */    unsigned int        promiscuity; //混杂模式的计数器    unsigned int        allmulti;//监听所有组播地址       /* 网络层协议特定数据 */#ifdef CONFIG_NET_DSA    void            *dsa_ptr;    /* dsa specific data */#endif    void             *atalk_ptr;    /* AppleTalk link     */    void            *ip_ptr;    /* IPv4 specific data    */    void                    *dn_ptr;        /* DECnet specific data */    void                    *ip6_ptr;       /* IPv6 specific data */    void            *ec_ptr;    /* Econet specific data    */    void            *ax25_ptr;    /* AX.25 specific data */    struct wireless_dev    *ieee80211_ptr;    /* IEEE 802.11 specific data,                           assign before registering *//* * Cache line mostly used on receive path (including eth_type_trans()) */    unsigned long        last_rx;    /* 最后接收数据包的时间   */    /* Interface address info used in eth_type_trans() */    unsigned char        *dev_addr;    /* hw address, (before bcast                           because most packets are                           unicast) */    struct netdev_hw_addr_list    dev_addrs; /* //网络设备硬件地址组成的链表    */    unsigned char        broadcast[MAX_ADDR_LEN];    /* //广播地址    */    struct netdev_queue    rx_queue; //接收队列,RPS(Receive Packet Steering)特性       struct netdev_queue    *_tx ____cacheline_aligned_in_smp;    /* Number of TX queues allocated at alloc_netdev_mq() time  */    unsigned int        num_tx_queues; //发送队列    /* Number of TX queues currently active in device  */    unsigned int        real_num_tx_queues;    /* root qdisc from userspace point of view */    struct Qdisc        *qdisc;    unsigned long        tx_queue_len;    /* Max frames per queue allowed */    spinlock_t        tx_global_lock;/* * One part is mostly used on xmit path (device) */    /* These may be needed for future network-power-down code. */    /*     * trans_start here is expensive for high speed devices on SMP,     * please use netdev_queue->trans_start instead.     */    unsigned long        trans_start;    /* //最近传送数据包的时间       */    int            watchdog_timeo; //发生传输超时时,设置的标志     struct timer_list    watchdog_timer;//网络层设置的传送数据包超时的时钟         /* Number of references to this device */    atomic_t        refcnt ____cacheline_aligned_in_smp;    /* delayed register/unregister */    struct list_head    todo_list;//延迟注册/注销的网络设备链表       /* device index hash chain */    struct hlist_node    index_hlist;//以索引号为关键字的网络设备哈希链表       struct net_device    *link_watch_next;    /* register/unregister state machine */    enum { NETREG_UNINITIALIZED=0,           NETREG_REGISTERED,    /* completed register_netdevice */           NETREG_UNREGISTERING,    /* called unregister_netdevice */           NETREG_UNREGISTERED,    /* completed unregister todo */           NETREG_RELEASED,        /* called free_netdev */           NETREG_DUMMY,        /* dummy device for NAPI poll */    } reg_state; //设备注册/注销状态机    /* Called from unregister, can be used to call free_netdev */    void (*destructor)(struct net_device *dev);#ifdef CONFIG_NETPOLL    struct netpoll_info    *npinfo;//NETPOLL相关信息 #endif#ifdef CONFIG_NET_NS    /* Network namespace this network device is inside */    struct net        *nd_net; //网络命名空间#endif    /* mid-layer private */    void            *ml_priv; //中间层的私有数据    /* bridge stuff */    struct net_bridge_port    *br_port; //桥接模式    /* macvlan */    struct macvlan_port    *macvlan_port;    /* GARP */    struct garp_port    *garp_port;    /* class/net/name entry */    struct device        dev; //在sysfs文件系统中输出网络设备信息    /* space for optional statistics and wireless sysfs groups */    const struct attribute_group *sysfs_groups[3];    /* rtnetlink link ops */    const struct rtnl_link_ops *rtnl_link_ops; //rtnetlink操作函数       /* VLAN feature mask */    unsigned long vlan_features; //虚拟局域网相关     /* for setting kernel sock attribute on TCP connection setup */#define GSO_MAX_SIZE        65536    unsigned int        gso_max_size; //GSO最大值   #ifdef CONFIG_DCB    /* Data Center Bridging netlink ops */    struct dcbnl_rtnl_ops *dcbnl_ops; //DCB操作函数 #endif#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)    /* max exchange id for FCoE LRO by ddp */    unsigned int        fcoe_ddp_xid;#endif};
参考:http://blog.sina.com.cn/s/blog_636a55070101qfse.html

0x03 总结

     在学习过程中,发现相关代码都有良好的注释。仅仅需要做下翻译,然后理解其目的。
阅读全文
0 0