TCP

来源:互联网 发布:ubuntu编译配置lnmp 编辑:程序博客网 时间:2024/05/21 18:49

位置:net/ipv4/tcp_offload.c

在网卡驱动中,当支持NAPI时,在中断中接收到数据后,最后通过调用napi_gro_receive,将数据交给上层协议,而在此过程中,会在每层协议中调用不同的回调函数,下面来看TCP层的情况。首先要申明一个结构:

static const struct net_offload tcpv4_offload = {.callbacks = {.gso_send_check=tcp_v4_gso_send_check,.gso_segment=tcp_gso_segment,.gro_receive=tcp4_gro_receive,.gro_complete=tcp4_gro_complete,},};
这里定义了一系列的回调函数,这个结构要注册到内核中去:

int __init tcpv4_offload_init(void){return inet_add_offload(&tcpv4_offload, IPPROTO_TCP);}
下面来看接收函数:

static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb){/* Use the IP hdr immediately proceeding for this transport */const struct iphdr *iph = skb_gro_network_header(skb);__wsum wsum;/* Don't bother verifying checksum if we're going to flush anyway. */if (NAPI_GRO_CB(skb)->flush)goto skip_csum;wsum = NAPI_GRO_CB(skb)->csum;switch (skb->ip_summed) {case CHECKSUM_NONE:wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb),    0);/* fall through */case CHECKSUM_COMPLETE:if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,  wsum)) {skb->ip_summed = CHECKSUM_UNNECESSARY;break;}NAPI_GRO_CB(skb)->flush = 1;return NULL;}skip_csum:return tcp_gro_receive(head, skb);}

struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb){struct sk_buff **pp = NULL;struct sk_buff *p;struct tcphdr *th;struct tcphdr *th2;unsigned int len;unsigned int thlen;__be32 flags;unsigned int mss = 1;unsigned int hlen;unsigned int off;int flush = 1;int i;

这时,NAPI_GRO_CB(skb)->data_offset己经是TCP头部的位置了。

        off = skb_gro_offset(skb);hlen = off + sizeof(*th);th = skb_gro_header_fast(skb, off);if (skb_gro_header_hard(skb, hlen)) {th = skb_gro_header_slow(skb, hlen, off);if (unlikely(!th))goto out;}thlen = th->doff * 4;if (thlen < sizeof(*th))goto out;hlen = off + thlen;if (skb_gro_header_hard(skb, hlen)) {th = skb_gro_header_slow(skb, hlen, off);if (unlikely(!th))goto out;}
先得到TCP头部,下面是将NAPI_GRO_CB(skb)->data_offset设为指向实际的数据(跟在TCP头部之后的数据)然后得到实际数据的长度

        skb_gro_pull(skb, thlen);len = skb_gro_len(skb);flags = tcp_flag_word(th);
下面是一个循环,对napi的列表进行遍历:

        for (; (p = *head); head = &p->next) { if (!NAPI_GRO_CB(p)->same_flow) continue; th2 = tcp_hdr(p); if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {NAPI_GRO_CB(p)->same_flow = 0; continue; } goto found; }        goto out_check_final;
同样是设置same_flow的值,还是判断源是否一致。

found:/* Include the IP ID check below from the inner most IP hdr */flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id;flush |= (__force int)(flags & TCP_FLAG_CWR);flush |= (__force int)((flags ^ tcp_flag_word(th2)) &  ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));flush |= (__force int)(th->ack_seq ^ th2->ack_seq);for (i = sizeof(*th); i < thlen; i += 4)flush |= *(u32 *)((u8 *)th + i) ^ *(u32 *)((u8 *)th2 + i);mss = tcp_skb_mss(p);flush |= (len - 1) >= mss;flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);if (flush || skb_gro_receive(head, skb)) {mss = 1;goto out_check_final;}p = *head;th2 = tcp_hdr(p);tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
如果在上面找到了在同一个流中的sk_buff,设置flush。要设置flush的情况包括:TCP_FLAG_CWR(拥塞窗口减少);flag不相同或者相同并且不是TCP_FLAG_CWR,不是TCP_FLAG_FIN(结束会话),不是TCP_FLAG_PSH(数据包立即发送);ack_seq不同;TCP头部不相同等。

如果设置了flush,则不用合并,否则要进行合并:

int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb){struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);unsigned int offset = skb_gro_offset(skb);unsigned int headlen = skb_headlen(skb);struct sk_buff *nskb, *lp, *p = *head;unsigned int len = skb_gro_len(skb);unsigned int delta_truesize;unsigned int headroom;if (unlikely(p->len + len >= 65536))return -E2BIG;lp = NAPI_GRO_CB(p)->last;pinfo = skb_shinfo(lp);
offset为skb实际数据的位置,headlen为skb主buffer的长度(不包含分片的长度),len为实际数据的长度。
if (headlen <= offset) {skb_frag_t *frag;skb_frag_t *frag2;int i = skbinfo->nr_frags;int nr_frags = pinfo->nr_frags + i;if (nr_frags > MAX_SKB_FRAGS)goto merge;

这里是对Scatter-Gather I/O的处理,先得到两个skb_buff的总的分片数。MAX_SKB_FRAGS为16。在skb_shared_info中有一个skb_frag_t数组,大小为MAX_SKB_FRAGS,如果数量超过数组大小,去进行合并。

                offset -= headlen;pinfo->nr_frags = nr_frags; skbinfo->nr_frags = 0; frag = pinfo->frags + nr_frags; frag2 = skbinfo->frags + i; do { *--frag = *--frag2; } while (--i);
把skbinfo中的分片信息拷贝到pinfo的后面。
frag->page_offset += offset;skb_frag_size_sub(frag, offset);/* all fragments truesize : remove (head size + sk_buff) */delta_truesize = skb->truesize - SKB_TRUESIZE(skb_end_offset(skb));skb->truesize -= skb->data_len;skb->len -= skb->data_len;skb->data_len = 0;NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;goto done;
调转frag和skb的值。将此skb标记为可以删除。
} else if (skb->head_frag) {int nr_frags = pinfo->nr_frags;skb_frag_t *frag = pinfo->frags + nr_frags;struct page *page = virt_to_head_page(skb->head);unsigned int first_size = headlen - offset;unsigned int first_offset;if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)goto merge;
如果分片数量不为0。page为skb的buffer的开始处的内存页。
first_offset = skb->data -       (unsigned char *)page_address(page) +       offset;pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;frag->page.p  = page;frag->page_offset = first_offset;skb_frag_size_set(frag, first_size);memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);/* We dont need to clear skbinfo->nr_frags here */delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;goto done;}
这里是直接将skbinfo中的分片信息通过memcpy拷贝到pinfo的后面。
if (pinfo->frag_list)goto merge;if (skb_gro_len(p) != pinfo->gso_size)return -E2BIG;headroom = skb_headroom(p);nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC);if (unlikely(!nskb))return -ENOMEM;
分配一个新的sk_buff。
__copy_skb_header(nskb, p);nskb->mac_len = p->mac_len;skb_reserve(nskb, headroom);__skb_put(nskb, skb_gro_offset(p));skb_set_mac_header(nskb, skb_mac_header(p) - p->data);skb_set_network_header(nskb, skb_network_offset(p));skb_set_transport_header(nskb, skb_transport_offset(p));__skb_pull(p, skb_gro_offset(p));memcpy(skb_mac_header(nskb), skb_mac_header(p),       p->data - skb_mac_header(p));
将skb中的内容拷贝到新的sk_buff中。
skb_shinfo(nskb)->frag_list = p;skb_shinfo(nskb)->gso_size = pinfo->gso_size;pinfo->gso_size = 0;skb_header_release(p);NAPI_GRO_CB(nskb)->last = p;nskb->data_len += p->len;nskb->truesize += p->truesize;nskb->len += p->len;*head = nskb;nskb->next = p->next;p->next = NULL;p = nskb;
调整新的sk_buff中的各值。
merge:delta_truesize = skb->truesize;if (offset > headlen) {unsigned int eat = offset - headlen;skbinfo->frags[0].page_offset += eat;skb_frag_size_sub(&skbinfo->frags[0], eat);skb->data_len -= eat;skb->len -= eat;offset = headlen;}


__skb_pull(skb, offset);if (NAPI_GRO_CB(p)->last == p)skb_shinfo(p)->frag_list = skb;elseNAPI_GRO_CB(p)->last->next = skb;NAPI_GRO_CB(p)->last = skb;skb_header_release(skb);lp = p;


done:NAPI_GRO_CB(p)->count++;p->data_len += len;p->truesize += delta_truesize;p->len += len;if (lp != p) {lp->data_len += len;lp->truesize += delta_truesize;lp->len += len;}NAPI_GRO_CB(skb)->same_flow = 1;return 0;}EXPORT_SYMBOL_GPL(skb_gro_receive);




out_check_final:flush = len < mss;flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |TCP_FLAG_RST | TCP_FLAG_SYN |TCP_FLAG_FIN));if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))pp = head;out:NAPI_GRO_CB(skb)->flush |= (flush != 0);return pp;}


0 0
原创粉丝点击