linux源码 网络包接收--从中断到协议栈

来源:互联网 发布:盛世投资 知乎 编辑:程序博客网 时间:2024/06/07 22:06

本文基于4.11内核

linux在硬中断触发后,会进入do_IRQ函数(arch/x86/kernel/irq.c):

/* * do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific * handlers). */__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs){    struct pt_regs *old_regs = set_irq_regs(regs);    struct irq_desc * desc;    /* high bit used in ret_from_ code  */    unsigned vector = ~regs->orig_ax;    /*     * NB: Unlike exception entries, IRQ entries do not reliably     * handle context tracking in the low-level entry code.  This is     * because syscall entries execute briefly with IRQs on before     * updating context tracking state, so we can take an IRQ from     * kernel mode with CONTEXT_USER.  The low-level entry code only     * updates the context if we came from user mode, so we won't     * switch to CONTEXT_KERNEL.  We'll fix that once the syscall     * code is cleaned up enough that we can cleanly defer enabling     * IRQs.     */    entering_irq();    /* entering_irq() tells RCU that we're not quiescent.  Check it. */    RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");    desc = __this_cpu_read(vector_irq[vector]);    if (!handle_irq(desc, regs)) {        ack_APIC_irq();        if (desc != VECTOR_RETRIGGERED) {            pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",                         __func__, smp_processor_id(),                         vector);        } else {            __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);        }    }    exiting_irq();    set_irq_regs(old_regs);    return 1;}

set_irq_regs函数将CPU寄存器设置为指定值,并返回旧的寄存器状态。在函数开头和末尾调用的
set_irq_regs使得中断可以嵌套执行。

desc = __this_cpu_read(vector_irq[vector])获得中断向量的描述符,中断描述符的结构如下:

/** * struct irq_desc - interrupt descriptor * @irq_common_data:    per irq and chip data passed down to chip functions * @kstat_irqs:     irq stats per cpu * @handle_irq:     highlevel irq-events handler * @preflow_handler:    handler called before the flow handler (currently used by sparc) * @action:     the irq action chain * @status:     status information * @core_internal_state__do_not_mess_with_it: core internal status information * @depth:      disable-depth, for nested irq_disable() calls * @wake_depth:     enable depth, for multiple irq_set_irq_wake() callers * @irq_count:      stats field to detect stalled irqs * @last_unhandled: aging timer for unhandled count * @irqs_unhandled: stats field for spurious unhandled interrupts * @threads_handled:    stats field for deferred spurious detection of threaded handlers * @threads_handled_last: comparator field for deferred spurious detection of theraded handlers * @lock:       locking for SMP * @affinity_hint:  hint to user space for preferred irq affinity * @affinity_notify:    context for notification of affinity changes * @pending_mask:   pending rebalanced interrupts * @threads_oneshot:    bitfield to handle shared oneshot threads * @threads_active: number of irqaction threads currently running * @wait_for_threads:   wait queue for sync_irq to wait for threaded handlers * @nr_actions:     number of installed actions on this descriptor * @no_suspend_depth:   number of irqactions on a irq descriptor with *          IRQF_NO_SUSPEND set * @force_resume_depth: number of irqactions on a irq descriptor with *          IRQF_FORCE_RESUME set * @rcu:        rcu head for delayed free * @kobj:       kobject used to represent this struct in sysfs * @dir:        /proc/irq/ procfs entry * @name:       flow handler name for /proc/interrupts output */struct irq_desc {    struct irq_common_data  irq_common_data;    struct irq_data     irq_data;    unsigned int __percpu   *kstat_irqs;    irq_flow_handler_t  handle_irq;#ifdef CONFIG_IRQ_PREFLOW_FASTEOI    irq_preflow_handler_t   preflow_handler;#endif    struct irqaction    *action;    /* IRQ action list */    unsigned int        status_use_accessors;    unsigned int        core_internal_state__do_not_mess_with_it;    unsigned int        depth;      /* nested irq disables */    unsigned int        wake_depth; /* nested wake enables */    unsigned int        irq_count;  /* For detecting broken IRQs */    unsigned long       last_unhandled; /* Aging timer for unhandled count */    unsigned int        irqs_unhandled;    atomic_t        threads_handled;    int         threads_handled_last;    raw_spinlock_t      lock;    struct cpumask      *percpu_enabled;    const struct cpumask    *percpu_affinity;#ifdef CONFIG_SMP    const struct cpumask    *affinity_hint;    struct irq_affinity_notify *affinity_notify;#ifdef CONFIG_GENERIC_PENDING_IRQ    cpumask_var_t       pending_mask;#endif#endif    unsigned long       threads_oneshot;    atomic_t        threads_active;    wait_queue_head_t       wait_for_threads;#ifdef CONFIG_PM_SLEEP    unsigned int        nr_actions;    unsigned int        no_suspend_depth;    unsigned int        cond_suspend_depth;    unsigned int        force_resume_depth;#endif#ifdef CONFIG_PROC_FS    struct proc_dir_entry   *dir;#endif#ifdef CONFIG_SPARSE_IRQ    struct rcu_head     rcu;    struct kobject      kobj;#endif    int         parent_irq;    struct module       *owner;    const char      *name;} ____cacheline_internodealigned_in_smp;

handle_irq函数实际只做了一个溢出检查,然后就调用了desc->handle_irq函数。然后调用exiting_irq
来结束中断处理。exiting_irq实际调用前几个版本常用的命名irq_exit函数。

/* * Exit an interrupt context. Process softirqs if needed and possible: */void irq_exit(void){#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED    local_irq_disable();#else    WARN_ON_ONCE(!irqs_disabled());#endif    account_irq_exit_time(current);    preempt_count_sub(HARDIRQ_OFFSET);    if (!in_interrupt() && local_softirq_pending())        invoke_softirq();    tick_irq_exit();    rcu_irq_exit();    trace_hardirq_exit(); /* must be last! */}

该函数减少一个preempt_count的硬中断计数器。in_interrupt通过preempt_count的软中断和
硬中断计数,来判断是否处于中断嵌套中,如果处于中断嵌套,那么计数值肯定大于1。local_softirq_pending
函数检查该CPU的__softirq_pending变量,查看是否有软中断挂起。如果不处于中断嵌套,且有软中断的话,
那么会执行invoke_softirq函数。

static inline void invoke_softirq(void){    if (ksoftirqd_running())        return;    if (!force_irqthreads) {#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK        /*         * We can safely execute softirq on the current stack if         * it is the irq stack, because it should be near empty         * at this stage.         */        __do_softirq();#else        /*         * Otherwise, irq_exit() is called on the task stack that can         * be potentially deep already. So call softirq in its own stack         * to prevent from any overrun.         */        do_softirq_own_stack();#endif    } else {        wakeup_softirqd();    }}

force_irqthreads是一个开关,如果该开关打开,那么所有的软中断都会在ksoftirqd中处理。
但是默认情况下,该开关是关闭的。这里我遇到过一个问题,曾经空跑一个收包收udp包的程序,
当入包速率为10Wpps(每秒10万包)时,用mpstat -P ALL 1看到的soft%只有1%,但是入包
速率为40Wpps时,soft%达到了8%,而入包速率为90Wpps时,soft%变成了100%。当时百思不得其解。
后来才知道mpstat那个只统计ksoftirqd的cpu占用,而ksoftirqd只有在软中断频繁被抢占然后
累计较多时候才会被唤醒。而在硬中断的时候进行的CPU消耗,是计算在硬中断的上下文进程里面的。

硬中断频率很低,负载很低的时候,软中断直接在硬中断之后就解决掉了,不会唤醒ksoftirqd线程,
所以即使本来应该花11%时间去处理10w包,但是大多数都没有在ksoftirqd线程中处理,所以记录出来的
soft%很低。当负载慢慢上升,很多软中断慢慢被硬中断抢占,于是累积起来了,于是ksoftirqd频繁被
唤醒,所以soft%越来越高,直到满载的时候,soft%占用100%。我曾在StackOverFlow上面提了这个问题,
后来自己解决了:https://stackoverflow.com/questions/44063602/the-linux-softirq-cpu-usage-looks-strange/44716705#44716705

asmlinkage __visible void __softirq_entry __do_softirq(void){    unsigned long end = jiffies + MAX_SOFTIRQ_TIME;    unsigned long old_flags = current->flags;    int max_restart = MAX_SOFTIRQ_RESTART;    struct softirq_action *h;    bool in_hardirq;    __u32 pending;    int softirq_bit;    /*     * Mask out PF_MEMALLOC s current task context is borrowed for the     * softirq. A softirq handled such as network RX might set PF_MEMALLOC     * again if the socket is related to swap     */    current->flags &= ~PF_MEMALLOC;    pending = local_softirq_pending();    account_irq_enter_time(current);    __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);    in_hardirq = lockdep_softirq_start();restart:    /* Reset the pending bitmask before enabling irqs */    set_softirq_pending(0);    local_irq_enable();    h = softirq_vec;    while ((softirq_bit = ffs(pending))) {        unsigned int vec_nr;        int prev_count;        h += softirq_bit - 1;        vec_nr = h - softirq_vec;        prev_count = preempt_count();        kstat_incr_softirqs_this_cpu(vec_nr);        trace_softirq_entry(vec_nr);        h->action(h);        trace_softirq_exit(vec_nr);        if (unlikely(prev_count != preempt_count())) {            pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",                   vec_nr, softirq_to_name[vec_nr], h->action,                   prev_count, preempt_count());            preempt_count_set(prev_count);        }        h++;        pending >>= softirq_bit;    }    rcu_bh_qs();    local_irq_disable();    pending = local_softirq_pending();    if (pending) {        if (time_before(jiffies, end) && !need_resched() &&            --max_restart)            goto restart;        wakeup_softirqd();    }    lockdep_softirq_end(in_hardirq);    account_irq_exit_time(current);    __local_bh_enable(SOFTIRQ_OFFSET);    WARN_ON_ONCE(in_interrupt());    tsk_restore_flags(current, old_flags, PF_MEMALLOC);}

MAX_SOFT_IRQ是软中断最多占用的时间,MAX_SOFTIRQ_RESTART是软中断循环执行次数。
account_irq_enter_time统计了软中断执行的时间,__local_bh_disable_ip增加了preempt_count
软中断计数,相当于阻止了进程调度。

然后是一个循环,先set_softirq_pending把软中断pending计数清零,然后local_irq_enable开启硬中断,
这时候软中断的处理过程可以被硬中断抢占了。然后获取优先级最高的软中断,增加软中断发生次数,执行
h->action,然后关闭硬中断。再次获取__softirq_pending变量,检查是否有软中断还未执行,如果
还有的话,先检查时间有没有用完,再看最多次数有没有用完,如果都没有,那么继续循环,否则调用
wakeup_softirqd唤醒ksoftirqd线程。

最后统计进程被软中断使用时间,减少软中断计数,并还原进程标识。

网卡的硬中断处理函数处理收包:

drivers/net/ethernet/realtek/8169.c为例:

static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance){    struct net_device *dev = dev_instance;    struct rtl8169_private *tp = netdev_priv(dev);    int handled = 0;    u16 status;    status = rtl_get_events(tp);    if (status && status != 0xffff) {        status &= RTL_EVENT_NAPI | tp->event_slow;        if (status) {            handled = 1;            rtl_irq_disable(tp);            napi_schedule(&tp->napi);        }    }    return IRQ_RETVAL(handled);}

从网卡的中断状态寄存器先读出状态值,如果使用NAPI接收机制进行接收。接收过程关中断。
napi_schedule调用napi_schedule_prep检查NAPI的前提条件满足后,进行中断处理:

/* Called with irq disabled */static inline void ____napi_schedule(struct softnet_data *sd,                     struct napi_struct *napi){    list_add_tail(&napi->poll_list, &sd->poll_list);    __raise_softirq_irqoff(NET_RX_SOFTIRQ);}

__raise_softirq_irqoff激活一个NET_RX_SOFTIRQ的软中断,然后报文就进入了软中断处理:

static __latent_entropy void net_rx_action(struct softirq_action *h){    struct softnet_data *sd = this_cpu_ptr(&softnet_data);    unsigned long time_limit = jiffies + 2;    int budget = netdev_budget;    LIST_HEAD(list);    LIST_HEAD(repoll);    local_irq_disable();    list_splice_init(&sd->poll_list, &list);    local_irq_enable();    for (;;) {        struct napi_struct *n;        if (list_empty(&list)) {            if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))                goto out;            break;        }        n = list_first_entry(&list, struct napi_struct, poll_list);        budget -= napi_poll(n, &repoll);        /* If softirq window is exhausted then punt.         * Allow this to run for 2 jiffies since which will allow         * an average latency of 1.5/HZ.         */        if (unlikely(budget <= 0 ||                 time_after_eq(jiffies, time_limit))) {            sd->time_squeeze++;            break;        }    }    local_irq_disable();    list_splice_tail_init(&sd->poll_list, &list);    list_splice_tail(&repoll, &list);    list_splice(&list, &sd->poll_list);    if (!list_empty(&sd->poll_list))        __raise_softirq_irqoff(NET_RX_SOFTIRQ);    net_rps_action_and_irq_enable(sd);out:    __kfree_skb_flush();}

在软中断处理函数中调用NAPI的poll收包,并且在收取过程中中断是开着的,所以还可以在poll_list中
继续追加。这里的poll_list就是前面napi_schedule中
list_add_tail(&napi->poll_list, &sd->poll_list);
所处理的poll_list。这个函数同样限制了时间和最多处理的包数量,如果超出就结束循环poll。如果是
因此退出的循环,poll_list里面还有包,那么就再触发一个软中断。最后流程转到了RPS的处理。

网卡驱动中的rtl_rx函数是NAPI的poll函数:

static int rtl_rx(struct net_device *dev, struct rtl8169_private *tp, u32 budget){    unsigned int cur_rx, rx_left;    unsigned int count;    cur_rx = tp->cur_rx;    for (rx_left = min(budget, NUM_RX_DESC); rx_left > 0; rx_left--, cur_rx++) {        unsigned int entry = cur_rx % NUM_RX_DESC;        struct RxDesc *desc = tp->RxDescArray + entry;        u32 status;        status = le32_to_cpu(desc->opts1) & tp->opts1_mask;        if (status & DescOwn)            break;        /* This barrier is needed to keep us from reading         * any other fields out of the Rx descriptor until         * we know the status of DescOwn         */        dma_rmb();        if (unlikely(status & RxRES)) {            netif_info(tp, rx_err, dev, "Rx ERROR. status = %08x\n",                   status);            dev->stats.rx_errors++;            if (status & (RxRWT | RxRUNT))                dev->stats.rx_length_errors++;            if (status & RxCRC)                dev->stats.rx_crc_errors++;            if (status & RxFOVF) {                rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);                dev->stats.rx_fifo_errors++;            }            if ((status & (RxRUNT | RxCRC)) &&                !(status & (RxRWT | RxFOVF)) &&                (dev->features & NETIF_F_RXALL))                goto process_pkt;        } else {            struct sk_buff *skb;            dma_addr_t addr;            int pkt_size;process_pkt:            addr = le64_to_cpu(desc->addr);            if (likely(!(dev->features & NETIF_F_RXFCS)))                pkt_size = (status & 0x00003fff) - 4;            else                pkt_size = status & 0x00003fff;            /*             * The driver does not support incoming fragmented             * frames. They are seen as a symptom of over-mtu             * sized frames.             */            if (unlikely(rtl8169_fragmented_frame(status))) {                dev->stats.rx_dropped++;                dev->stats.rx_length_errors++;                goto release_descriptor;            }            skb = rtl8169_try_rx_copy(tp->Rx_databuff[entry],                          tp, pkt_size, addr);            if (!skb) {                dev->stats.rx_dropped++;                goto release_descriptor;            }            rtl8169_rx_csum(skb, status);            skb_put(skb, pkt_size);            skb->protocol = eth_type_trans(skb, dev);            rtl8169_rx_vlan_tag(desc, skb);            if (skb->pkt_type == PACKET_MULTICAST)                dev->stats.multicast++;            napi_gro_receive(&tp->napi, skb);            u64_stats_update_begin(&tp->rx_stats.syncp);            tp->rx_stats.packets++;            tp->rx_stats.bytes += pkt_size;            u64_stats_update_end(&tp->rx_stats.syncp);        }release_descriptor:        desc->opts2 = 0;        rtl8169_mark_to_asic(desc, rx_buf_sz);    }    count = cur_rx - tp->cur_rx;    tp->cur_rx = cur_rx;    return count;}

大概是一个大循环,从rx_ring缓冲区中一直读取包,然后解析Ethernet头,最后选择丢弃或者将剥离下来的skb
(socket buffer)通过napi_gro_receive最后送到netif_receive_skb送入内核协议栈。这个rtl_rx函数是
rtl8169_poll函数调用的,在驱动程序初始化的时候,有
netif_napi_add(dev, &tp->napi, rtl8169_poll, R8169_NAPI_WEIGHT);这样的注册。关于NAPI的poll
机制,可以参考参考中的4.

如果网卡不支持NAPI,那么默认的poll函数会变成:

static int process_backlog(struct napi_struct *napi, int quota){    struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);    bool again = true;    int work = 0;    /* Check if we have pending ipi, its better to send them now,     * not waiting net_rx_action() end.     */    if (sd_has_rps_ipi_waiting(sd)) {        local_irq_disable();        net_rps_action_and_irq_enable(sd);    }    napi->weight = dev_rx_weight;    while (again) {        struct sk_buff *skb;        while ((skb = __skb_dequeue(&sd->process_queue))) {            rcu_read_lock();            __netif_receive_skb(skb);            rcu_read_unlock();            input_queue_head_incr(sd);            if (++work >= quota)                return work;        }        local_irq_disable();        rps_lock(sd);        if (skb_queue_empty(&sd->input_pkt_queue)) {            /*             * Inline a custom version of __napi_complete().             * only current cpu owns and manipulates this napi,             * and NAPI_STATE_SCHED is the only possible flag set             * on backlog.             * We can use a plain write instead of clear_bit(),             * and we dont need an smp_mb() memory barrier.             */            napi->state = 0;            again = false;        } else {            skb_queue_splice_tail_init(&sd->input_pkt_queue,                           &sd->process_queue);        }        rps_unlock(sd);        local_irq_enable();    }    return work;}

这个函数取出sd->process_queue中的数据包,分别处理每个skb,最后也是调用netif_receive_skb将包转到
协议栈处理。当然协议栈也是中断下半部的内容(软中断)。

由此看来,一个包的接收,在到3层协议栈之前,就经过了这么多的处理,所以linux内核协议栈,单核最多也只能跑
百万级的数据包。在高频的场合,会使用内核旁路等技术来绕过内核协议栈,因为之后的内核协议栈里面还有很多
锁机制和限制性能的其他因素。

参考:

  1. linux网卡驱动处理 http://blog.csdn.net/yuan1164345228/article/details/18078539
  2. linux中断发生 http://www.cnblogs.com/tolimit/p/4444850.html
  3. linux软中断机制 http://www.cnblogs.com/tolimit/p/4495128.html
  4. NAPI原理 http://blog.csdn.net/zhangskd/article/details/21627963
阅读全文
0 0