x86体系结构下Linux-2.6.26的中断处理

来源：互联网发布：零点有数科技知乎编辑：程序博客网时间：2024/05/22 13:30

http://home.ustc.edu.cn/~hchunhui/linux_intr.html

PB09210183 何春晖

Linux对x86中断机制的利用

IDT的初始化
8259A的初始化和基本操作
中断、异常、系统调用入口
上下文的保存与恢复
总结

Linux中断处理的体系结构无关层

irq_chip
irqaction
irq_desc
do_IRQ

中断处理程序的注册与运行

运行

软中断、tasklet和工作队列机制

软中断
tasklet
工作队列

中断实践——截获键盘中断

Linux对x86中断机制的利用

x86系统中，当CPU处于实模式和保护模式时，中断机制是不同的。由于Linux只工作在保护模式下，因此只说明保护模式下的中断机制。

在保护模式下，CPU根据中断描述符表（IDT）来决定中断和异常的处理。当中断或异常发生时，CPU通过查中断向量号对应IDT的表项决定动作。IDT最多有256项。根据Intel的分配，IDT的前32项是由Intel定义的异常和保留项。

IDT的每一项为一个门描述符，记录了该下标中断发生时要运行的程序的入口和权限信息。在x86 CPU中，门被分为多种类型，且权限的检查十分繁琐。Linux主体部分只使用这个机制很简单的一部分。

另外，传统的x86系统使用8259A来作为中断控制器，因此还需要对8259A进行编程，才能组成完整的x86中断系统。

下面从几个方面来说明Linux对上述机制的利用。

IDT的初始化

IDT在内核进入保护模式时就已经初始化，不过真正的初始化在init/main.c::start_kernel调用的 arch/x86/kernel/traps_32.c::trap_init和arch/x86/kernel/i8259_32.c::init_IRQ中：

void __init trap_init(void){....        set_trap_gate(0,  &divide_error);        set_intr_gate(1,  &debug);        set_intr_gate(2,  &nmi);        set_system_intr_gate(3, &int3); /* int3/4 can be called from all */        set_system_gate(4, &overflow);        set_trap_gate(5,  &bounds);        set_trap_gate(6,  &invalid_op);        set_trap_gate(7,  &device_not_available);        set_task_gate(8,  GDT_ENTRY_DOUBLEFAULT_TSS);        set_trap_gate(9,  &coprocessor_segment_overrun);        set_trap_gate(10, &invalid_TSS);        set_trap_gate(11, &segment_not_present);        set_trap_gate(12, &stack_segment);        set_trap_gate(13, &general_protection);        set_intr_gate(14, &page_fault);        set_trap_gate(15, &spurious_interrupt_bug);        set_trap_gate(16, &coprocessor_error);        set_trap_gate(17, &alignment_check);#ifdef CONFIG_X86_MCE        set_trap_gate(18, &machine_check);#endif        set_trap_gate(19, &simd_coprocessor_error);....        set_system_gate(SYSCALL_VECTOR, &system_call);....}

void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));void __init native_init_IRQ(void){....        /* all the set up before the call gates are initialised */        pre_intr_init_hook();....        for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {                int vector = FIRST_EXTERNAL_VECTOR + i;                if (i >= NR_IRQS)                        break;                /* SYSCALL_VECTOR was reserved in trap_init. */                if (!test_bit(vector, used_vectors))                        set_intr_gate(vector, interrupt[i]);        }....}

这两段程序设置好了中断、异常和系统调用的表项。

8259A的初始化和基本操作

8259A初始化流程初始调用是上面native_init_IRQ的pre_intr_init_hook()函数，辗转到达arch/x86/kernel/i8259_32.c::init_ISA_irqs和init_8259A中：

void __init init_ISA_irqs (void){        int i;#ifdef CONFIG_X86_LOCAL_APIC        init_bsp_APIC();#endif        init_8259A(0);        /*         * 16 old-style INTA-cycle interrupts:         */        for (i = 0; i < 16; i++) {                set_irq_chip_and_handler_name(i, &i8259A_chip,                                              handle_level_irq, "XT");        }}

void init_8259A(int auto_eoi){        unsigned long flags;        i8259A_auto_eoi = auto_eoi;        spin_lock_irqsave(&i8259A_lock, flags);        outb(0xff, PIC_MASTER_IMR);     /* mask all of 8259A-1 */        outb(0xff, PIC_SLAVE_IMR);      /* mask all of 8259A-2 */        /*         * outb_pic - this has to work on a wide range of PC hardware.         */        outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */        outb_pic(0x20 + 0, PIC_MASTER_IMR);     /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */        outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */        if (auto_eoi)   /* master does Auto EOI */                outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);        else            /* master expects normal EOI */                outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);        outb_pic(0x11, PIC_SLAVE_CMD);  /* ICW1: select 8259A-2 init */        outb_pic(0x20 + 8, PIC_SLAVE_IMR);      /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */        outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);        /* 8259A-2 is a slave on master's IR2 */        outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */        if (auto_eoi)                /*                 * In AEOI mode we just have to mask the interrupt                 * when acking.                 */                i8259A_chip.mask_ack = disable_8259A_irq;        else                i8259A_chip.mask_ack = mask_and_ack_8259A;        udelay(100);            /* wait for 8259A to initialize */        outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */        outb(cached_slave_mask, PIC_SLAVE_IMR);   /* restore slave IRQ mask */        spin_unlock_irqrestore(&i8259A_lock, flags);}

上面一段程序用set_irq_chip_and_handler_name向上注册了8259A芯片；下面的程序重新对8259A编程，将中断向量设为0x20~0x27(master)、0x28~0x2f(slave)。

在i8259_32.c中，还有一些基本操作代码，最终形成了如下结构：

static struct irq_chip i8259A_chip = {        .name           = "XT-PIC",        .mask           = disable_8259A_irq,        .disable        = disable_8259A_irq,        .unmask         = enable_8259A_irq,        .mask_ack       = mask_and_ack_8259A,};

中断、异常、系统调用入口

这三者的入口都在arch/x86/entry_32.S中。其中异常和系统调用的入口可以直接找到，而中断入口interrupt数组是用下列宏语句生成的：

.section .rodata,"a"ENTRY(interrupt).textENTRY(irq_entries_start)        RING0_INT_FRAMEvector=0.rept NR_IRQS        ALIGN .if vector        CFI_ADJUST_CFA_OFFSET -4 .endif1:      pushl $~(vector)        CFI_ADJUST_CFA_OFFSET 4        jmp common_interrupt .previous        .long 1b .textvector=vector+1.endrEND(irq_entries_start).previousEND(interrupt)

可见每个中断向量的处理程序都是在压栈~vector后，就转到了common_interrupt处，最终进入do_IRQ。

上下文的保存与恢复

在进入common_interrupt后，代码如下：

common_interrupt:        SAVE_ALL        TRACE_IRQS_OFF        movl %esp,%eax        call do_IRQ        jmp ret_from_intr...ret_from_intr:        GET_THREAD_INFO(%ebp)check_userspace:        movl PT_EFLAGS(%esp), %eax      # mix EFLAGS and CS        movb PT_CS(%esp), %al        andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax        cmpl $USER_RPL, %eax        jb resume_kernel                # not returning to v8086 or userspaceENTRY(resume_userspace)...        jmp restore_allEND(ret_from_exception)...ENTRY(resume_kernel)...

可见向量号压栈后，首先调用了SAVE_ALL宏来保存所有的通用寄存器，然后进入do_IRQ函数执行。在执行完毕后，跳到ret_from_intr中，最终到restore_all中恢复保存的上下文并返回。

总结

Linux通过上述代码利用x86中断机制进行了最底层的操作，为在上层使用体系结构无关的描述提供了基础。

Linux中断处理的体系结构无关层

Linux中断处理的体系结构无关层是由irqaction、irq_desc和irq_chip几个结构向上向下联系起来，从而达到了隔离体系结构相关和无关代码的目的。

irq_chip

irq_chip是无关层向下的接口，它提供了对中断控制器的抽象。前面已经看到，在传统x86系统上，8259A将进行注册。

irqaction

irqaction是无关层向上的接口，它记录了上方代码对某IRQ所进行的响应的信息。在include/linux/interrupt.h中，如下：

typedef irqreturn_t (*irq_handler_t)(int, void *);struct irqaction {        irq_handler_t handler;        unsigned long flags;        cpumask_t mask;        const char *name;        void *dev_id;        struct irqaction *next;        int irq;        struct proc_dir_entry *dir;};

响应相同IRQ的irqaction通过next形成链表，从而实现了IRQ复用。

irq_desc

irq_desc是一个软件层面上的IRQ描述符（类似于IDT，但是体系结构无关的），它记录了每个IRQ号底部的控制器和对应的irqaction等内容。定义在include/linux/irq.h中，如下：

struct irq_desc {        irq_flow_handler_t      handle_irq;        struct irq_chip         *chip;        struct msi_desc         *msi_desc;        void                    *handler_data;        void                    *chip_data;        struct irqaction        *action;        /* IRQ action list */        unsigned int            status;         /* IRQ status */        unsigned int            depth;          /* nested irq disables */        unsigned int            wake_depth;     /* nested wake enables */        unsigned int            irq_count;      /* For detecting broken IRQs */        unsigned int            irqs_unhandled;        unsigned long           last_unhandled; /* Aging timer for unhandled count */        spinlock_t              lock;#ifdef CONFIG_SMP        cpumask_t               affinity;        unsigned int            cpu;#endif#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)        cpumask_t               pending_mask;#endif#ifdef CONFIG_PROC_FS        struct proc_dir_entry   *dir;#endif        const char              *name;} ____cacheline_internodealigned_in_smp;extern struct irq_desc irq_desc[NR_IRQS];

do_IRQ

do_IRQ是由硬件中断驱动上层动作的核心函数。

do_IRQ()函数的等价代码（摘自课件）：

        int irq = ~regs->orig_ax;               //1        irq_desc[irq]->handle_irq(irq, desc);   //2        mask_ack_irq(desc, irq);                //3        handle_IRQ_event(irq,&regs,irq_desc[irq].action);//4        irq_desc[irq].handler->end(irq);        //5        irq_exit(); //其中invoke_softirq  //6

其中：

1句取得对应的中断向量
2句调用中断处理句柄，对8259，就是handle_level_irq
3句应答PIC的中断，并禁用这条IRQ线。(为串行处理同类型中断)
4调用handle_IRQ_event()执行中断服务例程，例如timer_interrupt
5句通知PIC重新激活这条IRQ线，允许处理同类型中断
6必要时触发softirq

由此可以看出Linux在处理中断时的策略。

中断处理程序的注册与运行

下面以时钟中断为例分析中断的注册与运行。

注册

时钟中断IRQ为0，在arch/x86/mach-default/setup.c注册：

static struct irqaction irq0  = {        .handler = timer_interrupt,        .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL,        .mask = CPU_MASK_NONE,        .name = "timer"};/** * time_init_hook - do any specific initialisations for the system timer. * * Description: *      Must plug the system timer interrupt source at HZ into the IRQ listed *      in irq_vectors.h:TIMER_IRQ **/void __init time_init_hook(void){        irq0.mask = cpumask_of_cpu(0);        setup_irq(0, &irq0);}

可见kernel/irq/manage.c::setup_irq函数负责中断处理程序的注册，主干如下：

int setup_irq(unsigned int irq, struct irqaction *new){....        spin_lock_irqsave(&desc->lock, flags);        p = &desc->action;        old = *p;        if (old) {....                /* add new interrupt at end of irq queue */                do {                        p = &old->next;                        old = *p;                } while (old);                shared = 1;        }        *p = new;....        spin_unlock_irqrestore(&desc->lock, flags);        new->irq = irq;        register_irq_proc(irq);        new->dir = NULL;        register_handler_proc(irq, new);        return 0;....}

可见主要工作就是把irqaction插入到irq_desc::action的最后。

不过，更加常见的中断处理程序注册是通过request_irq来进行的，如drivers/net/phy.c::phy_start_interrupts，requset_irq最终也调用setup_irq。

运行

当时钟中断到来时，CPU查询IDT，并由记录转到interrupt[0]处执行，之后流程为：

common_interrupt
do_IRQ
....
arch/x86/kernel/time_32.c::timer_interrupt

软中断、tasklet和工作队列机制

这三种机制是内核对可延迟中断处理的支持。这三者特点为：

tasklet在软中断之上实现
在同一个CPU上软中断/tasklet不嵌套
同类tasklet不能并发
软中断由内核静态分配，tasklet可以在运行时分配和初始化
软中断/tasklet不能睡眠、阻塞，工作队列以内核线程身份运行，可

可延迟函数上可以执行4种操作初始化：定义一个新的可延迟函数，通常在内核初始化时进行激活：设置可延迟函数在下一轮处理中执行屏蔽：有选择的屏蔽一个可延迟函数，这样即使被激活也不会被运行执行：在特定的时间执行可延迟函数

软中断

以TIMER_SOFTIRQ为例。

初始化

kernel/timer.c::init_timers中，有：

        open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);

open_softirq在kernel/softirq.c中，它做的事情很简单：

void open_softirq(int nr, void (*action)(struct softirq_action*), void *data){        softirq_vec[nr].data = data;        softirq_vec[nr].action = action;}

而run_timer_softirq就在kernel/timer.c中。当这个软中断激活时，就会被执行。

激活

激活过程颇为曲折。时间中断来临时，会依次走如下流程：

arch/x86/kernel/time_32.c::timer_interrupt
include/asm-x86/mach-default/do_timer.h::do_timer_interrupt_hook ¹
kernel/time/tick-common.c::tick_periodic
kernel/timer.c::update_process_times
kernel/timer.c::run_local_timers
kernel/softirq.c::raise_softirq

raise_softirq最终会将local_softirq_pending对应位置置位，这就完成了激活。

注：

1. event_handler最终注册为kernel/time/tick-common.c::tick_periodic，分析见http://blog.chinaunix.net/space.php?uid=20729605&do=blog&id=1884329

执行

前面在分析do_IRQ函数时，说到最后调用irq_exit。这个函数如下：

void irq_exit(void){        account_system_vtime(current);        trace_hardirq_exit();        sub_preempt_count(IRQ_EXIT_OFFSET);        if (!in_interrupt() && local_softirq_pending())                invoke_softirq();....

此函数先用sub_preempt_count退出一层中断上下文。若中断没有嵌套，且local_softirq_pending有置位（假设已经raise_softirq），则会运行invoke_softirq，最终进入__do_softirq：

asmlinkage void __do_softirq(void){....        pending = local_softirq_pending();....restart:        /* Reset the pending bitmask before enabling irqs */        set_softirq_pending(0);        local_irq_enable();        h = softirq_vec;        do {                if (pending & 1) {                        h->action(h);                        rcu_bh_qsctr_inc(cpu);                }                h++;                pending >>= 1;        } while (pending);        local_irq_disable();        pending = local_softirq_pending();        if (pending && --max_restart)                goto restart;        if (pending)                wakeup_softirqd();....}

这里一个while循环在允许中断的环境下将每个pending的软中断执行一遍。然后重新查看local_softirq_pending，若再次非空，则重启循环继续执行。但是若负荷特别重，以致重启次数超过max_restart的初值次，则唤醒ksoftirqd内核线程，交由它继续执行。而自己退出，以免正常任务饿死。

这里，kernel/timer.c::run_timer_softirq会被执行。

tasklet

在kernel/softirq.c::softirq_init中，有：

        open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);        open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);

因此tasklet是使用softirq机制实现的。

这里以键盘驱动为例（drivers/char/keyboard.c），说明tasklet的接口。tasklet_hi类似，只不过优先级高。

初始化

在drivers/char/keyboard.c的1030行有：

DECLARE_TASKLET_DISABLED(keyboard_tasklet, kbd_bh, 0);

这就定义了一个叫keyboard_tasklet的tasklet_struct结构，延迟函数为kbd_bh。

使能和屏蔽

在这个文件中，有多处使能和屏蔽，分布在kbd_start和kbd_init中。

使能用的是tasklet_enable(&keyboard_tasklet)，屏蔽用的是tasklet_disable(&keyboard_tasklet)。

激活

在kbd_event和kbd_init中，各有一次激活操作，用的都是tasklet_schedule(&keyboard_tasklet)。

在include/linux/interrupt.h中，有：

static inline void tasklet_schedule(struct tasklet_struct *t){        if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))                __tasklet_schedule(t);}

kernel/softirq.c中：

void __tasklet_schedule(struct tasklet_struct *t){        unsigned long flags;        local_irq_save(flags);        t->next = NULL;        *__get_cpu_var(tasklet_vec).tail = t;        __get_cpu_var(tasklet_vec).tail = &(t->next);        raise_softirq_irqoff(TASKLET_SOFTIRQ);        local_irq_restore(flags);}

因此，对tasklet的激活操作是：将tasklet插入tasklet_vec队列尾，并raise_softirq。并且由tasklet_schedule中的if判断，若此tasklet已经schedule但还没执行，激活操作将被忽略。这里体现了tasklet机制与softirq的区别。

执行

由于套用softirq机制，前半段执行与softirq相同。softirq机制最终调用tasklet_action函数：

static void tasklet_action(struct softirq_action *a){        struct tasklet_struct *list;        local_irq_disable();        list = __get_cpu_var(tasklet_vec).head;        __get_cpu_var(tasklet_vec).head = NULL;        __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head;        local_irq_enable();        while (list) {                struct tasklet_struct *t = list;                list = list->next;                if (tasklet_trylock(t)) {                        if (!atomic_read(&t->count)) {                                if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))                                        BUG();                                t->func(t->data);                                tasklet_unlock(t);                                continue;                        }                        tasklet_unlock(t);                }                local_irq_disable();                t->next = NULL;                *__get_cpu_var(tasklet_vec).tail = t;                __get_cpu_var(tasklet_vec).tail = &(t->next);                __raise_softirq_irqoff(TASKLET_SOFTIRQ);                local_irq_enable();        }}

这个函数做的事情为：从tasklet_vec取下链表，并依次执行。但若某个tasklet正在别的CPU上面执行（tasklet_trylock失败），则将放回tasklet_vec链，并再次raise_softirq。

这里说明，tasklet是严格串行执行的。

在这个例子中，kbd_bh函数得到执行。

工作队列

这里以PS/2鼠标驱动为例（drivers/input/mouse/psmouse-base.c）。

初始化

在此文件psmouse_init有：

        kpsmoused_wq = create_singlethread_workqueue("kpsmoused");

这里创建了一个kpsmoused的工作队列。

这句调用最终执行kernel/workqueue.c::__create_workqueue_key。最终创建一个名为kpsmoused的内核线程。线程函数为worker_thread。

激活

在psmouse_interrupt，有两次激活操作，用的都是queue_work(kpsmoused_wq, &psmouse->resync_work)，加入了一个psmouse->resync_work工作。而此工作是运行psmouse_resync函数。

queue_work阅读起来和tasklet_schedule差不多，都是将工作插入一个链表。然后，使用工作队列的more_work这个类似信号量的东西，唤醒对应内核线程。 </src>

执行

在某一时刻，kpsmoused线程得到调度，在worker_thread函数中运行：

static int worker_thread(void *__cwq){....        for (;;) {                prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);                if (!freezing(current) &&                    !kthread_should_stop() &&                    list_empty(&cwq->worklist))                        schedule();                finish_wait(&cwq->more_work, &wait);....                run_workqueue(cwq);        }....}

由于前面以及激活，死循环中的finish_wait顺利通过，向下执行run_workqueue函数。

run_workqueue函数与tasklet_action也类似，也是将工作逐一取出执行，不再具体分析。

这时，psmouse_resync函数得到运行。

中断实践——截获键盘中断

键盘IRQ是1。思路是用写一个内核模块，用request_irq注册处理程序。如下：

#include <linux/init.h>#include <linux/interrupt.h>#include <linux/module.h>   /* Specifically, a module */MODULE_LICENSE("GPL");#define DEVICE_NAME "kbd_intr_hook"irqreturn_t hook_handler(int irq, void *dev_id){        printk("kbd_hook: intr!\n");        return IRQ_HANDLED;}static int hook_init(){        int retval;        retval = request_irq(1,                             hook_handler,                             IRQF_SHARED | IRQF_DISABLED,                             "kbd_hook",                             0x55aa00ff);        printk("kbd_hook: retval=%d\n", retval);        return retval;}static void hook_exit(){        free_irq(1, 0x55aa00ff);        printk("kbd_hook: exit\n");}module_init(hook_init);module_exit(hook_exit);

最后，套用模板编译并insmod。结果如下：

[52232.201128] kbd_hook: retval=0[52232.256559] kbd_hook: intr![52233.324266] kbd_hook: intr![52233.395786] kbd_hook: intr![52233.589061] kbd_hook: intr![52233.655376] kbd_hook: intr![52233.667734] kbd_hook: intr![52233.712986] kbd_hook: intr!....

最后rmmod卸载。

0 0