Linux等待队列waitqueue

来源：互联网发布：win7 443端口服务编辑：程序博客网时间：2024/05/20 10:21

waitqueue

- waitqueue
  - 创建一个等待队列
  - 让当前进程开始等待
    - 1 wait_event
    - 2 wait_event_timeout
    - 3 wait_event_interruptible
    - 4 wait_event_interruptible_timeout
- 唤醒等待队列上的进程

内核中提供了等待队列，作用是实现阻塞操作。等待队列用于使进程等待某一特定的事件发生而无需频繁的轮询，进程在等待期间睡眠，在某些事件发生时，由内核自动唤醒。

首先，Linux中所有的进程都是由task_struct这个结构管理。在生成进程的时候会分配一个task_struct结构，之后将通过这个结构对进程进行管理。task_struct位于独立的连续区间。task_struct结构中有一个state成员，有下面几种状态：

状态说明 TASK_RUNNING 执行可能状态 TASK_INTERRUPTIBLE 等待状态，可接受信号 TASK_UNINTERRUPTIBLE 等待状态，不能接受信号 TASK_ZOMBIE 僵尸状态，exit后的状态 TASK_STOPPED 延缓状态

1. 创建一个等待队列

Linux内核中，wait_queue_head_t代表一个等待队列头，wait_queue_head_t数据结构如下：

struct __wait_queue_head {    spinlock_t lock;            // 自旋锁，确保对链表操作的原子性    struct list_head task_list; // 链表};typedef struct __wait_queue_head wait_queue_head_t;

等待队列中每个元素用wait_queue_t来表示，wait_queue_t数据结构如下：

typedef struct __wait_queue wait_queue_t;struct __wait_queue {    unsigned int flags;         // WQ_FLAG_EXCLUSIVE-表示等待进程想要被独占地唤醒； 0-可以和其他进程一起唤醒。#define WQ_FLAG_EXCLUSIVE 0x01  // 在结构体中定义宏跟一般的宏没区别，这里表示flags会用到该宏，提高直观性。    void *private;              // 指向等待进程的task_struct地址    wait_queue_func_t func;     // 用于唤醒被挂起任务的回调函数    struct list_head task_list; // 链表元素，用于链接到wait_queue_head_t中的task_list链表中};

① 可以调用init_waitqueue_head接口来初始化此队列，init_waitqueue_head主要是将wait_queue_head_t结构体中的两个成员进行初始化。

staitc wait_queue_head_t prod_wq;init_waitqueue_head(&prod_wq);#define init_waitqueue_head(q)                  \    do {                                        \        static struct lock_class_key __key;     \                                                \        __init_waitqueue_head((q), #q, &__key); \    } while (0)void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key){    spin_lock_init(&q->lock);   // 初始化自旋锁    lockdep_set_class_and_name(&q->lock, key, name);    //和防止死锁有关    INIT_LIST_HEAD(&q->task_list);  // 初始化链表}

② 也可以使用DECLARE_WAIT_QUEUE_HEAD来定义和初始化等待队列头。

#define __WAIT_QUEUE_HEAD_INITIALIZER(name) {                           \        .lock           = __SPIN_LOCK_UNLOCKED(name.lock),              \        .task_list      = { &(name).task_list, &(name).task_list } }#define DECLARE_WAIT_QUEUE_HEAD(name) \        wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)

③ 定义和初始化等待队列项

#define __WAITQUEUE_INITIALIZER(name, tsk) {                            \        .private        = tsk,                                          \        .func           = default_wake_function,                        \        .task_list      = { NULL, NULL } }#define DECLARE_WAITQUEUE(name, tsk)                                    \        wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk)

2. 让当前进程开始等待

内核提供了如下的接口来让当前进程在条件不满足的情况下，阻塞等待：

wait_event(wq, condition)wait_event_timeout(wq, condition, timeout)wait_event_interruptible(wq, condition)wait_event_interruptible_timeout(wq, condition, timeout)

2.1 wait_event

wait_event的实现如下：

void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait){        unsigned long flags;        wait->flags |= WQ_FLAG_EXCLUSIVE;        spin_lock_irqsave(&q->lock, flags);        __add_wait_queue_tail(q, wait);        spin_unlock_irqrestore(&q->lock, flags);}EXPORT_SYMBOL(add_wait_queue_exclusive);void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state){        unsigned long flags;        wait->flags &= ~WQ_FLAG_EXCLUSIVE;        spin_lock_irqsave(&q->lock, flags);        if (list_empty(&wait->task_list))                __add_wait_queue(q, wait);        set_current_state(state);        spin_unlock_irqrestore(&q->lock, flags);}EXPORT_SYMBOL(prepare_to_wait);void finish_wait(wait_queue_head_t *q, wait_queue_t *wait){        unsigned long flags;        __set_current_state(TASK_RUNNING);        /*         * We can check for list emptiness outside the lock         * IFF:         *  - we use the "careful" check that verifies both         *    the next and prev pointers, so that there cannot         *    be any half-pending updates in progress on other         *    CPU's that we haven't seen yet (and that might         *    still change the stack area.         * and         *  - all other users take the lock (ie we can only         *    have _one_ other CPU that looks at or modifies         *    the list).         */        if (!list_empty_careful(&wait->task_list)) {                spin_lock_irqsave(&q->lock, flags);                list_del_init(&wait->task_list);                spin_unlock_irqrestore(&q->lock, flags);        }}#define __wait_event(wq, condition)                                     \do {                                                                    \        DEFINE_WAIT(__wait);                                            \                                                                        \        for (;;) {                                                      \                prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE);    \                if (condition)                                          \                        break;                                          \                schedule();                                             \        }                                                               \        finish_wait(&wq, &__wait);                                      \} while (0)#define wait_event(wq, condition)                                       \do {                                                                    \        if (condition)                                                  \                break;                                                  \        __wait_event(wq, condition);                                    \} while (0)

里面有个宏定义即DEFINE_WAIT，详细如下：

#define DEFINE_WAIT_FUNC(name, function)                                \        wait_queue_t name = {                                           \                .private        = current,                              \                .func           = function,                             \                .task_list      = LIST_HEAD_INIT((name).task_list),     \        }#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)

可以看到private成员是当前task对象的地址current， func成员是autoremove_wake_function(在下面wake_up时再做说明)。
所以整个wait_event的逻辑就是：
① 首先判断条件是否满足，如果满足，直接退出；如果不满足，调用__wait_event

② __wait_event中首先基于当前经常构建一个等待队列项；然后进入死循环：
- 调用prepare_to_wait，该函数将新建的等待队列项加入到等待队列中，并修改当前任务的state为TASK_UNINTERRUPTIBLE；（注，该函数flags的结果必然是0，也就是说这个函数是将非独占进程添加到等待队列当中。而add_wait_queue_exclusive函数则是将独占进程添加到等待队列的尾部，也就是说一个等待队列，非独占进程总是在前面，独占进程总是在后面）
- 判断condition条件，满足就退出循环，不满足继续
- 调用schedule()进行任务调度后，重新开始循环

③ 退出循环后调用finish_wait，将当前任务的state设置为TASK_RUNNING，并将新建的等待队列从任务队列中删除。

2.2 wait_event_timeout

wait_event_timeout 的实现如下：

signed long __sched schedule_timeout(signed long timeout){        struct timer_list timer;        unsigned long expire;        switch (timeout)        {        case MAX_SCHEDULE_TIMEOUT:                /*                 * These two special cases are useful to be comfortable                 * in the caller. Nothing more. We could take                 * MAX_SCHEDULE_TIMEOUT from one of the negative value                 * but I' d like to return a valid offset (>=0) to allow                 * the caller to do everything it want with the retval.                 */                schedule();                goto out;        default:                /*                 * Another bit of PARANOID. Note that the retval will be                 * 0 since no piece of kernel is supposed to do a check                 * for a negative retval of schedule_timeout() (since it                 * should never happens anyway). You just have the printk()                 * that will tell you if something is gone wrong and where.                 */                if (timeout < 0) {                        printk(KERN_ERR "schedule_timeout: wrong timeout "                              "value %lx\n", timeout);                        dump_stack();                        current->state = TASK_RUNNING;                        goto out;                }        }        expire = timeout + jiffies;        setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);        __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);        schedule();        del_singleshot_timer_sync(&timer);        /* Remove the timer from the object tracker */        destroy_timer_on_stack(&timer);        timeout = expire - jiffies; out:        return timeout < 0 ? 0 : timeout;}#define __wait_event_timeout(wq, condition, ret)                        \do {                                                                    \        DEFINE_WAIT(__wait);                                            \                                                                        \        for (;;) {                                                      \                prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE);    \                if (condition)                                          \                        break;                                          \                ret = schedule_timeout(ret);                            \                if (!ret)                                               \                        break;                                          \        }                                                               \        if (!ret && (condition))                                        \                ret = 1;                                                \        finish_wait(&wq, &__wait);                                      \} while (0)#define wait_event_timeout(wq, condition, timeout)                      \({                                                                      \        long __ret = timeout;                                           \        if (!(condition))                                               \                __wait_event_timeout(wq, condition, __ret);             \        __ret;                                                          \})

wait_event_timeout 和 wait_event逻辑类似，就一个地方差异较大，即schedule_timeout。
schedule_timeout中构建了一个定时器，该定时器到期后将调用process_timeout(通过中断的形式)，传入的参数则是当前进程的指针current。然后调用schedule，等待调度器回到该位置(由于任务状态为UNINTERRUPTABLE，不能通过调度或信号回到该位置)。这个时候就有两种情况(唤醒在后面wake_up部分详细说明)：
① 超时了，调用process_timeout函数，该函数调用wake_up_process函数，核心代码类似wake_up_xxx(current)
② 在其他任务中调用了wake_up_xxx(wq)函数，将任务状态修改为TASK_RUNNING
一旦任务状态为TASK_RUNNING，就又回到了cpu的run queue中，可以通过调度回到函数中的schedule位置。

wait_event_timeout 返回值如下：
- 大于0: 表示condition满足，返回值表示距离设定超时还有多久(jiffies)
- 等于0: 表示超时发生

2.3 wait_event_interruptible

wait_event_interruptible 的实现如下：

#define __wait_event_interruptible(wq, condition, ret)                  \do {                                                                    \        DEFINE_WAIT(__wait);                                            \                                                                        \        for (;;) {                                                      \                prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE);      \                if (condition)                                          \                        break;                                          \                if (!signal_pending(current)) {                         \                        schedule();                                     \                        continue;                                       \                }                                                       \                ret = -ERESTARTSYS;                                     \                break;                                                  \        }                                                               \        finish_wait(&wq, &__wait);                                      \} while (0)#define wait_event_interruptible(wq, condition)                         \({                                                                      \        int __ret = 0;                                                  \        if (!(condition))                                               \                __wait_event_interruptible(wq, condition, __ret);       \        __ret;                                                          \})

wait_event_interruptible的实现和wait_event类似，区别是有多了一个signal_pending操作。
signal_pending检查给定进程是否有信号需要处理，返回0表示没有信号需要处理。
所以此时退出循环的条件是：满足 condition 和有信号两者之一就行 (如果执行到schedule，需要另外一个进程调用wake_up_xxx(&wq)操作，或者该进程收到了信号，将任务加入到run queue中。)

wait_event_interruptible 返回值如下：
- -ERESTARTSYS: 表示被信号激活唤醒。该错误的意思表示发生系统调用，任务正处在睡眠状态，等wakeup之后，会重新调用一次系统调用。
- 等于0：表示condition满足

2.4 wait_event_interruptible_timeout

wait_event_interruptible_timeout 的实现如下：

#define __wait_event_interruptible_timeout(wq, condition, ret)          \do {                                                                    \        DEFINE_WAIT(__wait);                                            \                                                                        \        for (;;) {                                                      \                prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE);      \                if (condition)                                          \                        break;                                          \                if (!signal_pending(current)) {                         \                        ret = schedule_timeout(ret);                    \                        if (!ret)                                       \                                break;                                  \                        continue;                                       \                }                                                       \                ret = -ERESTARTSYS;                                     \                break;                                                  \        }                                                               \        if (!ret && (condition))                                        \                ret = 1;                                                \        finish_wait(&wq, &__wait);                                      \} while (0)#define wait_event_interruptible_timeout(wq, condition, timeout)        \({                                                                      \        long __ret = timeout;                                           \        if (!(condition))                                               \                __wait_event_interruptible_timeout(wq, condition, __ret); \        __ret;                                                          \})

wait_event_interruptible_timeout的实现和上面wait_event等类似，退出循环的条件是：满足 condition、timeout 和有信号三者之一(如果执行到schedule，需要另外一个进程调用wake_up_xxx(&wq)操作，或者超时了，或者该进程收到了信号，这三者都会将任务加入到run queue中。)。

wait_event_interruptible_timeout 返回值如下：
- -ERESTARTSYS: 表示被信号激活唤醒
- 大于0: 表示condition满足，返回值表示距离设定超时还有多久(jiffies)
- 等于0: 表示超时发生

3. 唤醒等待队列上的进程

内核提供了如下接口来唤醒等待队列上的进程：

#define wake_up(x)                      __wake_up(x, TASK_NORMAL, 1, NULL)#define wake_up_nr(x, nr)               __wake_up(x, TASK_NORMAL, nr, NULL)#define wake_up_all(x)                  __wake_up(x, TASK_NORMAL, 0, NULL)#define wake_up_locked(x)               __wake_up_locked((x), TASK_NORMAL, 1)#define wake_up_all_locked(x)           __wake_up_locked((x), TASK_NORMAL, 0)#define wake_up_interruptible(x)        __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)#define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)#define wake_up_interruptible_all(x)    __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)#define wake_up_interruptible_sync(x)   __wake_up_sync((x), TASK_INTERRUPTIBLE, 1)

可以到这些接口调用了三个函数__wake_up，__wake_up_locked，__wake_up_sync。先看看__wake_up的实现:

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,                        int nr_exclusive, int wake_flags, void *key){        wait_queue_t *curr, *next;        list_for_each_entry_safe(curr, next, &q->task_list, task_list) {                unsigned flags = curr->flags;                if (curr->func(curr, mode, wake_flags, key) &&                                (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)                        break;        }}void __wake_up(wait_queue_head_t *q, unsigned int mode,                        int nr_exclusive, void *key){        unsigned long flags;        spin_lock_irqsave(&q->lock, flags);        __wake_up_common(q, mode, nr_exclusive, 0, key);        spin_unlock_irqrestore(&q->lock, flags);}

可以看到__wake_up会调到__wake_up_common函数，该函数的逻辑是，遍历等待队列上的的wait_queue_t结构体，进行如下的操作：
① 获取curr->flags值放入flags中。
② 进行判断，如果同时满足三个条件就退出循环。（注对于if来说，如果前面有一项不满足，后续的判断就不会做）
第一个条件是curr->func的返回结果，依据前面的说明，该函数实际上就是autoremove_wake_function，其详细说明如下，如果返回1，表明已经将相关的任务加入到cpu的run queue，并修改任务的状态成功。依据前面定义的wait_event_xxx的实现，该项正常来说均返回1。
第二个条件是flags & WQ_FLAG_EXCLUSIVE，如果该wait_queue_t是独占的，就为真。对于一个任务队列来说，只有前面的非互斥项执行curr->func之后，才轮到互斥进程，也只有互斥进程flags & WQ_FLAG_EXCLUSIVE才为真。
第三个条件是！–nr_exclusive，如果nr_exclusive为0，依据常理，该项始终为假；如果nr_exclusive为1，则第一次就为真；如果nr_exclusive为一个整数nr，则第nr次，该项为真。
所以可以得出：
wake_up 唤醒全部的非独占任务，唤醒一个独占任务。
wake_up_nr 唤醒全部的非独占任务，唤醒nr个独占任务。
wake_up_all 唤醒全部的非独占任务，唤醒全部独占任务。
wake_up_interruptible_xxx等函数类似上面。

static inttry_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags){        unsigned long flags;        int cpu, success = 0;        /*         * If we are going to wake up a thread waiting for CONDITION we         * need to ensure that CONDITION=1 done by the caller can not be         * reordered with p->state check below. This pairs with mb() in         * set_current_state() the waiting thread does.         */        smp_mb__before_spinlock();        raw_spin_lock_irqsave(&p->pi_lock, flags);        if (!(p->state & state))                goto out;        success = 1; /* we're going to change ->state */        cpu = task_cpu(p);  /* 获取最后执行该任务的CPU */        if (p->on_rq && ttwu_remote(p, wake_flags))                goto stat;    /* support smp 在很多架构上还不支持smp可以忽略此处     * 判断是否要将任务转移到另外一个CPU的执行队列上，负载均衡     * /#ifdef CONFIG_SMP        /*         * If the owning (remote) cpu is still in the middle of schedule() with         * this task as prev, wait until its done referencing the task.         */        while (p->on_cpu)                cpu_relax();        /*         * Pairs with the smp_wmb() in finish_lock_switch().         */        smp_rmb();        p->sched_contributes_to_load = !!task_contributes_to_load(p);        p->state = TASK_WAKING;        if (p->sched_class->task_waking)                p->sched_class->task_waking(p);        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);        if (task_cpu(p) != cpu) {                wake_flags |= WF_MIGRATED;                set_task_cpu(p, cpu);        }#endif /* CONFIG_SMP */        ttwu_queue(p, cpu);stat:        ttwu_stat(p, cpu, wake_flags);out:        raw_spin_unlock_irqrestore(&p->pi_lock, flags);        return success;}int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,                          void *key){        return try_to_wake_up(curr->private, mode, wake_flags);}int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key){        int ret = default_wake_function(wait, mode, sync, key);        if (ret) // 如果返回真，将该任务从等待队列中移除。                list_del_init(&wait->task_list);        return ret;}

以上是autoremove_wake_function的实现，具体看try_to_wake_up函数。
该函数有3个参数：
- p 任务结构体指针
- state 需要唤醒的进程状态掩码，即需要唤醒符合该状态掩码的进程
- wake_flags 此处等待队列传过来的值为0。表示是同步唤醒sync，还是异步唤醒 async；

阅读全文

1 0