linux内核工作队列讲解和源码详细注释

来源:互联网 发布:ubuntu 只能游客登录 编辑:程序博客网 时间:2024/04/30 07:11

1. 前言

  工作队列(workqueue)的Linux内核中的定义的用来处理不是很紧急事件的回调方式处理方法。

  以下代码的linux内核版本为2.6.19.2, 源代码文件主要为kernel/workqueue.c.

  2. 数据结构

  /* include/linux/workqueue.h */ // 工作节点结构struct work_struct { // 等待时间unsigned long pending;// 链表节点struct list_head entry;// workqueue回调函数void (*func)(void *);// 回调函数func的数据void *data;// 指向CPU相关数据, 一般指向struct cpu_workqueue_struct结构void *wq_data;// 定时器struct timer_list timer;};

  struct execute_work { struct work_struct work;};

  /* kernel/workqueue.c */ /* * The per-CPU workqueue (if single thread, we always use the first * possible cpu)。

  * * The sequence counters are for flush_scheduled_work()。 It wants to wait * until all currently-scheduled works are completed, but it doesn't * want to be livelocked by new, incoming ones. So it waits until * remove_sequence is >= the insert_sequence which pertained when * flush_scheduled_work() was called. */ // 这个结构是针对每个CPU的struct cpu_workqueue_struct { // 结构锁spinlock_t lock;// 下一个要执行的节点序号long remove_sequence; /* Least-recently added (next to run) */ // 下一个要插入节点的序号long insert_sequence; /* Next to add */ // 工作机构链表节点struct list_head worklist;// 要进行处理的等待队列wait_queue_head_t more_work;// 处理完的等待队列wait_queue_head_t work_done;// 工作队列节点struct workqueue_struct *wq;// 进程指针struct task_struct *thread;int run_depth; /* Detect run_workqueue() recursion depth */ } ____cacheline_aligned;/* * The externally visible workqueue abstraction is an array of * per-CPU workqueues:*/ // 工作队列结构struct workqueue_struct { struct cpu_workqueue_struct *cpu_wq;const char *name;struct list_head list; /* Empty if single thread */ };

  kernel/workqueue.c中定义了一个工作队列链表, 所有工作队列可以挂接到这个链表中:static LIST_HEAD(workqueues);

  3. 一些宏定义

  /* include/linux/workqueue.h */ // 初始化工作队列#define __WORK_INITIALIZER(n, f, d) { // 初始化list。entry = { &(n)。entry, &(n)。entry },// 回调函数。func = (f),// 回调函数参数。data = (d),// 初始化定时器。timer = TIMER_INITIALIZER(NULL, 0, 0),}

  // 声明工作队列并初始化#define DECLARE_WORK(n, f, d)

  struct work_struct n = __WORK_INITIALIZER(n, f, d)

  /* * initialize a work-struct's func and data pointers:*/ // 重新定义工作结构参数#define PREPARE_WORK(_work, _func, _data)

  do {(_work)->func = _func;(_work)->data = _data;} while (0)

  /* * initialize all of a work-struct:*/ // 初始化工作结构, 和__WORK_INITIALIZER功能相同,不过__WORK_INITIALIZER用在// 参数初始化定义, 而该宏用在程序之中对工作结构赋值#define INIT_WORK(_work, _func, _data)

  do { INIT_LIST_HEAD(&(_work)->entry);(_work)->pending = 0;PREPARE_WORK((_work), (_func), (_data));init_timer(&(_work)->timer);} while (0)

  4. 操作函数

  4.1 创建工作队列

  一般的创建函数是create_workqueue, 但这其实只是一个宏:/* include/linux/workqueue.h */ #define create_workqueue(name) __create_workqueue((name), 0)

  在workqueue的初始化函数中, 定义了一个针对内核中所有线程可用的事件工作队列, 其他内核线程建立的事件工作结构就都挂接到该队列:void init_workqueues(void)

  {……

  keventd_wq = create_workqueue("events");……

  }

  核心创建函数是__create_workqueue:

  struct workqueue_struct *__create_workqueue(const char *name,int singlethread)

  { int cpu, destroy = 0;struct workqueue_struct *wq;struct task_struct *p;// 分配工作队列结构空间wq = kzalloc(sizeof(*wq), GFP_KERNEL);if (!wq)

  return NULL;// 为每个CPU分配单独的工作队列空间wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);if (!wq->cpu_wq) { kfree(wq);return NULL;} wq->name = name;mutex_lock(&workqueue_mutex);if (singlethread) { // 使用create_workqueue宏时该参数始终为0 // 如果是单一线程模式, 在单线程中调用各个工作队列// 建立一个的工作队列内核线程INIT_LIST_HEAD(&wq->list);// 建立工作队列的线程p = create_workqueue_thread(wq, singlethread_cpu);if (!p)

  destroy = 1;else // 唤醒该线程wake_up_process(p);} else { // 链表模式, 将工作队列添加到工作队列链表list_add(&wq->list, &workqueues);// 为每个CPU建立一个工作队列线程for_each_online_cpu(cpu) { p = create_workqueue_thread(wq, cpu);if (p) { // 绑定CPU kthread_bind(p, cpu);// 唤醒线程wake_up_process(p);} else destroy = 1;} mutex_unlock(&workqueue_mutex);/* * Was there any error during startup? If yes then clean up:*/ if (destroy) { // 建立线程失败, 释放工作队列destroy_workqueue(wq);wq = NULL;} return wq;} EXPORT_SYMBOL_GPL(__create_workqueue);

  // 创建工作队列线程static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,int cpu)

  { // 每个CPU的工作队列struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);struct task_struct *p;spin_lock_init(&cwq->lock);// 初始化cwq->wq = wq;cwq->thread = NULL;cwq->insert_sequence = 0;cwq->remove_sequence = 0;INIT_LIST_HEAD(&cwq->worklist);// 初始化等待队列more_work, 该队列处理要执行的工作结构init_waitqueue_head(&cwq->more_work);// 初始化等待队列work_done, 该队列处理执行完的工作结构init_waitqueue_head(&cwq->work_done);// 建立内核线程work_thread if (is_single_threaded(wq))

  p = kthread_create(worker_thread, cwq, "%s", wq->name);else p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu);if (IS_ERR(p))

  return NULL;// 保存线程指针cwq->thread = p;return p;} static int worker_thread(void *__cwq)

  { struct cpu_workqueue_struct *cwq = __cwq;// 声明一个等待队列DECLARE_WAITQUEUE(wait, current);// 信号struct k_sigaction sa;sigset_t blocked;current->flags |= PF_NOFREEZE;// 降低进程优先级, 工作进程不是个很紧急的进程,不和其他进程抢占CPU,通常在系统空闲时运行set_user_nice(current, -5);/* Block and flush all signals */ // 阻塞所有信号sigfillset(&blocked);sigprocmask(SIG_BLOCK, &blocked, NULL);flush_signals(current);/* * We inherited MPOL_INTERLEAVE from the booting kernel. * Set MPOL_DEFAULT to insure node local allocations. */ numa_default_policy();/* SIG_IGN makes children autoreap: see do_notify_parent()。 */ // 信号处理都是忽略sa.sa.sa_handler = SIG_IGN;sa.sa.sa_flags = 0;siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);// 进程可中断set_current_state(TASK_INTERRUPTIBLE);// 进入循环, 没明确停止该进程就一直运行while (!kthread_should_stop()) { // 设置more_work等待队列, 当有新work结构链入队列中时会激发此等待队列add_wait_queue(&cwq->more_work, &wait);if (list_empty(&cwq->worklist))

  // 工作队列为空, 睡眠schedule();else // 进行运行状态__set_current_state(TASK_RUNNING);// 删除等待队列remove_wait_queue(&cwq->more_work, &wait);// 按链表遍历执行工作任务if (!list_empty(&cwq->worklist))

  run_workqueue(cwq);// 执行完工作, 设置进程是可中断的, 重新循环等待工作set_current_state(TASK_INTERRUPTIBLE);} __set_current_state(TASK_RUNNING);return 0;}

  // 运行工作结构static void run_workqueue(struct cpu_workqueue_struct *cwq)

  { unsigned long flags;/* * Keep taking off work from the queue until * done. */ // 加锁spin_lock_irqsave(&cwq->lock, flags);// 统计已经递归调用了多少次了cwq->run_depth++;if (cwq->run_depth > 3) { // 递归调用此时太多/* morton gets to eat his hat */ printk("%s: recursion depth exceeded: %dn",__FUNCTION__, cwq->run_depth);dump_stack();} // 遍历工作链表while (!list_empty(&cwq->worklist)) { // 获取的是next节点的struct work_struct *work = list_entry(cwq->worklist.next,struct work_struct, entry);void (*f) (void *) = work->func;void *data = work->data;// 删除节点, 同时节点中的list参数清空list_del_init(cwq->worklist.next);// 解锁// 现在在执行以下代码时可以中断,run_workqueue本身可能会重新被调用, 所以要判断递归深度spin_unlock_irqrestore(&cwq->lock, flags);BUG_ON(work->wq_data != cwq);// 工作结构已经不在链表中clear_bit(0, &work->pending);// 执行工作函数f(data);// 重新加锁spin_lock_irqsave(&cwq->lock, flags);// 执行完的工作序列号递增cwq->remove_sequence++;// 唤醒工作完成等待队列, 供释放工作队列wake_up(&cwq->work_done);} // 减少递归深度cwq->run_depth——;// 解锁spin_unlock_irqrestore(&cwq->lock, flags);}

  4.2 释放工作队列/** * destroy_workqueue - safely terminate a workqueue * @wq: target workqueue * * Safely destroy a workqueue. All work currently pending will be done first. */ void destroy_workqueue(struct workqueue_struct *wq)

  { int cpu;// 清除当前工作队列中的所有工作flush_workqueue(wq);/* We don't need the distraction of CPUs appearing and vanishing. */ mutex_lock(&workqueue_mutex);// 结束该工作队列的线程if (is_single_threaded(wq))

  cleanup_workqueue_thread(wq, singlethread_cpu);else { for_each_online_cpu(cpu)

  cleanup_workqueue_thread(wq, cpu);list_del(&wq->list);} mutex_unlock(&workqueue_mutex);// 释放工作队列中对应每个CPU的工作队列数据free_percpu(wq->cpu_wq);kfree(wq);} EXPORT_SYMBOL_GPL(destroy_workqueue);

  /** * flush_workqueue - ensure that any scheduled work has run to completion. * @wq: workqueue to flush * * Forces execution of the workqueue and blocks until its completion. * This is typically used in driver shutdown handlers. * * This function will sample each workqueue's current insert_sequence number and * will sleep until the head sequence is greater than or equal to that. This * means that we sleep until all works which were queued on entry have been * handled, but we are not livelocked by new incoming ones. * * This function used to run the workqueues itself. Now we just wait for the * helper threads to do it. */ void fastcall flush_workqueue(struct workqueue_struct *wq)

  { // 该进程可以睡眠might_sleep();// 清空每个CPU上的工作队列if (is_single_threaded(wq)) { /* Always use first cpu's area. */ flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));} else { int cpu;mutex_lock(&workqueue_mutex);for_each_online_cpu(cpu)

  flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));mutex_unlock(&workqueue_mutex);} EXPORT_SYMBOL_GPL(flush_workqueue);

  flush_workqueue的核心处理函数为flush_cpu_workqueue:static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)

  { if (cwq->thread == current) { // 如果是工作队列进程正在被调度/* * Probably keventd trying to flush its own queue. So simply run * it by hand rather than deadlocking. */ // 执行完该工作队列run_workqueue(cwq);} else { // 定义等待DEFINE_WAIT(wait);long sequence_needed;// 加锁spin_lock_irq(&cwq->lock);// 最新工作结构序号sequence_needed = cwq->insert_sequence;// 该条件是判断队列中是否还有没有执行的工作结构while (sequence_needed - cwq->remove_sequence > 0) { // 有为执行的工作结构// 通过work_done等待队列等待prepare_to_wait(&cwq->work_done, &wait,TASK_UNINTERRUPTIBLE);// 解锁spin_unlock_irq(&cwq->lock);// 睡眠, 由wake_up(&cwq->work_done)来唤醒schedule();// 重新加锁spin_lock_irq(&cwq->lock);} // 等待清除finish_wait(&cwq->work_done, &wait);spin_unlock_irq(&cwq->lock);}

  4.3 调度工作

  在大多数情况下, 并不需要自己建立工作队列,而是只定义工作, 将工作结构挂接到内核预定义的事件工作队列中调度, 在kernel/workqueue.c中定义了一个静态全局量的工作队列keventd_wq:static struct workqueue_struct *keventd_wq;

  4.3.1 立即调度// 在其他函数中使用以下函数来调度工作结构, 是把工作结构挂接到工作队列中进行调度/** * schedule_work - put work task in global workqueue * @work: job to be done * * This puts a job in the kernel-global workqueue. */ // 调度工作结构, 将工作结构添加到事件工作队列keventd_wq int fastcall schedule_work(struct work_struct *work)

  { return queue_work(keventd_wq, work);} EXPORT_SYMBOL(schedule_work);

  /** * queue_work - queue work on a workqueue * @wq: workqueue to use * @work: work to queue * * Returns 0 if @work was already on a queue, non-zero otherwise. * * We queue the work to the CPU it was submitted, but there is no * guarantee that it will be processed by that CPU. */ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)

  { int ret = 0, cpu = get_cpu();if (!test_and_set_bit(0, &work->pending)) { // 工作结构还没在队列, 设置pending标志表示把工作结构挂接到队列中if (unlikely(is_single_threaded(wq)))

  cpu = singlethread_cpu;BUG_ON(!list_empty(&work->entry));// 进行具体的排队__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);ret = 1;} put_cpu();return ret;} EXPORT_SYMBOL_GPL(queue_work);/* Preempt must be disabled. */ // 不能被抢占static void __queue_work(struct cpu_workqueue_struct *cwq,struct work_struct *work)

  { unsigned long flags;// 加锁spin_lock_irqsave(&cwq->lock, flags);// 指向CPU工作队列work->wq_data = cwq;// 挂接到工作链表list_add_tail(&work->entry, &cwq->worklist);// 递增插入的序列号cwq->insert_sequence++;// 唤醒等待队列准备处理工作结构wake_up(&cwq->more_work);spin_unlock_irqrestore(&cwq->lock, flags);}

  4.3.2 延迟调度

  4.3.2.1 schedule_delayed_work /** * schedule_delayed_work - put work task in global workqueue after delay * @work: job to be done * @delay: number of jiffies to wait * * After waiting for a given time this puts a job in the kernel-global * workqueue. */ // 延迟调度工作, 延迟一定时间后再将工作结构挂接到工作队列int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay)

  { return queue_delayed_work(keventd_wq, work, delay);} EXPORT_SYMBOL(schedule_delayed_work);

  /** * queue_delayed_work - queue work on a workqueue after delay * @wq: workqueue to use * @work: work to queue * @delay: number of jiffies to wait before queueing * * Returns 0 if @work was already on a queue, non-zero otherwise. */ int fastcall queue_delayed_work(struct workqueue_struct *wq,struct work_struct *work, unsigned long delay)

  { int ret = 0;// 定时器, 此时的定时器应该是不起效的, 延迟将通过该定时器来实现struct timer_list *timer = &work->timer;if (!test_and_set_bit(0, &work->pending)) { // 工作结构还没在队列, 设置pending标志表示把工作结构挂接到队列中// 如果现在定时器已经起效, 出错BUG_ON(timer_pending(timer));// 工作结构已经挂接到链表, 出错BUG_ON(!list_empty(&work->entry));/* This stores wq for the moment, for the timer_fn */ // 保存工作队列的指针work->wq_data = wq;// 定时器初始化timer->expires = jiffies + delay;timer->data = (unsigned long)work;// 定时函数timer->function = delayed_work_timer_fn;// 定时器生效, 定时到期后再添加到工作队列add_timer(timer);ret = 1;} return ret;} EXPORT_SYMBOL_GPL(queue_delayed_work);

  // 定时中断函数static void delayed_work_timer_fn(unsigned long __data)

  { struct work_struct *work = (struct work_struct *)__data;struct workqueue_struct *wq = work->wq_data;// 获取CPU int cpu = smp_processor_id();if (unlikely(is_single_threaded(wq)))

  cpu = singlethread_cpu;// 将工作结构添加到工作队列,注意这是在时间中断调用__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);}

  4.3.2.2 schedule_delayed_work_on

  指定CPU的延迟调度工作结构, 和schedule_delayed_work相比增加了一个CPU参数, 其他都相同/** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use * @work: job to be done * @delay: number of jiffies to wait * * After waiting for a given time this puts a job in the kernel-global * workqueue on the specified CPU. */ int schedule_delayed_work_on(int cpu,struct work_struct *work, unsigned long delay)

  { return queue_delayed_work_on(cpu, keventd_wq, work, delay);}

  /** * queue_delayed_work_on - queue work on specific CPU after delay * @cpu: CPU number to execute work on * @wq: workqueue to use * @work: work to queue * @delay: number of jiffies to wait before queueing * * Returns 0 if @work was already on a queue, non-zero otherwise. */ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,struct work_struct *work, unsigned long delay)

  { int ret = 0;struct timer_list *timer = &work->timer;if (!test_and_set_bit(0, &work->pending)) { BUG_ON(timer_pending(timer));BUG_ON(!list_empty(&work->entry));/* This stores wq for the moment, for the timer_fn */ work->wq_data = wq;timer->expires = jiffies + delay;timer->data = (unsigned long)work;timer->function = delayed_work_timer_fn;add_timer_on(timer, cpu);ret = 1;} return ret;} EXPORT_SYMBOL_GPL(queue_delayed_work_on);

  5. 结论

  工作队列和定时器函数处理有点类似, 都是执行一定的回调函数, 但和定时器处理函数不同的是定时器回调函数只执行一次, 而且执行定时器回调函数的时候是在时钟中断中, 限制比较多, 因此回调程序不能太复杂; 而工作队列是通过内核线程实现, 一直有效, 可重复执行, 由于执行时降低了线程的优先级, 执行时可能休眠, 因此工作队列处理的应该是那些不是很紧急的任务, 如垃圾回收处理等, 通常在系统空闲时执行,在xfrm库中就广泛使用了workqueue,使用时,只需要定义work结构,然后调用schedule_(delayed_)work即可。