linux 消息队列

来源：互联网发布：python3 urllib json 编辑：程序博客网时间：2024/05/20 04:50

Init 1

msgget 2

msgsnd 2

Schedule() 3

总结 17

前面都是一些实现细节，看看总结就可以了

Init

在前面文档中说的查找 ipc_namespace 是通过当前进程的task_stuct 检索到的

这个结构体定义在 ipc/util.c 中

struct ipc_namespace init_ipc_ns

不管是共享内存还是消息队列，都是把自己的ids 放到ipc_namespace 中的结构体指针数组ids 中去。

Ipc/msg.c

void __init msg_init(void)

__msg_init_ns

设置各种大小的限制值

#define MSGMNI 16 /* <= IPCMNI */ /* max # of msg queue identifiers */

#define MSGMAX 8192 /* <= INT_MAX */ /* max size of message (bytes) */

#define MSGMNB 16384 /* <= INT_MAX */ /* default max size of a message queue */

也就是说系统中最多只能有16个消息队列

然后创建 sysvipc/msg

msgget

asmlinkage long sys_msgget(key_t key, int msgflg)

如果是父子进程间使用msg ，key为可以设置成 IPC_PRIVATE

创建一个新的msg_queue

放入entries 指向的结构体数组中

初始化消息链表发送链表接受链表

如果不是IPC_PRIVATE，在entries 中查找，如果找不到且没有创建标志，失败

如果找到对应的msg_queue，首先需要进行权限验证

生成一个内部id后返回

msgsnd

asmlinkage long

sys_msgsnd(int msqid, struct msgbuf __user *msgp, size_t msgsz, int msgflg)

创建了结构体 msg_sender

struct msg_sender {

struct list_head list;

struct task_struct *tsk;

};

如果消息队列中的消息的数目和当前所有消息的字节数不超过限定值，

就遍历rev 队列

如果是接受者需要的消息类型，且长度合适就送给接受者，并且唤醒调用者进程，

否则就得存下来

Schedule()

进程调度函数

Linux完整地支持内核抢占。

if (unlikely(in_atomic() && !current->exit_state)) {

printk(KERN_ERR "BUG: scheduling while atomic: "

"%s/0x%08x/%d/n",

current->comm, preempt_count(), current->pid);

debug_show_held_locks(current);

if (irqs_disabled())

print_irqtrace_events(current);

dump_stack();

}

判断的条件是进程处于原子状态且进程正在运行

# define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != kernel_locked())

#define kernel_locked() (current->lock_depth >= 0)

lock_depth初始值为-1，而preempt_count初始值为1

每次占用大内核锁，各自加1，重复进锁，lock_depth 增加而preempt_count不加

也就是说在pxa310中，如果进程占用了1次的大内核锁就认为是原子状态，

即进程占大了大内核锁且处于运行状态就不能schedule

接下来

preempt_disable();

prev = current;

release_kernel_lock(prev);

preempt 加1 ，有大内核锁就得释放

rq = this_rq(); 获取本CPU对应的runqueue

if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {

printk(KERN_ERR "bad: scheduling from the idle thread!/n");

dump_stack();

}

Idle的线程不允许schedule

now = sched_clock();

if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {

run_time = now - prev->timestamp;

if (unlikely((long long)(now - prev->timestamp) < 0))

run_time = 0;

} else

run_time = NS_MAX_SLEEP_AVG;

以ns为单位，取当前调度的时间戳，与本进程的一次调度比较获取

进程的当前运行时间

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

switch_count = &prev->nvcsw;

if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&

unlikely(signal_pending(prev))))

prev->state = TASK_RUNNING;

else {

if (prev->state == TASK_UNINTERRUPTIBLE)

rq->nr_uninterruptible++;

deactivate_task(prev, rq);

}

都到这了，不占大内核锁，不在运行，如果可能中断，且有未决中断，则置运行状态，否则需要从运行队列中去掉

cpu = smp_processor_id();

if (unlikely(!rq->nr_running)) {

idle_balance(cpu, rq);

if (!rq->nr_running) {

next = rq->idle;

rq->expired_timestamp = 0;

goto switch_tasks;

}

如果当前CPU的运行队列为0，就得进行负载均衡了

当我们的pxa310 单CPU的，就不用考虑啦

array = rq->active;

if (unlikely(!array->nr_active)) {

* Switch the active and expired arrays.

schedstat_inc(rq, sched_switch);

rq->active = rq->expired;

rq->expired = array;

array = rq->active;

rq->expired_timestamp = 0;

rq->best_expired_prio = MAX_PRIO;

}

如果nr_active为0，则交换 active 和expired

活动队列的结构有一个140bit的位图，可以根据这个活动位图来快速检索一个活动的task_struct

还有一个140个元素的链表头数组

每个链表头串着一串task_struct

idx = sched_find_first_bit(array->bitmap);

queue = array->queue + idx;

next = list_entry(queue->next, struct task_struct, run_list);

现在找到了一个task_struct

三种调度策略：SCHED_FIFO，SCHED_RR和SCHED_NORMAL。
FIFO实时调度算法当调度器将CPU指定给某个进程时，它把该进程放到运行队列首；除非有更高优先级的进程，否则该进程将一直占用CPU。
Round Robin实时进程调度把CPU指定给某进程，把它放到运行队列尾。时间片运行完再选择其他进程调度。这样保证了同优先级的公平竞争CPU。
SCHED_NORMAL是普通的基于运行时间和等待时间等，动态调整进程优先级的一种调度策略。
实时进程优先级1～100，普通101～139。

实时进程调度的时机
1)      该进程被更高优先级的进程抢占；
2)      进程执行一个阻塞操作，被放到睡眠队列，状态为TASK_INTERRUPTIBLE或TASK_UNINTERRUPTIBLE；
3)      进程被终止(状态为TASK_STOPPED 或TASK_TRACED)，或者进程被杀死(状态为EXIT_ZOMBIE 或 EXIT_DEAD)
4)      进程调用sched_yield()主动放弃CPU；
5)      RR实时进程用完了CPU分配的时间片；

prefetch(next);

使用PLD 预取一下该内存地址

prefetch_stack(next);

clear_tsk_need_resched(prev);

清除thread_info中的重新调用标志

context_switch

static inline struct task_struct *

context_switch(struct rq *rq, struct task_struct *prev,

struct task_struct *next)

{

struct mm_struct *mm = next->mm;

struct mm_struct *oldmm = prev->active_mm;

/*

* For paravirt, this is coupled with an exit in switch_to to

* combine the page table reload and the switch backend into

* one hypercall.

*/

重新加载页表

arch_enter_lazy_cpu_mode();

if (!mm) {

next->active_mm = oldmm;

atomic_inc(&oldmm->mm_count);

enter_lazy_tlb(oldmm, next);

} else

switch_mm(oldmm, mm, next);

看一下 switch_mm 这个函数

static inline void

switch_mm(struct mm_struct *prev, struct mm_struct *next,

struct task_struct *tsk)

{

#ifdef CONFIG_MMU

unsigned int cpu = smp_processor_id();

获取当前的CPU ID

if (prev != next) {

如果需要切换内存

cpu_set(cpu, next->cpu_vm_mask);

使用获取到的CPU_ID 设置虚拟内存空间的掩码

check_context(next);

cpu_switch_mm(next->pgd, next);

有如下定义

#define cpu_switch_mm(pgd,mm) cpu_do_switch_mm(virt_to_phys(pgd),mm)

#define cpu_do_switch_mm(pgd,mm) processor.switch_mm(pgd,mm)

struct processor processor;

这个结构体中包含了一些CPU相关的方法

对应着 proc-xscale.S 中的cpu_xscale_switch_mm 方法

#define PTE_CACHE_WRITE_ALLOCATE 0

/*

* cpu_xscale_switch_mm(pgd)

*

* Set the translation base pointer to be as described by pgd.

*

* pgd: new page tables

*/

.align 5

ENTRY(cpu_xscale_switch_mm)

clean_d_cache r1, r2

mcr p15, 0, ip, c7, c5, 0 @ Invalidate I cache & BTB

mcr p15, 0, ip, c7, c10, 4 @ Drain Write (& Fill) Buffer

mcr p15, 0, r0, c2, c0, 0 @ load page table pointer

mcr p15, 0, ip, c8, c7, 0 @ invalidate I & D TLBs

cpwait_ret lr, ip

重新加载了页目录表的基地址

if (cache_is_vivt())

cpu_clear(cpu, prev->cpu_vm_mask);

}

#endif

}

if (!prev->mm) {

prev->active_mm = NULL;

WARN_ON(rq->prev_mm);

rq->prev_mm = oldmm;

}

/*

* Since the runqueue lock will be released by the next

* task (which is an invalid locking op but in the case

* of the scheduler it's an obvious special-case), so we

* do an early lockdep release here:

*/

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

#endif

/* Here we just switch the register state and the stack. */

switch_to(prev, next, prev);

return prev;

}

#define switch_to(prev,next,last) /

do { /

last = __switch_to(prev,task_thread_info(prev), task_thread_info(next)); /

} while (0)

Arm 的__switch_to实现如下

arch/arm/kernel/entry-armv.S

__switch_to

主要做了4件事

1保存现场

2设置tls domain 寄存器

3通知线程切换事件

4切换至新的现场

* Register switch for ARMv3 and ARMv4 processors

* r0 = previous task_struct, r1 = previous thread_info, r2 = next thread_info

* previous and next are guaranteed not to be the same.

ENTRY(__switch_to)

add ip, r1, #TI_CPU_SAVE

在arm中有如下定义

DEFINE(TI_CPU_SAVE, offsetof(struct thread_info, cpu_context));

由于之前r1 中已经是 previous thread_info

所以 ip 现在是cpu_context

ldr r3, [r2, #TI_TP_VALUE]

有定义

DEFINE(TI_TP_VALUE, offsetof(struct thread_info, tp_value));

将thread_info 中 tp_value 的值放入r3 中

stmia ip!, {r4 - sl, fp, sp, lr} @ Store most regs on stack

ip现在指向的是一个cpu_context_save 结构

struct cpu_context_save {

__u32 r4;

__u32 r5;

__u32 r6;

__u32 r7;

__u32 r8;

__u32 r9;

__u32 sl;

__u32 fp;

__u32 sp;

__u32 pc;

};

该指令把 r4-s1 fp sp lr 的值都放入了cpu_context_save中

#ifdef CONFIG_MMU

ldr r6, [r2, #TI_CPU_DOMAIN]

#endif

配置了MMU

将新的thread_info 中的cpu_domain 放入r6中

#if __LINUX_ARM_ARCH__ >= 6

#ifdef CONFIG_CPU_32v6K

clrex

清除独占

#else

独占

strex r5, r4, [ip] @ Clear exclusive monitor

#endif

#if defined(CONFIG_HAS_TLS_REG)

mcr p15, 0, r3, c13, c0, 3 @ set TLS register

现在的r3 是tp_value 设置了p15

#elif !defined(CONFIG_TLS_REG_EMUL)

mov r4, #0xffff0fff

str r3, [r4, #-15] @ TLS val at 0xffff0ff0

#endif

#ifdef CONFIG_MMU

mcr p15, 0, r6, c3, c0, 0 @ Set domain register

设置了cpu_domain

#endif

mov r5, r0

r0 中为task_struct 地址传给r5

add r4, r2, #TI_CPU_SAVE

r4 指向保存的cpu_context

ldr r0, =thread_notify_head

在process.c 中定义了

ATOMIC_NOTIFIER_HEAD(thread_notify_head);

mov r1, #THREAD_NOTIFY_SWITCH

有定义

#define THREAD_NOTIFY_SWITCH 2

bl atomic_notifier_call_chain

也就是将线程切换的消息通知出去

mov r0, r5

ldmia r4, {r4 - sl, fp, sp, pc} @ Load all regs saved previously

真正切换到新的现场

context_switch 主要的作用就是更新页目录表基地保存CPU 寄存器现场切换到新的现场

到这里就将schedule 的流程梳理一遍

再来看一下

do_msgrcv

按照指定的模式在消息列表中找，找到就好

如果找不到构造一个rev结构体，放入链表

重新调度，当再次执行的时候，循环判断自身等待的消息是否存在，如果有了就退出

总结：

消息队列究竟是怎么回事？

Linux 在内核中的一种机制并提供了应用层接口用来在进程间通信的一种机制，当然也可用于进程内，呵呵。

对于接受方来说,

有几种获取消息的方式

1 接受所有消息

2 接受指定消息或除了某种消息外的所有消息

3 接受某个消息段的消息

当前的设计让接受方只能知道消息类型和具体消息，至于发送消息的进程ID，发送时间等内容并没有返回到上层（出错的时候带上了进程ID）

以阻塞的方式接受。

对同一个消息队列来说，发送者可以有多个，所有发往同一个消息队列的消息都能被监听该消息队列的接受者得到

对于发送方来说

也分阻塞发送，和非阻塞发送2种

如果消息队列满了，阻塞发送就得等，直到有了对应的接受者或有了足够的空间存放，

非阻塞在这种情况下就直接返回了，当然消息也就丢掉了

Pxa310 所用的系统中最多可以支持16个消息队列

在调试的时候可以cat /proc/sysvipc/msg

来查看msg使用的详细情况。