基于ARM平台分析Linux系统调用过程

来源：互联网发布：测试端口是否开启编辑：程序博客网时间：2024/05/24 06:30

Linux 3.10.44 + ARM V7 + Android:4.4.2

一 open

以open系统调用为例在bionic中的入口代码位于bionic/libc/bionic/open.c中，源代码如下：

int open(const char *pathname, int flags, ...)

{

mode_t mode = 0;

flags |= O_LARGEFILE;

if (flags & O_CREAT)

{

va_list args;

va_start(args, flags);

mode = (mode_t) va_arg(args, int);

va_end(args);

}

return __open(pathname, flags, mode);

}

仅在创建新文件时，mode参数才有作用。

二 __open

代码位于bionic/libc/arch-arm/syscalls/__open.S中，源代码如下：

ENTRY(__open)

mov ip, r7

ldr r7, =__NR_open

swi #0

mov r7, ip

cmn r0, #(MAX_ERRNO + 1)

bxls lr

neg r0, r0

b __set_errno

END(__open)

首先将r7内容保存到ip寄存器中，将系统调用号放到R7中，然后调用SWI（软中断，已更新成SVC）指令陷入内核模式，后边的处理就交给内核了。从内核返回后首先做的就是恢复R7寄存器，然后判断调用是否成功，如果失败的话设置errno。

三 vector_swi

异常处理的代码位于arch/arm/kernel/entry-armv.S，摘取如下：

__vectors_start:

W(b) vector_rst

W(b) vector_und

W(ldr) pc, __vectors_start + 0x1000

W(b) vector_pabt

W(b) vector_dabt

W(b) vector_addrexcptn

W(b) vector_irq

W(b) vector_fiq

在0x08偏移处的指令是将__vectors_start + 0x1000地址中的内容复制给PC，那么该地址是什么值呢？参考该文件下边的内容，该操作执行后会调用到vector_swi中执行。

.section .stubs, "ax", %progbits

__stubs_start:

@ This must be the first word

.word vector_swi

vector_swi源代码位于arch/arm/kernel/entry-common.S

S_FRAME_SIZE及S_PC定义在arch/arm/kernel/asm-offsets.c和arch/arm/include/uapi/asm/ptrace.h中，参考如下定义，整个frame包含18个寄存器，除了r0~r15外再增加cpsr和原r0。

#ifndef __KERNEL__

struct pt_regs {

long uregs[18];

};

#endif /* __KERNEL__ */

#define ARM_cpsr uregs[16]

#define ARM_pc uregs[15]

#define ARM_lr uregs[14]

#define ARM_sp uregs[13]

#define ARM_ip uregs[12]

#define ARM_fp uregs[11]

#define ARM_r10 uregs[10]

#define ARM_r9 uregs[9]

#define ARM_r8 uregs[8]

#define ARM_r7 uregs[7]

#define ARM_r6 uregs[6]

#define ARM_r5 uregs[5]

#define ARM_r4 uregs[4]

#define ARM_r3 uregs[3]

#define ARM_r2 uregs[2]

#define ARM_r1 uregs[1]

#define ARM_r0 uregs[0]

#define ARM_ORIG_r0 uregs[17]

以下是SWI handler去掉一些宏开关，在CONFIG_OABI_COMPAT和CONFIG_ARM_THUMB都设置为Y情况下的代码。

/*=============================================================================

* SWI handler

*-----------------------------------------------------------------------------

.align 5

ENTRY(vector_swi)

//首先保存寄存器信息

sub sp, sp, #S_FRAME_SIZE

stmia sp, {r0 - r12} @ Calling r0 - r12

ARM( add r8, sp, #S_PC )

ARM( stmdb r8, {sp, lr}^ ) @ Calling sp, lr

THUMB( mov r8, sp )

THUMB( store_user_sp_lr r8, r10, S_SP ) @ calling sp, lr

mrs r8, spsr @ called from non-FIQ mode, so ok.

str lr, [sp, #S_PC] @ Save calling PC

str r8, [sp, #S_PSR] @ Save CPSR

str r0, [sp, #S_OLD_R0] @ Save OLD_R0

zero_fp//fp清零

//遗留

#ifdef CONFIG_ALIGNMENT_TRAP

ldr ip, __cr_alignment

ldr ip, [ip]

mcr p15, 0, ip, c1, c0 @ update control register

#endif

//开中断

enable_irq

//调用内核用户态跟踪的函数

ct_user_exit

//获取thread_info指针到r9寄存器

get_thread_info tsk

* Get the system call number.

* If we have CONFIG_OABI_COMPAT then we need to look at the swi

* value to determine if it is an EABI or an old ABI call.

tst r8, #PSR_T_BIT

movne r10, #0 @ no thumb OABI emulation

USER( ldreq r10, [lr, #-4] ) @ get SWI instruction

ARM_BE8(rev r10, r10) @ little endian instruction

adr tbl, sys_call_table @ load syscall table pointer

* If the swi argument is zero, this is an EABI call and we do nothing.

* If this is an old ABI call, get the syscall number into scno and

* get the old ABI syscall table address.

bics r10, r10, #0xff000000

eorne scno, r10, #__NR_OABI_SYSCALL_BASE

ldrne tbl, =sys_oabi_call_table

local_restart:

ldr r10, [tsk, #TI_FLAGS] @ check for syscall tracing

stmdb sp!, {r4, r5} @ push fifth and sixth args

tst r10, #_TIF_SYSCALL_WORK @ are we tracing syscalls?

bne __sys_trace

cmp scno, #NR_syscalls @ check upper syscall limit

//设置返回地址为ret_fast_syscall

adr lr, BSYM(ret_fast_syscall) @ return address

//如果是正常的系统调用那么执行系统调用函数

ldrcc pc, [tbl, scno, lsl #2] @ call sys_* routine

//以下处理特殊的系统调用

add r1, sp, #S_OFF

2: mov why, #0 @ no longer a real syscall

cmp scno, #(__ARM_NR_BASE - __NR_SYSCALL_BASE)

eor r0, scno, #__NR_SYSCALL_BASE @ put OS number back

bcs arm_syscall

b sys_ni_syscall @ not private func

* We failed to handle a fault trying to access the page

* containing the swi instruction, but we're not really in a

* position to return -EFAULT. Instead, return back to the

* instruction and re-enter the user fault handling path trying

* to page it in. This will likely result in sending SEGV to the

* current task.

//异常处理的功能，把lr减4，那么将再次做系统调用

9001:

sub lr, lr, #4

str lr, [sp, #S_PC]

b ret_fast_syscall

ENDPROC(vector_swi)

四关于USER宏的作用

USER( ldreq r10, [lr, #-4] ) @ get SWI instruction

#define USER(x...) \

9999: x; \

.pushsection __ex_table,"a"; \

.align 3; \

.long 9999b,9001f; \

.popsection

ldreq r10, [lr, #-4]在读取内容是存在该页尚未换入的可能，这样直接访问就可能

导致page fault。USER宏的作用是在__ex_table中增加一个异常处理的函数，表示该指

令出错后会执行9001标签后的代码。

以下是这个patch的原始说明：

ARM: 7748/1: oabi: handle faults when loading swi instruction from userspace

Running an OABI_COMPAT kernel on an SMP platform can lead to fun and

games with page aging.

If one CPU issues a swi instruction immediately before another CPU

decides to mkold the page containing the swi instruction, then we will

fault attempting to load the instruction during the vector_swi handler

in order to retrieve its immediate field. Since this fault is not

currently dealt with by our exception tables, this results in a panic:

Unable to handle kernel paging request at virtual address 4020841c

pgd = c490c000

[4020841c] *pgd=84451831, *pte=bf05859d, *ppte=00000000

Internal error: Oops: 17 [#1] PREEMPT SMP ARM

Modules linked in: hid_sony(O)

CPU: 1 Tainted: G W O (3.4.0-perf-gf496dca-01162-gcbcc62b #1)

PC is at vector_swi+0x28/0x88

LR is at 0x40208420

This patch wraps all of the swi instruction loads with the USER macro

and provides a shared exception table entry which simply rewinds the

saved user PC and returns from the system call (without setting tbl, so

there's no worries with tracing or syscall restarting). Returning to

userspace will re-enter the page fault handler, from where we will

probably send SIGSEGV to the current task.

Reported-by: Wang, Yalin <yalin.wang@sonymobile.com>

Reviewed-by: Nicolas Pitre <nico@linaro.org>

Signed-off-by: Will Deacon <will.deacon@arm.com>

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

从以上说明看问题发生在一个核刚调用完swi陷入内核后，另一个核就把包含swi页换出了，

这个时候内核访问swi指令就会导致发生oops错误。增加了异常处理后，如果发生page fault，

那么首先会产生data about，在异常处理函数中通过执行9001处代码，这段代码会将lr减4再

返回到用户空间，这样用户空间会再次调用swi，此时产生缺页异常再次陷入内核，内核将该

页换入，然后再陷入内核执行系统调用。

五 ret_fast_syscall

系统调用执行完毕后就会通过调用ret_fast_syscall返回到用户空间。

.align 5

* This is the fast syscall return path. We do as little as

* possible here, and this includes saving r0 back into the SVC

* stack.

ret_fast_syscall:

UNWIND(.fnstart )

UNWIND(.cantunwind )

disable_irq @ disable interrupts

ldr r1, [tsk, #TI_FLAGS]

tst r1, #_TIF_WORK_MASK

//检查返回用户空间前是否需要做些其他事情，比如need rescheduled到其他进程

bne fast_work_pending

asm_trace_hardirqs_on

/* perform architecture specific actions before user return */

arch_ret_to_user r1, lr

ct_user_enter

//返回到用户空间

restore_user_regs fast = 1, offset = S_OFF

UNWIND(.fnend )

六系统调用过程中的异常

因为在vector_swi中会通过zero_fp将fp设置为0，且对于内核态vector_swi是调用的起点，

所以在vector_swi中发生异常后是没有调用栈的，且提示“no frame pointer”

Unable to handle kernel paging request at virtual address 400da2dc

pgd = d91c4000

[400da2dc] *pgd=1fead831, *pte=1793c59d, *ppte=00000000

Internal error: Oops: 17 [#1] PREEMPT SMP ARM

Modules linked in:

CPU: 1 PID: 2204 Comm: ActivityManager Not tainted 3.10.24-00056-g2526063 #1

task: d9353a00 ti: d3c4a000 task.ti: d3c4a000

PC is at vector_swi+0x2c/0x58

LR is at 0x400da2e0

pc : [<c000e2ec>] lr : [<400da2e0>] psr: 600f0093

sp : d3c4bfb0 ip : 40108384 fp : 00000000

r10: 000000a4 r9 : 629e2a98 r8 : 200f0010

r7 : 00000003 r6 : ada00009 r5 : 5fbd69d4 r4 : 5b505c80

r3 : 00000000 r2 : 000000ff r1 : 5fbd69d4 r0 : 000000a4

Flags: nZCv IRQs off FIQs on Mode SVC_32 ISA ARM Segment user

Control: 10c5387d Table: 1fdc406a DAC: 00000015

PC: 0xc000e26c:

e26c e16ff001 f57ff01f e95d7fff e1a00000 e28dd00c e1b0f00e eb0122cd e3550000

e28c 11a00004 128fe000 11a0f005 e1a096ad e1a09689 eaffffeb e320f000 e320f000

e2ac e320f000 e320f000 e320f000 e320f000 e320f000 e24dd048 e88d1fff e28d803c

e2cc e9486000 e14f8000 e58de03c e58d8040 e58d0044 e3a0b000 e3180020 13a0a000

e2ec 051ea004 e59fc0a8 e59cc000 ee01cf10 f1080080 e1a096ad e1a09689 e28f8098

e30c e3daa4ff 122a7609 159f8088 e599a000 e92d0030 e31a0c0f 1a000008 e3570f5f

e32c e24fef4d 3798f107 e28d1008 e3a08000 e357080f e2270000 2a0013df ea00eb4e

e34c e1a01007 e28d0008 eb000b1e e28fe024 e1a07000 e28d1008 e3570f5f 3891007f

SP: 0xd3c4bf30:

bf30 d3c4bf54 d3c4bf40 c0101090 c00f1b54 00000000 000000a4 c000e2ec 600f0093

bf50 ffffffff d3c4bf9c 00000000 d3c4bf68 c000dd58 c000849c 000000a4 5fbd69d4

bf70 000000ff 00000000 5b505c80 5fbd69d4 ada00009 00000003 200f0010 629e2a98

bf90 000000a4 00000000 40108384 d3c4bfb0 400da2e0 c000e2ec 600f0093 ffffffff

bfb0 000000a4 5fbd69d4 000000ff 00000000 5b505c80 5fbd69d4 ada00009 00000003

bfd0 ebc00001 629e2a98 000000a4 00000000 40108384 5fbd69b0 401f5527 400da2e0

bff0 200f0010 000000a4 ff566758 59777795 00000000 00000002 bf000000 d93b3a00

c010 c0a5161c 00000002 00000015 d93b3a00 00000004 d3c4c000 c0a469b0 c0b052d8

Process ActivityManager (pid: 2204, stack limit = 0xd3c4a238)

Stack: (0xd3c4bfb0 to 0xd3c4c000)

bfa0: 000000a4 5fbd69d4 000000ff 00000000

bfc0: 5b505c80 5fbd69d4 ada00009 00000003 ebc00001 629e2a98 000000a4 00000000

bfe0: 40108384 5fbd69b0 401f5527 400da2e0 200f0010 000000a4 ff566758 59777795

Backtrace: no frame pointer

Code: e58d0044 e3a0b000 e3180020 13a0a000 (051ea004)

---[ end trace 10d7d4bd070793====1401323113.82689

除vector_swi中fp会设置为0后，还有几个地方也会设置fp为0：

1. 不合法的异常入口

2. fiq入口

3. 在user模式进入各种异常时

这些地方的统一特点是对于内核来讲这些都是调用的起点，所以fp为0是正常的。

common_invalid:

zero_fp

ldmia r0, {r4 - r6}

add r0, sp, #S_PC @ here for interlock avoidance

mov r7, #-1 @ "" "" "" ""

str r4, [sp] @ save preserved r0

stmia r0, {r5 - r7} @ lr_<exception>,

@ cpsr_<exception>, "old_r0"

mov r0, sp

b bad_mode

ENDPROC(__und_invalid)

因为在调用内核中的API处理函数前，lr寄存器会被设置成ret_fast_syscall函数所在的位置，所以

如果这个时候内核发生异常，那么调用栈的最后是ret_fast_syscall，实际上这个是不对的，因为

调用系统函数的起点是vector_swi，这个地方只是显示问题。

Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000100

CPU: 3 PID: 1 Comm: init Tainted: G W 3.10.44-g516054c #1

Backtrace:

[<c0013410>] (dump_backtrace+0x0/0x10c) from [<c0013748>] (show_stack+0x18/0x1c)

r6:e1660000 r5:c0a1bab4 r4:c0d8dae8 r3:00000000

[<c0013730>] (show_stack+0x0/0x1c) from [<c083aac0>] (dump_stack+0x20/0x28)

[<c083aaa0>] (dump_stack+0x0/0x28) from [<c0836ac8>] (panic+0x98/0x1fc)

[<c0836a30>] (panic+0x0/0x1fc) from [<c002a10c>] (do_exit+0x7d8/0x930)

r0:c0a1bab4[2014:06:27 01:38:10][pid:1,cpu3,init]

r7:e02c9e00

[<c0029934>] (do_exit+0x0/0x930) from [<c002a3e0>] (do_group_exit+0x44/0xb8)

r7:000000f8

[<c002a39c>] (do_group_exit+0x0/0xb8) from [<c002a46c>] (__wake_up_parent+0x0/0x28)

r7:000000f8 r6:beac5d54 r5:00000000 r4:beac5c34

[<c002a454>] (SyS_exit_group+0x0/0x18) from [<c000e880>] (ret_fast_syscall+0x0/0x30)

遗留问题：

1. 以上的分析都是假设在内核中不能处理缺页异常，且如果在异常处理表中有处理表项就不会发生

oops异常，如果没有就会发生。这个未分析相关的代码。

2. asm_trace_hardirqs_on的具体作用

0 0