kernel 系统调用----system call
来源:互联网 发布:民航网络信息安全 编辑:程序博客网 时间:2024/05/17 04:20
Init
在trap_init中对SYSCALL_VECTOR(编号0x80)的向量进行初始化。
808 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
将system call初始化为trap门,加入到IDT table中,发生中断以后,会跳转到对应system_call的地址去执行后续的中断流程。发生中断到跳转执行中断向量的过程在kernel 中断分析三——中断处理流程有详细解释,本篇只关注system_call的运行过程。
ENTRY(system_call)
499 /* 500 * syscall stub including irq exit should be protected against kprobes 501 */ 502 .pushsection .kprobes.text, "ax" 503 # system call handler stub 504 ENTRY(system_call) 505 RING0_INT_FRAME # can't unwind into user space anyway 506 ASM_CLAC 507 pushl_cfi %eax # save orig_eax --------------1 508 SAVE_ALL -----------2 509 GET_THREAD_INFO(%ebp) -----------3 510 # system call tracing in operation / emulation 511 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) ---------4 512 jnz syscall_trace_entry 513 cmpl $(NR_syscalls), %eax 514 jae syscall_badsys 515 syscall_call: --------5 516 call *sys_call_table(,%eax,4) 517 syscall_after_call: --------6 518 movl %eax,PT_EAX(%esp) # store the return value 519 syscall_exit: --------7 520 LOCKDEP_SYS_EXIT 521 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt 522 # setting need_resched or sigpending 523 # between sampling and the iret 524 TRACE_IRQS_OFF 525 movl TI_flags(%ebp), %ecx 526 testl $_TIF_ALLWORK_MASK, %ecx # current->work 527 jne syscall_exit_work 528
- RING0_INT_FRAME设置esp、eip指向内核态,然后将eax中的系统调用号入栈
- 保存现场,即用户态的一些寄存器值
- 将thread_info的地址保存到ebp寄存器
- 当前进程是否有被trace,如果有就执行相关的动作保存当时的追踪信息
- 调用对应的系统调用函数
- 将返回值入栈
- 屏蔽其他中断。检测当前进程是否还有工作没有完成,如果有,那么跳转到syscall_exit_work
- 然后恢复userspace被压入栈的寄存器,返回userspace
529 restore_all: 530 TRACE_IRQS_IRET 531 restore_all_notrace: 532 #ifdef CONFIG_X86_ESPFIX32 533 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS ------------1 534 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we 535 # are returning to the kernel. 536 # See comments in process.c:copy_thread() for details. 537 movb PT_OLDSS(%esp), %ah 538 movb PT_CS(%esp), %al 539 andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax 540 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax 541 CFI_REMEMBER_STATE 542 je ldt_ss # returning to user-space with LDT SS 543 #endif 544 restore_nocheck: 545 RESTORE_REGS 4 # skip orig_eax/error_code 546 irq_return: 547 INTERRUPT_RETURN ---------- 548 .section .fixup,"ax" 549 ENTRY(iret_exc) 550 pushl $0 # no error code 551 pushl $do_iret_error 552 jmp error_code 553 .previous 554 _ASM_EXTABLE(irq_return,iret_exc) 555 556 #ifdef CONFIG_X86_ESPFIX32 557 CFI_RESTORE_STATE 558 ldt_ss: 559 #ifdef CONFIG_PARAVIRT 560 /* 561 * The kernel can't run on a non-flat stack if paravirt mode 562 * is active. Rather than try to fixup the high bits of 563 * ESP, bypass this code entirely. This may break DOSemu 564 * and/or Wine support in a paravirt VM, although the option 565 * is still available to implement the setting of the high 566 * 16-bits in the INTERRUPT_RETURN paravirt-op. 567 */ 568 cmpl $0, pv_info+PARAVIRT_enabled 569 jne restore_nocheck 570 #endif 571 572 /* 573 * Setup and switch to ESPFIX stack 574 * 575 * We're returning to userspace with a 16 bit stack. The CPU will not 576 * restore the high word of ESP for us on executing iret... This is an 577 * "official" bug of all the x86-compatible CPUs, which we can work 578 * around to make dosemu and wine happy. We do this by preloading the 579 * high word of ESP with the high word of the userspace ESP while 580 * compensating for the offset by changing to the ESPFIX segment with 581 * a base address that matches for the difference. 582 */ 583 #define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) 584 mov %esp, %edx /* load kernel esp */ 585 mov PT_OLDESP(%esp), %eax /* load userspace esp */ 586 mov %dx, %ax /* eax: new kernel esp */ 587 sub %eax, %edx /* offset (low word is 0) */ 588 shr $16, %edx 589 mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ 590 mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ 591 pushl_cfi $__ESPFIX_SS 592 pushl_cfi %eax /* new kernel esp */ 593 /* Disable interrupts, but do not irqtrace this section: we 594 * will soon execute iret and the tracer was already set to 595 * the irqstate after the iret */ 596 DISABLE_INTERRUPTS(CLBR_EAX) 597 lss (%esp), %esp /* switch to espfix segment */ 598 CFI_ADJUST_CFA_OFFSET -8 599 jmp restore_nocheck 600 #endif 601 CFI_ENDPROC 602 ENDPROC(system_call)
syscall_exit_work
_TIF_ALLWORK_MASK 的定义如下:
144 /* Work to do on any return to user space. */145 #define _TIF_ALLWORK_MASK \146 (_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SINGLESTEP|\147 _TIF_ASYNC_TLB|_TIF_NOTIFY_RESUME)
当以下情况之一发生时,返回用户态之前需要进入syscall_exit_work处理:
1. 当前进程有信号pending
2. 当前进程需要被重新调度
3. 设置了_TIF_SINGLESTEP,restore singlestep on return to user mode
4. got an async TLB fault in kernel
5. callback before returning to user
670 syscall_exit_work: 671 testl $_TIF_WORK_SYSCALL_EXIT, %ecx----------1 672 jz work_pending 673 TRACE_IRQS_ON 674 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call 675 # schedule() instead 676 movl %esp, %eax 677 call syscall_trace_leave 678 jmp resume_userspace------------------------2 679 END(syscall_exit_work)
- 检测是否有work pending
- 否则开中断然后返回用户态
607 work_pending: 608 testb $_TIF_NEED_RESCHED, %cl -------------1 609 jz work_notifysig -------------2 610 work_resched: -------------3 611 call schedule 612 LOCKDEP_SYS_EXIT 613 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt 614 # setting need_resched or sigpending 615 # between sampling and the iret 616 TRACE_IRQS_OFF 617 movl TI_flags(%ebp), %ecx 618 andl $_TIF_WORK_MASK, %ecx # is there any work to be done other 619 # than syscall tracing? 620 jz restore_all 621 testb $_TIF_NEED_RESCHED, %cl 622 jnz work_resched 623 624 work_notifysig: # deal with pending signals and-------------------4 625 # notify-resume requests 626 #ifdef CONFIG_VM86 627 testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) 628 movl %esp, %eax 629 jne work_notifysig_v86 # returning to kernel-space or 630 # vm86-space 631 1: 632 #else 633 movl %esp, %eax 634 #endif 635 TRACE_IRQS_ON 636 ENABLE_INTERRUPTS(CLBR_NONE) 637 movb PT_CS(%esp), %bl 638 andb $SEGMENT_RPL_MASK, %bl 639 cmpb $USER_RPL, %bl 640 jb resume_kernel 641 xorl %edx, %edx 642 call do_notify_resume -------------------5 643 jmp resume_userspace 644 645 #ifdef CONFIG_VM86 646 ALIGN 647 work_notifysig_v86: 648 pushl_cfi %ecx # save ti_flags for do_notify_resume 649 call save_v86_state # %eax contains pt_regs pointer 650 popl_cfi %ecx 651 movl %eax, %esp 652 jmp 1b 653 #endif 654 END(work_pending)
- 检测_TIF_NEED_RESCHED,若被设置,跳转到work_resched,否则跳转到work_notifysig,进行信号处理
- 调用schedule主动让出CPU
- 处理pending的信号,具体的处理流程在do_notify_resume 中的do_signal
整个处理流程用流程图表现得更加直观:
0 0
- kernel 系统调用----system call
- 系统调用system call
- linux 系统调用system call
- Android 中用内核模块实现系统调用(Implement system call by kernel module in Android)
- linux系统调用表(system call table)
- fast system call 快速系统调用
- 系统调用(system call)与应用程序接口(API)
- System Call 扒开系统调用的三层皮[未完待续~]
- 系统调用(system call)和库函数调用(Library functions)
- 远程调用内核接口(remote call kernel)
- base-kernel-系统调用
- Linux X86 系统调用列表 system call table 32 bits and 64 bits
- msgrcv出错errno=4[Interrupted system call]系统调用被信号中断
- measure the cost of a system call(测量系统调用时间)
- Adding a New System Call into the Linux Kernel 2.6
- linux新增system call(for kernel 2.6)
- ZZ: linux新增system call(for kernel 2.6)
- linux新增system call(for kernel 2.6)
- http://blog.chinaunix.net/uid-31438209-id-5760176.html
- 框架梳理|企业大数据管理之道
- VC下寻找某个进程并关闭
- 并查集详解
- MySql性能优化一
- kernel 系统调用----system call
- OkHttp的实现原理(一)之同步
- jQuery方法扩展代码整理
- 程序员的鄙视链
- yarn资源隔离
- mysql无法更改初始密码,mysql忘记登录密码
- js点击复制文本
- ProtocolBuffer Java Jar 生成指导
- Git推送报错:The remote end hung up unexpectedly的解决办法