kernel 系统调用----system call

来源:互联网 发布:民航网络信息安全 编辑:程序博客网 时间:2024/05/17 04:20

Init

在trap_init中对SYSCALL_VECTOR(编号0x80)的向量进行初始化。

 808     set_system_trap_gate(SYSCALL_VECTOR, &system_call);

将system call初始化为trap门,加入到IDT table中,发生中断以后,会跳转到对应system_call的地址去执行后续的中断流程。发生中断到跳转执行中断向量的过程在kernel 中断分析三——中断处理流程有详细解释,本篇只关注system_call的运行过程。

ENTRY(system_call)

 499 /* 500  * syscall stub including irq exit should be protected against kprobes 501  */ 502     .pushsection .kprobes.text, "ax" 503     # system call handler stub 504 ENTRY(system_call) 505     RING0_INT_FRAME         # can't unwind into user space anyway 506     ASM_CLAC 507     pushl_cfi %eax          # save orig_eax  --------------1 508     SAVE_ALL                                    -----------2 509     GET_THREAD_INFO(%ebp)                       -----------3 510                     # system call tracing in operation / emulation 511     testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) ---------4 512     jnz syscall_trace_entry 513     cmpl $(NR_syscalls), %eax 514     jae syscall_badsys 515 syscall_call:                                      --------5 516     call *sys_call_table(,%eax,4) 517 syscall_after_call:                                --------6 518     movl %eax,PT_EAX(%esp)      # store the return value 519 syscall_exit:                                      --------7 520     LOCKDEP_SYS_EXIT 521     DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt 522                     # setting need_resched or sigpending 523                     # between sampling and the iret 524     TRACE_IRQS_OFF 525     movl TI_flags(%ebp), %ecx 526     testl $_TIF_ALLWORK_MASK, %ecx  # current->work 527     jne syscall_exit_work 528
  1. RING0_INT_FRAME设置esp、eip指向内核态,然后将eax中的系统调用号入栈
  2. 保存现场,即用户态的一些寄存器值
  3. 将thread_info的地址保存到ebp寄存器
  4. 当前进程是否有被trace,如果有就执行相关的动作保存当时的追踪信息
  5. 调用对应的系统调用函数
  6. 将返回值入栈
  7. 屏蔽其他中断。检测当前进程是否还有工作没有完成,如果有,那么跳转到syscall_exit_work
  8. 然后恢复userspace被压入栈的寄存器,返回userspace
 529 restore_all: 530     TRACE_IRQS_IRET 531 restore_all_notrace: 532 #ifdef CONFIG_X86_ESPFIX32 533     movl PT_EFLAGS(%esp), %eax  # mix EFLAGS, SS and CS     ------------1 534     # Warning: PT_OLDSS(%esp) contains the wrong/random values if we 535     # are returning to the kernel. 536     # See comments in process.c:copy_thread() for details. 537     movb PT_OLDSS(%esp), %ah 538     movb PT_CS(%esp), %al 539     andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax 540     cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax 541     CFI_REMEMBER_STATE 542     je ldt_ss           # returning to user-space with LDT SS 543 #endif 544 restore_nocheck: 545     RESTORE_REGS 4          # skip orig_eax/error_code 546 irq_return: 547     INTERRUPT_RETURN                            ---------- 548 .section .fixup,"ax" 549 ENTRY(iret_exc) 550     pushl $0            # no error code 551     pushl $do_iret_error 552     jmp error_code 553 .previous 554     _ASM_EXTABLE(irq_return,iret_exc) 555 556 #ifdef CONFIG_X86_ESPFIX32 557     CFI_RESTORE_STATE 558 ldt_ss: 559 #ifdef CONFIG_PARAVIRT 560     /* 561      * The kernel can't run on a non-flat stack if paravirt mode 562      * is active.  Rather than try to fixup the high bits of 563      * ESP, bypass this code entirely.  This may break DOSemu 564      * and/or Wine support in a paravirt VM, although the option 565      * is still available to implement the setting of the high 566      * 16-bits in the INTERRUPT_RETURN paravirt-op. 567      */ 568     cmpl $0, pv_info+PARAVIRT_enabled 569     jne restore_nocheck 570 #endif 571 572 /* 573  * Setup and switch to ESPFIX stack 574  * 575  * We're returning to userspace with a 16 bit stack. The CPU will not 576  * restore the high word of ESP for us on executing iret... This is an 577  * "official" bug of all the x86-compatible CPUs, which we can work 578  * around to make dosemu and wine happy. We do this by preloading the 579  * high word of ESP with the high word of the userspace ESP while 580  * compensating for the offset by changing to the ESPFIX segment with 581  * a base address that matches for the difference. 582  */ 583 #define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) 584     mov %esp, %edx          /* load kernel esp */ 585     mov PT_OLDESP(%esp), %eax   /* load userspace esp */ 586     mov %dx, %ax            /* eax: new kernel esp */ 587     sub %eax, %edx          /* offset (low word is 0) */ 588     shr $16, %edx 589     mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ 590     mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ 591     pushl_cfi $__ESPFIX_SS 592     pushl_cfi %eax          /* new kernel esp */ 593     /* Disable interrupts, but do not irqtrace this section: we 594      * will soon execute iret and the tracer was already set to 595      * the irqstate after the iret */ 596     DISABLE_INTERRUPTS(CLBR_EAX) 597     lss (%esp), %esp        /* switch to espfix segment */ 598     CFI_ADJUST_CFA_OFFSET -8 599     jmp restore_nocheck 600 #endif 601     CFI_ENDPROC 602 ENDPROC(system_call)

syscall_exit_work

_TIF_ALLWORK_MASK 的定义如下:

144 /* Work to do on any return to user space. */145 #define _TIF_ALLWORK_MASK \146   (_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SINGLESTEP|\147    _TIF_ASYNC_TLB|_TIF_NOTIFY_RESUME)

当以下情况之一发生时,返回用户态之前需要进入syscall_exit_work处理:
1. 当前进程有信号pending
2. 当前进程需要被重新调度
3. 设置了_TIF_SINGLESTEP,restore singlestep on return to user mode
4. got an async TLB fault in kernel
5. callback before returning to user

 670 syscall_exit_work: 671     testl $_TIF_WORK_SYSCALL_EXIT, %ecx----------1 672     jz work_pending 673     TRACE_IRQS_ON 674     ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call 675                     # schedule() instead 676     movl %esp, %eax 677     call syscall_trace_leave 678     jmp resume_userspace------------------------2 679 END(syscall_exit_work)
  1. 检测是否有work pending
  2. 否则开中断然后返回用户态
 607 work_pending: 608     testb $_TIF_NEED_RESCHED, %cl     -------------1 609     jz work_notifysig                 -------------2 610 work_resched:                         -------------3      611     call schedule 612     LOCKDEP_SYS_EXIT 613     DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt 614                     # setting need_resched or sigpending 615                     # between sampling and the iret 616     TRACE_IRQS_OFF 617     movl TI_flags(%ebp), %ecx 618     andl $_TIF_WORK_MASK, %ecx  # is there any work to be done other 619                     # than syscall tracing? 620     jz restore_all 621     testb $_TIF_NEED_RESCHED, %cl 622     jnz work_resched 623 624 work_notifysig:             # deal with pending signals and-------------------4 625                     # notify-resume requests 626 #ifdef CONFIG_VM86 627     testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) 628     movl %esp, %eax 629     jne work_notifysig_v86      # returning to kernel-space or 630                     # vm86-space 631 1: 632 #else 633     movl %esp, %eax 634 #endif 635     TRACE_IRQS_ON 636     ENABLE_INTERRUPTS(CLBR_NONE) 637     movb PT_CS(%esp), %bl 638     andb $SEGMENT_RPL_MASK, %bl 639     cmpb $USER_RPL, %bl 640     jb resume_kernel 641     xorl %edx, %edx 642     call do_notify_resume -------------------5 643     jmp resume_userspace 644 645 #ifdef CONFIG_VM86 646     ALIGN 647 work_notifysig_v86: 648     pushl_cfi %ecx          # save ti_flags for do_notify_resume 649     call save_v86_state     # %eax contains pt_regs pointer 650     popl_cfi %ecx 651     movl %eax, %esp 652     jmp 1b 653 #endif 654 END(work_pending)
  1. 检测_TIF_NEED_RESCHED,若被设置,跳转到work_resched,否则跳转到work_notifysig,进行信号处理
  2. 调用schedule主动让出CPU
  3. 处理pending的信号,具体的处理流程在do_notify_resume 中的do_signal

整个处理流程用流程图表现得更加直观:
这里写图片描述

0 0
原创粉丝点击