android 抓取crash信息流程简介

来源：互联网发布：牡丹江管理局电视网络编辑：程序博客网时间：2024/05/21 22:32

通过上一篇博客我们知道，在linker 完成自身重定位，在对可执行程序进行重定位的过程中，会初始化debuggerd，也就是注册异常处理函数，在程序发生异常的时候抓取异常信息。

4185  /*4186   * This code is called after the linker has linked itself and4187   * fixed it's own GOT. It is safe to make references to externs4188   * and other non-local data at this point.4189   */4190  static ElfW(Addr) __linker_init_post_relocation(KernelArgumentBlock& args, ElfW(Addr) linker_base) {...4202    debuggerd_init();  //进行debuggerd 的初始化...4389    TRACE("[ Ready to execute \"%s\" @ %p ]", si->get_realpath(), reinterpret_cast<void*>(si->entry));4390    return si->entry;4391  }

追踪debuggerd_init() 这个函数

bionic/linker/debugger.cpp

 __LIBC_HIDDEN__ void debuggerd_init() {/* * bionic/libc/kernel/uapi/asm-generic/signal.h * struct sigaction { * __sighandler_t sa_handler;  //信号对应的处理函数 *   unsigned long sa_flags; *  #ifdef SA_RESTORER *   __sigrestore_t sa_restorer;  //处理完成之后的返回函数，一般不设置，kernel会在设置 * #endif *   sigset_t sa_mask; * };*/303    struct sigaction action;304    memset(&action, 0, sizeof(action));305    sigemptyset(&action.sa_mask);306    action.sa_sigaction = debuggerd_signal_handler;  // debuggerd_signal_handler 就是处理函数307    action.sa_flags = SA_RESTART | SA_SIGINFO;308  309    // Use the alternate signal stack if available so we can catch stack overflows.310    action.sa_flags |= SA_ONSTACK;  //使用独立的栈空间311  312    sigaction(SIGABRT, &action, nullptr);313    sigaction(SIGBUS, &action, nullptr);314    sigaction(SIGFPE, &action, nullptr);315    sigaction(SIGILL, &action, nullptr);316    sigaction(SIGSEGV, &action, nullptr);317  #if defined(SIGSTKFLT)318    sigaction(SIGSTKFLT, &action, nullptr);319  #endif320    sigaction(SIGTRAP, &action, nullptr);321  }

debuggerd_signal_handler 就是程序收到SIGABRT，SIGBUS，SIGFPE，SIGILL，SIGSEGV等这几个信号时，会调用的处理函数。

bionic/libc/bionic/sigaction.cpp

36  extern "C" int __rt_sigaction(int, const struct __kernel_sigaction*, struct __kernel_sigaction*, size_t);37  38  int sigaction(int signal, const struct sigaction* bionic_new_action, struct sigaction* bionic_old_action) {39    __kernel_sigaction kernel_new_action;40    if (bionic_new_action != NULL) {41      kernel_new_action.sa_flags = bionic_new_action->sa_flags;42      kernel_new_action.sa_handler = bionic_new_action->sa_handler;43      kernel_new_action.sa_mask = bionic_new_action->sa_mask;44  #if defined(SA_RESTORER)45      kernel_new_action.sa_restorer = bionic_new_action->sa_restorer;46  #if defined(__aarch64__)47      // arm64 has sa_restorer, but unwinding works best if you just let the48      // kernel supply the default restorer from [vdso]. gdb doesn't care, but49      // libgcc needs the nop that the kernel includes before the actual code.50      // (We could add that ourselves, but why bother?)51  #else52      if (!(kernel_new_action.sa_flags & SA_RESTORER)) {53        kernel_new_action.sa_flags |= SA_RESTORER;54        kernel_new_action.sa_restorer = &__restore_rt;  //用户空间处理函数执行完后返回内核空间的函数55      }56  #endif57  #endif58    }59  60    __kernel_sigaction kernel_old_action;61    int result = __rt_sigaction(signal,62                                (bionic_new_action != NULL) ? &kernel_new_action : NULL,63                                (bionic_old_action != NULL) ? &kernel_old_action : NULL,64                                sizeof(sigset_t));65  66    if (bionic_old_action != NULL) {67      bionic_old_action->sa_flags = kernel_old_action.sa_flags;68      bionic_old_action->sa_handler = kernel_old_action.sa_handler;69      bionic_old_action->sa_mask = kernel_old_action.sa_mask;70  #if defined(SA_RESTORER)71      bionic_old_action->sa_restorer = kernel_old_action.sa_restorer;72  #endif73    }74  75    return result;76  }

这个函数中把传入的参数sigaction 转成__kernel_sigaction类型，这两个结构体其实是一样的，然后调用__rt_sigaction()注册。

bionic/libc/arch-arm64/syscalls/__rt_sigaction.S

3  #include <private/bionic_asm.h>4  5  ENTRY(__rt_sigaction)6      mov     x8, __NR_rt_sigaction7      svc     #0  //系统调用8  9      cmn     x0, #(MAX_ERRNO + 1)10      cneg    x0, x0, hi11      b.hi    __set_errno_internal12  13      ret14  END(__rt_sigaction)15  .hidden __rt_sigaction

__rt_sigaction()是一个系统调用，kernel 中对应的处理函数是do_sigaction()，系统调用的过程在fork() 对应的博客中有详细分析，这里不再分析，所以我们理所当然地认为调用了__rt_sigaction()函数后，就会跑到kernel中的do_sigaction()。

linux-4.10/kernel/signal.c

3065  int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)3066  {/* * linux-4.10/include/linux/signal.h *  struct sigaction {   // sigaction 在kernel 中的定义 *  #ifndef __ARCH_HAS_IRIX_SIGACTION *  __sighandler_tsa_handler; *  unsigned longsa_flags; *  #else *   unsigned intsa_flags; *  __sighandler_tsa_handler; *   #endif *   #ifdef __ARCH_HAS_SA_RESTORER *   __sigrestore_t sa_restorer; *   #endif *   sigset_tsa_mask;/* mask last for extensibility */ *  }; *  *   struct k_sigaction { // k_sigaction在kernel 中的定义 *   struct sigaction sa;   //相当于把sigaction 的成员搬到这里 *   #ifdef __ARCH_HAS_KA_RESTORER *  __sigrestore_t ka_restorer; *   #endif *  };*/3067  struct task_struct *p = current, *t; //p 当前进程的task_struct 结构体，也就是PCB3068  struct k_sigaction *k;3069  sigset_t mask;3070  3071  if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))3072  return -EINVAL;3073  /* * linux-4.10/include/linux/sched.h * struct task_struct { * .../*   signal handlers  *   struct signal_struct *signal; * struct sighand_struct *sighand;  //保存信号相关的处理函数 *  * sigset_t blocked, real_blocked; * sigset_t saved_sigmask;// restored if set_restore_sigmask() was used  *  struct sigpending pending; *   *  unsigned long sas_ss_sp;  //信号处理函数独立的堆栈 *   size_t sas_ss_size;   //堆栈的大小 * unsigned sas_ss_flags;   //相关标志位 * ... * } *  *  struct sighand_struct { *  atomic_tcount; * struct k_sigactionaction[_NSIG];  #define _NSIG64 *   spinlock_tsiglock; *   wait_queue_head_tsignalfd_wqh; *  };*/3074  k = &p->sighand->action[sig-1];  // struct k_sigaction指针，可以认为是对应信号的处理函数3075  3076  spin_lock_irq(&p->sighand->siglock);3077  if (oact)3078  *oact = *k;  //指向之前注册的处理函数3079  3080  sigaction_compat_abi(act, oact);3081  3082  if (act) {3083  sigdelsetmask(&act->sa.sa_mask,3084        sigmask(SIGKILL) | sigmask(SIGSTOP));3085  *k = *act;  //将user space 也就是应用程序中的handle 函数保存（注册进来）...3097  if (sig_handler_ignored(sig_handler(p, sig), sig)) {  //handler 为 0或者 1的会被特殊处理3098  sigemptyset(&mask);3099  sigaddset(&mask, sig);3100  flush_sigqueue_mask(&mask, &p->signal->shared_pending);3101  for_each_thread(p, t)3102  flush_sigqueue_mask(&mask, &t->pending);3103  }3104  }3105  3106  spin_unlock_irq(&p->sighand->siglock);3107  return 0;3108  }

56  static void __user *sig_handler(struct task_struct *t, int sig)  //返回之前的handle指针57  {58  return t->sighand->action[sig - 1].sa.sa_handler;59  }6061  static int sig_handler_ignored(void __user *handler, int sig)62  {63  /* Is it explicitly or implicitly ignored? */64  return handler == SIG_IGN || // #define SIG_IGN((__force __sighandler_t)1)/* ignore signal */65  (handler == SIG_DFL && sig_kernel_ignore(sig)); //#define SIG_DFL((__force __sighandler_t)0)/* default signal handling */66  }

经过上面的流程，程序中SIGABRT，SIGBUS，SIGFPE，SIGILL，SIGSEGV等信号对应的处理函数就注册完成了。接下来我们分析一下程序出现异常时，信号被处理的流程。

linux-4.10/arch/arm64/kernel/entry.S

317  ENTRY(vectors)…327  328  ventryel0_sync// Synchronous 64-bit EL0  //用户空间访问非法地址后走这里329  ventryel0_irq// IRQ 64-bit EL0330  ventryel0_fiq_invalid// FIQ 64-bit EL0331  ventryel0_error_invalid// Error 64-bit EL0…344  END(vectors)

当程序在运行过程中访问到非法地址，比如空指针，或者未映射的地址，就会被处理器捕获到异常，走到对应的异常处理函数。怎么走到异常处理函数，其实很好理解，处理器捕获到异常后会跳到一个约定好的地址，kernel在初始化的时候往这些地址写对应的处理函数地址，这样就能走到处理函数中了。

512   * EL0 mode handlers.513   */514  .align6515  el0_sync:516  kernel_entry 0  //保存用户空间的寄存器517  mrsx25, esr_el1// read the syndrome register518  lsrx24, x25, #ESR_ELx_EC_SHIFT// exception class519  cmpx24, #ESR_ELx_EC_SVC64// SVC in 64-bit state520  b.eqel0_svc521  cmpx24, #ESR_ELx_EC_DABT_LOW// data abort in EL0  //走这条flow522  b.eqel0_da523  cmpx24, #ESR_ELx_EC_IABT_LOW// instruction abort in EL0524  b.eqel0_ia

发生异常时，处理器会把一些信息保存到对应的寄存器，具体是哪个寄存器，会保存什么样的信息，这里不详细介绍，异常处理函数根据寄存器中的信息，再跳到更加细化的处理函数中，程序访问非法地址，会走el0_da。

589  el0_da:590  /*591   * Data abort handling592   */593  mrsx26, far_el1594  // enable interrupts before calling the main handler595  enable_dbg_and_irq596  ct_user_exit597  bicx0, x26, #(0xff << 56)  // x0 = x26 & 0x0fffffff598  movx1, x25   // x1 = x25599  movx2, sp    // x2 = sp600  bldo_mem_abort  //x0 , x1 , x2作为do_mem_abort() 的三个参数601  bret_to_user

linux-4.10/arch/arm64/mm/fault.c

568  /*569   * Dispatch a data abort to the relevant handler.570   */571  asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,572   struct pt_regs *regs)573  {574  const struct fault_info *inf = fault_info + (esr & 63);575  struct siginfo info;576  577  if (!inf->fn(addr, esr, regs))  //如果是缺页异常，尝试修复，成功就直接返回578  return;579  580  pr_alert("Unhandled fault: %s (0x%08x) at 0x%016lx\n",581   inf->name, esr, addr);582  583  info.si_signo = inf->sig;584  info.si_errno = 0;585  info.si_code  = inf->code;586  info.si_addr  = (void __user *)addr;587  arm64_notify_die("", regs, &info, esr);   588  }

进入到do_mem_abort ()后，会先判断是不是缺页导致，如果不是，的确是访问了非法地址会走arm64_notify_die()。

linux-4.10/arch/arm64/kernel/traps.c

298  void arm64_notify_die(const char *str, struct pt_regs *regs,299        struct siginfo *info, int err)300  {301  if (user_mode(regs)) {  //如果是用户空间的进程执行导致的异常302  current->thread.fault_address = 0;303  current->thread.fault_code = err;304  force_sig_info(info->si_signo, info, current); 305  } else { //否则是kernel 里面的异常，走这里，最后会走到panic306  die(str, regs, err);307  }308  }

如果是用户空间的程序访问了非法地址，会调用force_sig_info()发送信号给对应程序。

linux-4.10/kernel/signal.c

1165  int1166  force_sig_info(int sig, struct siginfo *info, struct task_struct *t)1167  {1168  unsigned long int flags;1169  int ret, blocked, ignored;1170  struct k_sigaction *action;1171  1172  spin_lock_irqsave(&t->sighand->siglock, flags);1173  action = &t->sighand->action[sig-1];  //保存在task_struct 里面 k_sigaction1174  ignored = action->sa.sa_handler == SIG_IGN;1175  blocked = sigismember(&t->blocked, sig); //测试参数sig 代表的信号是否已加入至参数set信号集里. 如果信号集里已有该信号则返回1，否则返回0。1176  if (blocked || ignored) {1177  action->sa.sa_handler = SIG_DFL;1178  if (blocked) {1179  sigdelset(&t->blocked, sig);1180  recalc_sigpending_and_wake(t);1181  }1182  }1183  if (action->sa.sa_handler == SIG_DFL)1184  t->signal->flags &= ~SIGNAL_UNKILLABLE;1185  ret = specific_send_sig_info(sig, info, t);1186  spin_unlock_irqrestore(&t->sighand->siglock, flags);1187  1188  return ret;1189  }

接着走到specific_send_sig_info()

1134  static int1135  specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)1136  {1137  return send_signal(sig, info, t, 0);1138  }

specific_send_sig_info() 接着调用到send_signal()

1082  static int send_signal(int sig, struct siginfo *info, struct task_struct *t,1083  int group)1084  {1085  int from_ancestor_ns = 0;1086  1087  #ifdef CONFIG_PID_NS1088  from_ancestor_ns = si_fromuser(info) &&1089     !task_pid_nr_ns(current, task_active_pid_ns(t));1090  #endif1091  1092  return __send_signal(sig, info, t, group, from_ancestor_ns);1093  }

send_signal() 也没有太多的处理逻辑，继续调用到__send_signal()，传入的group 参数的值是0。

978  static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,979  int group, int from_ancestor_ns)980  {...989  if (!prepare_signal(sig, t,  //对特殊信号做相应处理990  from_ancestor_ns || (info == SEND_SIG_FORCED)))991  goto ret;992  993  pending = group ? &t->signal->shared_pending : &t->pending; //这时group 为0，所以信号是发给这个进程，而不是进程里面的所有线程...1073  out_set:1074  signalfd_notify(t, sig);  // Deliver the signal to listening signalfd1075  sigaddset(&pending->signal, sig);1076  complete_signal(sig, t, group); //继续往下走到这里1077  ret:1078  trace_signal_generate(sig, info, t, group, result);1079  return ret;1080  }

__send_signal()里面做了各种各样的check，因为我们这里主要是熟悉这个流程，对于里面的具体细节，不做过多的介绍，需要了解的同学可以自己看源码。

876  static void complete_signal(int sig, struct task_struct *p, int group)877  {878  struct signal_struct *signal = p->signal;879  struct task_struct *t;880  881  /*882   * Now find a thread we can wake up to take the signal off the queue.883   *884   * If the main thread wants the signal, it gets first crack.885   * Probably the least surprising to the average bear.886   */887  if (wants_signal(sig, p))888  t = p;889  else if (!group || thread_group_empty(p))890  /*891   * There is just one thread and it does not need to be woken.892   * It will dequeue unblocked signals before it runs again.893   */894  return;895  else {896  /*897   * Otherwise try to find a suitable thread.898   */899  t = signal->curr_target;900  while (!wants_signal(sig, t)) {901  t = next_thread(t);902  if (t == signal->curr_target)903  /*904   * No thread needs to be woken.905   * Any eligible threads will see906   * the signal in the queue soon.907   */908  return;909  }910  signal->curr_target = t;911  }912  913  /*914   * Found a killable thread.  If the signal will be fatal,915   * then start taking the whole group down immediately.916   */917  if (sig_fatal(p, sig) &&918      !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&919      !sigismember(&t->real_blocked, sig) &&920      (sig == SIGKILL || !t->ptrace)) {921  /*922   * This signal will be fatal to the whole group.923   */924  if (!sig_kernel_coredump(sig)) {925  /*926   * Start a group exit and wake everybody up.927   * This way we don't have other threads928   * running and doing things after a slower929   * thread has the fatal signal pending.930   */931  signal->flags = SIGNAL_GROUP_EXIT;932  signal->group_exit_code = sig;933  signal->group_stop_count = 0;934  t = p;935  do {936  task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);937  sigaddset(&t->pending.signal, SIGKILL);938  signal_wake_up(t, 1);939  } while_each_thread(p, t);940  return;941  }942  }943  944  /*945   * The signal is already in the shared-pending queue.946   * Tell the chosen thread to wake up and dequeue it.947   */948  signal_wake_up(t, sig == SIGKILL);949  return;950  }

complete_signal() 对信号在进行了进一步处理，最后调用到signal_wake_up()

linux-4.10/include/uapi/asm-generic/signal.h

10  #define SIGHUP 111  #define SIGINT 212  #define SIGQUIT 313  #define SIGILL 4 //执行了非法指令. 通常是因为可执行文件本身出现错误, 或者试图执行数据段. 堆栈溢出时也有可能产生这个信号。14  #define SIGTRAP 515  #define SIGABRT 616  #define SIGIOT 6  //调用abort函数生成的信号17  #define SIGBUS 718  #define SIGFPE 819  #define SIGKILL 9 //用来立即结束程序的运行. 本信号不能被阻塞、处理和忽略。20  #define SIGUSR11021  #define SIGSEGV11 //试图访问未分配给自己的内存, 或试图往没有写权限的内存地址写数据.22  #define SIGUSR21223  #define SIGPIPE13  //管道破裂。这个信号通常在进程间通信产生24  #define SIGALRM1425  #define SIGTERM1526  #define SIGSTKFLT   1627  #define SIGCHLD1728  #define SIGCONT18  //让一个停止(stopped)的进程继续执行. 本信号不能被阻塞.29  #define SIGSTOP1930  #define SIGTSTP20

上面给出了一些信号对应的值，在实际中，遇到最多的情况就是SIGSEGV，也就是访问了非法地址，应该90%以上是这种情况。

linux-4.10/include/linux/sched.h

3520  static inline void signal_wake_up(struct task_struct *t, bool resume)3521  {3522  signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);3523  }

现在考虑的是SIGSEGV 信号的情况，所以上面传下来的resume 为0，调用到signal_wake_up_state() 传入的第二个参数也是0

linux-4.10/kernel/signal.c

645  void signal_wake_up_state(struct task_struct *t, unsigned int state)646  {647  set_tsk_thread_flag(t, TIF_SIGPENDING);648  /*649   * TASK_WAKEKILL also means wake it up in the stopped/traced/killable650   * case. We don't check t->state here because there is a race with it651   * executing another processor and just now entering stopped state.652   * By using wake_up_state, we ensure the process will wake up and653   * handle its death signal.654   */655  if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))  656  kick_process(t);657  }

走了一圈，有回到signal.c 这个文件里面的代码，set_tsk_thread_flag()会把thread_info.flags，设置成TIF_SIGPENDING，wake_up_state()和kick_process()就不深入了解了。

linux-4.10/arch/arm64/kernel/entry.S

589  el0_da:590  /*591   * Data abort handling592   */593  mrsx26, far_el1594  // enable interrupts before calling the main handler595  enable_dbg_and_irq596  ct_user_exit597  bicx0, x26, #(0xff << 56)598  movx1, x25599  movx2, sp//bl (表示带返回值的跳转) 带链接的跳转。 首先将当前指令的下一条指令地址保存在LR寄存器，然后跳转的lable//b 表示无条件跳转600  bldo_mem_abort  //do_mem_abort在kernel 走了一圈，返回了601  bret_to_user  //

重新回到entry.S 的el0_da 代码块，do_mem_abort跑完了之后，会继续往下，跑到ret_to_user

/*770   * "slow" syscall return path.771   */772  ret_to_user:773  disable_irq// disable interrupts774  ldrx1, [tsk, #TSK_TI_FLAGS]  //tsk.reqx28 current thread_info  x1 = thread_info.flags775  andx2, x1, #_TIF_WORK_MASK  // x2 = x1 & _TIF_WORK_MASK   也就是取出thread_info.flags 的标志位776  cbnzx2, work_pending         // 如果x2 != 0 跳到 work_pending 777  finish_ret_to_user:778  enable_step_tsk x1, x2779  kernel_exit 0780  ENDPROC(ret_to_user)

前面说了set_tsk_thread_flag()会把thread_info.flags设置成TIF_SIGPENDING(增加这个flag)，所以这里会走到work_pending。

758  /*759   * Ok, we need to do extra processing, enter the slow path.760   */761  work_pending:762  movx0, sp// 'regs'763  bldo_notify_resume764  #ifdef CONFIG_TRACE_IRQFLAGS765  bltrace_hardirqs_on// enabled while in userspace766  #endif767  ldrx1, [tsk, #TSK_TI_FLAGS]// re-check for single-step768  bfinish_ret_to_user

继续往下会调用到do_notify_resume()函数

linux-4.10/arch/arm64/kernel/signal.c

402  asmlinkage void do_notify_resume(struct pt_regs *regs,403   unsigned int thread_flags)404  {405  /*406   * The assembly code enters us with IRQs off, but it hasn't407   * informed the tracing code of that for efficiency reasons.408   * Update the trace code with the current status.409   */410  trace_hardirqs_off();411  do {412  if (thread_flags & _TIF_NEED_RESCHED) {413  schedule();414  } else {415  local_irq_enable();416  417  if (thread_flags & _TIF_UPROBE)418  uprobe_notify_resume(regs);419  420  if (thread_flags & _TIF_SIGPENDING)  //走的是这里421  do_signal(regs);422  423  if (thread_flags & _TIF_NOTIFY_RESUME) {424  clear_thread_flag(TIF_NOTIFY_RESUME);425  tracehook_notify_resume(regs);426  }427  428  if (thread_flags & _TIF_FOREIGN_FPSTATE)429  fpsimd_restore_current_state();430  }431  432  local_irq_disable();433  thread_flags = READ_ONCE(current_thread_info()->flags);434  } while (thread_flags & _TIF_WORK_MASK);435  }

通过前面的了解，我们知道thread_flags & _TIF_SIGPENDING 这个条件是成立的，所以继续走到do_signal(regs);

static void do_signal(struct pt_regs *regs)332  {...366  /*367   * Get the signal to deliver. When running under ptrace, at this point368   * the debugger may change all of our registers.369   */370  if (get_signal(&ksig)) {...385  handle_signal(&ksig, regs);386  return;387  }...399  restore_saved_sigmask();400  }

接着会调用handle_signal() 函数

285  /*286   * OK, we're invoking a handler287   */288  static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)289  {290  struct task_struct *tsk = current;291  sigset_t *oldset = sigmask_to_save();292  int usig = ksig->sig;293  int ret;294  295  /*296   * Set up the stack frame297   */298  if (is_compat_task()) {299  if (ksig->ka.sa.sa_flags & SA_SIGINFO)300  ret = compat_setup_rt_frame(usig, ksig, oldset, regs);301  else302  ret = compat_setup_frame(usig, ksig, oldset, regs);303  } else {304  ret = setup_rt_frame(usig, ksig, oldset, regs);  //走这里305  }306  307  /*308   * Check that the resulting registers are actually sane.309   */310  ret |= !valid_user_regs(&regs->user_regs, current);311  312  /*313   * Fast forward the stepping logic so we step into the signal314   * handler.315   */316  if (!ret)317  user_fastforward_single_step(tsk);318  319  signal_setup_done(ret, ksig, 0);320  }

我们重点看setup_rt_frame()做了什么。

250  static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,251    struct pt_regs *regs)252  {253  struct rt_sigframe __user *frame;254  int err = 0;255  256  frame = get_sigframe(ksig, regs);  //获取用户空间处理信号的栈257  if (!frame)258  return 1;259  260  __put_user_error(0, &frame->uc.uc_flags, err);261  __put_user_error(NULL, &frame->uc.uc_link, err);262  263  err |= __save_altstack(&frame->uc.uc_stack, regs->sp);264  err |= setup_sigframe(frame, regs, set);265  if (err == 0) {266  setup_return(regs, &ksig->ka, frame, usig);267  if (ksig->ka.sa.sa_flags & SA_SIGINFO) {268  err |= copy_siginfo_to_user(&frame->info, &ksig->info);269  regs->regs[1] = (unsigned long)&frame->info;270  regs->regs[2] = (unsigned long)&frame->uc;271  }272  }273  274  return err;275  }

setup_rt_frame()会用户空间执行信号处理函数准备好栈和相关参数。

232  static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,233   void __user *frame, int usig)234  {235  __sigrestore_t sigtramp;236  237  regs->regs[0] = usig;238  regs->sp = (unsigned long)frame;239  regs->regs[29] = regs->sp + offsetof(struct rt_sigframe, fp);240  regs->pc = (unsigned long)ka->sa.sa_handler; //之前注册的用户空间处理函数241  242  if (ka->sa.sa_flags & SA_RESTORER)243  sigtramp = ka->sa.sa_restorer;244  else245  sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp);  246  247  regs->regs[30] = (unsigned long)sigtramp; //用户空间的处理函数执行完后，会调用这个函数再返回内核空间248  }

regs->pc 设置成之前用户空间传下来的处理函数，所以返回用户空间时会执行处理函数，regs->regs[30]设置里用户空间执行完处理函数后，通过这个函数再返回内核空间。

linux-4.10/arch/arm64/kernel/vdso/vdso.lds.S

/*96   * Make the sigreturn code visible to the kernel.97   */98  VDSO_sigtramp= __kernel_rt_sigreturn;

linux-4.10/arch/arm64/kernel/vdso/sigreturn.S

28  ENTRY(__kernel_rt_sigreturn)29  .cfi_startproc30  .cfi_signal_frame31  .cfi_def_cfax29, 032  .cfi_offsetx29, 0 * 833  .cfi_offsetx30, 1 * 834  movx8, #__NR_rt_sigreturn35  svc#036  .cfi_endproc37  ENDPROC(__kernel_rt_sigreturn)

所以上面的sigtramp 里面插入了一个系统调用__NR_rt_sigreturn，也就是信号处理后返回。

bionic/linker/debugger.cpp

302  __LIBC_HIDDEN__ void debuggerd_init() {...306    action.sa_sigaction = debuggerd_signal_handler;...321  }

再会到debuggerd_init()中，kernel 将信号发生出来后，经常许多环节的处理，会返回用户空间，调用debuggerd_signal_handler()这个处理函数。

262  static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void*) {...271    send_debuggerd_packet(info);...294    int rc = syscall(SYS_rt_tgsigqueueinfo, getpid(), gettid(), signal_number, info);295    if (rc != 0) {296      __libc_format_log(ANDROID_LOG_FATAL, "libc", "failed to resend signal during crash: %s",297                        strerror(errno));298      _exit(0);299    }300  }

这里会调用send_debuggerd_packet() 向debuggerd进程发生信息。

208  static void send_debuggerd_packet(siginfo_t* info) {...226    int s = socket_abstract_client(DEBUGGER_SOCKET_NAME, SOCK_STREAM | SOCK_CLOEXEC); //创建socket连接227    if (s == -1) {228      __libc_format_log(ANDROID_LOG_FATAL, "libc", "Unable to open connection to debuggerd: %s",229                        strerror(errno));230      return;231    }232  233    // debuggerd knows our pid from the credentials on the234    // local socket but we need to tell it the tid of the crashing thread.235    // debuggerd will be paranoid and verify that we sent a tid236    // that's actually in our process.237    debugger_msg_t msg;238    msg.action = DEBUGGER_ACTION_CRASH;239    msg.tid = gettid();  //消息中有tid，也就是出现异常的线程号240    msg.abort_msg_address = reinterpret_cast<uintptr_t>(g_abort_message);241    msg.original_si_code = (info != nullptr) ? info->si_code : 0;242    ret = TEMP_FAILURE_RETRY(write(s, &msg, sizeof(msg)));  //将消息通过socket 发送给debuggerd...255    close(s);256  }

接着debuggerd 会收到发送过来的消息

system/core/debuggerd/debuggerd.cpp

921  int main(int argc, char** argv) {922    union selinux_callback cb;923    if (argc == 1) {924      cb.func_audit = audit_callback;925      selinux_set_callback(SELINUX_CB_AUDIT, cb);926      cb.func_log = selinux_log_callback;927      selinux_set_callback(SELINUX_CB_LOG, cb);928      return do_server();929    }930  931    bool dump_backtrace = false;932    bool have_tid = false;933    pid_t tid = 0;934    for (int i = 1; i < argc; i++) {935      if (!strcmp(argv[i], "-b")) {936        dump_backtrace = true;937      } else if (!have_tid) {938        tid = atoi(argv[i]);939        have_tid = true;940      } else {941        usage();942        return 1;943      }944    }945    if (!have_tid) {946      usage();947      return 1;948    }949    return do_explicit_dump(tid, dump_backtrace);950  }

debuggerd进程运行起来之后，会根据参数走不同的流程，如果是默认的走do_server()，也就是创建一个socket server，等待client端连接，处理相应的请求。另一种情况就是我们手动调用它dump 某个进程的backtrace，前面有讲过，adb shell debuggerd -b pid .

842  static int do_server() {843    // debuggerd crashes can't be reported to debuggerd.844    // Reset all of the crash handlers.845    signal(SIGABRT, SIG_DFL);846    signal(SIGBUS, SIG_DFL);847    signal(SIGFPE, SIG_DFL);848    signal(SIGILL, SIG_DFL);849    signal(SIGSEGV, SIG_DFL);850  #ifdef SIGSTKFLT851    signal(SIGSTKFLT, SIG_DFL);852  #endif853    signal(SIGTRAP, SIG_DFL);854  855    // Ignore failed writes to closed sockets856    signal(SIGPIPE, SIG_IGN);857  858    // Block SIGCHLD so we can sigtimedwait for it.859    sigset_t sigchld;860    sigemptyset(&sigchld);861    sigaddset(&sigchld, SIGCHLD);862    sigprocmask(SIG_SETMASK, &sigchld, nullptr);863  864    int s = socket_local_server(SOCKET_NAME, ANDROID_SOCKET_NAMESPACE_ABSTRACT, //创建socket ，充当server端865                                SOCK_STREAM | SOCK_CLOEXEC);866    if (s == -1) return 1;867  868    // Fork a process that stays root, and listens on a pipe to pause and resume the target.869    if (!start_signal_sender()) {870      ALOGE("debuggerd: failed to fork signal sender");871      return 1;872    }873  874    ALOGI("debuggerd: starting\n");875  876    for (;;) {  //循环等待877      sockaddr_storage ss;878      sockaddr* addrp = reinterpret_cast<sockaddr*>(&ss);879      socklen_t alen = sizeof(ss);880  881      ALOGV("waiting for connection\n");882      int fd = accept4(s, addrp, &alen, SOCK_CLOEXEC); //等待client端来连接883      if (fd == -1) {884        ALOGE("accept failed: %s\n", strerror(errno));885        continue;886      }887  888      handle_request(fd); //处理client端的请求889    }890    return 0;891  }

do_server() 会把debuggerd 自身crash 时的信号屏蔽掉，然后创建socket，充当server，调用accept()等待client端来连接，收到连接后，调用handle_request(fd)处理。

801  static void handle_request(int fd) {802    ALOGV("handle_request(%d)\n", fd);803  804    ScopedFd closer(fd);805    debugger_request_t request;806    memset(&request, 0, sizeof(request));807    int status = read_request(fd, &request); //读取client端发生过来的消息808    if (status != 0) {809      return;810    }...831    // Fork a child to handle the rest of the request.832    pid_t fork_pid = fork();833    if (fork_pid == -1) {834      ALOGE("debuggerd: failed to fork: %s\n", strerror(errno));835    } else if (fork_pid == 0) {836      worker_process(fd, request);  //fork 出子进程来处理837    } else {838      monitor_worker_process(fork_pid, request);839    }840  }

handle_request()会把client 端，也就是信号处理函数中发生过来的信息读出来，然后创建出一个子进程继续处理。

565  static void worker_process(int fd, debugger_request_t& request) {...598    // Attach to the target process.599    if (!ptrace_attach_thread(request.pid, request.tid)) {  //ptrace 进程600      ALOGE("debuggerd: ptrace attach failed: %s", strerror(errno));601      exit(1);602    }...608    if (request.action == DEBUGGER_ACTION_CRASH) { //通过前面的代码我们知道action是 DEBUGGER_ACTION_CRASH609      pid_t pid;610      uid_t uid;611      gid_t gid;612      if (get_process_info(request.tid, &pid, &uid, &gid) != 0) {613        ALOGE("debuggerd: failed to get process info for tid '%d'", request.tid);614        exit(1);615      }...624    }625  626    // Don't attach to the sibling threads if we want to attach gdb.627    // Supposedly, it makes the process less reliable.628    bool attach_gdb = should_attach_gdb(request);629    if (attach_gdb) {630      // Open all of the input devices we need to listen for VOLUMEDOWN before dropping privileges.631      if (init_getevent() != 0) {632        ALOGE("debuggerd: failed to initialize input device, not waiting for gdb");633        attach_gdb = false;634      }636    }...662    int crash_signal = SIGKILL;663    succeeded = perform_dump(request, fd, tombstone_fd, backtrace_map.get(), siblings, //dump 异常进程的 寄存器 和backtrace 信息664                             &crash_signal, amfd_data.get());...692    for (pid_t sibling : siblings) {693      ptrace(PTRACE_DETACH, sibling, 0, 0); //ptrace DETACH694    }...717  718    close(amfd);  //关闭socket连接719  720    exit(!succeeded);721  }

worker_process()中，会PTRACE_ATTACH 上发生异常的进程，然后dump 出进程的信息用于debug，最后PTRACE_DETACH 该进程，关于ptrace 的功能，这里不再介绍，用户空间调用ptrace 实际上是系统调用的接口，真正的实现在kernel中。

483  static bool perform_dump(const debugger_request_t& request, int fd, int tombstone_fd,484                           BacktraceMap* backtrace_map, const std::set<pid_t>& siblings,485                           int* crash_signal, std::string* amfd_data) {...492    while (true) {493      int signal = wait_for_signal(request.tid, &total_sleep_time_usec);494      switch (signal) {495        case -1:496          ALOGE("debuggerd: timed out waiting for signal");497          return false;498  ...517        case SIGABRT:518        case SIGBUS:519        case SIGFPE:520        case SIGILL:521        case SIGSEGV:522  #ifdef SIGSTKFLT523        case SIGSTKFLT:524  #endif525        case SIGSYS:526        case SIGTRAP:527          ALOGV("stopped -- fatal signal\n");528          *crash_signal = signal;529          engrave_tombstone(tombstone_fd, backtrace_map, request.pid, request.tid, siblings, signal,530                            request.original_si_code, request.abort_msg_address, amfd_data);531          break;532  533        default:534          ALOGE("debuggerd: process stopped due to unexpected signal %d\n", signal);535          break;536      }537      break;538    }539  540    return true;541  }

ptrace 上出现异常的进程后，该进程会重新跑起来，还是跑出现异常的那段代码，所以又会发生异常，但是这时候的异常信息不会发生给debuggerd_signal_handler()处理函数，而是给当前的debuggerd进程。debugger进程收到信号后会调用engrave_tombstone().

system/core/debuggerd/tombstone.cpp

688  void engrave_tombstone(int tombstone_fd, BacktraceMap* map, pid_t pid, pid_t tid,689                         const std::set<pid_t>& siblings, int signal, int original_si_code,690                         uintptr_t abort_msg_address, std::string* amfd_data) {691    log_t log;692    log.current_tid = tid;693    log.crashed_tid = tid;694  695    if (tombstone_fd < 0) {696      ALOGE("debuggerd: skipping tombstone write, nothing to do.\n");697      return;698    }699  700    log.tfd = tombstone_fd;701    log.amfd_data = amfd_data;702    dump_crash(&log, map, pid, tid, siblings, signal, original_si_code, abort_msg_address);703  }

走到dump_crash() dump相关信息

607  // Dumps all information about the specified pid to the tombstone.608  static void dump_crash(log_t* log, BacktraceMap* map, pid_t pid, pid_t tid,609                         const std::set<pid_t>& siblings, int signal, int si_code,610                         uintptr_t abort_msg_address) {611    // don't copy log messages to tombstone unless this is a dev device612    char value[PROPERTY_VALUE_MAX];613    property_get("ro.debuggable", value, "0");614    bool want_logs = (value[0] == '1');615  616    _LOG(log, logtype::HEADER,617         "*** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***\n");618    dump_header_info(log);619    dump_thread(log, pid, tid, map, signal, si_code, abort_msg_address, true);620    if (want_logs) {621      dump_logs(log, pid, 5);622    }623  624    if (!siblings.empty()) {625      for (pid_t sibling : siblings) {626        dump_thread(log, pid, sibling, map, 0, 0, 0, false);627      }628    }629  630    if (want_logs) {631      dump_logs(log, pid, 0);632    }633  }

阅读全文

0 0