Native Exception产生

来源：互联网发布：scada系统数据分析编辑：程序博客网时间：2024/06/16 18:58

自己创建一个异常

Native Exception，简称NE，是发生于C/C++ code里面最常见的一种异常,
android
我们在写代码的时候一些常见的操作都会导致NE，比如空指针赋值，数组越界访问等，现在我在代码里面人为的添加一个简单的exception:

test.c:

  1 #include <stdio.h>  2   3 void func4()  4 {  5     char *p = NULL;  6     *p = 0x5;//异常产生的地方  7 }  8   9 void func3() 10 { 11     int var4 = 4; 12 } 13  14  15 void func2() 16 { 17     int var3 = 3; 18     func3(); 19     func4(); 20 } 21  22 void func1() 23 { 24     int var1,var2; 25     var1 = 2; 26 } 27  28 void main() 29 { 30     int var0 = 1; 31     func1(); 32     func2(); 33     return; 34 }

Android.mk

 1 LOCAL_PATH := $(call my-dir)  2 include $(CLEAR_VARS)  3   4 LOCAL_CFLAGS += -g3 -O0  5       6 LOCAL_SRC_FILES := test.c  7   8 LOCAL_MODULE := test  9  10 LOCAL_MULTILIB := 32 11      12 include $(BUILD_EXECUTABLE)

我把它放入pls/vendor/mediatek/proprietary/external/libtest/目录下面，我们对它进行编译并push到手机里面:

mmm vendor/mediatek/proprietary/external/libtest/out/target/product/xxx/system/bin/testadb push out/target/product/xxx/system/bin/test  system/bin/

gdb-server调试程序

启动gdbserver：

$ adb shell ./system/bin/gdbserver :1234 system/bin/test    Process system/bin/test created; pid = 4130    Listening on port 1234

$ adb forward tcp:1234 tcp:1234

gdb 调试这个bin文件：

$ ./prebuilts/gcc/linux-x86/arm/cit-arm-linux-androideabi-4.8/bin/arm-linux-androideabi-gdb out/target/product/xxx/symbols/system/bin/test

Reading symbols from out/target/product/xxx/symbols/system/bin/test...done.(gdb) set solib-search-path out/target/product/xxx/symbols/system/lib/(gdb) set solib-absolute-prefix out/target/product/xxx/symbols/(gdb) target remote:1234Remote debugging using :1234Reading symbols from out/target/product/xxx/symbols/system/bin/linker...done.Loaded symbols for out/target/product/xxx/symbols/system/bin/linker__dl__start () at bionic/linker/arch/arm/begin.S:3232    mov r0, sp(gdb) list

现在test程序加载成功了：

(gdb) b main //设置断点Breakpoint 1 at 0xaaaaa772: file vendor/mediatek/proprietary/external/libtest/test.c, line 30.(gdb) n //单步执行33    bl __linker_init

当运行到func函数里面就出现异常：

(gdb) nProgram received signal SIGSEGV, Segmentation fault.0xaaaaa740 in func4 () at vendor/mediatek/proprietary/external/libtest/test.c:66       *p = 0x5;

可以很清楚的知道，我们在调用libtest这里面出现了问题，AEE是MTK平台自己的一套处理异常的工具，代码是封装好的,当应用app发生了异常，它回收集异常信息到压缩在DB文件里面，我们需要用GAT工具才能打开这个文件，通过在main_log里面，我们可以搜索到如下信息：

01-02 04:16:18.768  4180  4180 I AEE_AED : Build fingerprint: 'xxx:7.0/NRD90M/v6H5E-2:eng/test-keys'01-02 04:16:18.768  4180  4180 I AEE_AED : Revision: '0'01-02 04:16:18.768  4180  4180 I AEE_AED : ABI: 'arm'01-02 04:16:18.768  4180  4180 I AEE_AED : pid: 4142, tid: 4142, name: test  >>> system/bin/test <<<01-02 04:16:18.769  4180  4180 I AEE_AED : signal 11 (SIGSEGV), code 1 (SEGV_MAPERR), fault addr 0x001-02 04:16:18.770  4180  4180 I AEE_AED :     r0 00000000  r1 00000005  r2 fffefa2c  r3 0000000001-02 04:16:18.770  4180  4180 I AEE_AED :     r4 aaaaa76f  r5 fffefa24  r6 00000001  r7 fffefa2c01-02 04:16:18.771  4180  4180 I AEE_AED :     r8 00000000  r9 00000000  sl 00000000  fp fffefa0001-02 04:16:18.771  4180  4180 I AEE_AED :     ip f750085c  sp fffef9cc  lr aaaaa761  pc aaaaa740  cpsr 0007003001-02 04:16:18.796  4180  4180 I AEE_AED : 01-02 04:16:18.796  4180  4180 I AEE_AED : backtrace:01-02 04:16:18.799  4180  4180 I AEE_AED :     #00 pc 00000740  /system/bin/test01-02 04:16:18.800  4180  4180 I AEE_AED :     #01 pc 0000075d  /system/bin/test01-02 04:16:18.800  4180  4180 I AEE_AED :     #02 pc 0000077b  /system/bin/test01-02 04:16:18.800  4180  4180 I AEE_AED :     #03 pc 0001708c  /system/lib/libc.so (__libc_init+84)01-02 04:16:18.800  4180  4180 I AEE_AED :     #04 pc 00000660  /system/bin/test01-02 04:16:18.819   290   290 I wmt_launcher: fw log ctrl flag has been set

当native层程序发生异常的时候，系统kernel就会进入异常模式会发送一个signal给到usr这边，处理这个异常的signal就是android的debuggerd这个进程，会在log当中找到类似如下log：

libc    : Fatal signal 11 (SIGSEGV), code 1, fault addr 0x14 in tid 9765 (Capture@CmdQue)

此进程可以侦测到程序崩溃，并将崩溃时的进程状态信息输出到文件和串口中，以供开发人员分析调试使用。Debuggerd的数据被保存在/data/tombstone/目录下，Linux kernel有自己的一套signal机制，在应用程序崩溃时，通常系统内核都会发送signal到出问题的进程，以通知进程出现什么异常，这些进程可以捕获这些signal并对其做相应的处理。

debuggerd创建一个名为 “Android:debuggerd”的socket，作为server端等待其他client端进程的连接，接收client端进程发送来的tid和action信息将由tid指定的那个进程的运行信息，按照由action指定的动作dump到文件；

c/c++程序clinet端

下面就将简单介绍debuggerd进程的处理过程：

在应用程序入口地址__start后，__linker_init中调用debugger_init()函数来注册异常信号处理handler，以实现拦截系统异常的几个singal：SIGILL,SIGABRT, SIGBUS, SIGFPE,SIGSEGV和SIGPIPE：

bionic/linker/linker.cpp：

4172static ElfW(Addr) __linker_init_post_relocation(KernelArgumentBlock& args, ElfW(Addr) linker_base) {4173#if TIMING4174  struct timeval t0, t1;4175  gettimeofday(&t0, 0);4176#endif4179  __libc_init_AT_SECURE(args);41804184  debuggerd_init();

bionic/linker/debugger.cpp：

302__LIBC_HIDDEN__ void debuggerd_init() {303  struct sigaction action;304  memset(&action, 0, sizeof(action));305  sigemptyset(&action.sa_mask);306  action.sa_sigaction = debuggerd_signal_handler;//异常处理函数；307  action.sa_flags = SA_RESTART | SA_SIGINFO;308309  // Use the alternate signal stack if available so we can catch stack overflows.310  action.sa_flags |= SA_ONSTACK;311312  sigaction(SIGABRT, &action, nullptr);313  sigaction(SIGBUS, &action, nullptr);314  sigaction(SIGFPE, &action, nullptr);315  sigaction(SIGILL, &action, nullptr);316  sigaction(SIGSEGV, &action, nullptr);317#if defined(SIGSTKFLT)318  sigaction(SIGSTKFLT, &action, nullptr);319#endif320  sigaction(SIGTRAP, &action, nullptr);321}

bionic库中的链接器会对以下七种信号设置Handler(debugger_signal_handler)：

    SIGILL(非法指令异常)//前面对空指针赋值就，内核那边就发送这个信号给进程cameraserver    SIGABRT(abort退出异常)    SIGBUS(硬件访问异常)    SIGFPE(浮点运算异常)    SIGSEGV(内存访问异常)    SIGSTKFLT(协处理器栈异常)    SIGPIPE(管道异常

262static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void*) {263  // It's possible somebody cleared the SA_SIGINFO flag, which would mean264  // our "info" arg holds an undefined value.265  if (!have_siginfo(signal_number)) {266    info = nullptr;267  }268269  log_signal_summary(signal_number, info);//打印出现问题进程信息；270271  send_debuggerd_packet(info);//现在处于clinet端，通过socket跟service 进行connect，     //然后通过write(s, &msg, sizeof(msg)把info发给debuggerd，DEBUGGER_ACTION_CRASH为采取的行为；272273  // We need to return from the signal handler so that debuggerd can dump the274  // thread that crashed, but returning here does not guarantee that the signal275  // will be thrown again, even for SIGSEGV and friends, since the signal could276  // have been sent manually. Resend the signal with rt_tgsigqueueinfo(2) to277  // preserve the SA_SIGINFO contents.278  signal(signal_number, SIG_DFL);//设置该信号关联的动作，SIG_DFL表示默认操作，恢复到默认；279280  struct siginfo si;281  if (!info) {282    memset(&si, 0, sizeof(si));283    si.si_code = SI_USER;284    si.si_pid = getpid();285    si.si_uid = getuid();286    info = &si;287  } else if (info->si_code >= 0 || info->si_code == SI_TKILL) {288    // rt_tgsigqueueinfo(2)'s documentation appears to be incorrect on kernels289    // that contain commit 66dd34a (3.9+). The manpage claims to only allow290    // negative si_code values that are not SI_TKILL, but 66dd34a changed the291    // check to allow all si_code values in calls coming from inside the house.292  }293294  int rc = syscall(SYS_rt_tgsigqueueinfo, getpid(), gettid(), signal_number, info);//系统调用tgsigqueueinfo：信号将被传递给线程组的任意成员；295  if (rc != 0) {296    __libc_format_log(ANDROID_LOG_FATAL, "libc", "failed to resend signal during crash: %s",297                      strerror(errno));298    _exit(0);299  }300}

debuggered进程service端：

system/core/debuggerd/debuggerd.cpp

871int main(int argc, char** argv) {872  union selinux_callback cb;873  if (argc == 1) {874    cb.func_audit = audit_callback;875    selinux_set_callback(SELINUX_CB_AUDIT, cb);876    cb.func_log = selinux_log_callback;877    selinux_set_callback(SELINUX_CB_LOG, cb);878    return do_server();//没有-b参数就调用这个流程879  }880895  if (!have_tid) {896    usage();897    return 1;898  }899  return do_explicit_dump(tid, dump_backtrace);//手动导出 debuggerd -b tid 900}

当启动debuggerd进程传递的参数个数为1时，debuggerd将作为一个后台服务进程，专门接收应用程序异常退出消息而产生tombstone：

792static int do_server() {793  // debuggerd crashes can't be reported to debuggerd.794  // Reset all of the crash handlers.     //忽略debuggerd自身crash的处理；795  signal(SIGABRT, SIG_DFL);796  signal(SIGBUS, SIG_DFL);797  signal(SIGFPE, SIG_DFL);798  signal(SIGILL, SIG_DFL);799  signal(SIGSEGV, SIG_DFL);800#ifdef SIGSTKFLT801  signal(SIGSTKFLT, SIG_DFL);802#endif803  signal(SIGTRAP, SIG_DFL);804805  // Ignore failed writes to closed sockets806  signal(SIGPIPE, SIG_IGN);807808  // Block SIGCHLD so we can sigtimedwait for it.809  sigset_t sigchld;810  sigemptyset(&sigchld);811  sigaddset(&sigchld, SIGCHLD);812  sigprocmask(SIG_SETMASK, &sigchld, nullptr);813  //建立socket通信的server端；814  int s = socket_local_server(SOCKET_NAME, ANDROID_SOCKET_NAMESPACE_ABSTRACT,815                              SOCK_STREAM | SOCK_CLOEXEC);816  if (s == -1) return 1;817818  // Fork a process that stays root, and listens on a pipe to pause and resume the target.819  if (!start_signal_sender()) {820    ALOGE("debuggerd: failed to fork signal sender");821    return 1;822  }823824  ALOGI("debuggerd: starting\n");825826  for (;;) {827    sockaddr_storage ss;828    sockaddr* addrp = reinterpret_cast<sockaddr*>(&ss);829    socklen_t alen = sizeof(ss);830831    ALOGV("waiting for connection\n");832    int fd = accept4(s, addrp, &alen, SOCK_CLOEXEC);833    if (fd == -1) {834      ALOGE("accept failed: %s\n", strerror(errno));835      continue;836    }837838    handle_request(fd);//handle_request 处理请求；839  }840  return 0;841}

system/core/debuggerd/debuggerd.cpp：

751static void handle_request(int fd) {752  ALOGV("handle_request(%d)\n", fd);753754  ScopedFd closer(fd);755  debugger_request_t request;756  memset(&request, 0, sizeof(request));757  int status = read_request(fd, &request);     //读取client端进程发送来的数据,socket上读取debugger_msg_t结构体;758  if (status != 0) {759    return;760  }781  // Fork a child to handle the rest of the request.782  pid_t fork_pid = fork();783  if (fork_pid == -1) {784    ALOGE("debuggerd: failed to fork: %s\n", strerror(errno));785  } else if (fork_pid == 0) {786    worker_process(fd, request);//创建一个子进程去处理dump的工作；787  } else {788    monitor_worker_process(fork_pid, request);//父进程监控子进程操作，结束后就会杀敌子进程；789  }790}

先看子进程的操作：system/core/debuggerd/debuggerd.cpp：

537static void worker_process(int fd, debugger_request_t& request) {538  // Open the tombstone file if we need it.539  std::string tombstone_path;540  int tombstone_fd = -1;541  switch (request.action) {542    case DEBUGGER_ACTION_DUMP_TOMBSTONE:543    case DEBUGGER_ACTION_CRASH:544      tombstone_fd = open_tombstone(&tombstone_path);553     //打开一个tombstone文件，限制最多10个，超过了就会被覆盖掉；554    default:555      ALOGE("debuggerd: unexpected request action: %d", request.action);556      exit(1);557  }569570  // Attach to the target process.571  if (ptrace(PTRACE_ATTACH, request.tid, 0, 0) != 0) {     //跟踪指定进程,成为它的父进程,并停止该进程，debuggerd可也拦截发送给这个thread的信号除了      //SIGKILL，所以现在kernel那边发送过来的信号将被debuggered拦截;     //ATTACH之后,会让kernel那边发送SIGSTOP信号给原来问题进程，这个信号将被debuggerd拦截；572    ALOGE("debuggerd: ptrace attach failed: %s", strerror(errno));573    exit(1);574  }575576  // Don't attach to the sibling threads if we want to attach gdb.577  // Supposedly, it makes the process less reliable.578  bool attach_gdb = should_attach_gdb(request);587  //是否调用gdb调试，是就会终止正常的crash588  std::set<pid_t> siblings;589  if (!attach_gdb) {590    ptrace_siblings(request.pid, request.tid, siblings);       //同时跟踪问题thread相关联的thread；591  }592593  // Generate the backtrace map before dropping privileges.594  std::unique_ptr<BacktraceMap> backtrace_map(BacktraceMap::Create(request.pid));595 //生成backtrace map；596  int amfd = -1;597  std::unique_ptr<std::string> amfd_data;598  if (request.action == DEBUGGER_ACTION_CRASH) {599    // Connect to the activity manager before dropping privileges.600    amfd = activity_manager_connect();601    amfd_data.reset(new std::string);602  }603604  bool succeeded = false;605606  // Now that we've done everything that requires privileges, we can drop them.607  if (!drop_privileges()) {608    ALOGE("debuggerd: failed to drop privileges, exiting");609    _exit(1);610  }611612  int crash_signal = SIGKILL;613  succeeded = perform_dump(request, fd, tombstone_fd, backtrace_map.get(), siblings,614                           &crash_signal, amfd_data.get());     //根据sinal信号类型然后通过engrave_tombstone把信息写到tombstone；615  if (succeeded) {616    if (request.action == DEBUGGER_ACTION_DUMP_TOMBSTONE) {617      if (!tombstone_path.empty()) {618        android::base::WriteFully(fd, tombstone_path.c_str(), tombstone_path.length());619      }620    }621  }631  if (!attach_gdb) {632    // Tell the Activity Manager about the crashing process. If we are633    // waiting for gdb to attach, do not send this or Activity Manager634    // might kill the process before anyone can attach.635    activity_manager_write(request.pid, crash_signal, amfd, *amfd_data.get());636  }637  //解除对问题tread的跟踪；638  if (ptrace(PTRACE_DETACH, request.tid, 0, 0) != 0) {639    ALOGE("debuggerd: ptrace detach from %d failed: %s", request.tid, strerror(errno));640  }641  //解除对问题相关联tread的跟踪；642  for (pid_t sibling : siblings) {643    ptrace(PTRACE_DETACH, sibling, 0, 0);644  }645646  // Send the signal back to the process if it crashed and we're not waiting for gdb.647  if (!attach_gdb && request.action == DEBUGGER_ACTION_CRASH) {648    if (!send_signal(request.pid, request.tid, crash_signal)) {649      ALOGE("debuggerd: failed to kill process %d: %s", request.pid, strerror(errno));650    }651  }667668  close(amfd);669670  exit(!succeeded);671}

455static bool perform_dump(const debugger_request_t& request, int fd, int tombstone_fd,456                         BacktraceMap* backtrace_map, const std::set<pid_t>& siblings,457                         int* crash_signal, std::string* amfd_data) {458  if (TEMP_FAILURE_RETRY(write(fd, "\0", 1)) != 1) {459    ALOGE("debuggerd: failed to respond to client: %s\n", strerror(errno));460    return false;461  }462463  int total_sleep_time_usec = 0;464  while (true) {465    int signal = wait_for_signal(request.tid, &total_sleep_time_usec);       //第一次发送等到的是stop信号，第二次才是出现问题类型的真正信号；466    switch (signal) {467      case -1:468        ALOGE("debuggerd: timed out waiting for signal");469        return false;470471      case SIGSTOP:480          ALOGV("debuggerd: stopped -- continuing");481          if (ptrace(PTRACE_CONT, request.tid, 0, 0) != 0) {//将目标问题进程切换位出现问题时刻的上下文状态；482            ALOGE("debuggerd: ptrace continue failed: %s", strerror(errno));483            return false;484          }485          continue;  // loop again486        }487        break;488489      case SIGABRT:490      case SIGBUS:491      case SIGFPE:492      case SIGILL:493      case SIGSEGV:494#ifdef SIGSTKFLT495      case SIGSTKFLT:496#endif497      case SIGSYS:498      case SIGTRAP:499        ALOGV("stopped -- fatal signal\n");500        *crash_signal = signal;//当在一次信号过来,就会通过下面的函数导出此刻问题进程的信息；501        engrave_tombstone(tombstone_fd, backtrace_map, request.pid, request.tid, siblings, signal,502                          request.original_si_code, request.abort_msg_address, amfd_data);503        break;504505      default:506        ALOGE("debuggerd: process stopped due to unexpected signal %d\n", signal);507        break;508    }509    break;510  }511512  return true;513}

如下就是从tombstone导出来的信息：

*** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***Build fingerprint: 'xxx/5049S/xxx:7.0/NRD90M/xxx:eng/test-keys'Revision: '0'ABI: 'arm'pid: 505, tid: 1046, name: Binder:505_1  >>> /system/bin/cameraserver <<<signal 4 (SIGILL), code 1 (ILL_ILLOPC), fault addr 0xe09a4988    r0 e8d1a2c8  r1 e09a8004  r2 00000001  r3 00000002    r4 e1106200  r5 00000001  r6 de83b891  r7 e8d438d0    r8 00000416  r9 e8d39990  sl e930bf6d  fp e127f910    ip de855c1c  sp e127f678  lr de83aba1  pc e09a4988  cpsr 200f0030backtrace:    #00 pc 00000988  /system/vendor/lib/libcancer.so (_ZN7android6Cancer15destroyInstanceEv+39)    #01 pc 0001db9d  /system/vendor/lib/libcam.client.so (_ZN7android15NSDisplayClient13DisplayClient4initEv+60)    #02 pc 0000d145  /system/vendor/lib/libcam.device1.so (_ZN7android14Cam1DeviceBase17initDisplayClientEP18preview_stream_ops+684)

整个tombstone包含的信息有：
(1). 创建1个tombstone文件。
最多10个,如果已存在10个,则覆盖最旧的文件。
(2). 版本信息
主要是fingerprint,可以看出异常版本是eng还是user。
(3). 寄存器信息
主要查看是哪个进程崩溃,信号是什么。寄存器信息需要配合下面的调用栈信息及数据信息结合GNU的工具(objdump -S反汇编)分析。
(4). 调用栈信息
这个是最直接可以看出异常的信息。
(5). 其他线程信息
如果异常线程和其他线程有逻辑关系的话,可以查看对应线程的信息。
(6). main log信息

最后添加一张流程图：
debuggered

0 0