Atlas源码剖析(三)

来源:互联网 发布:延边网络电视台 编辑:程序博客网 时间:2024/04/27 14:55

keepalive进程的开启

Atlas调用chassis_unix_proc_keepalive函数来keep process alive。

调用fork创建子进程,其中子进程直接返回,处理接下来的任务,父进程将接收到的SIGINT、SIGTERM、SIGHUP、SIGUSR1和SIGUSR2信号发送给子进程进行处理,若系统支持wait4则调用wait4等待子进程结束,否则调用waitpid。

若子进程退出,对pid_file调用unlink(这个文件保存了子进程的pid)。检查子进程的退出原因,若是调用exit退出的,写入日志,同时父进程返回1,这将会导致父进程的结束;若是因为信号而退出,写入日志,同时sleep 2秒后重启子进程。

若是出的其的错误,则写入日志,返回-1结束。

这样就以keepalive的形式启动了整个程序,子进程负责对外提供服务,父进程则监控子进程,保证子进程一直都处于活跃状态,若子进程终止,则重新创建子进程,以保证服务的不中断。

int chassis_unix_proc_keepalive(int *child_exit_status, const char *pid_file) {#ifdef _WIN32   g_assert_not_reached();    return 0; #else   int nprocs = 0;   pid_t child_pid = -1;    /* we ignore SIGINT and SIGTERM and just let it be forwarded to the child instead    * as we want to collect its PID before we shutdown too     *    * the child will have to set its own signal handlers for this    */   for (;;) {      /* try to start the children */      while (nprocs < 1) {         pid_t pid = fork();         if (pid == 0) {            /* child */            g_debug("%s: we are the child: %d",                         G_STRLOC,                         getpid());            return 0;         } else if (pid < 0) {            /* fork() failed */            g_critical("%s: fork() failed: %s (%d)",                         G_STRLOC,                         g_strerror(errno),                         errno);            return -1;          } else {            /* we are the angel, let's see what the child did */            g_message("%s: [angel] we try to keep PID=%d alive",                         G_STRLOC,                         pid);            /* forward a few signals that are sent to us to the child instead */            signal(SIGINT, chassis_unix_signal_forward);            signal(SIGTERM, chassis_unix_signal_forward);            signal(SIGHUP, chassis_unix_signal_forward);            signal(SIGUSR1, chassis_unix_signal_forward);            signal(SIGUSR2, chassis_unix_signal_forward);            child_pid = pid;            nprocs++;         }         }         if (child_pid != -1) {         struct rusage rusage;         int exit_status;         pid_t exit_pid;         g_debug("%s: waiting for %d",                      G_STRLOC,                      child_pid);#ifdef HAVE_WAIT4         exit_pid = wait4(child_pid, &exit_status, 0, &rusage);#else         memset(&rusage, 0, sizeof(rusage)); /* make sure everything is zero'ed out */         exit_pid = waitpid(child_pid, &exit_status, 0);#endif         g_debug("%s: %d returned: %d",                      G_STRLOC,                      child_pid,                      exit_pid);         if (exit_pid == child_pid) {            /* delete pid file */            if (pid_file) {               unlink(pid_file);            }            /* our child returned, let's see how it went */            if (WIFEXITED(exit_status)) {               g_message("%s: [angel] PID=%d exited normally with exit-code = %d (it used %ld kBytes max)",                              G_STRLOC,                              child_pid,                              WEXITSTATUS(exit_status),                              rusage.ru_maxrss / 1024);               if (child_exit_status) *child_exit_status = WEXITSTATUS(exit_status);                  return 1;               } else if (WIFSIGNALED(exit_status)) {                  int time_towait = 2;                  /* our child died on a signal                   *                   * log it and restart */                  g_critical("%s: [angel] PID=%d died on signal=%d (it used %ld kBytes "                                       "... waiting 3min before restart",                                  G_STRLOC,                                  child_pid,                                  WTERMSIG(exit_status),                                  rusage.ru_maxrss / 1024);                  /**                   * to make sure we don't loop as fast as we can, sleep a bit between                    * restarts                   */                  signal(SIGINT, SIG_DFL);                  signal(SIGTERM, SIG_DFL);                  signal(SIGHUP, SIG_DFL);                  while (time_towait > 0) time_towait = sleep(time_towait);                  nprocs--;                  child_pid = -1;               } else if (WIFSTOPPED(exit_status)) {               } else {                  g_assert_not_reached();               }            } else if (-1 == exit_pid) {               /* EINTR is ok, all others bad */               if (EINTR != errno) {                  /* how can this happen ? */                  g_critical("%s: wait4(%d, ...) failed: %s (%d)",                                  G_STRLOC,                                  child_pid,                                  g_strerror(errno),                                  errno);                  return -1;               }            } else {               g_assert_not_reached();            }      }   }#endif}
0 0
原创粉丝点击