Hadoop2.x NodeManager启动之服务启动

来源:互联网 发布:淘宝客服销售话术大全 编辑:程序博客网 时间:2024/06/07 06:35
在NM各服务初始化完毕之后,会进入启动阶段,该阶段的调用栈如下

由于NM也是一个综合服务类,所以他会进入CompositeService的serviceStart循环启动各个服务,代码如下:
protected void serviceStart() throws Exception {//获得服务列表  List<Service> services = getServices();  if (LOG.isDebugEnabled()) {    LOG.debug(getName() + ": starting services, size=" + services.size());  }  for (Service service : services) {    // start the service. If this fails that service    // will be stopped and an exception raised    service.start();//启动相应服务  }  super.serviceStart();}
这段代码循环启动如下服务

NodeHealthCheckerService服务:综合服务类内部包含LocalDirsHandlerService服务,该功能是否开启受yarn.nodemanager.disk-health-checker.enable控制默认开启,默认2分钟检测一次,检测目录为yarn.nodemanager.local-dirs yarn.nodemanager.log-dirs ,默认每2分钟检测一次
//LocalDirsHandlerService.serviceStart() line: 154@Overrideprotected void serviceStart() throws Exception {  if (isDiskHealthCheckerEnabled) {    dirsHandlerScheduler = new Timer("DiskHealthMonitor-Timer", true);    dirsHandlerScheduler.scheduleAtFixedRate(monitoringTimerTask,        diskHealthCheckInterval, diskHealthCheckInterval);  }  super.serviceStart();}
目录检测类构建如下:
public MonitoringTimerTask(Configuration conf) throws YarnRuntimeException {  localDirs = new DirectoryCollection(      validatePaths(conf.getTrimmedStrings(YarnConfiguration.NM_LOCAL_DIRS)));  logDirs = new DirectoryCollection(      validatePaths(conf.getTrimmedStrings(YarnConfiguration.NM_LOG_DIRS)));  localDirsAllocator = new LocalDirAllocator(      YarnConfiguration.NM_LOCAL_DIRS);  logDirsAllocator = new LocalDirAllocator(YarnConfiguration.NM_LOG_DIRS);}
检测函数如下,因为是对文件系统的检测,无非是读写执行这些权限,所以这里就不继续贴下去了
private void checkDirs() {    boolean newFailure = false;    if (localDirs.checkDirs()) {      newFailure = true;    }    if (logDirs.checkDirs()) {      newFailure = true;    }    if (newFailure) {      updateDirsAfterFailure();    }    lastDisksCheckTime = System.currentTimeMillis();}
ContainerManager服务:该服务会启动一个RPC服务器,协议为:ContainerManagementProtocol用于AM与NM之间的通信,handler线程数量由yarn.nodemanager.container-manager.thread-count控制
@Overrideprotected void serviceStart() throws Exception {  // Enqueue user dirs in deletion context  Configuration conf = getConfig();  Configuration serverConf = new Configuration(conf);  // always enforce it to be token-based.  serverConf.set(    CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION,    SaslRpcServer.AuthMethod.TOKEN.toString());    YarnRPC rpc = YarnRPC.create(conf);  InetSocketAddress initialAddress = conf.getSocketAddr(      YarnConfiguration.NM_ADDRESS,      YarnConfiguration.DEFAULT_NM_ADDRESS,      YarnConfiguration.DEFAULT_NM_PORT);//创建RPC服务器,用于接收AM的控制,具体协议可参考ContainerManagementProtocol  server =      rpc.getServer(ContainerManagementProtocol.class, this, initialAddress,           serverConf, this.context.getNMTokenSecretManager(),          conf.getInt(YarnConfiguration.NM_CONTAINER_MGR_THREAD_COUNT,               YarnConfiguration.DEFAULT_NM_CONTAINER_MGR_THREAD_COUNT));    // Enable service authorization?  if (conf.getBoolean(      CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHORIZATION,       false)) {    refreshServiceAcls(conf, new NMPolicyProvider());  }  //服务器启动阶段不接受客户端的控制请求  LOG.info("Blocking new container-requests as container manager rpc" +  " server is still starting.");  this.setBlockNewContainerRequests(true);  server.start();  InetSocketAddress connectAddress = NetUtils.getConnectAddress(server);  NodeId nodeId = NodeId.newInstance(      connectAddress.getAddress().getCanonicalHostName(),      connectAddress.getPort());  ((NodeManager.NMContext)context).setNodeId(nodeId);  this.context.getNMTokenSecretManager().setNodeId(nodeId);  this.context.getContainerTokenSecretManager().setNodeId(nodeId);  LOG.info("ContainerManager started at " + connectAddress);  super.serviceStart();}
ContainerManager子服务之ResourceLocalizationService
@Overridepublic void serviceStart() throws Exception {  cacheCleanup.scheduleWithFixedDelay(new CacheCleanup(dispatcher),      cacheCleanupPeriod, cacheCleanupPeriod, TimeUnit.MILLISECONDS);  server = createServer();  server.start();  localizationServerAddress =      getConfig().updateConnectAddr(YarnConfiguration.NM_LOCALIZER_ADDRESS,                                    server.getListenerAddress());  LOG.info("Localizer started on port " + server.getPort());  super.serviceStart();}
ContainerManager子服务之ContainersMonitor:
public void run() {  while (true) {    // Print the processTrees for debugging.    .....    // 添加新的容器    synchronized (containersToBeAdded) {      for (Entry<ContainerId, ProcessTreeInfo> entry : containersToBeAdded          .entrySet()) {        ContainerId containerId = entry.getKey();        ProcessTreeInfo processTreeInfo = entry.getValue();        LOG.info("Starting resource-monitoring for " + containerId);        trackingContainers.put(containerId, processTreeInfo);      }      containersToBeAdded.clear();    }    // 删除完成容器    synchronized (containersToBeRemoved) {      for (ContainerId containerId : containersToBeRemoved) {        trackingContainers.remove(containerId);        LOG.info("Stopping resource-monitoring for " + containerId);      }      containersToBeRemoved.clear();    }    // 检测所有已跟踪容器,如果资源使用超出限制则kill    long vmemStillInUsage = 0;    long pmemStillInUsage = 0;    //遍历容器    for (Iterator<Map.Entry<ContainerId, ProcessTreeInfo>> it =        trackingContainers.entrySet().iterator(); it.hasNext();) {      Map.Entry<ContainerId, ProcessTreeInfo> entry = it.next();      ContainerId containerId = entry.getKey();      ProcessTreeInfo ptInfo = entry.getValue();      try {        String pId = ptInfo.getPID();        // 如果PID为空则需要重新初始化        if (pId == null) {          // get pid from ContainerId          pId = containerExecutor.getProcessId(ptInfo.getContainerId());          if (pId != null) {            // pId will be null, either if the container is not spawned yet            // or if the container's pid is removed from ContainerExecutor            LOG.debug("Tracking ProcessTree " + pId                + " for the first time");            ResourceCalculatorProcessTree pt =                ResourceCalculatorProcessTree.getResourceCalculatorProcessTree(pId, processTreeClass, conf);            ptInfo.setPid(pId);            ptInfo.setProcessTree(pt);          }        }        // End of initializing any uninitialized processTrees//初始化后仍然为空的则放弃跟踪        if (pId == null) {          continue; // processTree cannot be tracked        }        LOG.debug("Constructing ProcessTree for : PID = " + pId            + " ContainerId = " + containerId);        ResourceCalculatorProcessTree pTree = ptInfo.getProcessTree();        pTree.updateProcessTree();    // update process-tree        //计算当前内存使用:物理内存和虚拟内存        long currentVmemUsage = pTree.getCumulativeVmem();        long currentPmemUsage = pTree.getCumulativeRssmem();        // as processes begin with an age 1, we want to see if there        // are processes more than 1 iteration old.        long curMemUsageOfAgedProcesses = pTree.getCumulativeVmem(1);        long curRssMemUsageOfAgedProcesses = pTree.getCumulativeRssmem(1);        //计算阈值        long vmemLimit = ptInfo.getVmemLimit();        long pmemLimit = ptInfo.getPmemLimit();        LOG.info(String.format(            "Memory usage of ProcessTree %s for container-id %s: ",                 pId, containerId.toString()) +            formatUsageString(currentVmemUsage, vmemLimit, currentPmemUsage, pmemLimit));        boolean isMemoryOverLimit = false;        String msg = "";        //判断内存使用是否超出阈值        if (isVmemCheckEnabled()            && isProcessTreeOverLimit(containerId.toString(),                currentVmemUsage, curMemUsageOfAgedProcesses, vmemLimit)) {          // Container (the root process) is still alive and overflowing          // memory.          // Dump the process-tree and then clean it up.          msg = formatErrorMessage("virtual",              currentVmemUsage, vmemLimit,              currentPmemUsage, pmemLimit,              pId, containerId, pTree);          isMemoryOverLimit = true;        } else if (isPmemCheckEnabled()            && isProcessTreeOverLimit(containerId.toString(),                currentPmemUsage, curRssMemUsageOfAgedProcesses,                pmemLimit)) {          // Container (the root process) is still alive and overflowing          // memory.          // Dump the process-tree and then clean it up.          msg = formatErrorMessage("physical",              currentVmemUsage, vmemLimit,              currentPmemUsage, pmemLimit,              pId, containerId, pTree);          isMemoryOverLimit = true;        }//如果超出阈值,则杀死该容器        if (isMemoryOverLimit) {          // Virtual or physical memory over limit. Fail the container and          // remove          // the corresponding process tree          LOG.warn(msg);          // warn if not a leader          if (!pTree.checkPidPgrpidForMatch()) {            LOG.error("Killed container process with PID " + pId                + " but it is not a process group leader.");          }          // kill the container          eventDispatcher.getEventHandler().handle(              new ContainerKillEvent(containerId, msg));          it.remove();          LOG.info("Removed ProcessTree with root " + pId);        } else {          // Accounting the total memory in usage for all containers that          // are still          // alive and within limits.          vmemStillInUsage += currentVmemUsage;          pmemStillInUsage += currentPmemUsage;        }      } catch (Exception e) {        // Log the exception and proceed to the next container.        LOG.warn("Uncaught exception in ContainerMemoryManager "            + "while managing memory of " + containerId, e);      }    }//休息一下    try {      Thread.sleep(monitoringInterval);    } catch (InterruptedException e) {      LOG.warn(ContainersMonitorImpl.class.getName()          + " is interrupted. Exiting.");      break;    }  }}
NodeStatusUpdater服务:接收来自RM命令,并执行相应操作,类似MR1中TT的主循环
protected void startStatusUpdater() {statusUpdaterRunnable = new Runnable() {  @Override  @SuppressWarnings("unchecked")  public void run() {    int lastHeartBeatID = 0;    while (!isStopped) {      // Send heartbeat      try {      //执行心跳操作,并捕获返回结果        NodeHeartbeatResponse response = null;        NodeStatus nodeStatus =            getNodeStatusAndUpdateContainersInContext(lastHeartBeatID);                NodeHeartbeatRequest request =            NodeHeartbeatRequest.newInstance(nodeStatus,              NodeStatusUpdaterImpl.this.context                .getContainerTokenSecretManager().getCurrentKey(),              NodeStatusUpdaterImpl.this.context.getNMTokenSecretManager()                .getCurrentKey());        response = resourceTracker.nodeHeartbeat(request);        //get next heartbeat interval from response        nextHeartBeatInterval = response.getNextHeartBeatInterval();        updateMasterKeys(response);//节点关闭操作        if (response.getNodeAction() == NodeAction.SHUTDOWN) {          LOG            .warn("Recieved SHUTDOWN signal from Resourcemanager as part of heartbeat,"                + " hence shutting down.");          LOG.warn("Message from ResourceManager: "              + response.getDiagnosticsMessage());          dispatcher.getEventHandler().handle(              new NodeManagerEvent(NodeManagerEventType.SHUTDOWN));          break;        }        //重新同步操作        if (response.getNodeAction() == NodeAction.RESYNC) {          LOG.warn("Node is out of sync with ResourceManager,"              + " hence resyncing.");          LOG.warn("Message from ResourceManager: "              + response.getDiagnosticsMessage());          // Invalidate the RMIdentifier while resync          NodeStatusUpdaterImpl.this.rmIdentifier =              ResourceManagerConstants.RM_INVALID_IDENTIFIER;          dispatcher.getEventHandler().handle(              new NodeManagerEvent(NodeManagerEventType.RESYNC));          break;        }//清理相应容器        lastHeartBeatID = response.getResponseId();        List<ContainerId> containersToCleanup = response            .getContainersToCleanup();        if (!containersToCleanup.isEmpty()) {          dispatcher.getEventHandler().handle(              new CMgrCompletedContainersEvent(containersToCleanup,                CMgrCompletedContainersEvent.Reason.BY_RESOURCEMANAGER));        }        List<ApplicationId> appsToCleanup =            response.getApplicationsToCleanup();        //Only start tracking for keepAlive on FINISH_APP        trackAppsForKeepAlive(appsToCleanup);        if (!appsToCleanup.isEmpty()) {          dispatcher.getEventHandler().handle(              new CMgrCompletedAppsEvent(appsToCleanup,                  CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER));        }      } catch (ConnectException e) {        //catch and throw the exception if tried MAX wait time to connect RM        dispatcher.getEventHandler().handle(            new NodeManagerEvent(NodeManagerEventType.SHUTDOWN));        throw new YarnRuntimeException(e);      } catch (Throwable e) {        // TODO Better error handling. Thread can die with the rest of the        // NM still running.        LOG.error("Caught exception in status-updater", e);      } finally {        synchronized (heartbeatMonitor) {          nextHeartBeatInterval = nextHeartBeatInterval <= 0 ?              YarnConfiguration.DEFAULT_RM_NM_HEARTBEAT_INTERVAL_MS :                nextHeartBeatInterval;          try {            heartbeatMonitor.wait(nextHeartBeatInterval);          } catch (InterruptedException e) {            // Do Nothing          }        }      }    }}


0 0
原创粉丝点击