
来源:互联网 发布:hbase性能优化方法总结 编辑:程序博客网 时间:2024/05/21 17:54



this.namenode = (DatanodeProtocol)       RPC.waitForProxy(DatanodeProtocol.class,                       DatanodeProtocol.versionID,                       nameNodeAddr, conf);

RPC.waitForProxy()方法掉用了RPC.getProxy()方法创建一个用于IPC通信(关于IPC请参考前面几篇分析Hadoop IPC的博文)的对象,方法返回后赋值给namenode,RPC.waitForProxy()方法如下:

static VersionedProtocol waitForProxy(Class<? extends VersionedProtocol> protocol,                                      long clientVersion, InetSocketAddress addr,                                      Configuration conf, int rpcTimeout,                                      long connTimeout)throws IOException {     long startTime = System.currentTimeMillis();    IOException ioe;    while (true) {      try {        return getProxy(protocol, clientVersion, addr, conf, rpcTimeout);      } catch(ConnectException se) {  // namenode has not been started        LOG.info("Server at " + addr + " not available yet, Zzzzz...");        ioe = se;      } catch(SocketTimeoutException te) {  // namenode is busy        LOG.info("Problem connecting to server: " + addr);        ioe = te;      }      // check if timed out      if (System.currentTimeMillis()-connTimeout >= startTime) {        throw ioe;      }      // wait for retry      try {        Thread.sleep(1000);      } catch (InterruptedException ie) {        // IGNORE      }    }  }





private NamespaceInfo handshake() throws IOException {    NamespaceInfo nsInfo = new NamespaceInfo();    while (shouldRun) {      try {        nsInfo = namenode.versionRequest();//调用远程方法,获取名字节点的信息        break;      } catch(SocketTimeoutException e) {  // namenode is busy        LOG.info("Problem connecting to server: " + getNameNodeAddr());        try {          Thread.sleep(1000);        } catch (InterruptedException ie) {}      }    }    if (!isPermittedVersion(nsInfo)) {      String errorMsg = "Shutting down. Incompatible version or revision." +          "DataNode version '" + VersionInfo.getVersion() +          "' and revision '" + VersionInfo.getRevision() +          "' and NameNode version '" + nsInfo.getVersion() +          "' and revision '" + nsInfo.getRevision() +          " and " + CommonConfigurationKeys.HADOOP_RELAXED_VERSION_CHECK_KEY +          " is " + (relaxedVersionCheck ? "enabled" : "not enabled") +          " and " + CommonConfigurationKeys.HADOOP_SKIP_VERSION_CHECK_KEY +          " is " + (noVersionCheck ? "enabled" : "not enabled");      LOG.fatal(errorMsg);      notifyNamenode(DatanodeProtocol.NOTIFY, errorMsg);        throw new IOException( errorMsg );    }    assert FSConstants.LAYOUT_VERSION == nsInfo.getLayoutVersion() :      "Data-node and name-node layout versions must be the same."      + "Expected: "+ FSConstants.LAYOUT_VERSION + " actual "+ nsInfo.getLayoutVersion();    return nsInfo;  }


public class NamespaceInfo extends StorageInfo implements Writable {  /**系统构建的版本号**/  String revision;  /**Hadoop版本**/  String version;  /**用于数据节点升级前进行版本检查**/  int distributedUpgradeVersion;}





/**    * 处理数据节点注册   */  public DatanodeRegistration register(DatanodeRegistration nodeReg                                       ) throws IOException {    verifyVersion(nodeReg.getVersion());    namesystem.registerDatanode(nodeReg);          return nodeReg;  }



public synchronized void registerDatanode(DatanodeRegistration nodeReg                                            ) throws IOException {    String dnAddress = Server.getRemoteAddress();//获得数据节点的地址    if (dnAddress == null) {      // Mostly called inside an RPC.      // But if not, use address passed by the data-node.      dnAddress = nodeReg.getHost();    }          // check if the datanode is allowed to be connect to the namenode    // 该数据节点是否允许连接到这个名字节点(根据include和exclude文件)    if (!verifyNodeRegistration(nodeReg, dnAddress)) {      throw new DisallowedDatanodeException(nodeReg);    }    String hostName = nodeReg.getHost();          // update the datanode's name with ip:port    //使用IP:端口更新数据节点    DatanodeID dnReg = new DatanodeID(dnAddress + ":" + nodeReg.getPort(),                                      nodeReg.getStorageID(),                                      nodeReg.getInfoPort(),                                      nodeReg.getIpcPort());    nodeReg.updateRegInfo(dnReg);    nodeReg.exportedKeys = getBlockKeys();          NameNode.stateChangeLog.info(                                 "BLOCK* registerDatanode: "                                 + "node registration from " + nodeReg.getName()                                 + " storage " + nodeReg.getStorageID());    DatanodeDescriptor nodeS = datanodeMap.get(nodeReg.getStorageID());    DatanodeDescriptor nodeN = host2DataNodeMap.getDatanodeByName(nodeReg.getName());          if (nodeN != null && nodeN != nodeS) {//数据节点使用新的存储标识进行注册      NameNode.LOG.info("BLOCK* registerDatanode: "                        + "node from name: " + nodeN.getName());      // nodeN previously served a different data storage,       // which is not served by anybody anymore.      removeDatanode(nodeN);      // physically remove node from datanodeMap      wipeDatanode(nodeN);      nodeN = null;    }    if (nodeS != null) {//重复注册      if (nodeN == nodeS) {        // The same datanode has been just restarted to serve the same data         // storage. We do not need to remove old data blocks, the delta will        // be calculated on the next block report from the datanode        NameNode.stateChangeLog.debug("BLOCK* registerDatanode: "                                      + "node restarted");      } else {        // nodeS is found        /* The registering datanode is a replacement node for the existing           data storage, which from now on will be served by a new node.          If this message repeats, both nodes might have same storageID           by (insanely rare) random chance. User needs to restart one of the          nodes with its data cleared (or user can just remove the StorageID          value in "VERSION" file under the data directory of the datanode,          but this is might not work if VERSION file format has changed        */                NameNode.stateChangeLog.info( "BLOCK* registerDatanode: "                                      + "node " + nodeS.getName()                                      + " is replaced by " + nodeReg.getName() +                                       " with the same storageID " +                                      nodeReg.getStorageID());      }      // update cluster map      clusterMap.remove(nodeS);      nodeS.updateRegInfo(nodeReg);      nodeS.setHostName(hostName);            // resolve network location      resolveNetworkLocation(nodeS);      clusterMap.add(nodeS);              // also treat the registration message as a heartbeat      synchronized(heartbeats) {        if( !heartbeats.contains(nodeS)) {          heartbeats.add(nodeS);          //update its timestamp          nodeS.updateHeartbeat(0L, 0L, 0L, 0);          nodeS.isAlive = true;        }      }      return;    }     // this is a new datanode serving a new data storage    if (nodeReg.getStorageID().equals("")) {      // this data storage has never been registered      // it is either empty or was created by pre-storageID version of DFS      nodeReg.storageID = newStorageID();      NameNode.stateChangeLog.debug(                                    "BLOCK* registerDatanode: "                                    + "new storageID " + nodeReg.getStorageID() + " assigned");    }    // register new datanode,登记新的数据节点    DatanodeDescriptor nodeDescr       = new DatanodeDescriptor(nodeReg, NetworkTopology.DEFAULT_RACK, hostName);    resolveNetworkLocation(nodeDescr);    unprotectedAddDatanode(nodeDescr);    clusterMap.add(nodeDescr);          // also treat the registration message as a heartbeat    // 加入到心跳检查列表中,注册信息相当与心跳    synchronized(heartbeats) {      heartbeats.add(nodeDescr);      nodeDescr.isAlive = true;      // no need to update its timestamp      // because its is done when the descriptor is created    }    if (safeMode != null) {      safeMode.checkMode();    }    return;  }







针对这三种情况,在FSNamesystem.registerDatanode()方法中进行处理。在FSNamesystem类中有两个成员变量来处理这些情况,分别是FSNamesystem.datanodeMap和FSNamesystem.host2DataNodeMap,其中FSNamesystem.datanodeMap成员变量记录了在当前NameNode节点注册过的所有DataNode节点,键值对为StorageID -> DatanodeDescriptor,这个变量定义为TreeMap类型,可以快速的根据键值storageID查找到对应的值DatanodeDescriptor对象。FSnamesystem.host2DataNodeMap变量提供了在NameNode节点上注册的DataNode节点名称和其DatanodeDescriptor对象的映射,即可以根据DataNode的名称查找该DataNode的注册信息。在注册方法中通过存储标识storageID在datanodeMap中获取数据节点描述符对象为nodeS,通过DataNode节点服务器名和端口号在host2DataNodeMap中获取的描述符为nodeN。那么NameNode节点为什么要保存两个注册信息的映射呢?试想如果有一个DataNode节点在NameNode上注册过了,但是这个DataNode节点在某个时间点进行了格式化,再重新启动,那么这个数据节点需要重新向NameNode节点发送一个注册信息,此时nodeS对象(为null)和nodeN对象就不相等,这就是上面所说的情况3,这时,nodeN就是一个过时的节点信息对象,那么就使用方法FSNamesystem.removeDatanode()方法和FSNamesystem.wipeDatanode()方法清理原有节点在NameNode节点上注册过的信息,再将nodeN赋值为null,这样后面的处理就和情况1一样了。





public void offerService() throws Exception {    while (shouldRun) {      try {        long startTime = now();        //        // Every so often, send heartbeat or block-report        //                if (startTime - lastHeartbeat > heartBeatInterval) {          //每隔一定的时间就发送一次心跳          lastHeartbeat = startTime;          DatanodeCommand[] cmds = namenode.sendHeartbeat(dnRegistration,//数据节点的标记                                                       data.getCapacity(),//数据节点的存储容量                                                       data.getDfsUsed(),//目前已经使用的容量                                                       data.getRemaining(),//剩余容量                                                       xmitsInProgress.get(),//正在进行数据块拷贝的线程数                                                       getXceiverCount());//DataXceiverServer中的服务线程数          myMetrics.addHeartBeat(now() - startTime);          if (!processCommand(cmds))//NameNode节点会在心跳过程中返回指令,DataNode节点执行这些指令            continue;        }        // check if there are newly received blocks,检测最近是否接受到数据块        Block [] blockArray=null;        String [] delHintArray=null;        synchronized(receivedBlockList) {//receivedBlockList保存着上次上报后接收到的数据块          synchronized(delHints) {            int numBlocks = receivedBlockList.size();            if (numBlocks > 0) {              if(numBlocks!=delHints.size()) {                LOG.warn("Panic: receiveBlockList and delHints are not of the same length" );              }              // Send newly-received blockids to namenode              blockArray = receivedBlockList.toArray(new Block[numBlocks]);              delHintArray = delHints.toArray(new String[numBlocks]);            }          }        }        if (blockArray != null) {          if(delHintArray == null || delHintArray.length != blockArray.length ) {            LOG.warn("Panic: block array & delHintArray are not the same" );          }          namenode.blockReceived(dnRegistration, blockArray, delHintArray);//上报最近接收到的数据块          synchronized (receivedBlockList) {//上报完成之后就清空receivedBlockList列表            synchronized (delHints) {              for(int i=0; i<blockArray.length; i++) {                receivedBlockList.remove(blockArray[i]);                delHints.remove(delHintArray[i]);              }            }          }        }        //每隔一段时间,数据节点会上报它管理的所有数据块        if (startTime - lastBlockReport > blockReportInterval) {          if (data.isAsyncBlockReportReady()) {            // Create block report            long brCreateStartTime = now();            Block[] bReport = data.retrieveAsyncBlockReport();                        // Send block report            long brSendStartTime = now();            DatanodeCommand cmd = namenode.blockReport(dnRegistration,                    BlockListAsLongs.convertToArrayLongs(bReport));                        // Log the block report processing stats from Datanode perspective            long brSendCost = now() - brSendStartTime;            long brCreateCost = brSendStartTime - brCreateStartTime;            myMetrics.addBlockReport(brSendCost);            LOG.info("BlockReport of " + bReport.length                + " blocks took " + brCreateCost + " msec to generate and "                + brSendCost + " msecs for RPC and NN processing");            // If we have sent the first block report, then wait a random            // time before we start the periodic block reports.            if (resetBlockReportTime) {              lastBlockReport = startTime -                  R.nextInt((int)(blockReportInterval));              resetBlockReportTime = false;            } else {              /* say the last block report was at 8:20:14. The current report                * should have started around 9:20:14 (default 1 hour interval).                * If current time is :               *   1) normal like 9:20:18, next report should be at 10:20:14               *   2) unexpected like 11:35:43, next report should be at               *      12:20:14               */              lastBlockReport += (now() - lastBlockReport) /                                  blockReportInterval * blockReportInterval;            }            processCommand(cmd);          } else {            data.requestAsyncBlockReport();            if (lastBlockReport > 0) { // this isn't the first report              long waitingFor =                  startTime - lastBlockReport - blockReportInterval;              String msg = "Block report is due, and been waiting for it for " +                  (waitingFor/1000) + " seconds...";              if (waitingFor > LATE_BLOCK_REPORT_WARN_THRESHOLD) {                LOG.warn(msg);              } else if (waitingFor > LATE_BLOCK_REPORT_INFO_THRESHOLD) {                LOG.info(msg);              } else if (LOG.isDebugEnabled()) {                LOG.debug(msg);              }            }          }        }        // start block scanner        if (blockScanner != null && blockScannerThread == null &&            upgradeManager.isUpgradeCompleted()) {          LOG.info("Starting Periodic block scanner");          blockScannerThread = new Daemon(blockScanner);          blockScannerThread.start();        }                    //        // There is no work to do;  sleep until hearbeat timer elapses,         // or work arrives, and then iterate again.        //        long waitTime = heartBeatInterval - (System.currentTimeMillis() - lastHeartbeat);        synchronized(receivedBlockList) {          if (waitTime > 0 && receivedBlockList.size() == 0) {            try {              receivedBlockList.wait(waitTime);            } catch (InterruptedException ie) {            }            delayBeforeBlockReceived();          }        } // synchronized      } catch(RemoteException re) {        String reClass = re.getClassName();        if (UnregisteredDatanodeException.class.getName().equals(reClass) ||            DisallowedDatanodeException.class.getName().equals(reClass) ||            IncorrectVersionException.class.getName().equals(reClass)) {          LOG.warn("DataNode is shutting down: " +                    StringUtils.stringifyException(re));          shutdown();          return;        }        LOG.warn(StringUtils.stringifyException(re));      } catch (IOException e) {        LOG.warn(StringUtils.stringifyException(e));      }    } // while (shouldRun)  } // offerService







  1. nodeReg,表示当前DataNode在NameNode上的注册信息;
  2. capacity,表示当前DataNode的存储容量;
  3. dfsUsed,表示当前DataNode目前已经使用的容量;
  4. remaining,表示当前DataNode剩余的容量;
  5. xmitsInProgress,表示当前DataNode中正在进行数据块拷贝的线程数;
  6. xceiverCount,表示当前DataNode中DataXceiverServer中的服务线程数。


DatanodeCommand[] handleHeartbeat(DatanodeRegistration nodeReg,      long capacity, long dfsUsed, long remaining,      int xceiverCount, int xmitsInProgress) throws IOException {    DatanodeCommand cmd = null;    synchronized (heartbeats) {      synchronized (datanodeMap) {        DatanodeDescriptor nodeinfo = null;        try {          nodeinfo = getDatanode(nodeReg);//从datanodeMap中得到当前DataNode信息        } catch(UnregisteredDatanodeException e) {          return new DatanodeCommand[]{DatanodeCommand.REGISTER};        }                  // Check if this datanode should actually be shutdown instead.         // 检查当前DataNode节点状态是否是AdminStates.DECOMMISSIONED,如果是,表明该节点不允许连接到NameNode节点        if (nodeinfo != null && shouldNodeShutdown(nodeinfo)) {          setDatanodeDead(nodeinfo);          throw new DisallowedDatanodeException(nodeinfo);        }        if (nodeinfo == null || !nodeinfo.isAlive) {          return new DatanodeCommand[]{DatanodeCommand.REGISTER};        }        updateStats(nodeinfo, false);//先减去这个DataNode节点上次心跳上报的数据        nodeinfo.updateHeartbeat(capacity, dfsUsed, remaining, xceiverCount);//更新DataNode节点的数据        updateStats(nodeinfo, true);//加上这个DataNode节点这次上报的数据                //check lease recovery,更新租约        cmd = nodeinfo.getLeaseRecoveryCommand(Integer.MAX_VALUE);        if (cmd != null) {          return new DatanodeCommand[] {cmd};        }        //返回的指令        ArrayList<DatanodeCommand> cmds = new ArrayList<DatanodeCommand>();        //check pending replication,复制副本指令        cmd = nodeinfo.getReplicationCommand(              maxReplicationStreams - xmitsInProgress);        if (cmd != null) {          cmds.add(cmd);        }        //check block invalidation,数据块删除指令        cmd = nodeinfo.getInvalidateBlocks(blockInvalidateLimit);        if (cmd != null) {          cmds.add(cmd);        }        // check access key update        if (isAccessTokenEnabled && nodeinfo.needKeyUpdate) {          cmds.add(new KeyUpdateCommand(accessTokenHandler.exportKeys()));          nodeinfo.needKeyUpdate = false;        }        // check for balancer bandwidth update        if (nodeinfo.getBalancerBandwidth() > 0) {          cmds.add(new BalancerBandwidthCommand(nodeinfo.getBalancerBandwidth()));          // set back to 0 to indicate that datanode has been sent the new value          nodeinfo.setBalancerBandwidth(0);        }        if (!cmds.isEmpty()) {          return cmds.toArray(new DatanodeCommand[cmds.size()]);        }      }    }    //check distributed upgrade    cmd = getDistributedUpgradeCommand();    if (cmd != null) {      return new DatanodeCommand[] {cmd};    }    return null;  }





《Hadoop技术内幕:深入理解Hadoop Common和HDFS架构设计与实现原理》

0 0