Hadoop Balancer源码解读

来源：互联网发布：网络维护工程师工资编辑：程序博客网时间：2024/06/08 13:50

前言

最近在做一些Hadoop运维的相关工作，发现了一个有趣的问题，我们公司的Hadoop集群磁盘占比数值参差不齐，高的接近80%，低的接近40%，并没有充分利用好上面的资源，但是balance的操作跑的也是正常的啊，所以打算看一下Hadoop的balance的源代码，更深层次的去了解Hadoop Balance的机制。

Balancer和Distpatch

上面2个类的设计就是与Hadoop Balance操作最紧密联系的类，Balancer类负载找出<source, target>这样的起始，目标结果对，然后存入到Distpatch中，然后通过Distpatch进行分发，不过在分发之前，会进行block的验证，判断此block是否能被移动，这里会涉及到一些条件的判断，具体的可以通过后面的代码中的注释去发现。

在Balancer的最后阶段，会将source和Target加入到dispatcher中，详见下面的代码：

/**   * 根据源节点和目标节点,构造任务对   */  private void matchSourceWithTargetToMove(Source source, StorageGroup target) {    long size = Math.min(source.availableSizeToMove(), target.availableSizeToMove());    final Task task = new Task(target, size);    source.addTask(task);    target.incScheduledSize(task.getSize());    //加入分发器中    dispatcher.add(source, target);    LOG.info("Decided to move "+StringUtils.byteDesc(size)+" bytes from "        + source.getDisplayName() + " to " + target.getDisplayName());  }

然后就是dispather类中的代码。然后进行block的分发操作：

/**   * For each datanode, choose matching nodes from the candidates. Either the   * datanodes or the candidates are source nodes with (utilization > Avg), and   * the others are target nodes with (utilization < Avg).   */  private <G extends StorageGroup, C extends StorageGroup>      void chooseStorageGroups(Collection<G> groups, Collection<C> candidates,          Matcher matcher) {    for(final Iterator<G> i = groups.iterator(); i.hasNext();) {      final G g = i.next();      for(; choose4One(g, candidates, matcher); );      if (!g.hasSpaceForScheduling()) {        //如果候选节点没有空间调度,则直接移除掉        i.remove();      }    }  }

继续调用后面的方法

 /**   * Decide all <source, target> pairs and   * the number of bytes to move from a source to a target   * Maximum bytes to be moved per storage group is   * min(1 Band worth of bytes,  MAX_SIZE_TO_MOVE).   * 从源节点列表和目标节点列表中各自选择节点组成一个个对,选择顺序优先为同节点组,同机架,然后是针对所有   * @return total number of bytes to move in this iteration   */  private long chooseStorageGroups() {    // First, match nodes on the same node group if cluster is node group aware    if (dispatcher.getCluster().isNodeGroupAware()) {      //首先匹配的条件是同节点组      chooseStorageGroups(Matcher.SAME_NODE_GROUP);    }        // Then, match nodes on the same rack    //然后是同机架    chooseStorageGroups(Matcher.SAME_RACK);    // At last, match all remaining nodes    //最后是匹配所有的节点    chooseStorageGroups(Matcher.ANY_OTHER);        return dispatcher.bytesToMove();  }

然后再通过调用Dispatch的层层方法，下面是简图：

最后核心的检验block块是否合适的代码为下面这个：

/**   * Decide if the block is a good candidate to be moved from source to target.   * A block is a good candidate if    * 1. the block is not in the process of being moved/has not been moved;   * 移动的块不是正在被移动的块   * 2. the block does not have a replica on the target;   * 在目标节点上没有移动的block块   * 3. doing the move does not reduce the number of racks that the block has   * 移动之后,不同机架上的block块的数量应该是不变的.   */  private boolean isGoodBlockCandidate(Source source, StorageGroup target,      DBlock block) {    if (source.storageType != target.storageType) {      return false;    }    // check if the block is moved or not    //如果所要移动的块是存在于正在被移动的块列表,则返回false    if (movedBlocks.contains(block.getBlock())) {      return false;    }    //如果移动的块已经存在于目标节点上,则返回false,将不会予以移动    if (block.isLocatedOn(target)) {      return false;    }    //如果开启了机架感知的配置,则目标节点不应该有相同的block    if (cluster.isNodeGroupAware()        && isOnSameNodeGroupWithReplicas(target, block, source)) {      return false;    }        //需要维持机架上的block块数量不变    if (reduceNumOfRacks(source, target, block)) {      return false;    }    return true;  }

下面是Balancer.java和Dispatch.java类的完整代码解析：

Balancer.java:

/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements.  See the NOTICE file * distributed with this work for additional information * regarding copyright ownership.  The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package org.apache.hadoop.hdfs.server.balancer;import static com.google.common.base.Preconditions.checkArgument;import java.io.IOException;import java.io.PrintStream;import java.net.URI;import java.text.DateFormat;import java.util.ArrayList;import java.util.Arrays;import java.util.Collection;import java.util.Collections;import java.util.Date;import java.util.Formatter;import java.util.Iterator;import java.util.LinkedList;import java.util.List;import java.util.Set;import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import org.apache.hadoop.classification.InterfaceAudience;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hdfs.DFSConfigKeys;import org.apache.hadoop.hdfs.DFSUtil;import org.apache.hadoop.hdfs.HdfsConfiguration;import org.apache.hadoop.hdfs.StorageType;import org.apache.hadoop.hdfs.server.balancer.Dispatcher.DDatanode;import org.apache.hadoop.hdfs.server.balancer.Dispatcher.DDatanode.StorageGroup;import org.apache.hadoop.hdfs.server.balancer.Dispatcher.Source;import org.apache.hadoop.hdfs.server.balancer.Dispatcher.Task;import org.apache.hadoop.hdfs.server.balancer.Dispatcher.Util;import org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicy;import org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicyDefault;import org.apache.hadoop.hdfs.server.namenode.UnsupportedActionException;import org.apache.hadoop.hdfs.server.protocol.DatanodeStorageReport;import org.apache.hadoop.hdfs.server.protocol.StorageReport;import org.apache.hadoop.util.StringUtils;import org.apache.hadoop.util.Time;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import com.google.common.base.Preconditions;/** <p>The balancer is a tool that balances disk space usage on an HDFS cluster * when some datanodes become full or when new empty nodes join the cluster. * The tool is deployed as an application program that can be run by the  * cluster administrator on a live HDFS cluster while applications * adding and deleting files. *  * <p>SYNOPSIS * <pre> * To start: *      bin/start-balancer.sh [-threshold <threshold>] *      Example: bin/ start-balancer.sh  *                     start the balancer with a default threshold of 10% *               bin/ start-balancer.sh -threshold 5 *                     start the balancer with a threshold of 5% * To stop: *      bin/ stop-balancer.sh * </pre> *  * <p>DESCRIPTION * <p>The threshold parameter is a fraction in the range of (1%, 100%) with a  * default value of 10%. The threshold sets a target for whether the cluster  * is balanced. A cluster is balanced if for each datanode, the utilization  * of the node (ratio of used space at the node to total capacity of the node)  * differs from the utilization of the (ratio of used space in the cluster  * to total capacity of the cluster) by no more than the threshold value.  * The smaller the threshold, the more balanced a cluster will become.  * It takes more time to run the balancer for small threshold values.  * Also for a very small threshold the cluster may not be able to reach the  * balanced state when applications write and delete files concurrently. *  * <p>The tool moves blocks from highly utilized datanodes to poorly  * utilized datanodes iteratively. In each iteration a datanode moves or  * receives no more than the lesser of 10G bytes or the threshold fraction  * of its capacity. Each iteration runs no more than 20 minutes. * 每次移动不超过10G大小，每次移动不超过20分钟。 * At the end of each iteration, the balancer obtains updated datanodes * information from the namenode. *  * <p>A system property that limits the balancer's use of bandwidth is  * defined in the default configuration file: * <pre> * <property> *   <name>dfs.balance.bandwidthPerSec</name> *   <value>1048576</value> * <description>  Specifies the maximum bandwidth that each datanode  * can utilize for the balancing purpose in term of the number of bytes  * per second. </description> * </property> * </pre> *  * <p>This property determines the maximum speed at which a block will be  * moved from one datanode to another. The default value is 1MB/s. The higher  * the bandwidth, the faster a cluster can reach the balanced state,  * but with greater competition with application processes. If an  * administrator changes the value of this property in the configuration  * file, the change is observed when HDFS is next restarted. *  * <p>MONITERING BALANCER PROGRESS * <p>After the balancer is started, an output file name where the balancer  * progress will be recorded is printed on the screen.  The administrator  * can monitor the running of the balancer by reading the output file.  * The output shows the balancer's status iteration by iteration. In each  * iteration it prints the starting time, the iteration number, the total  * number of bytes that have been moved in the previous iterations,  * the total number of bytes that are left to move in order for the cluster  * to be balanced, and the number of bytes that are being moved in this  * iteration. Normally "Bytes Already Moved" is increasing while "Bytes Left  * To Move" is decreasing. *  * <p>Running multiple instances of the balancer in an HDFS cluster is  * prohibited by the tool. *  * <p>The balancer automatically exits when any of the following five  * conditions is satisfied: * <ol> * <li>The cluster is balanced; * <li>No block can be moved; * <li>No block has been moved for five consecutive(连续) iterations; * <li>An IOException occurs while communicating with the namenode; * <li>Another balancer is running. * </ol> * 下面5种情况会导致Balance操作的失败 * 1、整个集群已经达到平衡状态 * 2、经过计算发现没有可以被移动的block块 * 3、在连续5次的迭代中，没有block块被移动 * 4、当datanode节点与namenode节点通信的时候，发生IO异常 * 5、已经存在一个Balance操作 *  * <p>Upon exit, a balancer returns an exit code and prints one of the  * following messages to the output file in corresponding to the above exit  * reasons: * <ol> * <li>The cluster is balanced. Exiting * <li>No block can be moved. Exiting... * <li>No block has been moved for 5 iterations. Exiting... * <li>Received an IO exception: failure reason. Exiting... * <li>Another balancer is running. Exiting... * </ol> * 在下面的5种情况下，balancer操作会自动退出 * 1、整个集群已经达到平衡的状态 * 2、经过计算发现没有可以被移动block块 * 3、在5论的迭代没有block被移动 * 4、接收端发生了I异常 * 5、已经存在一个balanr进程在工作 *  * <p>The administrator can interrupt the execution of the balancer at any  * time by running the command "stop-balancer.sh" on the machine where the  * balancer is running. */@InterfaceAudience.Privatepublic class Balancer {  static final Log LOG = LogFactory.getLog(Balancer.class);  private static final Path BALANCER_ID_PATH = new Path("/system/balancer.id");  private static final long GB = 1L << 30; //1GB  private static final long MAX_SIZE_TO_MOVE = 10*GB;  private static final String USAGE = "Usage: java "      + Balancer.class.getSimpleName()      + "\n\t[-policy <policy>]\tthe balancing policy: "      + BalancingPolicy.Node.INSTANCE.getName() + " or "      + BalancingPolicy.Pool.INSTANCE.getName()      + "\n\t[-threshold <threshold>]\tPercentage of disk capacity"      + "\n\t[-exclude [-f <hosts-file> | comma-sperated list of hosts]]"      + "\tExcludes the specified datanodes."      + "\n\t[-include [-f <hosts-file> | comma-sperated list of hosts]]"      + "\tIncludes only the specified datanodes.";    private final Dispatcher dispatcher;  private final BalancingPolicy policy;  private final double threshold;    // all data node lists  //四种datanode节点类型  private final Collection<Source> overUtilized = new LinkedList<Source>();  private final Collection<Source> aboveAvgUtilized = new LinkedList<Source>();  private final Collection<StorageGroup> belowAvgUtilized      = new LinkedList<StorageGroup>();  private final Collection<StorageGroup> underUtilized      = new LinkedList<StorageGroup>();  /* Check that this Balancer is compatible(兼容) with the Block Placement Policy   * used by the Namenode.   * 检测此balancer均衡工具是否于与目前的namenode节点所用的block存放策略相兼容   */  private static void checkReplicationPolicyCompatibility(Configuration conf      ) throws UnsupportedActionException {    if (!(BlockPlacementPolicy.getInstance(conf, null, null, null) instanceof         BlockPlacementPolicyDefault)) {      throw new UnsupportedActionException(          //如果不兼容则抛异常          "Balancer without BlockPlacementPolicyDefault");    }  }  /**   * Construct a balancer.   * Initialize balancer. It sets the value of the threshold, and    * builds the communication proxies to   * namenode as a client and a secondary namenode and retry proxies   * when connection fails.   *    * 构造一个balancer均衡器,设置threshold值,读取配置中的分发线程数的值   */  Balancer(NameNodeConnector theblockpool, Parameters p, Configuration conf) {    final long movedWinWidth = conf.getLong(        DFSConfigKeys.DFS_BALANCER_MOVEDWINWIDTH_KEY,        DFSConfigKeys.DFS_BALANCER_MOVEDWINWIDTH_DEFAULT);        //移动线程数    final int moverThreads = conf.getInt(        DFSConfigKeys.DFS_BALANCER_MOVERTHREADS_KEY,        DFSConfigKeys.DFS_BALANCER_MOVERTHREADS_DEFAULT);        //分发线程数    final int dispatcherThreads = conf.getInt(        DFSConfigKeys.DFS_BALANCER_DISPATCHERTHREADS_KEY,        DFSConfigKeys.DFS_BALANCER_DISPATCHERTHREADS_DEFAULT);    final int maxConcurrentMovesPerNode = conf.getInt(        DFSConfigKeys.DFS_DATANODE_BALANCE_MAX_NUM_CONCURRENT_MOVES_KEY,        DFSConfigKeys.DFS_DATANODE_BALANCE_MAX_NUM_CONCURRENT_MOVES_DEFAULT);    this.dispatcher = new Dispatcher(theblockpool, p.nodesToBeIncluded,        p.nodesToBeExcluded, movedWinWidth, moverThreads, dispatcherThreads,        maxConcurrentMovesPerNode, conf);    this.threshold = p.threshold;    this.policy = p.policy;  }    /**   * 获取节点总容量大小   */   private static long getCapacity(DatanodeStorageReport report, StorageType t) {    long capacity = 0L;    for(StorageReport r : report.getStorageReports()) {      if (r.getStorage().getStorageType() == t) {        capacity += r.getCapacity();      }    }    return capacity;  }    /**   * 获取节点剩余可用容量大小    */  private static long getRemaining(DatanodeStorageReport report, StorageType t) {    long remaining = 0L;    for(StorageReport r : report.getStorageReports()) {      if (r.getStorage().getStorageType() == t) {        remaining += r.getRemaining();      }    }    return remaining;  }  /**   * Given a datanode storage set, build a network topology and decide   * over-utilized storages, above average utilized storages,    * below average utilized storages, and underutilized storages.    * The input datanode storage set is shuffled in order to randomize   * to the storage matching later on.   * 给定一个datanode集合,创建一个网络拓扑逻辑并划分出过度使用,使用率超出平均值,   * 低于平均值的,以及未能充分利用资源的4种类型   *   * @return the number of bytes needed to move in order to balance the cluster.   */  private long init(List<DatanodeStorageReport> reports) {    // 计算平均使用率    for (DatanodeStorageReport r : reports) {      //累加每个节点上的使用空间      policy.accumulateSpaces(r);    }    //计算出平均值    policy.initAvgUtilization();    long overLoadedBytes = 0L, underLoadedBytes = 0L;    //进行使用率等级的划分,总共4种,over-utilized, above-average, below-average and under-utilized    for(DatanodeStorageReport r : reports) {      final DDatanode dn = dispatcher.newDatanode(r);      for(StorageType t : StorageType.asList()) {        final Double utilization = policy.getUtilization(r, t);        if (utilization == null) { // datanode does not have such storage type           continue;        }                final long capacity = getCapacity(r, t);        final double utilizationDiff = utilization - policy.getAvgUtilization(t);        final double thresholdDiff = Math.abs(utilizationDiff) - threshold;        //计算理论上最大移动空间        final long maxSize2Move = computeMaxSize2Move(capacity,            getRemaining(r, t), utilizationDiff, threshold);        final StorageGroup g;        if (utilizationDiff > 0) {          //使用率超出平均值,加入发送节点列表中          final Source s = dn.addSource(t, maxSize2Move, dispatcher);          if (thresholdDiff <= 0) { // within threshold            //如果在threshold范围之内,则加入aboveAvgUtilized            aboveAvgUtilized.add(s);          } else {            //否则加入overUtilized,并计算超出空间            overLoadedBytes += precentage2bytes(thresholdDiff, capacity);            overUtilized.add(s);          }          g = s;        } else {          //与上面的相反          g = dn.addStorageGroup(t, maxSize2Move);          if (thresholdDiff <= 0) { // within threshold            belowAvgUtilized.add(g);          } else {            underLoadedBytes += precentage2bytes(thresholdDiff, capacity);            underUtilized.add(g);          }        }        dispatcher.getStorageGroupMap().put(g);      }    }    logUtilizationCollections();        Preconditions.checkState(dispatcher.getStorageGroupMap().size()        == overUtilized.size() + underUtilized.size() + aboveAvgUtilized.size()           + belowAvgUtilized.size(),        "Mismatched number of storage groups");        // return number of bytes to be moved in order to make the cluster balanced    return Math.max(overLoadedBytes, underLoadedBytes);  }  private static long computeMaxSize2Move(final long capacity, final long remaining,      final double utilizationDiff, final double threshold) {    final double diff = Math.min(threshold, Math.abs(utilizationDiff));    long maxSizeToMove = precentage2bytes(diff, capacity);    if (utilizationDiff < 0) {      maxSizeToMove = Math.min(remaining, maxSizeToMove);    }    return Math.min(MAX_SIZE_TO_MOVE, maxSizeToMove);  }  private static long precentage2bytes(double precentage, long capacity) {    Preconditions.checkArgument(precentage >= 0,        "precentage = " + precentage + " < 0");    return (long)(precentage * capacity / 100.0);  }  /* log the over utilized & under utilized nodes */  private void logUtilizationCollections() {    logUtilizationCollection("over-utilized", overUtilized);    if (LOG.isTraceEnabled()) {      logUtilizationCollection("above-average", aboveAvgUtilized);      logUtilizationCollection("below-average", belowAvgUtilized);    }    logUtilizationCollection("underutilized", underUtilized);  }  private static <T extends StorageGroup>      void logUtilizationCollection(String name, Collection<T> items) {    LOG.info(items.size() + " " + name + ": " + items);  }  /**   * Decide all <source, target> pairs and   * the number of bytes to move from a source to a target   * Maximum bytes to be moved per storage group is   * min(1 Band worth of bytes,  MAX_SIZE_TO_MOVE).   * 从源节点列表和目标节点列表中各自选择节点组成一个个对,选择顺序优先为同节点组,同机架,然后是针对所有   * @return total number of bytes to move in this iteration   */  private long chooseStorageGroups() {    // First, match nodes on the same node group if cluster is node group aware    if (dispatcher.getCluster().isNodeGroupAware()) {      //首先匹配的条件是同节点组      chooseStorageGroups(Matcher.SAME_NODE_GROUP);    }        // Then, match nodes on the same rack    //然后是同机架    chooseStorageGroups(Matcher.SAME_RACK);    // At last, match all remaining nodes    //最后是匹配所有的节点    chooseStorageGroups(Matcher.ANY_OTHER);        return dispatcher.bytesToMove();  }  /** Decide all <source, target> pairs according to the matcher. */  private void chooseStorageGroups(final Matcher matcher) {    /* first step: match each overUtilized datanode (source) to     * one or more underUtilized datanodes (targets).     *     * 把over组的数据移动under组中     */    chooseStorageGroups(overUtilized, underUtilized, matcher);        /* match each remaining overutilized datanode (source) to      * below average utilized datanodes (targets).     * Note only overutilized datanodes that haven't had that max bytes to move     * satisfied in step 1 are selected          * 把over组的数据移动到below     */    chooseStorageGroups(overUtilized, belowAvgUtilized, matcher);    /* match each remaining underutilized datanode (target) to      * above average utilized datanodes (source).     * Note only underutilized datanodes that have not had that max bytes to     * move satisfied in step 1 are selected.     *      * 然后,再把under组的数据移动一部分到above组中     */    chooseStorageGroups(underUtilized, aboveAvgUtilized, matcher);  }  /**   * For each datanode, choose matching nodes from the candidates. Either the   * datanodes or the candidates are source nodes with (utilization > Avg), and   * the others are target nodes with (utilization < Avg).   */  private <G extends StorageGroup, C extends StorageGroup>      void chooseStorageGroups(Collection<G> groups, Collection<C> candidates,          Matcher matcher) {    for(final Iterator<G> i = groups.iterator(); i.hasNext();) {      final G g = i.next();      for(; choose4One(g, candidates, matcher); );      if (!g.hasSpaceForScheduling()) {        //如果候选节点没有空间调度,则直接移除掉        i.remove();      }    }  }  /**   * For the given datanode, choose a candidate and then schedule it.   * @return true if a candidate is chosen; false if no candidates is chosen.   */  private <C extends StorageGroup> boolean choose4One(StorageGroup g,      Collection<C> candidates, Matcher matcher) {    final Iterator<C> i = candidates.iterator();    final C chosen = chooseCandidate(g, i, matcher);      if (chosen == null) {      return false;    }    if (g instanceof Source) {      matchSourceWithTargetToMove((Source)g, chosen);    } else {      matchSourceWithTargetToMove((Source)chosen, g);    }    if (!chosen.hasSpaceForScheduling()) {      i.remove();    }    return true;  }    /**   * 根据源节点和目标节点,构造任务对   */  private void matchSourceWithTargetToMove(Source source, StorageGroup target) {    long size = Math.min(source.availableSizeToMove(), target.availableSizeToMove());    final Task task = new Task(target, size);    source.addTask(task);    target.incScheduledSize(task.getSize());    //加入分发器中    dispatcher.add(source, target);    LOG.info("Decided to move "+StringUtils.byteDesc(size)+" bytes from "        + source.getDisplayName() + " to " + target.getDisplayName());  }    /** Choose a candidate for the given datanode. */  private <G extends StorageGroup, C extends StorageGroup>      C chooseCandidate(G g, Iterator<C> candidates, Matcher matcher) {    if (g.hasSpaceForScheduling()) {      for(; candidates.hasNext(); ) {        final C c = candidates.next();        if (!c.hasSpaceForScheduling()) {          candidates.remove();        } else if (matcher.match(dispatcher.getCluster(),            g.getDatanodeInfo(), c.getDatanodeInfo())) {          //如果满足匹配的条件,则返回值          return c;        }      }    }    return null;  }  /* reset all fields in a balancer preparing for the next iteration */  private void resetData(Configuration conf) {    this.overUtilized.clear();    this.aboveAvgUtilized.clear();    this.belowAvgUtilized.clear();    this.underUtilized.clear();    this.policy.reset();    dispatcher.reset(conf);;  }    /** Run an iteration for all datanodes. */  private ExitStatus run(int iteration, Formatter formatter,      Configuration conf) {    try {      final List<DatanodeStorageReport> reports = dispatcher.init();      final long bytesLeftToMove = init(reports);      if (bytesLeftToMove == 0) {        System.out.println("The cluster is balanced. Exiting...");        return ExitStatus.SUCCESS;      } else {        LOG.info( "Need to move "+ StringUtils.byteDesc(bytesLeftToMove)            + " to make the cluster balanced." );      }            /* Decide all the nodes that will participate in the block move and       * the number of bytes that need to be moved from one node to another       * in this iteration. Maximum bytes to be moved per node is       * Min(1 Band worth of bytes,  MAX_SIZE_TO_MOVE).       */      final long bytesToMove = chooseStorageGroups();      if (bytesToMove == 0) {        System.out.println("No block can be moved. Exiting...");        return ExitStatus.NO_MOVE_BLOCK;      } else {        LOG.info( "Will move " + StringUtils.byteDesc(bytesToMove) +            " in this iteration");      }      formatter.format("%-24s %10d  %19s  %18s  %17s%n",          DateFormat.getDateTimeInstance().format(new Date()),          iteration,          StringUtils.byteDesc(dispatcher.getBytesMoved()),          StringUtils.byteDesc(bytesLeftToMove),          StringUtils.byteDesc(bytesToMove)          );            /* For each pair of <source, target>, start a thread that repeatedly        * decide a block to be moved and its proxy source,        * then initiates the move until all bytes are moved or no more block       * available to move.       * Exit no byte has been moved for 5 consecutive iterations.       *       * 如果发现在5次连续的迭代中还是没有字节被移动,则退出       */      if (!dispatcher.dispatchAndCheckContinue()) {        return ExitStatus.NO_MOVE_PROGRESS;      }      return ExitStatus.IN_PROGRESS;    } catch (IllegalArgumentException e) {      System.out.println(e + ".  Exiting ...");      return ExitStatus.ILLEGAL_ARGUMENTS;    } catch (IOException e) {      System.out.println(e + ".  Exiting ...");      return ExitStatus.IO_EXCEPTION;    } catch (InterruptedException e) {      System.out.println(e + ".  Exiting ...");      return ExitStatus.INTERRUPTED;    } finally {      dispatcher.shutdownNow();    }  }  /**   * Balance all namenodes.   * For each iteration,   * for each namenode,   * execute a {@link Balancer} to work through all datanodes once.     *    * 开放给外部调用的run方法   */  static int run(Collection<URI> namenodes, final Parameters p,      Configuration conf) throws IOException, InterruptedException {    final long sleeptime = 2000*conf.getLong(        DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY,        DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_DEFAULT);    LOG.info("namenodes  = " + namenodes);    LOG.info("parameters = " + p);        final Formatter formatter = new Formatter(System.out);    System.out.println("Time Stamp               Iteration#  Bytes Already Moved  Bytes Left To Move  Bytes Being Moved");        final List<NameNodeConnector> connectors        = new ArrayList<NameNodeConnector>(namenodes.size());    try {      for (URI uri : namenodes) {        final NameNodeConnector nnc = new NameNodeConnector(            Balancer.class.getSimpleName(), uri, BALANCER_ID_PATH, conf);        nnc.getKeyManager().startBlockKeyUpdater();        connectors.add(nnc);      }          boolean done = false;      for(int iteration = 0; !done; iteration++) {        done = true;        Collections.shuffle(connectors);        for(NameNodeConnector nnc : connectors) {          //初始化均衡器具          final Balancer b = new Balancer(nnc, p, conf);          //均衡器执行balance操作          final ExitStatus r = b.run(iteration, formatter, conf);          // clean all lists          b.resetData(conf);          if (r == ExitStatus.IN_PROGRESS) {            done = false;          } else if (r != ExitStatus.SUCCESS) {            //must be an error statue, return.            return r.getExitCode();          }        }        if (!done) {          Thread.sleep(sleeptime);        }      }    } finally {      for(NameNodeConnector nnc : connectors) {        nnc.close();      }    }    return ExitStatus.SUCCESS.getExitCode();  }  /* Given elaspedTime in ms, return a printable string */  private static String time2Str(long elapsedTime) {    String unit;    double time = elapsedTime;    if (elapsedTime < 1000) {      unit = "milliseconds";    } else if (elapsedTime < 60*1000) {      unit = "seconds";      time = time/1000;    } else if (elapsedTime < 3600*1000) {      unit = "minutes";      time = time/(60*1000);    } else {      unit = "hours";      time = time/(3600*1000);    }    return time+" "+unit;  }  static class Parameters {    static final Parameters DEFAULT = new Parameters(        BalancingPolicy.Node.INSTANCE, 10.0,        Collections.<String> emptySet(), Collections.<String> emptySet());    final BalancingPolicy policy;    final double threshold;    // exclude the nodes in this set from balancing operations    Set<String> nodesToBeExcluded;    //include only these nodes in balancing operations    Set<String> nodesToBeIncluded;    Parameters(BalancingPolicy policy, double threshold,        Set<String> nodesToBeExcluded, Set<String> nodesToBeIncluded) {      this.policy = policy;      this.threshold = threshold;      this.nodesToBeExcluded = nodesToBeExcluded;      this.nodesToBeIncluded = nodesToBeIncluded;    }    @Override    public String toString() {      return Balancer.class.getSimpleName() + "." + getClass().getSimpleName()          + "[" + policy + ", threshold=" + threshold +          ", number of nodes to be excluded = "+ nodesToBeExcluded.size() +          ", number of nodes to be included = "+ nodesToBeIncluded.size() +"]";    }  }  static class Cli extends Configured implements Tool {    /**     * Parse arguments and then run Balancer.     *      * @param args command specific arguments.     * @return exit code. 0 indicates success, non-zero indicates failure.     */    @Override    public int run(String[] args) {      final long startTime = Time.now();      final Configuration conf = getConf();      try {        checkReplicationPolicyCompatibility(conf);        final Collection<URI> namenodes = DFSUtil.getNsServiceRpcUris(conf);        return Balancer.run(namenodes, parse(args), conf);      } catch (IOException e) {        System.out.println(e + ".  Exiting ...");        return ExitStatus.IO_EXCEPTION.getExitCode();      } catch (InterruptedException e) {        System.out.println(e + ".  Exiting ...");        return ExitStatus.INTERRUPTED.getExitCode();      } finally {        System.out.format("%-24s ", DateFormat.getDateTimeInstance().format(new Date()));        System.out.println("Balancing took " + time2Str(Time.now()-startTime));      }    }    /** parse command line arguments */    static Parameters parse(String[] args) {      BalancingPolicy policy = Parameters.DEFAULT.policy;      double threshold = Parameters.DEFAULT.threshold;      Set<String> nodesTobeExcluded = Parameters.DEFAULT.nodesToBeExcluded;      Set<String> nodesTobeIncluded = Parameters.DEFAULT.nodesToBeIncluded;      if (args != null) {        try {          for(int i = 0; i < args.length; i++) {            if ("-threshold".equalsIgnoreCase(args[i])) {              checkArgument(++i < args.length,                "Threshold value is missing: args = " + Arrays.toString(args));              try {                threshold = Double.parseDouble(args[i]);                if (threshold < 1 || threshold > 100) {                  throw new IllegalArgumentException(                      "Number out of range: threshold = " + threshold);                }                LOG.info( "Using a threshold of " + threshold );              } catch(IllegalArgumentException e) {                System.err.println(                    "Expecting a number in the range of [1.0, 100.0]: "                    + args[i]);                throw e;              }            } else if ("-policy".equalsIgnoreCase(args[i])) {              checkArgument(++i < args.length,                "Policy value is missing: args = " + Arrays.toString(args));              try {                policy = BalancingPolicy.parse(args[i]);              } catch(IllegalArgumentException e) {                System.err.println("Illegal policy name: " + args[i]);                throw e;              }            } else if ("-exclude".equalsIgnoreCase(args[i])) {              checkArgument(++i < args.length,                  "List of nodes to exclude | -f <filename> is missing: args = "                  + Arrays.toString(args));              if ("-f".equalsIgnoreCase(args[i])) {                checkArgument(++i < args.length,                    "File containing nodes to exclude is not specified: args = "                    + Arrays.toString(args));                nodesTobeExcluded = Util.getHostListFromFile(args[i], "exclude");              } else {                nodesTobeExcluded = Util.parseHostList(args[i]);              }            } else if ("-include".equalsIgnoreCase(args[i])) {              checkArgument(++i < args.length,                "List of nodes to include | -f <filename> is missing: args = "                + Arrays.toString(args));              if ("-f".equalsIgnoreCase(args[i])) {                checkArgument(++i < args.length,                    "File containing nodes to include is not specified: args = "                    + Arrays.toString(args));                nodesTobeIncluded = Util.getHostListFromFile(args[i], "include");               } else {                nodesTobeIncluded = Util.parseHostList(args[i]);              }            } else {              throw new IllegalArgumentException("args = "                  + Arrays.toString(args));            }          }          checkArgument(nodesTobeExcluded.isEmpty() || nodesTobeIncluded.isEmpty(),              "-exclude and -include options cannot be specified together.");        } catch(RuntimeException e) {          printUsage(System.err);          throw e;        }      }            return new Parameters(policy, threshold, nodesTobeExcluded, nodesTobeIncluded);    }    private static void printUsage(PrintStream out) {      out.println(USAGE + "\n");    }  }  /**   * Run a balancer   * @param args Command line arguments   */  public static void main(String[] args) {    if (DFSUtil.parseHelpArgument(args, USAGE, System.out, true)) {      System.exit(0);    }    try {      System.exit(ToolRunner.run(new HdfsConfiguration(), new Cli(), args));    } catch (Throwable e) {      LOG.error("Exiting balancer due an exception", e);      System.exit(-1);    }  }}

下面是Dispatch.java类的解析：

/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements.  See the NOTICE file * distributed with this work for additional information * regarding copyright ownership.  The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package org.apache.hadoop.hdfs.server.balancer;import static org.apache.hadoop.hdfs.protocolPB.PBHelper.vintPrefixed;import java.io.BufferedInputStream;import java.io.BufferedOutputStream;import java.io.DataInputStream;import java.io.DataOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.OutputStream;import java.net.Socket;import java.util.ArrayList;import java.util.Arrays;import java.util.Collection;import java.util.EnumMap;import java.util.HashMap;import java.util.HashSet;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.Set;import java.util.concurrent.ExecutionException;import java.util.concurrent.ExecutorService;import java.util.concurrent.Executors;import java.util.concurrent.Future;import java.util.concurrent.atomic.AtomicLong;import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import org.apache.hadoop.classification.InterfaceAudience;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.CommonConfigurationKeys;import org.apache.hadoop.hdfs.DFSUtil;import org.apache.hadoop.hdfs.StorageType;import org.apache.hadoop.hdfs.protocol.Block;import org.apache.hadoop.hdfs.protocol.DatanodeInfo;import org.apache.hadoop.hdfs.protocol.ExtendedBlock;import org.apache.hadoop.hdfs.protocol.HdfsConstants;import org.apache.hadoop.hdfs.protocol.datatransfer.IOStreamPair;import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;import org.apache.hadoop.hdfs.protocol.datatransfer.TrustedChannelResolver;import org.apache.hadoop.hdfs.protocol.datatransfer.sasl.DataTransferSaslUtil;import org.apache.hadoop.hdfs.protocol.datatransfer.sasl.SaslDataTransferClient;import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.BlockOpResponseProto;import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status;import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;import org.apache.hadoop.hdfs.server.balancer.Dispatcher.DDatanode.StorageGroup;import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations;import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations.BlockWithLocations;import org.apache.hadoop.hdfs.server.protocol.DatanodeStorageReport;import org.apache.hadoop.io.IOUtils;import org.apache.hadoop.net.NetUtils;import org.apache.hadoop.net.NetworkTopology;import org.apache.hadoop.security.token.Token;import org.apache.hadoop.util.HostsFileReader;import org.apache.hadoop.util.StringUtils;import org.apache.hadoop.util.Time;import com.google.common.base.Preconditions;/** Dispatching block replica moves between datanodes. */@InterfaceAudience.Privatepublic class Dispatcher {  static final Log LOG = LogFactory.getLog(Dispatcher.class);  private static final long GB = 1L << 30; // 1GB  private static final long MAX_BLOCKS_SIZE_TO_FETCH = 2 * GB;  private static final int MAX_NO_PENDING_MOVE_ITERATIONS = 5;  private static final long DELAY_AFTER_ERROR = 10 * 1000L; // 10 seconds  private final NameNodeConnector nnc;  private final SaslDataTransferClient saslClient;  /** Set of datanodes to be excluded. */  private final Set<String> excludedNodes;  /** Restrict to the following nodes. */  private final Set<String> includedNodes;  private final Collection<Source> sources = new HashSet<Source>();  private final Collection<StorageGroup> targets = new HashSet<StorageGroup>();  private final GlobalBlockMap globalBlocks = new GlobalBlockMap();  private final MovedBlocks<StorageGroup> movedBlocks;  /** Map (datanodeUuid,storageType -> StorageGroup) */  private final StorageGroupMap storageGroupMap = new StorageGroupMap();  private NetworkTopology cluster;  private final ExecutorService moveExecutor;  private final ExecutorService dispatchExecutor;  /** The maximum number of concurrent blocks moves at a datanode */  private final int maxConcurrentMovesPerNode;  private final AtomicLong bytesMoved = new AtomicLong();  private static class GlobalBlockMap {    private final Map<Block, DBlock> map = new HashMap<Block, DBlock>();    /**     * Get the block from the map;     * if the block is not found, create a new block and put it in the map.     */    private DBlock get(Block b) {      DBlock block = map.get(b);      if (block == null) {        block = new DBlock(b);        map.put(b, block);      }      return block;    }        /** Remove all blocks except for the moved blocks. */    private void removeAllButRetain(MovedBlocks<StorageGroup> movedBlocks) {      for (Iterator<Block> i = map.keySet().iterator(); i.hasNext();) {        if (!movedBlocks.contains(i.next())) {          i.remove();        }      }    }  }  static class StorageGroupMap {    private static String toKey(String datanodeUuid, StorageType storageType) {      return datanodeUuid + ":" + storageType;    }    private final Map<String, StorageGroup> map = new HashMap<String, StorageGroup>();    StorageGroup get(String datanodeUuid, StorageType storageType) {      return map.get(toKey(datanodeUuid, storageType));    }    void put(StorageGroup g) {      final String key = toKey(g.getDatanodeInfo().getDatanodeUuid(), g.storageType);      final StorageGroup existing = map.put(key, g);      Preconditions.checkState(existing == null);    }    int size() {      return map.size();    }    void clear() {      map.clear();    }  }  /** This class keeps track of a scheduled block move */  private class PendingMove {    private DBlock block;    private Source source;    private DDatanode proxySource;    private StorageGroup target;    private PendingMove() {    }    @Override    public String toString() {      final Block b = block.getBlock();      return b + " with size=" + b.getNumBytes() + " from "          + source.getDisplayName() + " to " + target.getDisplayName()          + " through " + proxySource.datanode;    }    /**     * Choose a block & a proxy source for this pendingMove whose source &     * target have already been chosen.     *      * @return true if a block and its proxy are chosen; false otherwise     */    private boolean chooseBlockAndProxy() {      // iterate all source's blocks until find a good one      for (Iterator<DBlock> i = source.getBlockIterator(); i.hasNext();) {        if (markMovedIfGoodBlock(i.next())) {          i.remove();          return true;        }      }      return false;    }    /**     * @return true if the given block is good for the tentative move.     */    private boolean markMovedIfGoodBlock(DBlock block) {      synchronized (block) {        synchronized (movedBlocks) {          if (isGoodBlockCandidate(source, target, block)) {            this.block = block;            if (chooseProxySource()) {              movedBlocks.put(block);              if (LOG.isDebugEnabled()) {                LOG.debug("Decided to move " + this);              }              return true;            }          }        }      }      return false;    }    /**     * Choose a proxy source.     *      * @return true if a proxy is found; otherwise false     */    private boolean chooseProxySource() {      final DatanodeInfo targetDN = target.getDatanodeInfo();      // if node group is supported, first try add nodes in the same node group      if (cluster.isNodeGroupAware()) {        for (StorageGroup loc : block.getLocations()) {          if (cluster.isOnSameNodeGroup(loc.getDatanodeInfo(), targetDN)              && addTo(loc)) {            return true;          }        }      }      // check if there is replica which is on the same rack with the target      for (StorageGroup loc : block.getLocations()) {        if (cluster.isOnSameRack(loc.getDatanodeInfo(), targetDN) && addTo(loc)) {          return true;        }      }      // find out a non-busy replica      for (StorageGroup loc : block.getLocations()) {        if (addTo(loc)) {          return true;        }      }      return false;    }    /** add to a proxy source for specific block movement */    private boolean addTo(StorageGroup g) {      final DDatanode dn = g.getDDatanode();      if (dn.addPendingBlock(this)) {        proxySource = dn;        return true;      }      return false;    }    /** Dispatch the move to the proxy source & wait for the response. */    private void dispatch() {      if (LOG.isDebugEnabled()) {        LOG.debug("Start moving " + this);      }      Socket sock = new Socket();      DataOutputStream out = null;      DataInputStream in = null;      try {        sock.connect(            NetUtils.createSocketAddr(target.getDatanodeInfo().getXferAddr()),            HdfsServerConstants.READ_TIMEOUT);        sock.setKeepAlive(true);        OutputStream unbufOut = sock.getOutputStream();        InputStream unbufIn = sock.getInputStream();        ExtendedBlock eb = new ExtendedBlock(nnc.getBlockpoolID(),            block.getBlock());        final KeyManager km = nnc.getKeyManager();         Token<BlockTokenIdentifier> accessToken = km.getAccessToken(eb);        IOStreamPair saslStreams = saslClient.socketSend(sock, unbufOut,            unbufIn, km, accessToken, target.getDatanodeInfo());        unbufOut = saslStreams.out;        unbufIn = saslStreams.in;        out = new DataOutputStream(new BufferedOutputStream(unbufOut,            HdfsConstants.IO_FILE_BUFFER_SIZE));        in = new DataInputStream(new BufferedInputStream(unbufIn,            HdfsConstants.IO_FILE_BUFFER_SIZE));        sendRequest(out, eb, accessToken);        receiveResponse(in);        bytesMoved.addAndGet(block.getNumBytes());        LOG.info("Successfully moved " + this);      } catch (IOException e) {        LOG.warn("Failed to move " + this + ": " + e.getMessage());        // Proxy or target may have some issues, delay before using these nodes        // further in order to avoid a potential storm of "threads quota        // exceeded" warnings when the dispatcher gets out of sync with work        // going on in datanodes.        proxySource.activateDelay(DELAY_AFTER_ERROR);        target.getDDatanode().activateDelay(DELAY_AFTER_ERROR);      } finally {        IOUtils.closeStream(out);        IOUtils.closeStream(in);        IOUtils.closeSocket(sock);        proxySource.removePendingBlock(this);        target.getDDatanode().removePendingBlock(this);        synchronized (this) {          reset();        }        synchronized (Dispatcher.this) {          Dispatcher.this.notifyAll();        }      }    }    /** Send a block replace request to the output stream */    private void sendRequest(DataOutputStream out, ExtendedBlock eb,        Token<BlockTokenIdentifier> accessToken) throws IOException {      new Sender(out).replaceBlock(eb, target.storageType, accessToken,          source.getDatanodeInfo().getDatanodeUuid(), proxySource.datanode);    }    /** Receive a block copy response from the input stream */    private void receiveResponse(DataInputStream in) throws IOException {      BlockOpResponseProto response =          BlockOpResponseProto.parseFrom(vintPrefixed(in));      while (response.getStatus() == Status.IN_PROGRESS) {        // read intermediate responses        response = BlockOpResponseProto.parseFrom(vintPrefixed(in));      }      if (response.getStatus() != Status.SUCCESS) {        if (response.getStatus() == Status.ERROR_ACCESS_TOKEN) {          throw new IOException("block move failed due to access token error");        }        throw new IOException("block move is failed: " + response.getMessage());      }    }    /** reset the object */    private void reset() {      block = null;      source = null;      proxySource = null;      target = null;    }  }  /** A class for keeping track of block locations in the dispatcher. */  private static class DBlock extends MovedBlocks.Locations<StorageGroup> {    DBlock(Block block) {      super(block);    }  }  /** The class represents a desired move. */  static class Task {    private final StorageGroup target;    private long size; // bytes scheduled to move    Task(StorageGroup target, long size) {      this.target = target;      this.size = size;    }    long getSize() {      return size;    }  }  /** A class that keeps track of a datanode. */  static class DDatanode {    /** A group of storages in a datanode with the same storage type. */    class StorageGroup {      final StorageType storageType;      final long maxSize2Move;      private long scheduledSize = 0L;      private StorageGroup(StorageType storageType, long maxSize2Move) {        this.storageType = storageType;        this.maxSize2Move = maxSize2Move;      }      private DDatanode getDDatanode() {        return DDatanode.this;      }      DatanodeInfo getDatanodeInfo() {        return DDatanode.this.datanode;      }      /** Decide if still need to move more bytes */      synchronized boolean hasSpaceForScheduling() {        return availableSizeToMove() > 0L;      }      /** @return the total number of bytes that need to be moved */      synchronized long availableSizeToMove() {        return maxSize2Move - scheduledSize;      }      /** increment scheduled size */      synchronized void incScheduledSize(long size) {        scheduledSize += size;      }      /** @return scheduled size */      synchronized long getScheduledSize() {        return scheduledSize;      }      /** Reset scheduled size to zero. */      synchronized void resetScheduledSize() {        scheduledSize = 0L;      }      /** @return the name for display */      String getDisplayName() {        return datanode + ":" + storageType;      }      @Override      public String toString() {        return getDisplayName();      }    }    final DatanodeInfo datanode;    final EnumMap<StorageType, StorageGroup> storageMap        = new EnumMap<StorageType, StorageGroup>(StorageType.class);    protected long delayUntil = 0L;    /** blocks being moved but not confirmed yet */    private final List<PendingMove> pendings;    private final int maxConcurrentMoves;    @Override    public String toString() {      return getClass().getSimpleName() + ":" + datanode + ":" + storageMap.values();    }    private DDatanode(DatanodeStorageReport r, int maxConcurrentMoves) {      this.datanode = r.getDatanodeInfo();      this.maxConcurrentMoves = maxConcurrentMoves;      this.pendings = new ArrayList<PendingMove>(maxConcurrentMoves);    }    private void put(StorageType storageType, StorageGroup g) {      final StorageGroup existing = storageMap.put(storageType, g);      Preconditions.checkState(existing == null);    }    StorageGroup addStorageGroup(StorageType storageType, long maxSize2Move) {      final StorageGroup g = new StorageGroup(storageType, maxSize2Move);      put(storageType, g);      return g;    }    Source addSource(StorageType storageType, long maxSize2Move, Dispatcher d) {      final Source s = d.new Source(storageType, maxSize2Move, this);      put(storageType, s);      return s;    }    synchronized private void activateDelay(long delta) {      delayUntil = Time.monotonicNow() + delta;    }    synchronized private boolean isDelayActive() {      if (delayUntil == 0 || Time.monotonicNow() > delayUntil) {        delayUntil = 0;        return false;      }      return true;    }    /** Check if the node can schedule more blocks to move */    synchronized boolean isPendingQNotFull() {      return pendings.size() < maxConcurrentMoves;    }    /** Check if all the dispatched moves are done */    synchronized boolean isPendingQEmpty() {      return pendings.isEmpty();    }    /** Add a scheduled block move to the node */    synchronized boolean addPendingBlock(PendingMove pendingBlock) {      if (!isDelayActive() && isPendingQNotFull()) {        return pendings.add(pendingBlock);      }      return false;    }    /** Remove a scheduled block move from the node */    synchronized boolean removePendingBlock(PendingMove pendingBlock) {      return pendings.remove(pendingBlock);    }  }  /** A node that can be the sources of a block move */  class Source extends DDatanode.StorageGroup {    private final List<Task> tasks = new ArrayList<Task>(2);    private long blocksToReceive = 0L;    /**     * Source blocks point to the objects in {@link Dispatcher#globalBlocks}     * because we want to keep one copy of a block and be aware that the     * locations are changing over time.     */    private final List<DBlock> srcBlocks = new ArrayList<DBlock>();    private Source(StorageType storageType, long maxSize2Move, DDatanode dn) {      dn.super(storageType, maxSize2Move);    }    /** Add a task */    void addTask(Task task) {      Preconditions.checkState(task.target != this,          "Source and target are the same storage group " + getDisplayName());      incScheduledSize(task.size);      tasks.add(task);    }    /** @return an iterator to this source's blocks */    Iterator<DBlock> getBlockIterator() {      return srcBlocks.iterator();    }    /**     * Fetch new blocks of this source from namenode and update this source's     * block list & {@link Dispatcher#globalBlocks}.     *      * @return the total size of the received blocks in the number of bytes.     */    private long getBlockList() throws IOException {      final long size = Math.min(MAX_BLOCKS_SIZE_TO_FETCH, blocksToReceive);      final BlocksWithLocations newBlocks = nnc.getBlocks(getDatanodeInfo(), size);      long bytesReceived = 0;      for (BlockWithLocations blk : newBlocks.getBlocks()) {        bytesReceived += blk.getBlock().getNumBytes();        synchronized (globalBlocks) {          final DBlock block = globalBlocks.get(blk.getBlock());          synchronized (block) {            block.clearLocations();            // update locations            final String[] datanodeUuids = blk.getDatanodeUuids();            final StorageType[] storageTypes = blk.getStorageTypes();            for (int i = 0; i < datanodeUuids.length; i++) {              final StorageGroup g = storageGroupMap.get(                  datanodeUuids[i], storageTypes[i]);              if (g != null) { // not unknown                block.addLocation(g);              }            }          }          if (!srcBlocks.contains(block) && isGoodBlockCandidate(block)) {            // filter bad candidates            srcBlocks.add(block);          }        }      }      return bytesReceived;    }    /** Decide if the given block is a good candidate to move or not */    private boolean isGoodBlockCandidate(DBlock block) {      for (Task t : tasks) {        if (Dispatcher.this.isGoodBlockCandidate(this, t.target, block)) {          return true;        }      }      return false;    }    /**     * Choose a move for the source. The block's source, target, and proxy     * are determined too. When choosing proxy and target, source &     * target throttling has been considered. They are chosen only when they     * have the capacity to support this block move. The block should be     * dispatched immediately after this method is returned.     *      * @return a move that's good for the source to dispatch immediately.     */    private PendingMove chooseNextMove() {      for (Iterator<Task> i = tasks.iterator(); i.hasNext();) {        final Task task = i.next();        final DDatanode target = task.target.getDDatanode();        PendingMove pendingBlock = new PendingMove();        if (target.addPendingBlock(pendingBlock)) {          // target is not busy, so do a tentative block allocation          pendingBlock.source = this;          pendingBlock.target = task.target;          if (pendingBlock.chooseBlockAndProxy()) {            long blockSize = pendingBlock.block.getNumBytes();            incScheduledSize(-blockSize);            task.size -= blockSize;            if (task.size == 0) {              i.remove();            }            return pendingBlock;          } else {            // cancel the tentative move            target.removePendingBlock(pendingBlock);          }        }      }      return null;    }    /** Iterate all source's blocks to remove moved ones */    private void removeMovedBlocks() {      for (Iterator<DBlock> i = getBlockIterator(); i.hasNext();) {        if (movedBlocks.contains(i.next().getBlock())) {          i.remove();        }      }    }    private static final int SOURCE_BLOCKS_MIN_SIZE = 5;    /** @return if should fetch more blocks from namenode */    private boolean shouldFetchMoreBlocks() {      return srcBlocks.size() < SOURCE_BLOCKS_MIN_SIZE && blocksToReceive > 0;    }    private static final long MAX_ITERATION_TIME = 20 * 60 * 1000L; // 20 mins    /**     * This method iteratively does the following: it first selects a block to     * move, then sends a request to the proxy source to start the block move     * when the source's block list falls below a threshold, it asks the     * namenode for more blocks. It terminates when it has dispatch enough block     * move tasks or it has received enough blocks from the namenode, or the     * elapsed time of the iteration has exceeded the max time limit.     */    private void dispatchBlocks() {      final long startTime = Time.monotonicNow();      this.blocksToReceive = 2 * getScheduledSize();      boolean isTimeUp = false;      int noPendingMoveIteration = 0;      while (!isTimeUp && getScheduledSize() > 0          && (!srcBlocks.isEmpty() || blocksToReceive > 0)) {        final PendingMove p = chooseNextMove();        if (p != null) {          // move the block          moveExecutor.execute(new Runnable() {            @Override            public void run() {              p.dispatch();            }          });          continue;        }        // Since we cannot schedule any block to move,        // remove any moved blocks from the source block list and        removeMovedBlocks(); // filter already moved blocks        // check if we should fetch more blocks from the namenode        if (shouldFetchMoreBlocks()) {          // fetch new blocks          try {            blocksToReceive -= getBlockList();            continue;          } catch (IOException e) {            LOG.warn("Exception while getting block list", e);            return;          }        } else {          // source node cannot find a pending block to move, iteration +1          noPendingMoveIteration++;          // in case no blocks can be moved for source node's task,          // jump out of while-loop after 5 iterations.          if (noPendingMoveIteration >= MAX_NO_PENDING_MOVE_ITERATIONS) {            resetScheduledSize();          }        }        // check if time is up or not        if (Time.monotonicNow() - startTime > MAX_ITERATION_TIME) {          isTimeUp = true;          continue;        }        // Now we can not schedule any block to move and there are        // no new blocks added to the source block list, so we wait.        try {          synchronized (Dispatcher.this) {            Dispatcher.this.wait(1000); // wait for targets/sources to be idle          }        } catch (InterruptedException ignored) {        }      }    }  }  public Dispatcher(NameNodeConnector nnc, Set<String> includedNodes,      Set<String> excludedNodes, long movedWinWidth, int moverThreads,      int dispatcherThreads, int maxConcurrentMovesPerNode, Configuration conf) {    this.nnc = nnc;    this.excludedNodes = excludedNodes;    this.includedNodes = includedNodes;    this.movedBlocks = new MovedBlocks<StorageGroup>(movedWinWidth);    this.cluster = NetworkTopology.getInstance(conf);    this.moveExecutor = Executors.newFixedThreadPool(moverThreads);    this.dispatchExecutor = Executors.newFixedThreadPool(dispatcherThreads);    this.maxConcurrentMovesPerNode = maxConcurrentMovesPerNode;    final boolean fallbackToSimpleAuthAllowed = conf.getBoolean(        CommonConfigurationKeys.IPC_CLIENT_FALLBACK_TO_SIMPLE_AUTH_ALLOWED_KEY,        CommonConfigurationKeys.IPC_CLIENT_FALLBACK_TO_SIMPLE_AUTH_ALLOWED_DEFAULT);    this.saslClient = new SaslDataTransferClient(        DataTransferSaslUtil.getSaslPropertiesResolver(conf),        TrustedChannelResolver.getInstance(conf), fallbackToSimpleAuthAllowed);  }  StorageGroupMap getStorageGroupMap() {    return storageGroupMap;  }  NetworkTopology getCluster() {    return cluster;  }    long getBytesMoved() {    return bytesMoved.get();  }  long bytesToMove() {    Preconditions.checkState(        storageGroupMap.size() >= sources.size() + targets.size(),        "Mismatched number of storage groups (" + storageGroupMap.size()            + " < " + sources.size() + " sources + " + targets.size()            + " targets)");    long b = 0L;    for (Source src : sources) {      b += src.getScheduledSize();    }    return b;  }  void add(Source source, StorageGroup target) {    sources.add(source);    targets.add(target);  }  private boolean shouldIgnore(DatanodeInfo dn) {    // ignore decommissioned nodes    final boolean decommissioned = dn.isDecommissioned();    // ignore decommissioning nodes    final boolean decommissioning = dn.isDecommissionInProgress();    // ignore nodes in exclude list    final boolean excluded = Util.isExcluded(excludedNodes, dn);    // ignore nodes not in the include list (if include list is not empty)    final boolean notIncluded = !Util.isIncluded(includedNodes, dn);    if (decommissioned || decommissioning || excluded || notIncluded) {      if (LOG.isTraceEnabled()) {        LOG.trace("Excluding datanode " + dn + ": " + decommissioned + ", "            + decommissioning + ", " + excluded + ", " + notIncluded);      }      return true;    }    return false;  }  /** Get live datanode storage reports and then build the network topology. */  List<DatanodeStorageReport> init() throws IOException {    final DatanodeStorageReport[] reports = nnc.getLiveDatanodeStorageReport();    final List<DatanodeStorageReport> trimmed = new ArrayList<DatanodeStorageReport>();     // create network topology and classify utilization collections:    // over-utilized, above-average, below-average and under-utilized.    for (DatanodeStorageReport r : DFSUtil.shuffle(reports)) {      final DatanodeInfo datanode = r.getDatanodeInfo();      if (shouldIgnore(datanode)) {        continue;      }      trimmed.add(r);      cluster.add(datanode);    }    return trimmed;  }  public DDatanode newDatanode(DatanodeStorageReport r) {    return new DDatanode(r, maxConcurrentMovesPerNode);  }  public boolean dispatchAndCheckContinue() throws InterruptedException {    return nnc.shouldContinue(dispatchBlockMoves());  }  /**   * Dispatch block moves for each source. The thread selects blocks to move &   * sends request to proxy source to initiate block move. The process is flow   * controlled. Block selection is blocked if there are too many un-confirmed   * block moves.   *    * @return the total number of bytes successfully moved in this iteration.   */  private long dispatchBlockMoves() throws InterruptedException {    final long bytesLastMoved = bytesMoved.get();    final Future<?>[] futures = new Future<?>[sources.size()];    final Iterator<Source> i = sources.iterator();    for (int j = 0; j < futures.length; j++) {      final Source s = i.next();      futures[j] = dispatchExecutor.submit(new Runnable() {        @Override        public void run() {          s.dispatchBlocks();        }      });    }    // wait for all dispatcher threads to finish    for (Future<?> future : futures) {      try {        future.get();      } catch (ExecutionException e) {        LOG.warn("Dispatcher thread failed", e.getCause());      }    }    // wait for all block moving to be done    waitForMoveCompletion();    return bytesMoved.get() - bytesLastMoved;  }  /** The sleeping period before checking if block move is completed again */  static private long blockMoveWaitTime = 30000L;  /** set the sleeping period for block move completion check */  static void setBlockMoveWaitTime(long time) {    blockMoveWaitTime = time;  }  /** Wait for all block move confirmations. */  private void waitForMoveCompletion() {    for(;;) {      boolean empty = true;      for (StorageGroup t : targets) {        if (!t.getDDatanode().isPendingQEmpty()) {          empty = false;          break;        }      }      if (empty) {        return; //all pending queues are empty      }      try {        Thread.sleep(blockMoveWaitTime);      } catch (InterruptedException ignored) {      }    }  }  /**   * Decide if the block is a good candidate to be moved from source to target.   * A block is a good candidate if    * 1. the block is not in the process of being moved/has not been moved;   * 移动的块不是正在被移动的块   * 2. the block does not have a replica on the target;   * 在目标节点上没有移动的block块   * 3. doing the move does not reduce the number of racks that the block has   * 移动之后,不同机架上的block块的数量应该是不变的.   */  private boolean isGoodBlockCandidate(Source source, StorageGroup target,      DBlock block) {    if (source.storageType != target.storageType) {      return false;    }    // check if the block is moved or not    //如果所要移动的块是存在于正在被移动的块列表,则返回false    if (movedBlocks.contains(block.getBlock())) {      return false;    }    //如果移动的块已经存在于目标节点上,则返回false,将不会予以移动    if (block.isLocatedOn(target)) {      return false;    }    //如果开启了机架感知的配置,则目标节点不应该有相同的block    if (cluster.isNodeGroupAware()        && isOnSameNodeGroupWithReplicas(target, block, source)) {      return false;    }        //需要维持机架上的block块数量不变    if (reduceNumOfRacks(source, target, block)) {      return false;    }    return true;  }  /**   * Determine whether moving the given block replica from source to target   * would reduce the number of racks of the block replicas.   */  private boolean reduceNumOfRacks(Source source, StorageGroup target,      DBlock block) {    final DatanodeInfo sourceDn = source.getDatanodeInfo();    if (cluster.isOnSameRack(sourceDn, target.getDatanodeInfo())) {      // source and target are on the same rack      return false;    }    boolean notOnSameRack = true;    synchronized (block) {      for (StorageGroup loc : block.getLocations()) {        if (cluster.isOnSameRack(loc.getDatanodeInfo(), target.getDatanodeInfo())) {          notOnSameRack = false;          break;        }      }    }    if (notOnSameRack) {      // target is not on the same rack as any replica      return false;    }    for (StorageGroup g : block.getLocations()) {      if (g != source && cluster.isOnSameRack(g.getDatanodeInfo(), sourceDn)) {        // source is on the same rack of another replica        return false;      }    }    return true;  }  /**   * Check if there are any replica (other than source) on the same node group   * with target. If true, then target is not a good candidate for placing   * specific replica as we don't want 2 replicas under the same nodegroup.   *    * @return true if there are any replica (other than source) on the same node   *         group with target   */  private boolean isOnSameNodeGroupWithReplicas(      StorageGroup target, DBlock block, Source source) {    final DatanodeInfo targetDn = target.getDatanodeInfo();    for (StorageGroup g : block.getLocations()) {      if (g != source && cluster.isOnSameNodeGroup(g.getDatanodeInfo(), targetDn)) {        return true;      }    }    return false;  }  /** Reset all fields in order to prepare for the next iteration */  void reset(Configuration conf) {    cluster = NetworkTopology.getInstance(conf);    storageGroupMap.clear();    sources.clear();    targets.clear();    globalBlocks.removeAllButRetain(movedBlocks);    movedBlocks.cleanup();  }  /** shutdown thread pools */  void shutdownNow() {    dispatchExecutor.shutdownNow();    moveExecutor.shutdownNow();  }  static class Util {    /** @return true if data node is part of the excludedNodes. */    static boolean isExcluded(Set<String> excludedNodes, DatanodeInfo dn) {      return isIn(excludedNodes, dn);    }    /**     * @return true if includedNodes is empty or data node is part of the     *         includedNodes.     */    static boolean isIncluded(Set<String> includedNodes, DatanodeInfo dn) {      return (includedNodes.isEmpty() || isIn(includedNodes, dn));    }    /**     * Match is checked using host name , ip address with and without port     * number.     *      * @return true if the datanode's transfer address matches the set of nodes.     */    private static boolean isIn(Set<String> datanodes, DatanodeInfo dn) {      return isIn(datanodes, dn.getPeerHostName(), dn.getXferPort())          || isIn(datanodes, dn.getIpAddr(), dn.getXferPort())          || isIn(datanodes, dn.getHostName(), dn.getXferPort());    }    /** @return true if nodes contains host or host:port */    private static boolean isIn(Set<String> nodes, String host, int port) {      if (host == null) {        return false;      }      return (nodes.contains(host) || nodes.contains(host + ":" + port));    }    /**     * Parse a comma separated string to obtain set of host names     *      * @return set of host names     */    static Set<String> parseHostList(String string) {      String[] addrs = StringUtils.getTrimmedStrings(string);      return new HashSet<String>(Arrays.asList(addrs));    }    /**     * Read set of host names from a file     *      * @return set of host names     */    static Set<String> getHostListFromFile(String fileName, String type) {      Set<String> nodes = new HashSet<String>();      try {        HostsFileReader.readFileToSet(type, fileName, nodes);        return StringUtils.getTrimmedStrings(nodes);      } catch (IOException e) {        throw new IllegalArgumentException(            "Failed to read host list from file: " + fileName);      }    }  }}

0 0