hbase源码分析-负载均衡过程

来源:互联网 发布:苹果电脑 查看mac地址 编辑:程序博客网 时间:2024/05/01 21:11

HBase 可以根据当前集群的负载以region为单位进行rebalance。在HMaster中,后台会起一个线程定期检查是否需要进行rebalance,线 程叫做BalancerChore。线程每隔 hbase.balancer.period会定期执行 master.balance()函数,配置项默认300000毫秒,5分钟。每次balance最多执行hbase.balancer.max.balancing,如果没有配置,则使用 hbase.balancer.period配置项的值。在BalancerChore类中会调用hmaster的balance方法

 

public class BalancerChore extends ScheduledChore {

  private static final Log LOG = LogFactory.getLog(BalancerChore.class);

 

  private final HMaster master;

 

  public BalancerChore(HMaster master) {

    super(master.getServerName() +"-BalancerChore", master, master.getConfiguration().getInt(

      "hbase.balancer.period", 300000));

    this.master = master;

  }

 

  @Override

  protected void chore() {

    try {

      master.balance();

    } catch (IOException e) {

      LOG.error("Failed to balance.", e);

    }

  }

}

loadBalancerTracker去zk上看是否load balance开启,如果开启,则从AssignmentManager中检查当前是否有region处于in transition状态,如果有,则直接返回。否则将集群的状态给balancer以便后续做决策,HMaster的 assignmentManager成员内部维护着一个表在哪些机器上,这些机器上分别有哪些region。对于每张表,都会执行 balancer.balanceCluster()方法。

public boolean balance() throws IOException {

    //如果master没有初始化,不能运行balance

    if (!this.initialized) {

      LOG.debug("Master has not been initialized, don't run balancer.");

      return false;

    }

    //调用外部异步块

    int maximumBalanceTime =getBalancerCutoffTime();

    synchronized (this.balancer) {

      //如果balance设置不是true,则返回,不运行balancer

      if (!this.loadBalancerTracker.isBalancerOn())return false;

      //一次只能运行一个balance,如果有region处于splitting状态,则不跑负载均衡方法。

      if (this.assignmentManager.getRegionStates().isRegionsInTransition()) {

        Map<String, RegionState> regionsInTransition =

          this.assignmentManager.getRegionStates().getRegionsInTransition();

        LOG.debug("Not running balancer because " + regionsInTransition.size() +

          " region(s) in transition: " + org.apache.commons.lang.StringUtils.

            abbreviate(regionsInTransition.toString(), 256));

        return false;

      }

      if (this.serverManager.areDeadServersInProgress()) {

//如果有挂掉的region server则不执行负载均衡。

        LOG.debug("Not running balancer because processing dead regionserver(s): " +

          this.serverManager.getDeadServers());

        return false;

      }

 

      if (this.cpHost !=null) {

        try {

          if (this.cpHost.preBalance()) {

            LOG.debug("Coprocessor bypassing balancer request");

            return false;

          }

        } catch (IOException ioe) {

          LOG.error("Error invoking master coprocessor preBalance()", ioe);

          return false;

        }

      }

//获取table下面的region server 和region

      Map<TableName, Map<ServerName, List<HRegionInfo>>> assignmentsByTable =

        this.assignmentManager.getRegionStates().getAssignmentsByTable();

 

      List<RegionPlan> plans = new ArrayList<RegionPlan>();

      //Give the balancer the current cluster state.设置当前集群状态

      this.balancer.setClusterStatus(getClusterStatus());

      for (Map<ServerName, List<HRegionInfo>> assignments : assignmentsByTable.values()) {

        List<RegionPlan> partialPlans = this.balancer.balanceCluster(assignments);//获取负载均衡计划表

        if (partialPlans !=null) plans.addAll(partialPlans);

      }

      long cutoffTime = System.currentTimeMillis() + maximumBalanceTime;

      int rpCount = 0; // number of RegionPlans balanced so far

      long totalRegPlanExecTime = 0;

      if (plans !=null && !plans.isEmpty()) {

        for (RegionPlan plan: plans) {

          LOG.info("balance " + plan);

          long balStartTime = System.currentTimeMillis();

          //TODO: bulk assign

          this.assignmentManager.balance(plan);//根据执行计划表的迁移内容

          totalRegPlanExecTime += System.currentTimeMillis()-balStartTime;

          rpCount++;

          if (rpCount < plans.size() &&

              // if performing next balance exceeds cutoff time, exit the loop

              (System.currentTimeMillis() + (totalRegPlanExecTime / rpCount)) > cutoffTime) {

            //TODO: After balance, there should not be a cutoff time (keeping it as a security net for now)

            LOG.debug("No more balancing till next balance run; maximumBalanceTime=" +

              maximumBalanceTime);

            break;

          }

        }

      }

      if (this.cpHost !=null) {

        try {

          this.cpHost.postBalance(rpCount < plans.size() ? plans.subList(0, rpCount) : plans);

        } catch (IOException ioe) {

          // balancing already succeeded so don't change the result

          LOG.error("Error invoking master coprocessor postBalance()", ioe);

        }

      }

    }

    // If LoadBalancer did not generate any plans, it means the cluster is already balanced.

    // Return true indicating a success.

    return true;

  }

getBalancerCutoffTime(),首先从hbase.balancer.max.balancing中取,取不到赋值为-1,如果是没取到,则从hbase.balancer.period中取,取不到赋值为300000,这个地方有个疑问,nonsense period是什么意思

private int getBalancerCutoffTime() {

    int balancerCutoffTime =

      getConfiguration().getInt("hbase.balancer.max.balancing", -1);

    if (balancerCutoffTime == -1) {

      // No time period set so create one

      int balancerPeriod =

        getConfiguration().getInt("hbase.balancer.period", 300000);

      balancerCutoffTime = balancerPeriod;

      // If nonsense period, set it to balancerPeriod

      if (balancerCutoffTime <= 0) balancerCutoffTime = balancerPeriod;

    }

    return balancerCutoffTime;

  }

HBase中load balance的策略是可插拔的,开发者可以根据自己业务的需求来开发自己的load balance策略。在HBase中,是通过接口LoadBalancer类实现的。具体使用哪个load balance策略由配置项hbase.master.loadbalancer.class决定,默认使用StochasticLoadBalancer。所有的逻辑都在StochasticLoadBalancer这个负载均衡器的 balanceCluster()方法中。

 

public synchronized List<RegionPlan> balanceCluster(Map<ServerName,

List<HRegionInfo>> clusterState) {

//首先生成master regionserver的region的plan

List<RegionPlan> plans = balanceMasterRegions(clusterState);

//如果plan和clusterState不为空,并且只有一个region,直接返回

    if (plans !=null || clusterState == null || clusterState.size() <= 1) {

      return plans;

}

//如果集群中包含masterserver并且集群的数量<=2,返回null

    if (masterServerName !=null && clusterState.containsKey(masterServerName)) {

      if (clusterState.size() <= 2) {

        return null;

      }

//否则在集群中移除masterserver

      clusterState = new HashMap<ServerName, List<HRegionInfo>>(clusterState);

      clusterState.remove(masterServerName);

    }

 

    // On clusters with lots of HFileLinks or lots of reference files,

    // instantiating the storefile infos can be quite expensive.

    // Allow turning this feature off if the locality cost is not going to

    // be used in any computations.

    RegionLocationFinder finder = null;

    if (this.localityCost !=null && this.localityCost.getMultiplier() > 0) {

      finder = this.regionFinder;

    }

 

    //The clusterState that is given to this method contains the state

    //of all the regions in the table(s) (that's true today)

    // Keep track of servers to iterate through them.

    Cluster cluster = new Cluster(clusterState,loads, finder, rackManager);

    if (!needsBalance(cluster)) {

      return null;

    }

 

    long startTime = EnvironmentEdgeManager.currentTime();

 

    initCosts(cluster);

//1. 生成一个虚拟的集群cluster,方便计算计算当前状态的开销,其中clusterState是表的状态

    double currentCost = computeCost(cluster, Double.MAX_VALUE);

 

    double initCost = currentCost;

    double newCost = currentCost;

//设置循环的次数和clusterregion server的总数和region总数有关。最大值mapSteps1000000

    long computedMaxSteps = Math.min(this.maxSteps,

        ((long)cluster.numRegions * (long)this.stepsPerRegion * (long)cluster.numServers));

    // Perform a stochastic walk to see if we can get a good fit.

    long step;

 

    for (step = 0; step < computedMaxSteps; step++) {

      int generatorIdx =RANDOM.nextInt(candidateGenerators.length);

      CandidateGenerator p = candidateGenerators[generatorIdx];

      Cluster.Action action = p.generate(cluster);

 

      if (action.type == Type.NULL) {

        continue;

      }

 

      cluster.doAction(action);

      updateCostsWithAction(cluster, action);

 

      newCost = computeCost(cluster, currentCost);

 

      // Should this be kept?

      if (newCost < currentCost) {

        currentCost = newCost;

      } else {

        // Put things back the way they were before.

        // TODO: undo by remembering old values

        Action undoAction = action.undoAction();

        cluster.doAction(undoAction);

        updateCostsWithAction(cluster, undoAction);

      }

 

      if (EnvironmentEdgeManager.currentTime() - startTime >

          maxRunningTime) {

        break;

      }

    }

 

    long endTime = EnvironmentEdgeManager.currentTime();

 

    metricsBalancer.balanceCluster(endTime - startTime);

 

    if (initCost > currentCost) {

      plans = createRegionPlans(cluster);

      if (LOG.isDebugEnabled()) {

        LOG.debug("Finished computing new load balance plan.  Computation took "

            + (endTime - startTime) + "ms to try " + step

            + " different iterations.  Found a solution that moves "

            + plans.size() + " regions; Going from a computed cost of "

            + initCost + " to a new cost of " + currentCost);

      }

      return plans;

    }

    if (LOG.isDebugEnabled()) {

      LOG.debug("Could not find a better load balance plan.  Tried "

          + step + " different configurations in " + (endTime - startTime)

          + "ms, and did not find anything with a computed cost less than " + initCost);

    }

    return null;

  }

banancer()是针对整个集群的region分布,而不是针对某个表的region分布。它只保证每个regionserver上分布的regions在平均regions的0.8到1.2倍之间。

avg = 整个集群的总region数/regionserver个数

min = floor(avg*(1-0.2))

max=ceiling(avg*(1+0.2))

即所有regionserver上的regions个数都在min和max之间的话,就不会执行balancer。

protected boolean needsBalance(Cluster c) {

    ClusterLoadState cs = new ClusterLoadState(c.clusterState);

    if (cs.getNumServers() <MIN_SERVER_BALANCE) {

      if (LOG.isDebugEnabled()) {

        LOG.debug("Not running balancer because only " + cs.getNumServers()

            + " active regionserver(s)");

      }

      return false;

    }

    if(areSomeRegionReplicasColocated(c))return true;

    // Check if we even need to do any load balancing

    // HBASE-3681 check sloppiness first

    float average = cs.getLoadAverage();// for logging  获取clusterregion server平均拥有的region数目

    int floor = (int) Math.floor(average * (1 -slop));//floor 返回不大于的最大整数 slop"hbase.regions.slop",默认0.2

    int ceiling = (int) Math.ceil(average * (1 +slop));//ceil返回最小的(最接近负无穷大)double值,大于或相等于参数,并相等于一个整数。特殊情况:

    if (!(cs.getMaxLoad() > ceiling || cs.getMinLoad() < floor)) {//如果cluster的最多和最少regionregion server不在范围内,返回false表明需要进行负载均衡算法。

      NavigableMap<ServerAndLoad, List<HRegionInfo>> serversByLoad = cs.getServersByLoad();

      if (LOG.isTraceEnabled()) {

        // If nothing to balance, then don't say anything unless trace-level logging.

        LOG.trace("Skipping load balancing because balanced cluster; " +

          "servers=" + cs.getNumServers() +

          " regions=" + cs.getNumRegions() +" average=" + average +

          " mostloaded=" + serversByLoad.lastKey().getLoad() +

          " leastloaded=" + serversByLoad.firstKey().getLoad());

      }

      return false;

    }

    return true;

  }

随机取CandidateGenerator策略,从几个角度来计算它的cost,最终根据 (权重*cost值) 加起来的就是总得分,如果新计算的cost小于currentCost,则说明,这种region的交换或者迁移是有效的,把新计算的cost赋值给currentCost,否则回退到之前的操作,每次迭代都是 基于上次的成果,总共做computedMaxSteps,computedMaxSteps的次数和cluster的region server 的总数和region总数有关,最大值mapSteps为1000000。

candidateGenerators = new CandidateGenerator[] {

          new RandomCandidateGenerator(),

          new LoadCandidateGenerator(),

          localityCandidateGenerator,

          new RegionReplicaRackCandidateGenerator(),

      };

localityCandidateGenerator = new LocalityBasedCandidateGenerator(services);

 

/**

   * This is the main cost function.  It will compute a cost associated with a proposed cluster

   * state.  All different costs will be combined with their multipliers to produce a double cost.

   *这个最主要的cost方法,它会计算目标集群状态相应的 cost。所有不同的cost会同它的权重相结合瀍河一个双重的cost

   * @param cluster The state of the cluster

   * @param previousCost the previous cost. This is used as an early out.之前的cost

   * @return a double of a cost associated with the proposed cluster state.  This cost is an

   *         aggregate of all individual cost functions.

   */

  protected double computeCost(Cluster cluster, double previousCost) {

    double total = 0;

 

    for (CostFunction c:costFunctions) {

      if (c.getMultiplier() <= 0) {//如果权重小于0,则不计算该functioncost

        continue;

      }

 

      total += c.getMultiplier() * c.cost();

 

      if (total > previousCost) {

        return total;

      }

    }

    return total;

  }

 

  costFunctions = new CostFunction[]{

      new RegionCountSkewCostFunction(conf),

      new MoveCostFunction(conf),

      localityCost,

      new TableSkewCostFunction(conf),

      regionReplicaHostCostFunction,

      regionReplicaRackCostFunction,

      regionLoadFunctions[0],

      regionLoadFunctions[1],

      regionLoadFunctions[2],

      regionLoadFunctions[3],

};

regionReplicaHostCostFunction = new RegionReplicaHostCostFunction(conf);

regionReplicaRackCostFunction = new RegionReplicaRackCostFunction(conf);

   regionLoadFunctions = new CostFromRegionLoadFunction[] {

      new ReadRequestCostFunction(conf),

      new WriteRequestCostFunction(conf),

      new MemstoreSizeCostFunction(conf),

      new StoreFileCostFunction(conf)

 };

localityCost = new LocalityCostFunction(conf, services);

0 0