hbase源码分析-负载均衡过程

来源：互联网发布：苹果电脑查看mac地址编辑：程序博客网时间：2024/05/01 21:11

HBase 可以根据当前集群的负载以region为单位进行rebalance。在HMaster中，后台会起一个线程定期检查是否需要进行rebalance，线程叫做BalancerChore。线程每隔 hbase.balancer.period会定期执行 master.balance()函数，配置项默认300000毫秒，5分钟。每次balance最多执行hbase.balancer.max.balancing，如果没有配置，则使用 hbase.balancer.period配置项的值。在BalancerChore类中会调用hmaster的balance方法

public class BalancerChore extends ScheduledChore {

private static final Log LOG = LogFactory.getLog(BalancerChore.class);

private final HMaster master;

public BalancerChore(HMaster master) {

super(master.getServerName() +"-BalancerChore", master, master.getConfiguration().getInt(

"hbase.balancer.period", 300000));

this.master = master;

}

@Override

protected void chore() {

try {

master.balance();

} catch (IOException e) {

LOG.error("Failed to balance.", e);

}

loadBalancerTracker去zk上看是否load balance开启，如果开启，则从AssignmentManager中检查当前是否有region处于in transition状态，如果有，则直接返回。否则将集群的状态给balancer以便后续做决策，HMaster的 assignmentManager成员内部维护着一个表在哪些机器上，这些机器上分别有哪些region。对于每张表，都会执行 balancer.balanceCluster()方法。

public boolean balance() throws IOException {

//如果master没有初始化，不能运行balance

if (!this.initialized) {

LOG.debug("Master has not been initialized, don't run balancer.");

return false;

}

//调用外部异步块

int maximumBalanceTime =getBalancerCutoffTime();

synchronized (this.balancer) {

//如果balance设置不是true，则返回，不运行balancer

if (!this.loadBalancerTracker.isBalancerOn())return false;

//一次只能运行一个balance,如果有region处于splitting状态，则不跑负载均衡方法。

if (this.assignmentManager.getRegionStates().isRegionsInTransition()) {

Map<String, RegionState> regionsInTransition =

this.assignmentManager.getRegionStates().getRegionsInTransition();

LOG.debug("Not running balancer because " + regionsInTransition.size() +

" region(s) in transition: " + org.apache.commons.lang.StringUtils.

abbreviate(regionsInTransition.toString(), 256));

return false;

}

if (this.serverManager.areDeadServersInProgress()) {

//如果有挂掉的region server则不执行负载均衡。

LOG.debug("Not running balancer because processing dead regionserver(s): " +

this.serverManager.getDeadServers());

return false;

}

if (this.cpHost !=null) {

try {

if (this.cpHost.preBalance()) {

LOG.debug("Coprocessor bypassing balancer request");

return false;

}

} catch (IOException ioe) {

LOG.error("Error invoking master coprocessor preBalance()", ioe);

return false;

}

//获取table下面的region server 和region

Map<TableName, Map<ServerName, List<HRegionInfo>>> assignmentsByTable =

this.assignmentManager.getRegionStates().getAssignmentsByTable();

List<RegionPlan> plans = new ArrayList<RegionPlan>();

//Give the balancer the current cluster state.设置当前集群状态

this.balancer.setClusterStatus(getClusterStatus());

for (Map<ServerName, List<HRegionInfo>> assignments : assignmentsByTable.values()) {

List<RegionPlan> partialPlans = this.balancer.balanceCluster(assignments);//获取负载均衡计划表

if (partialPlans !=null) plans.addAll(partialPlans);

}

long cutoffTime = System.currentTimeMillis() + maximumBalanceTime;

int rpCount = 0; // number of RegionPlans balanced so far

long totalRegPlanExecTime = 0;

if (plans !=null && !plans.isEmpty()) {

for (RegionPlan plan: plans) {

LOG.info("balance " + plan);

long balStartTime = System.currentTimeMillis();

//TODO: bulk assign

this.assignmentManager.balance(plan);//根据执行计划表的迁移内容

totalRegPlanExecTime += System.currentTimeMillis()-balStartTime;

rpCount++;

if (rpCount < plans.size() &&

// if performing next balance exceeds cutoff time, exit the loop

(System.currentTimeMillis() + (totalRegPlanExecTime / rpCount)) > cutoffTime) {

//TODO: After balance, there should not be a cutoff time (keeping it as a security net for now)

LOG.debug("No more balancing till next balance run; maximumBalanceTime=" +

maximumBalanceTime);

break;

}

if (this.cpHost !=null) {

try {

this.cpHost.postBalance(rpCount < plans.size() ? plans.subList(0, rpCount) : plans);

} catch (IOException ioe) {

// balancing already succeeded so don't change the result

LOG.error("Error invoking master coprocessor postBalance()", ioe);

}

// If LoadBalancer did not generate any plans, it means the cluster is already balanced.

// Return true indicating a success.

return true;

}

getBalancerCutoffTime(),首先从hbase.balancer.max.balancing中取，取不到赋值为-1，如果是没取到，则从hbase.balancer.period中取，取不到赋值为300000，这个地方有个疑问，nonsense period是什么意思？

private int getBalancerCutoffTime() {

int balancerCutoffTime =

getConfiguration().getInt("hbase.balancer.max.balancing", -1);

if (balancerCutoffTime == -1) {

// No time period set so create one

int balancerPeriod =

getConfiguration().getInt("hbase.balancer.period", 300000);

balancerCutoffTime = balancerPeriod;

// If nonsense period, set it to balancerPeriod

if (balancerCutoffTime <= 0) balancerCutoffTime = balancerPeriod;

}

return balancerCutoffTime;

}

HBase中load balance的策略是可插拔的，开发者可以根据自己业务的需求来开发自己的load balance策略。在HBase中，是通过接口LoadBalancer类实现的。具体使用哪个load balance策略由配置项hbase.master.loadbalancer.class决定，默认使用StochasticLoadBalancer。所有的逻辑都在StochasticLoadBalancer这个负载均衡器的 balanceCluster()方法中。

public synchronized List<RegionPlan> balanceCluster(Map<ServerName,

List<HRegionInfo>> clusterState) {

//首先生成master regionserver的region的plan

List<RegionPlan> plans = balanceMasterRegions(clusterState);

//如果plan和clusterState不为空，并且只有一个region，直接返回

if (plans !=null || clusterState == null || clusterState.size() <= 1) {

return plans;

}

//如果集群中包含masterserver并且集群的数量<=2,返回null

if (masterServerName !=null && clusterState.containsKey(masterServerName)) {

if (clusterState.size() <= 2) {

return null;

}

//否则在集群中移除masterserver

clusterState = new HashMap<ServerName, List<HRegionInfo>>(clusterState);

clusterState.remove(masterServerName);

}

// On clusters with lots of HFileLinks or lots of reference files,

// instantiating the storefile infos can be quite expensive.

// Allow turning this feature off if the locality cost is not going to

// be used in any computations.

RegionLocationFinder finder = null;

if (this.localityCost !=null && this.localityCost.getMultiplier() > 0) {

finder = this.regionFinder;

}

//The clusterState that is given to this method contains the state

//of all the regions in the table(s) (that's true today)

// Keep track of servers to iterate through them.

Cluster cluster = new Cluster(clusterState,loads, finder, rackManager);

if (!needsBalance(cluster)) {

return null;

}

long startTime = EnvironmentEdgeManager.currentTime();

initCosts(cluster);

//1. 生成一个虚拟的集群cluster，方便计算计算当前状态的开销，其中clusterState是表的状态

double currentCost = computeCost(cluster, Double.MAX_VALUE);

double initCost = currentCost;

double newCost = currentCost;

//设置循环的次数和cluster的region server的总数和region总数有关。最大值mapSteps为1000000

long computedMaxSteps = Math.min(this.maxSteps,

((long)cluster.numRegions * (long)this.stepsPerRegion * (long)cluster.numServers));

// Perform a stochastic walk to see if we can get a good fit.

long step;

for (step = 0; step < computedMaxSteps; step++) {

int generatorIdx =RANDOM.nextInt(candidateGenerators.length);

CandidateGenerator p = candidateGenerators[generatorIdx];

Cluster.Action action = p.generate(cluster);

if (action.type == Type.NULL) {

continue;

}

cluster.doAction(action);

updateCostsWithAction(cluster, action);

newCost = computeCost(cluster, currentCost);

// Should this be kept?

if (newCost < currentCost) {

currentCost = newCost;

} else {

// Put things back the way they were before.

// TODO: undo by remembering old values

Action undoAction = action.undoAction();

cluster.doAction(undoAction);

updateCostsWithAction(cluster, undoAction);

}

if (EnvironmentEdgeManager.currentTime() - startTime >

maxRunningTime) {

break;

}

long endTime = EnvironmentEdgeManager.currentTime();

metricsBalancer.balanceCluster(endTime - startTime);

if (initCost > currentCost) {

plans = createRegionPlans(cluster);

if (LOG.isDebugEnabled()) {

LOG.debug("Finished computing new load balance plan. Computation took "

+ (endTime - startTime) + "ms to try " + step

+ " different iterations. Found a solution that moves "

+ plans.size() + " regions; Going from a computed cost of "

+ initCost + " to a new cost of " + currentCost);

}

return plans;

}

if (LOG.isDebugEnabled()) {

LOG.debug("Could not find a better load balance plan. Tried "

+ step + " different configurations in " + (endTime - startTime)

+ "ms, and did not find anything with a computed cost less than " + initCost);

}

return null;

}

banancer()是针对整个集群的region分布，而不是针对某个表的region分布。它只保证每个regionserver上分布的regions在平均regions的0.8到1.2倍之间。

avg = 整个集群的总region数/regionserver个数

min = floor(avg*(1-0.2))

max=ceiling(avg*(1+0.2))

即所有regionserver上的regions个数都在min和max之间的话，就不会执行balancer。

protected boolean needsBalance(Cluster c) {

ClusterLoadState cs = new ClusterLoadState(c.clusterState);

if (cs.getNumServers() <MIN_SERVER_BALANCE) {

if (LOG.isDebugEnabled()) {

LOG.debug("Not running balancer because only " + cs.getNumServers()

+ " active regionserver(s)");

}

return false;

}

if(areSomeRegionReplicasColocated(c))return true;

// Check if we even need to do any load balancing

// HBASE-3681 check sloppiness first

float average = cs.getLoadAverage();// for logging 获取cluster中region server平均拥有的region数目

int floor = (int) Math.floor(average * (1 -slop));//floor 返回不大于的最大整数 slop取"hbase.regions.slop"，默认0.2

int ceiling = (int) Math.ceil(average * (1 +slop));//ceil返回最小的（最接近负无穷大）double值，大于或相等于参数，并相等于一个整数。特殊情况：

if (!(cs.getMaxLoad() > ceiling || cs.getMinLoad() < floor)) {//如果cluster的最多和最少region的region server不在范围内，返回false表明需要进行负载均衡算法。

NavigableMap<ServerAndLoad, List<HRegionInfo>> serversByLoad = cs.getServersByLoad();

if (LOG.isTraceEnabled()) {

// If nothing to balance, then don't say anything unless trace-level logging.

LOG.trace("Skipping load balancing because balanced cluster; " +

"servers=" + cs.getNumServers() +

" regions=" + cs.getNumRegions() +" average=" + average +

" mostloaded=" + serversByLoad.lastKey().getLoad() +

" leastloaded=" + serversByLoad.firstKey().getLoad());

}

return false;

}

return true;

}

随机取CandidateGenerator策略，从几个角度来计算它的cost，最终根据 (权重*cost值) 加起来的就是总得分，如果新计算的cost小于currentCost，则说明，这种region的交换或者迁移是有效的，把新计算的cost赋值给currentCost，否则回退到之前的操作，每次迭代都是基于上次的成果，总共做computedMaxSteps，computedMaxSteps的次数和cluster的region server 的总数和region总数有关，最大值mapSteps为1000000。

candidateGenerators = new CandidateGenerator[] {

new RandomCandidateGenerator(),

new LoadCandidateGenerator(),

localityCandidateGenerator,

new RegionReplicaRackCandidateGenerator(),

};

localityCandidateGenerator = new LocalityBasedCandidateGenerator(services);

/**

* This is the main cost function. It will compute a cost associated with a proposed cluster

* state. All different costs will be combined with their multipliers to produce a double cost.

*这个最主要的cost方法，它会计算目标集群状态相应的 cost。所有不同的cost会同它的权重相结合瀍河一个双重的cost

* @param cluster The state of the cluster

* @param previousCost the previous cost. This is used as an early out.之前的cost

* @return a double of a cost associated with the proposed cluster state. This cost is an

* aggregate of all individual cost functions.

protected double computeCost(Cluster cluster, double previousCost) {

double total = 0;

for (CostFunction c:costFunctions) {

if (c.getMultiplier() <= 0) {//如果权重小于0，则不计算该function的cost

continue;

}

total += c.getMultiplier() * c.cost();

if (total > previousCost) {

return total;

}

return total;

}

costFunctions = new CostFunction[]{

new RegionCountSkewCostFunction(conf),

new MoveCostFunction(conf),

localityCost,

new TableSkewCostFunction(conf),

regionReplicaHostCostFunction,

regionReplicaRackCostFunction,

regionLoadFunctions[0],

regionLoadFunctions[1],

regionLoadFunctions[2],

regionLoadFunctions[3],

};

regionReplicaHostCostFunction = new RegionReplicaHostCostFunction(conf);

regionReplicaRackCostFunction = new RegionReplicaRackCostFunction(conf);

regionLoadFunctions = new CostFromRegionLoadFunction[] {

new ReadRequestCostFunction(conf),

new WriteRequestCostFunction(conf),

new MemstoreSizeCostFunction(conf),

new StoreFileCostFunction(conf)

};

localityCost = new LocalityCostFunction(conf, services);

0 0