HBase 0.94.8 split 源码分析

来源:互联网 发布:windows xp 解码器 编辑:程序博客网 时间:2024/05/18 13:24
1. 发起 hbase split
1.1 HBaseAdmin.split
  /**   * Split a table or an individual region.   * Asynchronous operation.   *   * @param tableNameOrRegionName table to region to split   * @param splitPoint the explicit position to split on   * @throws IOException if a remote or network exception occurs   * @throws InterruptedException interrupt exception occurred   */  public void split(final byte [] tableNameOrRegionName,      final byte [] splitPoint) throws IOException, InterruptedException {    CatalogTracker ct = getCatalogTracker();    try {      Pair<HRegionInfo, ServerName> regionServerPair        = getRegion(tableNameOrRegionName, ct); //如果tableNameOrRegionName是RegionName则可以获得Pair<HRegionInfo, ServerName>,否则为空      if (regionServerPair != null) {        if (regionServerPair.getSecond() == null) {            throw new NoServerForRegionException(Bytes.toStringBinary(tableNameOrRegionName));        } else {  //split region 重点分析方法          split(regionServerPair.getSecond(), regionServerPair.getFirst(), splitPoint);        }      } else {//如果tableNameOrRegionName为表名称则进入这个分支        final String tableName = tableNameString(tableNameOrRegionName, ct);        List<Pair<HRegionInfo, ServerName>> pairs =          MetaReader.getTableRegionsAndLocations(ct,              tableName);//获得tableName这个表的所有region的HRegionInfo和对应的ServerName//如果splitPoint为空则会对所有region执行split,如果非空则只对包含splitPoint的region执行split.        for (Pair<HRegionInfo, ServerName> pair: pairs) {          // May not be a server for a particular row          if (pair.getSecond() == null) continue;          HRegionInfo r = pair.getFirst();          // check for parents          if (r.isSplitParent()) continue;          // if a split point given, only split that particular region          if (splitPoint != null && !r.containsRow(splitPoint)) continue;          // call out to region server to do split now          split(pair.getSecond(), pair.getFirst(), splitPoint);        }      }    } finally {      cleanupCatalogTracker(ct);    }  }
1.2 HBaseAdmin.split
  //这个函数为上面函数中调用的split  private void split(final ServerName sn, final HRegionInfo hri,      byte[] splitPoint) throws IOException {    HRegionInterface rs =      this.connection.getHRegionConnection(sn.getHostname(), sn.getPort());//获得HRegionServer实例    rs.splitRegion(hri, splitPoint);//调用HRegionServer.splitRegion对region进行split  }

1.3 HRegionServer.splitRegion 对Region进行split
  public void splitRegion(HRegionInfo regionInfo, byte[] splitPoint)      throws NotServingRegionException, IOException {    checkOpen();//检查server和hdfs是否可用    HRegion region = getRegion(regionInfo.getRegionName());//根据RegionName获得region    region.flushcache();//flush cache 有几种情况不进行flush,the cache is empty | the region is closed | a flush is already in progress | writes are disabled    region.forceSplit(splitPoint);//设置split point     compactSplitThread.requestSplit(region, region.checkSplit()); //通过region.checkSplit()获取split point,进行split    }
  
2. 确定split point

2.2 HRegion.checkSplit 
  public byte[] checkSplit() {    // Can't split ROOT/META    if (this.regionInfo.isMetaTable()) {      if (shouldForceSplit()) {        LOG.warn("Cannot split root/meta regions in HBase 0.20 and above");      }      return null;    }    if (!splitPolicy.shouldSplit()) {      return null;    }    byte[] ret = splitPolicy.getSplitPoint();    if (ret != null) {      try {        checkRow(ret, "calculated split");      } catch (IOException e) {        LOG.error("Ignoring invalid split", e);        return null;      }    }    return ret;  }
  
2.3 RegionSplitPolicy.getSplitPoint 具体获得分割点方法
  //如果region设置了split Point,则返回设置的split Point。否则,获取store的midkey作为splitpoint  protected byte[] getSplitPoint() {    byte[] explicitSplitPoint = this.region.getExplicitSplitPoint();    if (explicitSplitPoint != null) {      return explicitSplitPoint;    }    Map<byte[], Store> stores = region.getStores();    byte[] splitPointFromLargestStore = null;    long largestStoreSize = 0;    for (Store s : stores.values()) {      byte[] splitPoint = s.getSplitPoint();      long storeSize = s.getSize();      if (splitPoint != null && largestStoreSize < storeSize) {        splitPointFromLargestStore = splitPoint;        largestStoreSize = storeSize;      }    }        return splitPointFromLargestStore;  }
  
3.执行split

3.1 CompactSplitThread.requestSplit

  public synchronized void requestSplit(final HRegion r, byte[] midKey) {    if (midKey == null) {      LOG.debug("Region " + r.getRegionNameAsString() +        " not splittable because midkey=null");      return;    }    try {      this.splits.execute(new SplitRequest(r, midKey, this.server));      if (LOG.isDebugEnabled()) {        LOG.debug("Split requested for " + r + ".  " + this);      }    } catch (RejectedExecutionException ree) {      LOG.info("Could not execute split for " + r, ree);    }  }
  
3.2 SplitRequest.run 
  public void run() {    if (this.server.isStopping() || this.server.isStopped()) {      LOG.debug("Skipping split because server is stopping=" +        this.server.isStopping() + " or stopped=" + this.server.isStopped());      return;    }    try {      final long startTime = System.currentTimeMillis();      SplitTransaction st = new SplitTransaction(parent, midKey);      // If prepare does not return true, for some reason -- logged inside in      // the prepare call -- we are not ready to split just now. Just return.  //SplitTransaction.prepare()初始化SplitTransaction对象中的两个子region。同时做一些检测比如splitrow必须被region包含等      if (!st.prepare()) return;      try {        st.execute(this.server, this.server);        this.server.getMetrics().incrementSplitSuccessCount();      } catch (Exception e) {        if (this.server.isStopping() || this.server.isStopped()) {          LOG.info(              "Skip rollback/cleanup of failed split of "                  + parent.getRegionNameAsString() + " because server is"                  + (this.server.isStopping() ? " stopping" : " stopped"), e);          return;        }        try {          LOG.info("Running rollback/cleanup of failed split of " +            parent.getRegionNameAsString() + "; " + e.getMessage(), e);          if (st.rollback(this.server, this.server)) {            LOG.info("Successful rollback of failed split of " +              parent.getRegionNameAsString());            this.server.getMetrics().incrementSplitFailureCount();          } else {            this.server.abort("Abort; we got an error after point-of-no-return");          }        } catch (RuntimeException ee) {        ........  }

3.3 SplitTransaction.execute
  /**   * Run the transaction.   * @param server Hosting server instance.  Can be null when testing (won't try   * and update in zk if a null server)   * @param services Used to online/offline regions.   * @throws IOException If thrown, transaction failed. Call {@link #rollback(Server, RegionServerServices)}   * @return Regions created   * @throws IOException   * @see #rollback(Server, RegionServerServices)   */  public PairOfSameType<HRegion> execute(final Server server,      final RegionServerServices services)  throws IOException {    PairOfSameType<HRegion> regions = createDaughters(server, services);//创建split临时目录,改变region zk状态,关闭region,停止所有store服务  //创建daughter目录,将region storefile放入目录中  //创建子region A、B,在zk上注册,并且设置原HRI下线    openDaughters(server, services, regions.getFirst(), regions.getSecond());    transitionZKNode(server, services, regions.getFirst(), regions.getSecond());    return regions;  }

3.3.1 SplitTransaction.createDaughters 创建两个region,获得parent region的写锁
  /**   * Prepare the regions and region files.   * @param server Hosting server instance.  Can be null when testing (won't try   * and update in zk if a null server)   * @param services Used to online/offline regions.   * @throws IOException If thrown, transaction failed. Call {@link #rollback(Server, RegionServerServices)}   * @return Regions created   */  /* package */PairOfSameType<HRegion> createDaughters(final Server server,      final RegionServerServices services) throws IOException {    LOG.info("Starting split of region " + this.parent);    if ((server != null && server.isStopped()) ||        (services != null && services.isStopping())) {      throw new IOException("Server is stopped or stopping");    }    assert !this.parent.lock.writeLock().isHeldByCurrentThread(): "Unsafe to hold write lock while performing RPCs";    // Coprocessor callback//这个就是触发BaseRegionObserver.preSplit的源头    if (this.parent.getCoprocessorHost() != null) {      this.parent.getCoprocessorHost().preSplit();    }    // If true, no cluster to write meta edits to or to update znodes in.    boolean testing = server == null? true:      server.getConfiguration().getBoolean("hbase.testing.nocluster", false);    this.fileSplitTimeout = testing ? this.fileSplitTimeout :      server.getConfiguration().getLong("hbase.regionserver.fileSplitTimeout",          this.fileSplitTimeout);    // Set ephemeral SPLITTING znode up in zk.  Mocked servers sometimes don't    // have zookeeper so don't do zk stuff if server or zookeeper is null    if (server != null && server.getZooKeeper() != null) {      try {// 1. 在zk上创建一个临时的node splitting point        createNodeSplitting(server.getZooKeeper(),          this.parent.getRegionInfo(), server.getServerName());      } catch (KeeperException e) {        throw new IOException("Failed creating SPLITTING znode on " +          this.parent.getRegionNameAsString(), e);      }    }//记录了进度在 private final List<JournalEntry> journal = new ArrayList<JournalEntry>();中    this.journal.add(JournalEntry.SET_SPLITTING_IN_ZK);    if (server != null && server.getZooKeeper() != null) {      try {        // Transition node from SPLITTING to SPLITTING after creating the split node.        // Master will get the callback for node change only if the transition is successful.        // Note that if the transition fails then the rollback will delete the created znode        // TODO : May be we can add some new state to znode and handle the new state incase of success/failure        // 2. 等待master直到这个region转为splitting状态this.znodeVersion = transitionNodeSplitting(server.getZooKeeper(),            this.parent.getRegionInfo(), server.getServerName(), -1);      } catch (KeeperException e) {        throw new IOException("Failed setting SPLITTING znode on "            + this.parent.getRegionNameAsString(), e);      }    }// 3. 建立splitting的文件夹    createSplitDir(this.parent.getFilesystem(), this.splitdir);    this.journal.add(JournalEntry.CREATE_SPLIT_DIR);     List<StoreFile> hstoreFilesToSplit = null;    Exception exceptionToThrow = null;    try{  // 4. 等待region的flush和compact都完成后,关闭这个region      hstoreFilesToSplit = this.parent.close(false);    } catch (Exception e) {      exceptionToThrow = e;    }    if (exceptionToThrow == null && hstoreFilesToSplit == null) {      // The region was closed by a concurrent thread.  We can't continue      // with the split, instead we must just abandon the split.  If we      // reopen or split this could cause problems because the region has      // probably already been moved to a different server, or is in the      // process of moving to a different server.      exceptionToThrow = closedByOtherException;    }    if (exceptionToThrow != closedByOtherException) {      this.journal.add(JournalEntry.CLOSED_PARENT_REGION);    }    if (exceptionToThrow != null) {      if (exceptionToThrow instanceof IOException) throw (IOException)exceptionToThrow;      throw new IOException(exceptionToThrow);    }    if (!testing) {  // 5. 从HRegionServer上移除,加入到下线region中      services.removeFromOnlineRegions(this.parent.getRegionInfo().getEncodedName());    }    this.journal.add(JournalEntry.OFFLINED_PARENT);    // TODO: If splitStoreFiles were multithreaded would we complete steps in    // less elapsed time?  St.Ack 20100920    //    // splitStoreFiles creates daughter region dirs under the parent splits dir    // Nothing to unroll here if failure -- clean up of CREATE_SPLIT_DIR will    // clean this up.// 6. 进行regionsplit操作,创建线程池,用StoreFileSplitter类将region下的所有Hfile(StoreFile)进行split,    //  (split row在hfile中的不管,其他的都进行引用,把引用文件分别写到region下边)    splitStoreFiles(this.splitdir, hstoreFilesToSplit);    // Log to the journal that we are creating region A, the first daughter    // region.  We could fail halfway through.  If we do, we could have left    // stuff in fs that needs cleanup -- a storefile or two.  Thats why we    // add entry to journal BEFORE rather than AFTER the change.// 7. 生成左右两个子region,删除meta上parent,根据引用文件生成子region的regioninfo,写到hdfs上    this.journal.add(JournalEntry.STARTED_REGION_A_CREATION);    HRegion a = createDaughterRegion(this.hri_a, this.parent.rsServices);    // Ditto    this.journal.add(JournalEntry.STARTED_REGION_B_CREATION);    HRegion b = createDaughterRegion(this.hri_b, this.parent.rsServices);    // This is the point of no return.  Adding subsequent edits to .META. as we    // do below when we do the daughter opens adding each to .META. can fail in    // various interesting ways the most interesting of which is a timeout    // BUT the edits all go through (See HBASE-3872).  IF we reach the PONR    // then subsequent failures need to crash out this regionserver; the    // server shutdown processing should be able to fix-up the incomplete split.    // The offlined parent will have the daughters as extra columns.  If    // we leave the daughter regions in place and do not remove them when we    // crash out, then they will have their references to the parent in place    // still and the server shutdown fixup of .META. will point to these    // regions.    // We should add PONR JournalEntry before offlineParentInMeta,so even if    // OfflineParentInMeta timeout,this will cause regionserver exit,and then    // master ServerShutdownHandler will fix daughter & avoid data loss. (See     // HBase-4562).    this.journal.add(JournalEntry.PONR);    // Edit parent in meta.  Offlines parent region and adds splita and splitb.    if (!testing) {      MetaEditor.offlineParentInMeta(server.getCatalogTracker(),        this.parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo());    }    return new PairOfSameType<HRegion>(a, b);  }
  
3.3.2 SplitTransaction.openDaughters 打开两个子region
 
  /**   * Perform time consuming opening of the daughter regions.   * @param server Hosting server instance.  Can be null when testing (won't try   * and update in zk if a null server)   * @param services Used to online/offline regions.   * @param a first daughter region   * @param a second daughter region   * @throws IOException If thrown, transaction failed. Call {@link #rollback(Server, RegionServerServices)}   */  /* package */void openDaughters(final Server server,      final RegionServerServices services, HRegion a, HRegion b)      throws IOException {    boolean stopped = server != null && server.isStopped();    boolean stopping = services != null && services.isStopping();    // TODO: Is this check needed here?    if (stopped || stopping) {      LOG.info("Not opening daughters " +          b.getRegionInfo().getRegionNameAsString() +          " and " +          a.getRegionInfo().getRegionNameAsString() +          " because stopping=" + stopping + ", stopped=" + stopped);    } else {      // Open daughters in parallel.  //打开两个子region  //内层会调用HRegion.openHRegion去打开一个Region,具体的初始化是在HRegion.initializeRegionInternals中      DaughterOpener aOpener = new DaughterOpener(server, a);      DaughterOpener bOpener = new DaughterOpener(server, b);      aOpener.start();      bOpener.start();      try {        aOpener.join();        bOpener.join();      } catch (InterruptedException e) {        Thread.currentThread().interrupt();        throw new IOException("Interrupted " + e.getMessage());      }      if (aOpener.getException() != null) {        throw new IOException("Failed " +          aOpener.getName(), aOpener.getException());      }      if (bOpener.getException() != null) {        throw new IOException("Failed " +          bOpener.getName(), bOpener.getException());      }      if (services != null) {        try {          // add 2nd daughter first (see HBASE-4335)          services.postOpenDeployTasks(b, server.getCatalogTracker(), true);          // Should add it to OnlineRegions          services.addToOnlineRegions(b);          services.postOpenDeployTasks(a, server.getCatalogTracker(), true);          services.addToOnlineRegions(a);        } catch (KeeperException ke) {          throw new IOException(ke);        }      }    }  }




a)DaughterOpener,打开region。(会调用openDaughterRegion,最底层会调用HRegion.openHRegion )
1).向hdfs上写入.regionInfo文件以便meta挂掉以便恢复 
    2).初始化其下的HStore,主要是LoadStoreFiles函数: 
     该函数会构造storefile对象,从hdfs上获取路径和文件,每个文件一个
     storefile对象,对每个storefile对象会读取文件上的内容创建一个 
      HalfStoreFileReader读对象来操作该region的父region上的相应的文件,及该 
      region上目前存储的是引用文件,其指向的是其父region上的相应的文件,对该 
      region的所有读或写都将关联到父region上。
    b).services.addToOnlineRegions 将子Region添加到rs的online region列表上,并添加到meta表上。

3.3.3 HRegion.openHRegion
  /**   * Open HRegion.   * Calls initialize and sets sequenceid.   * @param reporter   * @return Returns <code>this</code>   * @throws IOException   */  protected HRegion openHRegion(final CancelableProgressable reporter)  throws IOException {    checkCompressionCodecs();//初始化region,  //1.checkRegionInfoOnFilesystem将HRegionInfo写入文件  //2.cleanupTempDir 清空老region临时目录  //3.初始化HRegion store,加载hfile  //4.获得recover.edit文件,找到对应的store,将读取的keyvalue输出到store,恢复hregion      long seqid = initialize(reporter);    if (this.log != null) {      this.log.setSequenceNumber(seqid);    }    return this;  }

  
3.4 SplitTransaction.transitionZKNode  修改zk节点状态,等待split结束

  /**   * Finish off split transaction, transition the zknode   * @param server Hosting server instance.  Can be null when testing (won't try   * and update in zk if a null server)   * @param services Used to online/offline regions.   * @param a first daughter region   * @param a second daughter region   * @throws IOException If thrown, transaction failed. Call {@link #rollback(Server, RegionServerServices)}   */  /* package */void transitionZKNode(final Server server,      final RegionServerServices services, HRegion a, HRegion b)      throws IOException {    // Tell master about split by updating zk.  If we fail, abort.    if (server != null && server.getZooKeeper() != null) {      try {        this.znodeVersion = transitionNodeSplit(server.getZooKeeper(),          parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(),          server.getServerName(), this.znodeVersion);        int spins = 0;        // Now wait for the master to process the split. We know it's done        // when the znode is deleted. The reason we keep tickling the znode is        // that it's possible for the master to miss an event.        do {          if (spins % 10 == 0) {            LOG.debug("Still waiting on the master to process the split for " +                this.parent.getRegionInfo().getEncodedName());          }          Thread.sleep(100);          // When this returns -1 it means the znode doesn't exist          this.znodeVersion = tickleNodeSplit(server.getZooKeeper(),            parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(),            server.getServerName(), this.znodeVersion);          spins++;        } while (this.znodeVersion != -1 && !server.isStopped()            && !services.isStopping());      } catch (Exception e) {        if (e instanceof InterruptedException) {          Thread.currentThread().interrupt();        }        throw new IOException("Failed telling master about split", e);      }    }    // Coprocessor callback    if (this.parent.getCoprocessorHost() != null) {      this.parent.getCoprocessorHost().postSplit(a,b);    }    // Leaving here, the splitdir with its dross will be in place but since the    // split was successful, just leave it; it'll be cleaned when parent is    // deleted and cleaned up.  }  



  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  





0 0
原创粉丝点击