hbase0.96 AggregateImplementation和AggregationClient

来源:互联网 发布:iphone直播软件 编辑:程序博客网 时间:2024/05/21 12:49

不知道为什么,hbase0.96中只定义了一个AggregateService的RPC协议,并没有具体的实现类和客户端调用类,官网新的0.98版本里又出现了这两个类,于是把0.98里的这两个类搬来了,在0.96上测试通过可以使用。


import java.io.IOException;import java.nio.ByteBuffer;import java.util.ArrayList;import java.util.List;import java.util.NavigableSet;import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import org.apache.hadoop.classification.InterfaceAudience;import org.apache.hadoop.hbase.Cell;import org.apache.hadoop.hbase.Coprocessor;import org.apache.hadoop.hbase.CoprocessorEnvironment;import org.apache.hadoop.hbase.client.Scan;import org.apache.hadoop.hbase.coprocessor.ColumnInterpreter;import org.apache.hadoop.hbase.coprocessor.CoprocessorException;import org.apache.hadoop.hbase.coprocessor.CoprocessorService;import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;import org.apache.hadoop.hbase.protobuf.ProtobufUtil;import org.apache.hadoop.hbase.protobuf.ResponseConverter;import org.apache.hadoop.hbase.protobuf.generated.AggregateProtos.AggregateRequest;import org.apache.hadoop.hbase.protobuf.generated.AggregateProtos.AggregateResponse;import org.apache.hadoop.hbase.protobuf.generated.AggregateProtos.AggregateService;import org.apache.hadoop.hbase.regionserver.InternalScanner;import com.google.protobuf.ByteString;import com.google.protobuf.Message;import com.google.protobuf.RpcCallback;import com.google.protobuf.RpcController;import com.google.protobuf.Service;/** * A concrete AggregateProtocol implementation. Its system level coprocessor * that computes the aggregate function at a region level. * {@link ColumnInterpreter} is used to interpret column value. This class is * parameterized with the following (these are the types with which the {@link ColumnInterpreter} * is parameterized, and for more description on these, refer to {@link ColumnInterpreter}): * @param <T> Cell value data type * @param <S> Promoted data type * @param <P> PB message that is used to transport initializer specific bytes * @param <Q> PB message that is used to transport Cell (<T>) instance * @param <R> PB message that is used to transport Promoted (<S>) instance */@InterfaceAudience.Privatepublic class AggregateImplementation<T, S, P extends Message, Q extends Message, R extends Message> extends AggregateService implements CoprocessorService, Coprocessor {  protected static final Log log = LogFactory.getLog(AggregateImplementation.class);  private RegionCoprocessorEnvironment env;  /**   * Gives the maximum for a given combination of column qualifier and column   * family, in the given row range as defined in the Scan object. In its   * current implementation, it takes one column family and one column qualifier   * (if provided). In case of null column qualifier, maximum value for the   * entire column family will be returned.   */  @Override  public void getMax(RpcController controller, AggregateRequest request,      RpcCallback<AggregateResponse> done) {    InternalScanner scanner = null;    AggregateResponse response = null;    T max = null;    try {      ColumnInterpreter<T, S, P, Q, R> ci = constructColumnInterpreterFromRequest(request);      T temp;      Scan scan = ProtobufUtil.toScan(request.getScan());      scanner = env.getRegion().getScanner(scan);      List<Cell> results = new ArrayList<Cell>();      byte[] colFamily = scan.getFamilies()[0];      NavigableSet<byte[]> qualifiers = scan.getFamilyMap().get(colFamily);      byte[] qualifier = null;      if (qualifiers != null && !qualifiers.isEmpty()) {        qualifier = qualifiers.pollFirst();      }      // qualifier can be null.      boolean hasMoreRows = false;      do {        hasMoreRows = scanner.next(results);        for (Cell kv : results) {          temp = ci.getValue(colFamily, qualifier, kv);          max = (max == null || (temp != null && ci.compare(temp, max) > 0)) ? temp : max;        }        results.clear();      } while (hasMoreRows);      if (max != null) {        AggregateResponse.Builder builder = AggregateResponse.newBuilder();        builder.addFirstPart(ci.getProtoForCellType(max).toByteString());        response = builder.build();      }    } catch (IOException e) {      ResponseConverter.setControllerException(controller, e);    } finally {      if (scanner != null) {        try {          scanner.close();        } catch (IOException ignored) {}      }    }    log.info("Maximum from this region is "        + env.getRegion().getRegionNameAsString() + ": " + max);    done.run(response);  }  /**   * Gives the minimum for a given combination of column qualifier and column   * family, in the given row range as defined in the Scan object. In its   * current implementation, it takes one column family and one column qualifier   * (if provided). In case of null column qualifier, minimum value for the   * entire column family will be returned.   */  @Override  public void getMin(RpcController controller, AggregateRequest request,      RpcCallback<AggregateResponse> done) {    AggregateResponse response = null;    InternalScanner scanner = null;    T min = null;    try {      ColumnInterpreter<T, S, P, Q, R> ci = constructColumnInterpreterFromRequest(request);      T temp;      Scan scan = ProtobufUtil.toScan(request.getScan());      scanner = env.getRegion().getScanner(scan);      List<Cell> results = new ArrayList<Cell>();      byte[] colFamily = scan.getFamilies()[0];      NavigableSet<byte[]> qualifiers = scan.getFamilyMap().get(colFamily);      byte[] qualifier = null;      if (qualifiers != null && !qualifiers.isEmpty()) {        qualifier = qualifiers.pollFirst();      }      boolean hasMoreRows = false;      do {        hasMoreRows = scanner.next(results);        for (Cell kv : results) {          temp = ci.getValue(colFamily, qualifier, kv);          min = (min == null || (temp != null && ci.compare(temp, min) < 0)) ? temp : min;        }        results.clear();      } while (hasMoreRows);      if (min != null) {        response = AggregateResponse.newBuilder().addFirstPart(           ci.getProtoForCellType(min).toByteString()).build();      }    } catch (IOException e) {      ResponseConverter.setControllerException(controller, e);    } finally {      if (scanner != null) {        try {          scanner.close();        } catch (IOException ignored) {}      }    }    log.info("Minimum from this region is "        + env.getRegion().getRegionNameAsString() + ": " + min);    done.run(response);  }  /**   * Gives the sum for a given combination of column qualifier and column   * family, in the given row range as defined in the Scan object. In its   * current implementation, it takes one column family and one column qualifier   * (if provided). In case of null column qualifier, sum for the entire column   * family will be returned.   */  @Override  public void getSum(RpcController controller, AggregateRequest request,      RpcCallback<AggregateResponse> done) {    AggregateResponse response = null;    InternalScanner scanner = null;    long sum = 0l;    try {      ColumnInterpreter<T, S, P, Q, R> ci = constructColumnInterpreterFromRequest(request);      S sumVal = null;      T temp;      Scan scan = ProtobufUtil.toScan(request.getScan());      scanner = env.getRegion().getScanner(scan);      byte[] colFamily = scan.getFamilies()[0];      NavigableSet<byte[]> qualifiers = scan.getFamilyMap().get(colFamily);      byte[] qualifier = null;      if (qualifiers != null && !qualifiers.isEmpty()) {        qualifier = qualifiers.pollFirst();      }      List<Cell> results = new ArrayList<Cell>();      boolean hasMoreRows = false;      do {        hasMoreRows = scanner.next(results);        for (Cell kv : results) {          temp = ci.getValue(colFamily, qualifier, kv);          if (temp != null)            sumVal = ci.add(sumVal, ci.castToReturnType(temp));        }        results.clear();      } while (hasMoreRows);      if (sumVal != null) {        response = AggregateResponse.newBuilder().addFirstPart(           ci.getProtoForPromotedType(sumVal).toByteString()).build();      }    } catch (IOException e) {      ResponseConverter.setControllerException(controller, e);    } finally {      if (scanner != null) {        try {          scanner.close();        } catch (IOException ignored) {}      }    }    log.debug("Sum from this region is "        + env.getRegion().getRegionNameAsString() + ": " + sum);    done.run(response);  }  /**   * Gives the row count for the given column family and column qualifier, in   * the given row range as defined in the Scan object.   * @throws IOException   */  @Override  public void getRowNum(RpcController controller, AggregateRequest request,      RpcCallback<AggregateResponse> done) {    AggregateResponse response = null;    long counter = 0l;    List<Cell> results = new ArrayList<Cell>();    InternalScanner scanner = null;    try {      Scan scan = ProtobufUtil.toScan(request.getScan());      byte[][] colFamilies = scan.getFamilies();      byte[] colFamily = colFamilies != null ? colFamilies[0] : null;      NavigableSet<byte[]> qualifiers = colFamilies != null ?          scan.getFamilyMap().get(colFamily) : null;      byte[] qualifier = null;      if (qualifiers != null && !qualifiers.isEmpty()) {        qualifier = qualifiers.pollFirst();      }      if (scan.getFilter() == null && qualifier == null)        scan.setFilter(new FirstKeyOnlyFilter());      scanner = env.getRegion().getScanner(scan);      boolean hasMoreRows = false;      do {        hasMoreRows = scanner.next(results);        if (results.size() > 0) {          counter++;        }        results.clear();      } while (hasMoreRows);      ByteBuffer bb = ByteBuffer.allocate(8).putLong(counter);      bb.rewind();      response = AggregateResponse.newBuilder().addFirstPart(           ByteString.copyFrom(bb)).build();    } catch (IOException e) {      ResponseConverter.setControllerException(controller, e);    } finally {      if (scanner != null) {        try {          scanner.close();        } catch (IOException ignored) {}      }    }    log.info("Row counter from this region is "        + env.getRegion().getRegionNameAsString() + ": " + counter);    done.run(response);  }  /**   * Gives a Pair with first object as Sum and second object as row count,   * computed for a given combination of column qualifier and column family in   * the given row range as defined in the Scan object. In its current   * implementation, it takes one column family and one column qualifier (if   * provided). In case of null column qualifier, an aggregate sum over all the   * entire column family will be returned.   * <p>   * The average is computed in   * AggregationClient#avg(byte[], ColumnInterpreter, Scan) by   * processing results from all regions, so its "ok" to pass sum and a Long   * type.   */  @Override  public void getAvg(RpcController controller, AggregateRequest request,      RpcCallback<AggregateResponse> done) {    AggregateResponse response = null;    InternalScanner scanner = null;    try {      ColumnInterpreter<T, S, P, Q, R> ci = constructColumnInterpreterFromRequest(request);      S sumVal = null;      Long rowCountVal = 0l;      Scan scan = ProtobufUtil.toScan(request.getScan());      scanner = env.getRegion().getScanner(scan);      byte[] colFamily = scan.getFamilies()[0];      NavigableSet<byte[]> qualifiers = scan.getFamilyMap().get(colFamily);      byte[] qualifier = null;      if (qualifiers != null && !qualifiers.isEmpty()) {        qualifier = qualifiers.pollFirst();      }      List<Cell> results = new ArrayList<Cell>();      boolean hasMoreRows = false;          do {        results.clear();        hasMoreRows = scanner.next(results);        for (Cell kv : results) {          sumVal = ci.add(sumVal, ci.castToReturnType(ci.getValue(colFamily,              qualifier, kv)));        }        rowCountVal++;      } while (hasMoreRows);      if (sumVal != null) {        ByteString first = ci.getProtoForPromotedType(sumVal).toByteString();        AggregateResponse.Builder pair = AggregateResponse.newBuilder();        pair.addFirstPart(first);        ByteBuffer bb = ByteBuffer.allocate(8).putLong(rowCountVal);        bb.rewind();        pair.setSecondPart(ByteString.copyFrom(bb));        response = pair.build();      }    } catch (IOException e) {      ResponseConverter.setControllerException(controller, e);    } finally {      if (scanner != null) {        try {          scanner.close();        } catch (IOException ignored) {}      }    }    done.run(response);  }  /**   * Gives a Pair with first object a List containing Sum and sum of squares,   * and the second object as row count. It is computed for a given combination of   * column qualifier and column family in the given row range as defined in the   * Scan object. In its current implementation, it takes one column family and   * one column qualifier (if provided). The idea is get the value of variance first:   * the average of the squares less the square of the average a standard   * deviation is square root of variance.   */  @Override  public void getStd(RpcController controller, AggregateRequest request,      RpcCallback<AggregateResponse> done) {    InternalScanner scanner = null;    AggregateResponse response = null;    try {      ColumnInterpreter<T, S, P, Q, R> ci = constructColumnInterpreterFromRequest(request);      S sumVal = null, sumSqVal = null, tempVal = null;      long rowCountVal = 0l;      Scan scan = ProtobufUtil.toScan(request.getScan());      scanner = env.getRegion().getScanner(scan);      byte[] colFamily = scan.getFamilies()[0];      NavigableSet<byte[]> qualifiers = scan.getFamilyMap().get(colFamily);      byte[] qualifier = null;      if (qualifiers != null && !qualifiers.isEmpty()) {        qualifier = qualifiers.pollFirst();      }      List<Cell> results = new ArrayList<Cell>();      boolean hasMoreRows = false;          do {        tempVal = null;        hasMoreRows = scanner.next(results);        for (Cell kv : results) {          tempVal = ci.add(tempVal, ci.castToReturnType(ci.getValue(colFamily,              qualifier, kv)));        }        results.clear();        sumVal = ci.add(sumVal, tempVal);        sumSqVal = ci.add(sumSqVal, ci.multiply(tempVal, tempVal));        rowCountVal++;      } while (hasMoreRows);      if (sumVal != null) {        ByteString first_sumVal = ci.getProtoForPromotedType(sumVal).toByteString();        ByteString first_sumSqVal = ci.getProtoForPromotedType(sumSqVal).toByteString();        AggregateResponse.Builder pair = AggregateResponse.newBuilder();        pair.addFirstPart(first_sumVal);        pair.addFirstPart(first_sumSqVal);        ByteBuffer bb = ByteBuffer.allocate(8).putLong(rowCountVal);        bb.rewind();        pair.setSecondPart(ByteString.copyFrom(bb));        response = pair.build();      }    } catch (IOException e) {      ResponseConverter.setControllerException(controller, e);    } finally {      if (scanner != null) {        try {          scanner.close();        } catch (IOException ignored) {}      }    }    done.run(response);  }  /**   * Gives a List containing sum of values and sum of weights.   * It is computed for the combination of column   * family and column qualifier(s) in the given row range as defined in the   * Scan object. In its current implementation, it takes one column family and   * two column qualifiers. The first qualifier is for values column and    * the second qualifier (optional) is for weight column.   */  @Override  public void getMedian(RpcController controller, AggregateRequest request,      RpcCallback<AggregateResponse> done) {    AggregateResponse response = null;    InternalScanner scanner = null;    try {      ColumnInterpreter<T, S, P, Q, R> ci = constructColumnInterpreterFromRequest(request);      S sumVal = null, sumWeights = null, tempVal = null, tempWeight = null;      Scan scan = ProtobufUtil.toScan(request.getScan());      scanner = env.getRegion().getScanner(scan);      byte[] colFamily = scan.getFamilies()[0];      NavigableSet<byte[]> qualifiers = scan.getFamilyMap().get(colFamily);      byte[] valQualifier = null, weightQualifier = null;      if (qualifiers != null && !qualifiers.isEmpty()) {        valQualifier = qualifiers.pollFirst();        // if weighted median is requested, get qualifier for the weight column        weightQualifier = qualifiers.pollLast();      }      List<Cell> results = new ArrayList<Cell>();      boolean hasMoreRows = false;          do {        tempVal = null;        tempWeight = null;        hasMoreRows = scanner.next(results);        for (Cell kv : results) {          tempVal = ci.add(tempVal, ci.castToReturnType(ci.getValue(colFamily,              valQualifier, kv)));          if (weightQualifier != null) {            tempWeight = ci.add(tempWeight,                ci.castToReturnType(ci.getValue(colFamily, weightQualifier, kv)));          }        }        results.clear();        sumVal = ci.add(sumVal, tempVal);        sumWeights = ci.add(sumWeights, tempWeight);      } while (hasMoreRows);      ByteString first_sumVal = ci.getProtoForPromotedType(sumVal).toByteString();      S s = sumWeights == null ? ci.castToReturnType(ci.getMinValue()) : sumWeights;      ByteString first_sumWeights = ci.getProtoForPromotedType(s).toByteString();      AggregateResponse.Builder pair = AggregateResponse.newBuilder();      pair.addFirstPart(first_sumVal);      pair.addFirstPart(first_sumWeights);       response = pair.build();    } catch (IOException e) {      ResponseConverter.setControllerException(controller, e);    } finally {      if (scanner != null) {        try {          scanner.close();        } catch (IOException ignored) {}      }    }    done.run(response);  }  @SuppressWarnings("unchecked")  ColumnInterpreter<T,S,P,Q,R> constructColumnInterpreterFromRequest(      AggregateRequest request) throws IOException {    String className = request.getInterpreterClassName();    Class<?> cls;    try {      cls = Class.forName(className);      ColumnInterpreter<T,S,P,Q,R> ci = (ColumnInterpreter<T, S, P, Q, R>) cls.newInstance();      if (request.hasInterpreterSpecificBytes()) {        ByteString b = request.getInterpreterSpecificBytes();        P initMsg = ProtobufUtil.getParsedGenericInstance(ci.getClass(), 2, b);        ci.initialize(initMsg);      }      return ci;    } catch (ClassNotFoundException e) {      throw new IOException(e);    } catch (InstantiationException e) {      throw new IOException(e);    } catch (IllegalAccessException e) {      throw new IOException(e);    }  }  @Override  public Service getService() {    return this;  }  /**   * Stores a reference to the coprocessor environment provided by the   * {@link org.apache.hadoop.hbase.regionserver.RegionCoprocessorHost} from the region where this   * coprocessor is loaded.  Since this is a coprocessor endpoint, it always expects to be loaded   * on a table region, so always expects this to be an instance of   * {@link RegionCoprocessorEnvironment}.   * @param env the environment provided by the coprocessor host   * @throws IOException if the provided environment is not an instance of   * {@code RegionCoprocessorEnvironment}   */  @Override  public void start(CoprocessorEnvironment env) throws IOException {    if (env instanceof RegionCoprocessorEnvironment) {      this.env = (RegionCoprocessorEnvironment)env;    } else {      throw new CoprocessorException("Must be loaded on a table region!");    }  }  @Override  public void stop(CoprocessorEnvironment env) throws IOException {    // nothing to do  }  }

import java.io.IOException;import java.nio.ByteBuffer;import java.util.ArrayList;import java.util.List;import java.util.Map;import java.util.NavigableMap;import java.util.NavigableSet;import java.util.TreeMap;import java.util.concurrent.atomic.AtomicLong;import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import org.apache.hadoop.classification.InterfaceAudience;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.hbase.Cell;import org.apache.hadoop.hbase.HConstants;import org.apache.hadoop.hbase.TableName;import org.apache.hadoop.hbase.client.HTable;import org.apache.hadoop.hbase.client.Result;import org.apache.hadoop.hbase.client.ResultScanner;import org.apache.hadoop.hbase.client.Scan;import org.apache.hadoop.hbase.client.coprocessor.Batch;import org.apache.hadoop.hbase.coprocessor.ColumnInterpreter;import org.apache.hadoop.hbase.ipc.BlockingRpcCallback;import org.apache.hadoop.hbase.ipc.ServerRpcController;import org.apache.hadoop.hbase.protobuf.ProtobufUtil;import org.apache.hadoop.hbase.protobuf.generated.AggregateProtos.AggregateRequest;import org.apache.hadoop.hbase.protobuf.generated.AggregateProtos.AggregateResponse;import org.apache.hadoop.hbase.protobuf.generated.AggregateProtos.AggregateService;import org.apache.hadoop.hbase.util.Bytes;import org.apache.hadoop.hbase.util.Pair;import org.apache.hadoop.yarn.webapp.hamlet.HamletSpec.Q;import com.google.protobuf.ByteString;import com.google.protobuf.Message;/** * This client class is for invoking the aggregate functions deployed on the * Region Server side via the AggregateService. This class will implement the * supporting functionality for summing/processing the individual results * obtained from the AggregateService for each region. * <p> * This will serve as the client side handler for invoking the aggregate * functions. * <ul> * For all aggregate functions, * <li>start row < end row is an essential condition (if they are not * {@link HConstants#EMPTY_BYTE_ARRAY}) * <li>Column family can't be null. In case where multiple families are * provided, an IOException will be thrown. An optional column qualifier can * also be defined. * <li>For methods to find maximum, minimum, sum, rowcount, it returns the * parameter type. For average and std, it returns a double value. For row * count, it returns a long value. */@InterfaceAudience.Privatepublic class AggregationClient {private static final Log log = LogFactory.getLog(AggregationClient.class);Configuration conf;/** * Constructor with Conf object *  * @param cfg */public AggregationClient(Configuration cfg) {this.conf = cfg;}/** * It gives the maximum value of a column for a given column family for the * given range. In case qualifier is null, a max of all values for the given * family is returned. *  * @param tableName * @param ci * @param scan * @return max val <R> * @throws Throwable *             The caller is supposed to handle the exception as they are *             thrown & propagated to it. */public <R, S, P extends Message, Q extends Message, T extends Message> R max(final TableName tableName,final ColumnInterpreter<R, S, P, Q, T> ci, final Scan scan)throws Throwable {HTable table = null;try {table = new HTable(conf, tableName);return max(table, ci, scan);} finally {if (table != null) {table.close();}}}/** * It gives the maximum value of a column for a given column family for the * given range. In case qualifier is null, a max of all values for the given * family is returned. *  * @param table * @param ci * @param scan * @return max val <R> * @throws Throwable *             The caller is supposed to handle the exception as they are *             thrown & propagated to it. */public <R, S, P extends Message, Q extends Message, T extends Message> R max(final HTable table, final ColumnInterpreter<R, S, P, Q, T> ci,final Scan scan) throws Throwable {final AggregateRequest requestArg = validateArgAndGetPB(scan, ci, false);class MaxCallBack implements Batch.Callback<R> {R max = null;R getMax() {return max;}@Overridepublic synchronized void update(byte[] region, byte[] row, R result) {max = (max == null || (result != null && ci.compare(max, result) < 0)) ? result : max;}}MaxCallBack aMaxCallBack = new MaxCallBack();table.coprocessorService(AggregateService.class, scan.getStartRow(),scan.getStopRow(), new Batch.Call<AggregateService, R>() {@Overridepublic R call(AggregateService instance) throws IOException {ServerRpcController controller = new ServerRpcController();BlockingRpcCallback<AggregateResponse> rpcCallback = new BlockingRpcCallback<AggregateResponse>();instance.getMax(controller, requestArg, rpcCallback);AggregateResponse response = rpcCallback.get();if (controller.failedOnException()) {throw controller.getFailedOn();}if (response.getFirstPartCount() > 0) {ByteString b = response.getFirstPart(0);Q q = ProtobufUtil.getParsedGenericInstance(ci.getClass(), 3, b);return ci.getCellValueFromProto(q);}return null;}}, aMaxCallBack);return aMaxCallBack.getMax();}/* * @param scan *  * @param canFamilyBeAbsent whether column family can be absent in familyMap * of scan */private void validateParameters(Scan scan, boolean canFamilyBeAbsent)throws IOException {if (scan == null|| (Bytes.equals(scan.getStartRow(), scan.getStopRow()) && !Bytes.equals(scan.getStartRow(), HConstants.EMPTY_START_ROW))|| ((Bytes.compareTo(scan.getStartRow(), scan.getStopRow()) > 0) && !Bytes.equals(scan.getStopRow(), HConstants.EMPTY_END_ROW))) {throw new IOException("Agg client Exception: Startrow should be smaller than Stoprow");} else if (!canFamilyBeAbsent) {if (scan.getFamilyMap().size() != 1) {throw new IOException("There must be only one family.");}}}/** * It gives the minimum value of a column for a given column family for the * given range. In case qualifier is null, a min of all values for the given * family is returned. *  * @param tableName * @param ci * @param scan * @return min val <R> * @throws Throwable */public <R, S, P extends Message, Q extends Message, T extends Message> R min(final TableName tableName,final ColumnInterpreter<R, S, P, Q, T> ci, final Scan scan)throws Throwable {HTable table = null;try {table = new HTable(conf, tableName);return min(table, ci, scan);} finally {if (table != null) {table.close();}}}/** * It gives the minimum value of a column for a given column family for the * given range. In case qualifier is null, a min of all values for the given * family is returned. *  * @param table * @param ci * @param scan * @return min val <R> * @throws Throwable */public <R, S, P extends Message, Q extends Message, T extends Message> R min(final HTable table, final ColumnInterpreter<R, S, P, Q, T> ci,final Scan scan) throws Throwable {final AggregateRequest requestArg = validateArgAndGetPB(scan, ci, false);class MinCallBack implements Batch.Callback<R> {private R min = null;public R getMinimum() {return min;}@Overridepublic synchronized void update(byte[] region, byte[] row, R result) {min = (min == null || (result != null && ci.compare(result, min) < 0)) ? result : min;}}MinCallBack minCallBack = new MinCallBack();table.coprocessorService(AggregateService.class, scan.getStartRow(),scan.getStopRow(), new Batch.Call<AggregateService, R>() {@Overridepublic R call(AggregateService instance) throws IOException {ServerRpcController controller = new ServerRpcController();BlockingRpcCallback<AggregateResponse> rpcCallback = new BlockingRpcCallback<AggregateResponse>();instance.getMin(controller, requestArg, rpcCallback);AggregateResponse response = rpcCallback.get();if (controller.failedOnException()) {throw controller.getFailedOn();}if (response.getFirstPartCount() > 0) {ByteString b = response.getFirstPart(0);Q q = ProtobufUtil.getParsedGenericInstance(ci.getClass(), 3, b);return ci.getCellValueFromProto(q);}return null;}}, minCallBack);log.debug("Min fom all regions is: " + minCallBack.getMinimum());return minCallBack.getMinimum();}/** * It gives the row count, by summing up the individual results obtained * from regions. In case the qualifier is null, FirstKeyValueFilter is used * to optimised the operation. In case qualifier is provided, I can't use * the filter as it may set the flag to skip to next row, but the value read * is not of the given filter: in this case, this particular row will not be * counted ==> an error. *  * @param tableName * @param ci * @param scan * @return <R, S> * @throws Throwable */public <R, S, P extends Message, Q extends Message, T extends Message> long rowCount(final TableName tableName,final ColumnInterpreter<R, S, P, Q, T> ci, final Scan scan)throws Throwable {HTable table = null;try {table = new HTable(conf, tableName);return rowCount(table, ci, scan);} finally {if (table != null) {table.close();}}}/** * It gives the row count, by summing up the individual results obtained * from regions. In case the qualifier is null, FirstKeyValueFilter is used * to optimised the operation. In case qualifier is provided, I can't use * the filter as it may set the flag to skip to next row, but the value read * is not of the given filter: in this case, this particular row will not be * counted ==> an error. *  * @param table * @param ci * @param scan * @return <R, S> * @throws Throwable */public <R, S, P extends Message, Q extends Message, T extends Message> long rowCount(final HTable table, final ColumnInterpreter<R, S, P, Q, T> ci,final Scan scan) throws Throwable {final AggregateRequest requestArg = validateArgAndGetPB(scan, ci, true);class RowNumCallback implements Batch.Callback<Long> {private final AtomicLong rowCountL = new AtomicLong(0);public long getRowNumCount() {return rowCountL.get();}@Overridepublic void update(byte[] region, byte[] row, Long result) {rowCountL.addAndGet(result.longValue());}}RowNumCallback rowNum = new RowNumCallback();table.coprocessorService(AggregateService.class, scan.getStartRow(),scan.getStopRow(), new Batch.Call<AggregateService, Long>() {@Overridepublic Long call(AggregateService instance)throws IOException {ServerRpcController controller = new ServerRpcController();BlockingRpcCallback<AggregateResponse> rpcCallback = new BlockingRpcCallback<AggregateResponse>();instance.getRowNum(controller, requestArg, rpcCallback);AggregateResponse response = rpcCallback.get();if (controller.failedOnException()) {throw controller.getFailedOn();}byte[] bytes = getBytesFromResponse(response.getFirstPart(0));ByteBuffer bb = ByteBuffer.allocate(8).put(bytes);bb.rewind();return bb.getLong();}}, rowNum);return rowNum.getRowNumCount();}/** * It sums up the value returned from various regions. In case qualifier is * null, summation of all the column qualifiers in the given family is done. *  * @param tableName * @param ci * @param scan * @return sum <S> * @throws Throwable */public <R, S, P extends Message, Q extends Message, T extends Message> S sum(final TableName tableName,final ColumnInterpreter<R, S, P, Q, T> ci, final Scan scan)throws Throwable {HTable table = null;try {table = new HTable(conf, tableName);return sum(table, ci, scan);} finally {if (table != null) {table.close();}}}/** * It sums up the value returned from various regions. In case qualifier is * null, summation of all the column qualifiers in the given family is done. *  * @param table * @param ci * @param scan * @return sum <S> * @throws Throwable */public <R, S, P extends Message, Q extends Message, T extends Message> S sum(final HTable table, final ColumnInterpreter<R, S, P, Q, T> ci,final Scan scan) throws Throwable {final AggregateRequest requestArg = validateArgAndGetPB(scan, ci, false);class SumCallBack implements Batch.Callback<S> {S sumVal = null;public S getSumResult() {return sumVal;}@Overridepublic synchronized void update(byte[] region, byte[] row, S result) {sumVal = ci.add(sumVal, result);}}SumCallBack sumCallBack = new SumCallBack();table.coprocessorService(AggregateService.class, scan.getStartRow(),scan.getStopRow(), new Batch.Call<AggregateService, S>() {@Overridepublic S call(AggregateService instance) throws IOException {ServerRpcController controller = new ServerRpcController();BlockingRpcCallback<AggregateResponse> rpcCallback = new BlockingRpcCallback<AggregateResponse>();instance.getSum(controller, requestArg, rpcCallback);AggregateResponse response = rpcCallback.get();if (controller.failedOnException()) {throw controller.getFailedOn();}if (response.getFirstPartCount() == 0) {return null;}ByteString b = response.getFirstPart(0);T t = ProtobufUtil.getParsedGenericInstance(ci.getClass(), 4, b);S s = ci.getPromotedValueFromProto(t);return s;}}, sumCallBack);return sumCallBack.getSumResult();}/** * It computes average while fetching sum and row count from all the * corresponding regions. Approach is to compute a global sum of region * level sum and rowcount and then compute the average. *  * @param tableName * @param scan * @throws Throwable */private <R, S, P extends Message, Q extends Message, T extends Message> Pair<S, Long> getAvgArgs(final TableName tableName,final ColumnInterpreter<R, S, P, Q, T> ci, final Scan scan)throws Throwable {HTable table = null;try {table = new HTable(conf, tableName);return getAvgArgs(table, ci, scan);} finally {if (table != null) {table.close();}}}/** * It computes average while fetching sum and row count from all the * corresponding regions. Approach is to compute a global sum of region * level sum and rowcount and then compute the average. *  * @param table * @param scan * @throws Throwable */private <R, S, P extends Message, Q extends Message, T extends Message> Pair<S, Long> getAvgArgs(final HTable table, final ColumnInterpreter<R, S, P, Q, T> ci,final Scan scan) throws Throwable {final AggregateRequest requestArg = validateArgAndGetPB(scan, ci, false);class AvgCallBack implements Batch.Callback<Pair<S, Long>> {S sum = null;Long rowCount = 0l;public Pair<S, Long> getAvgArgs() {return new Pair<S, Long>(sum, rowCount);}@Overridepublic synchronized void update(byte[] region, byte[] row,Pair<S, Long> result) {sum = ci.add(sum, result.getFirst());rowCount += result.getSecond();}}AvgCallBack avgCallBack = new AvgCallBack();table.coprocessorService(AggregateService.class, scan.getStartRow(),scan.getStopRow(),new Batch.Call<AggregateService, Pair<S, Long>>() {@Overridepublic Pair<S, Long> call(AggregateService instance)throws IOException {ServerRpcController controller = new ServerRpcController();BlockingRpcCallback<AggregateResponse> rpcCallback = new BlockingRpcCallback<AggregateResponse>();instance.getAvg(controller, requestArg, rpcCallback);AggregateResponse response = rpcCallback.get();if (controller.failedOnException()) {throw controller.getFailedOn();}Pair<S, Long> pair = new Pair<S, Long>(null, 0L);if (response.getFirstPartCount() == 0) {return pair;}ByteString b = response.getFirstPart(0);T t = ProtobufUtil.getParsedGenericInstance(ci.getClass(), 4, b);S s = ci.getPromotedValueFromProto(t);pair.setFirst(s);ByteBuffer bb = ByteBuffer.allocate(8).put(getBytesFromResponse(response.getSecondPart()));bb.rewind();pair.setSecond(bb.getLong());return pair;}}, avgCallBack);return avgCallBack.getAvgArgs();}/** * This is the client side interface/handle for calling the average method * for a given cf-cq combination. It was necessary to add one more call * stack as its return type should be a decimal value, irrespective of what * columninterpreter says. So, this methods collects the necessary * parameters to compute the average and returs the double value. *  * @param tableName * @param ci * @param scan * @return <R, S> * @throws Throwable */public <R, S, P extends Message, Q extends Message, T extends Message> double avg(final TableName tableName,final ColumnInterpreter<R, S, P, Q, T> ci, Scan scan)throws Throwable {Pair<S, Long> p = getAvgArgs(tableName, ci, scan);return ci.divideForAvg(p.getFirst(), p.getSecond());}/** * This is the client side interface/handle for calling the average method * for a given cf-cq combination. It was necessary to add one more call * stack as its return type should be a decimal value, irrespective of what * columninterpreter says. So, this methods collects the necessary * parameters to compute the average and returs the double value. *  * @param table * @param ci * @param scan * @return <R, S> * @throws Throwable */public <R, S, P extends Message, Q extends Message, T extends Message> double avg(final HTable table, final ColumnInterpreter<R, S, P, Q, T> ci,Scan scan) throws Throwable {Pair<S, Long> p = getAvgArgs(table, ci, scan);return ci.divideForAvg(p.getFirst(), p.getSecond());}/** * It computes a global standard deviation for a given column and its value. * Standard deviation is square root of (average of squares - * average*average). From individual regions, it obtains sum, square sum and * number of rows. With these, the above values are computed to get the * global std. *  * @param table * @param scan * @return standard deviations * @throws Throwable */private <R, S, P extends Message, Q extends Message, T extends Message> Pair<List<S>, Long> getStdArgs(final HTable table, final ColumnInterpreter<R, S, P, Q, T> ci,final Scan scan) throws Throwable {final AggregateRequest requestArg = validateArgAndGetPB(scan, ci, false);class StdCallback implements Batch.Callback<Pair<List<S>, Long>> {long rowCountVal = 0l;S sumVal = null, sumSqVal = null;public Pair<List<S>, Long> getStdParams() {List<S> l = new ArrayList<S>();l.add(sumVal);l.add(sumSqVal);Pair<List<S>, Long> p = new Pair<List<S>, Long>(l, rowCountVal);return p;}@Overridepublic synchronized void update(byte[] region, byte[] row,Pair<List<S>, Long> result) {if (result.getFirst().size() > 0) {sumVal = ci.add(sumVal, result.getFirst().get(0));sumSqVal = ci.add(sumSqVal, result.getFirst().get(1));rowCountVal += result.getSecond();}}}StdCallback stdCallback = new StdCallback();table.coprocessorService(AggregateService.class, scan.getStartRow(),scan.getStopRow(),new Batch.Call<AggregateService, Pair<List<S>, Long>>() {@Overridepublic Pair<List<S>, Long> call(AggregateService instance)throws IOException {ServerRpcController controller = new ServerRpcController();BlockingRpcCallback<AggregateResponse> rpcCallback = new BlockingRpcCallback<AggregateResponse>();instance.getStd(controller, requestArg, rpcCallback);AggregateResponse response = rpcCallback.get();if (controller.failedOnException()) {throw controller.getFailedOn();}Pair<List<S>, Long> pair = new Pair<List<S>, Long>(new ArrayList<S>(), 0L);if (response.getFirstPartCount() == 0) {return pair;}List<S> list = new ArrayList<S>();for (int i = 0; i < response.getFirstPartCount(); i++) {ByteString b = response.getFirstPart(i);T t = ProtobufUtil.getParsedGenericInstance(ci.getClass(), 4, b);S s = ci.getPromotedValueFromProto(t);list.add(s);}pair.setFirst(list);ByteBuffer bb = ByteBuffer.allocate(8).put(getBytesFromResponse(response.getSecondPart()));bb.rewind();pair.setSecond(bb.getLong());return pair;}}, stdCallback);return stdCallback.getStdParams();}/** * This is the client side interface/handle for calling the std method for a * given cf-cq combination. It was necessary to add one more call stack as * its return type should be a decimal value, irrespective of what * columninterpreter says. So, this methods collects the necessary * parameters to compute the std and returns the double value. *  * @param tableName * @param ci * @param scan * @return <R, S> * @throws Throwable */public <R, S, P extends Message, Q extends Message, T extends Message> double std(final TableName tableName, ColumnInterpreter<R, S, P, Q, T> ci,Scan scan) throws Throwable {HTable table = null;try {table = new HTable(conf, tableName);return std(table, ci, scan);} finally {if (table != null) {table.close();}}}/** * This is the client side interface/handle for calling the std method for a * given cf-cq combination. It was necessary to add one more call stack as * its return type should be a decimal value, irrespective of what * columninterpreter says. So, this methods collects the necessary * parameters to compute the std and returns the double value. *  * @param table * @param ci * @param scan * @return <R, S> * @throws Throwable */public <R, S, P extends Message, Q extends Message, T extends Message> double std(final HTable table, ColumnInterpreter<R, S, P, Q, T> ci, Scan scan)throws Throwable {Pair<List<S>, Long> p = getStdArgs(table, ci, scan);double res = 0d;double avg = ci.divideForAvg(p.getFirst().get(0), p.getSecond());double avgOfSumSq = ci.divideForAvg(p.getFirst().get(1), p.getSecond());res = avgOfSumSq - (avg) * (avg); // varianceres = Math.pow(res, 0.5);return res;}/** * It helps locate the region with median for a given column whose weight is * specified in an optional column. From individual regions, it obtains sum * of values and sum of weights. *  * @param table * @param ci * @param scan * @return pair whose first element is a map between start row of the region *         and (sum of values, sum of weights) for the region, the second *         element is (sum of values, sum of weights) for all the regions *         chosen * @throws Throwable */private <R, S, P extends Message, Q extends Message, T extends Message> Pair<NavigableMap<byte[], List<S>>, List<S>> getMedianArgs(final HTable table, final ColumnInterpreter<R, S, P, Q, T> ci,final Scan scan) throws Throwable {final AggregateRequest requestArg = validateArgAndGetPB(scan, ci, false);final NavigableMap<byte[], List<S>> map = new TreeMap<byte[], List<S>>(Bytes.BYTES_COMPARATOR);class StdCallback implements Batch.Callback<List<S>> {S sumVal = null, sumWeights = null;public Pair<NavigableMap<byte[], List<S>>, List<S>> getMedianParams() {List<S> l = new ArrayList<S>();l.add(sumVal);l.add(sumWeights);Pair<NavigableMap<byte[], List<S>>, List<S>> p = new Pair<NavigableMap<byte[], List<S>>, List<S>>(map, l);return p;}@Overridepublic synchronized void update(byte[] region, byte[] row,List<S> result) {map.put(row, result);sumVal = ci.add(sumVal, result.get(0));sumWeights = ci.add(sumWeights, result.get(1));}}StdCallback stdCallback = new StdCallback();table.coprocessorService(AggregateService.class, scan.getStartRow(),scan.getStopRow(), new Batch.Call<AggregateService, List<S>>() {@Overridepublic List<S> call(AggregateService instance)throws IOException {ServerRpcController controller = new ServerRpcController();BlockingRpcCallback<AggregateResponse> rpcCallback = new BlockingRpcCallback<AggregateResponse>();instance.getMedian(controller, requestArg, rpcCallback);AggregateResponse response = rpcCallback.get();if (controller.failedOnException()) {throw controller.getFailedOn();}List<S> list = new ArrayList<S>();for (int i = 0; i < response.getFirstPartCount(); i++) {ByteString b = response.getFirstPart(i);T t = ProtobufUtil.getParsedGenericInstance(ci.getClass(), 4, b);S s = ci.getPromotedValueFromProto(t);list.add(s);}return list;}}, stdCallback);return stdCallback.getMedianParams();}/** * This is the client side interface/handler for calling the median method * for a given cf-cq combination. This method collects the necessary * parameters to compute the median and returns the median. *  * @param tableName * @param ci * @param scan * @return R the median * @throws Throwable */public <R, S, P extends Message, Q extends Message, T extends Message> R median(final TableName tableName, ColumnInterpreter<R, S, P, Q, T> ci,Scan scan) throws Throwable {HTable table = null;try {table = new HTable(conf, tableName);return median(table, ci, scan);} finally {if (table != null) {table.close();}}}/** * This is the client side interface/handler for calling the median method * for a given cf-cq combination. This method collects the necessary * parameters to compute the median and returns the median. *  * @param table * @param ci * @param scan * @return R the median * @throws Throwable */public <R, S, P extends Message, Q extends Message, T extends Message> R median(final HTable table, ColumnInterpreter<R, S, P, Q, T> ci, Scan scan)throws Throwable {Pair<NavigableMap<byte[], List<S>>, List<S>> p = getMedianArgs(table,ci, scan);byte[] startRow = null;byte[] colFamily = scan.getFamilies()[0];NavigableSet<byte[]> quals = scan.getFamilyMap().get(colFamily);NavigableMap<byte[], List<S>> map = p.getFirst();S sumVal = p.getSecond().get(0);S sumWeights = p.getSecond().get(1);double halfSumVal = ci.divideForAvg(sumVal, 2L);double movingSumVal = 0;boolean weighted = false;if (quals.size() > 1) {weighted = true;halfSumVal = ci.divideForAvg(sumWeights, 2L);}for (Map.Entry<byte[], List<S>> entry : map.entrySet()) {S s = weighted ? entry.getValue().get(1) : entry.getValue().get(0);double newSumVal = movingSumVal + ci.divideForAvg(s, 1L);if (newSumVal > halfSumVal)break; // we found the region with the medianmovingSumVal = newSumVal;startRow = entry.getKey();}// scan the region with median and find itScan scan2 = new Scan(scan);// inherit stop row from method parameterif (startRow != null)scan2.setStartRow(startRow);ResultScanner scanner = null;try {int cacheSize = scan2.getCaching();if (!scan2.getCacheBlocks() || scan2.getCaching() < 2) {scan2.setCacheBlocks(true);cacheSize = 5;scan2.setCaching(cacheSize);}scanner = table.getScanner(scan2);Result[] results = null;byte[] qualifier = quals.pollFirst();// qualifier for the weight columnbyte[] weightQualifier = weighted ? quals.pollLast() : qualifier;R value = null;do {results = scanner.next(cacheSize);if (results != null && results.length > 0) {for (int i = 0; i < results.length; i++) {Result r = results[i];// retrieve weightCell kv = r.getColumnLatest(colFamily, weightQualifier);R newValue = ci.getValue(colFamily, weightQualifier, kv);S s = ci.castToReturnType(newValue);double newSumVal = movingSumVal+ ci.divideForAvg(s, 1L);// see if we have moved past the medianif (newSumVal > halfSumVal) {return value;}movingSumVal = newSumVal;kv = r.getColumnLatest(colFamily, qualifier);value = ci.getValue(colFamily, qualifier, kv);}}} while (results != null && results.length > 0);} finally {if (scanner != null) {scanner.close();}}return null;}<R, S, P extends Message, Q extends Message, T extends Message> AggregateRequest validateArgAndGetPB(Scan scan, ColumnInterpreter<R, S, P, Q, T> ci,boolean canFamilyBeAbsent) throws IOException {validateParameters(scan, canFamilyBeAbsent);final AggregateRequest.Builder requestBuilder = AggregateRequest.newBuilder();requestBuilder.setInterpreterClassName(ci.getClass().getCanonicalName());P columnInterpreterSpecificData = null;if ((columnInterpreterSpecificData = ci.getRequestData()) != null) {requestBuilder.setInterpreterSpecificBytes(columnInterpreterSpecificData.toByteString());}requestBuilder.setScan(ProtobufUtil.toScan(scan));return requestBuilder.build();}byte[] getBytesFromResponse(ByteString response) {ByteBuffer bb = response.asReadOnlyByteBuffer();bb.rewind();byte[] bytes;if (bb.hasArray()) {bytes = bb.array();} else {bytes = response.toByteArray();}return bytes;}}

具体调用方法:

1 首先将这两个类打成jar包,上传到hdfs上去,比如我上传到了 hdfs://master68:8020/sharelib/aggregate.jar

2 将该协处理器设置到某个具体的表上,我这里假设设置在userinfo表上

public static void main(String[] args) throws MasterNotRunningException,Exception {// TODO Auto-generated method stubbyte[] tableName = Bytes.toBytes("userinfo");Configuration conf = HBaseConfiguration.create();HBaseAdmin admin = new HBaseAdmin(conf);admin.disableTable(tableName);HTableDescriptor htd = admin.getTableDescriptor(tableName);htd.addCoprocessor(AggregateImplementation.class.getName(), new Path("hdfs://master68:8020/sharelib/aggregate.jar"), 1001, null);//htd.removeCoprocessor(RowCountEndpoint.class.getName());admin.modifyTable(tableName, htd);admin.enableTable(tableName);admin.close();}
3 写个测试类测试一下:

import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.hbase.HBaseConfiguration;import org.apache.hadoop.hbase.TableName;import org.apache.hadoop.hbase.client.Scan;import org.apache.hadoop.hbase.client.coprocessor.LongColumnInterpreter;public class TestAggregationClient {private static Configuration hbaseConfig = null;static {hbaseConfig = HBaseConfiguration.create();}public static void main(String[] args) throws Throwable {AggregationClient client = new AggregationClient(hbaseConfig);final Scan scan;scan = new Scan();Long count = client.rowCount(TableName.valueOf("userinfo"),new LongColumnInterpreter(), scan);System.out.println(count);}}



0 0