写hive的udf函数

来源：互联网发布：淘宝店铺突然没流量编辑：程序博客网时间：2024/06/05 22:30

最近感受了Hive的udf函数的强大威力了，不仅可以使用很多已经有的udf函数，还可以自己定义符合业务场景的udf函数，下面就说一下如何写udf/udaf/udtf函数，算是一个入门介绍吧。

First, you need to create a new class that extends UDF, with one or more methods named evaluate.

[html] view plain copy
 
package com.example.hive.udf;  
  
import org.apache.hadoop.hive.ql.exec.UDF;  
import org.apache.hadoop.io.Text;  
  
public final class Lower extends UDF {  
  public Text evaluate(final Text s) {  
    if (s == null) { return null; }  
    return new Text(s.toString().toLowerCase());  
  }  
}

After compiling your code to a jar, you need to add this to the hive classpath.

[html] view plain copy
add jar my_jar.jar;  

Once hive is started up with your jars in the classpath, the final step is to register your function

[html] view plain copy
 
create temporary function my_lower as 'com.example.hive.udf.Lower';

上面主要描述了实现一个udf的过程，首先自然是实现一个UDF函数，然后编译为jar并加入到hive的classpath中，最后创建一个临时变量名字让hive中调用。

下面这个表格可以更加清晰的看出udf/udaf/udtf之间的区别

Show几个例子：

1） UDF （参考：http://svn.apache.org/repos/asf/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/udf/）

[html] view plain copy
package org.apache.hadoop.hive.contrib.udf.example;  
  
import org.apache.hadoop.hive.ql.exec.UDF;  
  
/**  
 * UDFExampleAdd.  
 *  
 */  
public class UDFExampleAdd extends UDF {  
  
  public Integer evaluate(Integer... a) {  
    int total = 0;  
    for (Integer element : a) {  
      if (element != null) {  
        total += element;  
      }  
    }  
    return total;  
  }  
  
  public Double evaluate(Double... a) {  
    double total = 0;  
    for (Double element : a) {  
      if (element != null) {  
        total += element;  
      }  
    }  
    return total;  
  }  
  
}  

2）UDAF（http://svn.apache.org/repos/asf/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/udaf/）

[html] view plain copy
package org.apache.hadoop.hive.contrib.udaf.example;  
  
import org.apache.hadoop.hive.ql.exec.UDAF;  
import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;  
  
/**  
 * This is a simple UDAF that calculates average.  
 *   
 * It should be very easy to follow and can be used as an example for writing  
 * new UDAFs.  
 *   
 * Note that Hive internally uses a different mechanism (called GenericUDAF) to  
 * implement built-in aggregation functions, which are harder to program but  
 * more efficient.  
 *   
 */  
public final class UDAFExampleAvg extends UDAF {  
  
  /**  
   * The internal state of an aggregation for average.  
   *   
   * Note that this is only needed if the internal state cannot be represented  
   * by a primitive.  
   *   
   * The internal state can also contains fields with types like  
   * ArrayList<String> and HashMap<String,Double> if needed.  
   */  
  public static class UDAFAvgState {  
    private long mCount;  
    private double mSum;  
  }  
  
  /**  
   * The actual class for doing the aggregation. Hive will automatically look  
   * for all internal classes of the UDAF that implements UDAFEvaluator.  
   */  
  public static class UDAFExampleAvgEvaluator implements UDAFEvaluator {  
  
    UDAFAvgState state;  
  
    public UDAFExampleAvgEvaluator() {  
      super();  
      state = new UDAFAvgState();  
      init();  
    }  
  
    /**  
     * Reset the state of the aggregation.  
     */  
    public void init() {  
      state.mSum = 0;  
      state.mCount = 0;  
    }  
  
    /**  
     * Iterate through one row of original data.  
     *   
     * The number and type of arguments need to the same as we call this UDAF  
     * from Hive command line.  
     *   
     * This function should always return true.  
     */  
    public boolean iterate(Double o) {  
      if (o != null) {  
        state.mSum += o;  
        state.mCount++;  
      }  
      return true;  
    }  
  
    /**  
     * Terminate a partial aggregation and return the state. If the state is a  
     * primitive, just return primitive Java classes like Integer or String.  
     */  
    public UDAFAvgState terminatePartial() {  
      // This is SQL standard - average of zero items should be null.  
      return state.mCount == 0 ? null : state;  
    }  
  
    /**  
     * Merge with a partial aggregation.  
     *   
     * This function should always have a single argument which has the same  
     * type as the return value of terminatePartial().  
     */  
    public boolean merge(UDAFAvgState o) {  
      if (o != null) {  
        state.mSum += o.mSum;  
        state.mCount += o.mCount;  
      }  
      return true;  
    }  
  
    /**  
     * Terminates the aggregation and return the final result.  
     */  
    public Double terminate() {  
      // This is SQL standard - average of zero items should be null.  
      return state.mCount == 0 ? null : Double.valueOf(state.mSum  
          / state.mCount);  
    }  
  }  
  
  private UDAFExampleAvg() {  
    // prevent instantiation  
  }  
  
}  

3）UDTF（http://svn.apache.org/repos/asf/hive/trunk/contrib/src/java/org/apache/hadoop/hive/contrib/udtf/）

[html] view plain copy
package org.apache.hadoop.hive.contrib.udtf.example;  
  
import java.util.ArrayList;  
import java.util.List;  
  
import org.apache.hadoop.hive.ql.exec.Description;  
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;  
import org.apache.hadoop.hive.ql.metadata.HiveException;  
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;  
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;  
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;  
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;  
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;  
  
/**  
 * GenericUDTFExplode2.  
 *  
 */  
@Description(name = "explode2",  
    value = "_FUNC_(a) - like explode, but outputs two identical columns (for testing purposes)")  
public class GenericUDTFExplode2 extends GenericUDTF {  
  
  ListObjectInspector listOI = null;  
  
  @Override  
  public void close() throws HiveException {  
  }  
  
  @Override  
  public StructObjectInspector initialize(ObjectInspector[] args)  
      throws UDFArgumentException {  
  
    if (args.length != 1) {  
      throw new UDFArgumentException("explode() takes only one argument");  
    }  
  
    if (args[0].getCategory() != ObjectInspector.Category.LIST) {  
      throw new UDFArgumentException("explode() takes an array as a parameter");  
    }  
    listOI = (ListObjectInspector) args[0];  
  
    ArrayList<String> fieldNames = new ArrayList<String>();  
    ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();  
    fieldNames.add("col1");  
    fieldNames.add("col2");  
    fieldOIs.add(listOI.getListElementObjectInspector());  
    fieldOIs.add(listOI.getListElementObjectInspector());  
    return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,  
        fieldOIs);  
  }  
  
  Object forwardObj[] = new Object[2];  
  
  @Override  
  public void process(Object[] o) throws HiveException {  
  
    List<?> list = listOI.getList(o[0]);  
    for (Object r : list) {  
      forwardObj[0] = r;  
      forwardObj[1] = r;  
      forward(forwardObj);  
    }  
  }  
  
  @Override  
  public String toString() {  
    return "explode";  
  }  
}  

阅读全文

0 0