Hive+GenericUDF示例一

来源:互联网 发布:淘宝拍照片技巧 编辑:程序博客网 时间:2024/06/15 04:57

           和UDF相比,通用GDF(GenericUDF)支持复杂类型(比如List,struct等)的输入和输出。

           下面来看一个小示例。

            Hive中whereme表中包含若干人的行程如下:  

A       2013-10-10 8:00:00      homeA       2013-10-10 10:00:00     Super MarketA       2013-10-10 12:00:00     KFCA       2013-10-10 15:00:00     schoolA       2013-10-10 20:00:00     homeA       2013-10-15 8:00:00      homeA       2013-10-15 10:00:00     parkA       2013-10-15 12:00:00     homeA       2013-10-15 15:30:00     bankA       2013-10-15 19:00:00     home

           通过查询我们要得到如下结果: 

A2013-10-1008:00:00home10:00:00Super MarketA2013-10-1010:00:00Super Market12:00:00KFCA2013-10-1012:00:00KFC15:00:00schoolA2013-10-1015:00:00school20:00:00homeA2013-10-1508:00:00home10:00:00parkA2013-10-1510:00:00park12:00:00homeA2013-10-1512:00:00home15:30:00bankA2013-10-1515:30:00bank19:00:00home

           1.编写GenericUDF.

package com.wz.udf;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.FloatWritable;import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;import org.apache.hadoop.hive.ql.exec.UDFArgumentException;import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;import org.apache.hadoop.hive.ql.metadata.HiveException;import org.apache.hadoop.hive.serde2.lazy.LazyString;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;import org.apache.hadoop.hive.serde2.objectinspector.StructField;import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector;import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;import java.text.DateFormat;import java.text.SimpleDateFormat;import java.util.Date; import java.util.Calendar;import java.util.ArrayList; public class helloGenericUDF extends GenericUDF {     ////输入变量定义     private ObjectInspector peopleObj;     private ObjectInspector timeObj;     private ObjectInspector placeObj;     //之前记录保存     String strPreTime = "";     String strPrePlace = "";      String strPrePeople = "";      @Override     //1.确认输入类型是否正确     //2.输出类型的定义     public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {          peopleObj = (ObjectInspector)arguments[0];          timeObj = (ObjectInspector)arguments[1];          placeObj = (ObjectInspector)arguments[2];          //输出结构体定义          ArrayList structFieldNames = new ArrayList();          ArrayList structFieldObjectInspectors = new ArrayList();          structFieldNames.add("people");  structFieldNames.add("day");          structFieldNames.add("from_time");          structFieldNames.add("from_place");          structFieldNames.add("to_time");          structFieldNames.add("to_place");           structFieldObjectInspectors.add( PrimitiveObjectInspectorFactory.writableStringObjectInspector );          structFieldObjectInspectors.add( PrimitiveObjectInspectorFactory.writableStringObjectInspector );          structFieldObjectInspectors.add( PrimitiveObjectInspectorFactory.writableStringObjectInspector );          structFieldObjectInspectors.add( PrimitiveObjectInspectorFactory.writableStringObjectInspector );  structFieldObjectInspectors.add( PrimitiveObjectInspectorFactory.writableStringObjectInspector );    structFieldObjectInspectors.add( PrimitiveObjectInspectorFactory.writableStringObjectInspector );          StructObjectInspector si2;          si2 = ObjectInspectorFactory.getStandardStructObjectInspector(structFieldNames, structFieldObjectInspectors);           return si2;     }      //遍历每条记录     @Override     public Object evaluate(DeferredObject[] arguments) throws HiveException{  LazyString LPeople = (LazyString)(arguments[0].get());  String strPeople = ((StringObjectInspector)peopleObj).getPrimitiveJavaObject( LPeople );  LazyString LTime = (LazyString)(arguments[1].get());  String strTime = ((StringObjectInspector)timeObj).getPrimitiveJavaObject( LTime );  LazyString LPlace = (LazyString)(arguments[2].get());  String strPlace = ((StringObjectInspector)placeObj).getPrimitiveJavaObject( LPlace );  Object[] e;  e = new Object[6];          try  {                //如果是同一个人,同一天  if(strPrePeople.equals(strPeople) && IsSameDay(strTime) )  {       e[0] = new Text(strPeople);                        e[1] = new Text(GetYearMonthDay(strTime));       e[2] = new Text(GetTime(strPreTime));       e[3] = new Text(strPrePlace);       e[4] = new Text(GetTime(strTime));       e[5] = new Text(strPlace);  }                else                {       e[0] = new Text(strPeople);e[1] = new Text(GetYearMonthDay(strTime));       e[2] = new Text("null");       e[3] = new Text("null");       e[4] = new Text(GetTime(strTime));       e[5] = new Text(strPlace);                }          }          catch(java.text.ParseException ex)          {          }         strPrePeople = new String(strPeople);  strPreTime= new String(strTime);  strPrePlace = new String(strPlace);          return e;     }      @Override     public String getDisplayString(String[] children) {          assert( children.length>0 );           StringBuilder sb = new StringBuilder();          sb.append("helloGenericUDF(");          sb.append(children[0]);          sb.append(")");           return sb.toString();     }     //比较相邻两个时间段是否在同一天     private boolean IsSameDay(String strTime) throws java.text.ParseException{    if(strPreTime.isEmpty()){     return false;         }         String curDay = GetYearMonthDay(strTime);         String preDay = GetYearMonthDay(strPreTime); return curDay.equals(preDay);     }     //获取年月日     private String GetYearMonthDay(String strTime)  throws java.text.ParseException{         DateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");         Date curDate = df.parse(strTime); df = new SimpleDateFormat("yyyy-MM-dd");         return df.format(curDate);     }     //获取时间     private String GetTime(String strTime)  throws java.text.ParseException{         DateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");         Date curDate = df.parse(strTime);         df = new SimpleDateFormat("HH:mm:ss");         return df.format(curDate);     }}

           2.在Hive里面创建两张表,一张包含结构体的表保存执行GenericUDF查询后的结果,另外一张用于保存最终结果.

hive> create table whereresult(people string,day string,from_time string,from_place string,to_time string,to_place string);OKTime taken: 0.287 secondshive> create table tmpResult(info struct<people:string,day:string,from_time:str>ing,from_place:string,to_time:string,to_place:string>);OKTime taken: 0.074 seconds

           3.执行GenericUDF查询,得到最终结果。  

hive> insert overwrite table tmpResult select hellogenericudf(whereme.people,whereme.time,whereme.place) from whereme;hive> insert overwrite table whereresult select info.people,info.day,info.from_time,info.from_place,info.to_time,info.to_place from tmpResult where info.from_time<>'null';Total MapReduce jobs = 2Launching Job 1 out of 2Number of reduce tasks is set to 0 since there's no reduce operatorStarting Job = job_201312022129_0006, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201312022129_0006Kill Command = /home/wangzhun/hadoop/hadoop-0.20.2/bin/../bin/hadoop job  -Dmapred.job.tracker=localhost:9001 -kill job_201312022129_0006Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 02013-12-02 22:48:40,733 Stage-1 map = 0%,  reduce = 0%2013-12-02 22:48:49,825 Stage-1 map = 100%,  reduce = 0%2013-12-02 22:48:52,869 Stage-1 map = 100%,  reduce = 100%Ended Job = job_201312022129_0006Ended Job = -383357832, job is filtered out (removed at runtime).Moving data to: hdfs://localhost:9000/tmp/hive-root/hive_2013-12-02_22-48-24_406_2701579121398466034/-ext-10000Loading data to table default.whereresultDeleted hdfs://localhost:9000/user/hive/warehouse/whereresultTable default.whereresult stats: [num_partitions: 0, num_files: 1, num_rows: 0, total_size: 346, raw_data_size: 0]8 Rows loaded to whereresultMapReduce Jobs Launched: Job 0: Map: 1   HDFS Read: 420 HDFS Write: 346 SUCESSTotal MapReduce CPU Time Spent: 0 msecOKTime taken: 29.098 secondshive> select * from whereresult;OKA2013-10-1008:00:00home10:00:00Super MarketA2013-10-1010:00:00Super Market12:00:00KFCA2013-10-1012:00:00KFC15:00:00schoolA2013-10-1015:00:00school20:00:00homeA2013-10-1508:00:00home10:00:00parkA2013-10-1510:00:00park12:00:00homeA2013-10-1512:00:00home15:30:00bankA2013-10-1515:30:00bank19:00:00homeTime taken: 0.105 seconds


         
原创粉丝点击