十八、Hive 中UDF编程

来源:互联网 发布:java数据脱敏技术 编辑:程序博客网 时间:2024/05/01 04:03


依据课程中讲解的如何自定义 UDF,进行案例编写,进行总结步骤,并完成额外需求,具体说明如下: 1) 依据课程讲解 UDF 编程案例,完成练习,总结开发 UDF 步骤,代码贴图,

给予注释,重点在于清晰编程思路。 2) 完成如下数据字段中双引号,自定义 UDF,完成编程测试。

  1. 新建maven项目。
  2. 配置pom.xml

 <properties>

   <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

      <hive.version>0.13.1</hive.version>

   <hadoop.version>2.5.0</hadoop.version>

  </properties>

 

  <dependencies>

 

   <dependency>

         <groupId>jdk.tools</groupId>

         <artifactId>jdk.tools</artifactId>

         <version>1.7</version>

         <scope>system</scope>

         <systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>

     </dependency>

     <!-- hadoop Client -->

   <dependency>

    <groupId>org.apache.hadoop</groupId>

            <artifactId>hadoop-common</artifactId>

            <version>${hadoop.version}</version>

        </dependency>

        <dependency>

            <groupId>org.apache.hadoop</groupId>

            <artifactId>hadoop-hdfs</artifactId>

            <version>${hadoop.version}</version>

        </dependency>

        <dependency>

            <groupId>org.apache.hadoop</groupId>

            <artifactId>hadoop-client</artifactId>

            <version>${hadoop.version}</version>

        </dependency>

        <dependency>

           <groupId>org.apache.hadoop</groupId>

           <artifactId>hadoop-hdfs</artifactId>

           <version>${hadoop.version}</version>

       </dependency>

<!-- Hive Client -->

<dependency>

    <groupId>org.apache.hive</groupId>

    <artifactId>hive-jdbc</artifactId>

    <version>${hive.version}</version>

</dependency>

<dependency>

    <groupId>org.apache.hive</groupId>

    <artifactId>hive-exec</artifactId>

    <version>${hive.version}</version>

</dependency>

  </dependencies>

  1. udf

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import java.util.regex.PatternSyntaxException;

import org.apache.hadoop.hive.ql.exec.UDF;

import org.apache.hadoop.io.Text;

 

//UDF是作用于单个数据行,产生一个数据行

//用户必须要继承UDF,且必须至少实现一个evalute方法,该方法并不在UDF

//但是Hive会检查用户的UDF是否拥有一个evalute方法

publicclass StrFilter extends UDF  {

    private Text result=new Text();

    public Text evaluate(Text  str) {

        try {

        if(str.toString().indexOf("/")!=-1){

              String   str2   = StringFilter(str.toString());

              result.set(str2.substring(str2.indexOf(" ",2),str2.indexOf(" ", 4)).trim());

             returnresult;

        }

        else{

            result.set(StringFilter(str.toString())+":");

            returnresult;}

           

        } catch (Exception e) {

            returnnull;

        }

    }

    //过滤特殊字符

    public   static   String StringFilter(String   str)   throws   PatternSyntaxException   {  

       //String regEx="[`~!@#$%^&*()+=|{}':;',\\[\\].<>/?~!@¥%……&*()——+|{}【】#;:”“’。,、?]";

         String regEx="[\"]";

         Pattern   p   =   Pattern.compile(regEx);  

         Matcher   m   =   p.matcher(str);  

         return   m.replaceAll("").trim();  

  }  

}

 

  1. 创建表并导入

CREATE TABLE logs(

host STRING,

identity STRING,

time STRING,

request STRING,

status STRING)

ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'

WITH SERDEPROPERTIES (

"input.regex" = "(\".*?\") (\".*?\") (\".*?\") (\".*?\") (\".*?\")",

"output.format.string" = "%1$s %2$s %3$s %4$s %5$s")

STORED AS TEXTFILE; 

导入

load data local inpath '/home/access.log' into table logs;

select host,request  from logs limit 100;


add jar /home/strsfilter.jar; 

create temporary function strfilter as 'com.hyhc.hiveudf.StrFilter';

查看结果

0 0
原创粉丝点击