十八、Hive 中UDF编程

来源：互联网发布：java数据脱敏技术编辑：程序博客网时间：2024/05/01 04:03

依据课程中讲解的如何自定义 UDF，进行案例编写，进行总结步骤，并完成额外需求，具体说明如下： 1）依据课程讲解 UDF 编程案例，完成练习，总结开发 UDF 步骤，代码贴图，

给予注释，重点在于清晰编程思路。 2）完成如下数据字段中双引号，自定义 UDF，完成编程测试。

新建maven项目。
配置pom.xml

<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

<hive.version>0.13.1</hive.version>

<hadoop.version>2.5.0</hadoop.version>

</properties>

<groupId>jdk.tools</groupId>

<artifactId>jdk.tools</artifactId>

<scope>system</scope>

<systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>

</dependency>

<groupId>org.apache.hadoop</groupId>

<artifactId>hadoop-common</artifactId>

<version>${hadoop.version}</version>

</dependency>

<groupId>org.apache.hadoop</groupId>

<artifactId>hadoop-hdfs</artifactId>

<version>${hadoop.version}</version>

</dependency>

<groupId>org.apache.hadoop</groupId>

<artifactId>hadoop-client</artifactId>

<version>${hadoop.version}</version>

</dependency>

<groupId>org.apache.hadoop</groupId>

<artifactId>hadoop-hdfs</artifactId>

<version>${hadoop.version}</version>

</dependency>

<groupId>org.apache.hive</groupId>

<version>${hive.version}</version>

</dependency>

<groupId>org.apache.hive</groupId>

<version>${hive.version}</version>

</dependency>

</dependencies>

写udf类

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import java.util.regex.PatternSyntaxException;

import org.apache.hadoop.hive.ql.exec.UDF;

import org.apache.hadoop.io.Text;

//UDF是作用于单个数据行，产生一个数据行

//用户必须要继承UDF，且必须至少实现一个evalute方法，该方法并不在UDF中

//但是Hive会检查用户的UDF是否拥有一个evalute方法

publicclass StrFilter extends UDF {

private Text result=new Text();

public Text evaluate(Text str) {

try {

if(str.toString().indexOf("/")!=-1){

String str2 = StringFilter(str.toString());

result.set(str2.substring(str2.indexOf(" ",2),str2.indexOf(" ", 4)).trim());

returnresult;

}

else{

result.set(StringFilter(str.toString())+":");

returnresult;}

} catch (Exception e) {

returnnull;

}

//过滤特殊字符

public static String StringFilter(String str) throws PatternSyntaxException {

//String regEx="[`~!@#$%^&*()+=|{}':;',\\[\\].<>/?~！＠￥%……&*（）——+|{}【】＃‘；：”“’。，、？]";

String regEx="[\"]";

Pattern p = Pattern.compile(regEx);

Matcher m = p.matcher(str);

return m.replaceAll("").trim();

}

创建表并导入

CREATE TABLE logs(

host STRING,

identity STRING,

time STRING,

request STRING,

status STRING)

ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'

WITH SERDEPROPERTIES (

"input.regex" = "(\".*?\") (\".*?\") (\".*?\") (\".*?\") (\".*?\")",

"output.format.string" = "%1$s %2$s %3$s %4$s %5$s")

STORED AS TEXTFILE;

导入

load data local inpath '/home/access.log' into table logs;

select host,request from logs limit 100;

add jar /home/strsfilter.jar;

create temporary function strfilter as 'com.hyhc.hiveudf.StrFilter';

查看结果

0 0