hive进行input transform解析

来源：互联网发布：晚吃姜如砒霜知乎编辑：程序博客网时间：2024/06/05 07:55

通过SerDes的方式对一下数据进行Hive的存储和查询操作：

0^^Hadoop^^America^^5000|8000|12000|level8^^male
1^^Spark^^America^^8000|10000|15000|level9^^famale
2^^Flink^^America^^7000|8000|13000|level10^^male
3^^Hadoop^^America^^9000|11000|12000|level10^^famale
4^^Spark^^America^^10000|11000|12000|level12^^male
5^^Flink^^America^^11000|12000|18000|level18^^famale
6^^Hadoop^^America^^15000|16000|19000|level16^^male
7^^Spark^^America^^18000|19000|20000|level20^^male
8^^Flink^^America^^15000|16000|19000|level19^^male

实现:inputformat格式编码解析,灵活对hive源数据进行清洗

1,按^^进行分割

2,同时也按|进行切分

实现步骤:

1,源数据位置:

root@master:/usr/local/IMF_testdata/hivestudy# ls

employeesinputformat.txt IMFInputFormat2.jar

2,查看文件内容

root@master:/usr/local/IMF_testdata/hivestudy# cat employeesinputformat.txt

0^^Hadoop^^America^^5000|8000|12000|level8^^male

1^^Spark^^America^^8000|10000|15000|level9^^famale

2^^Flink^^America^^7000|8000|13000|level10^^male

3^^Hadoop^^America^^9000|11000|12000|level10^^famale

4^^Spark^^America^^10000|11000|12000|level12^^male

5^^Flink^^America^^11000|12000|18000|level18^^famale

6^^Hadoop^^America^^15000|16000|19000|level16^^male

7^^Spark^^America^^18000|19000|20000|level20^^male

8^^Flink^^America^^15000|16000|19000|level19^^male

3,开发inputformat代码,源代码附后. 导出jar包IMFInputFormat2.jar

代码中使用了正则表达式对文本进行了解析:

String patternhive = "^(.*)\\^\\^(.*)\\^\\^(.*)\\^\\^(.*)\\|(.*)\\|(.*)\\|(.*)\\^\\^(.*)";

按^^及|进行解析,解析以后进行分组,依次获取各分组的值,然后使用"\u001"组拼接成字符串.

问题:使用"\t"拼接在hive中导入数据为null；

解决：使用"\u001"组拼接成字符串.，顺利导入数据到hive。

4,在hive中的操作:

删表:

drop table employee_inputformat;

导入jar包

add jar /usr/local/IMF_testdata/hivestudy/IMFInputFormat2.jar;

建立表

CREATE TABLE employee_InputFormat(userid INT,name String,address String, salarys1 int ,salarys2 int ,salarys3 int ,salarys4 string , gendre string) stored as INPUTFORMAT 'com.dt.spark.hive.IMFInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';

加载数据

LOAD DATA LOCAL INPATH '/usr/local/IMF_testdata/hivestudy/employeesinputformat.txt' INTO TABLE employee_InputFormat;

数据查询

select * from employee_InputFormat;

5,运行结果如下:

hive> desc formatted employee_inputformat;

# col_name data_type comment

userid int

name string

address string

salarys1 int

salarys2 int

salarys3 int

salarys4 string

gendre string

# Detailed Table Information

Database: default

Owner: root

CreateTime: Sun Dec 11 20:47:21 CST 2016

LastAccessTime: UNKNOWN

Protect Mode: None

Retention: 0

Location: hdfs://master:9000/user/hive/warehouse/employee_inputformat

Table Type: MANAGED_TABLE

Table Parameters:

COLUMN_STATS_ACCURATE true

numFiles 1

totalSize 467

transient_lastDdlTime 1481460441

# Storage Information

SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

InputFormat: com.dt.spark.hive.IMFInputFormat

OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat

Compressed: No

Num Buckets: -1

Bucket Columns: []

Sort Columns: []

Storage Desc Params:

serialization.format 1

Time taken: 0.111 seconds, Fetched: 36 row(s)

hive>

附件源代码:

package com.dt.spark.hive;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapred.FileSplit;

import org.apache.hadoop.mapred.InputSplit;

import org.apache.hadoop.mapred.JobConf;

import org.apache.hadoop.mapred.JobConfigurable;

import org.apache.hadoop.mapred.RecordReader;

import org.apache.hadoop.mapred.Reporter;

import org.apache.hadoop.mapred.TextInputFormat;

public class IMFInputFormat extends TextInputFormat implements

JobConfigurable

{

public RecordReader<LongWritable, Text> getRecordReader(

InputSplit genericSplit, JobConf job, Reporter reporter)

throws IOException {

reporter.setStatus(genericSplit.toString());

return new IMFRecordReader((FileSplit) genericSplit,job);

}

源代码:

package com.dt.spark.hive;

import java.io.IOException;

import java.io.InputStream;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FSDataInputStream;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.compress.CompressionCodec;

import org.apache.hadoop.io.compress.CompressionCodecFactory;

import org.apache.hadoop.mapred.FileSplit;

import org.apache.hadoop.util.LineReader;

import org.apache.hadoop.mapred.RecordReader;

public class IMFRecordReaderimplements RecordReader<LongWritable, Text> {

private CompressionCodecFactorycompressionCodecs = null;

private long start;

private long pos;

private long end;

private LineReader lineReader;

int maxLineLength;

public IMFRecordReader(FileSplitinputSplit, Configuration job) throws IOException {

maxLineLength = job.getInt("mapred.IMFRecordReader.maxlength", Integer.MAX_VALUE);

start = inputSplit.getStart();

end = start + inputSplit.getLength();

final Path file = inputSplit.getPath();

compressionCodecs = new CompressionCodecFactory(job);

final CompressionCodeccodec = compressionCodecs.getCodec(file);

// Open file and seek to the start of the split

FileSystem fs = file.getFileSystem(job);

FSDataInputStream fileIn = fs.open(file);

boolean skipFirstLine =false;

if (codec !=null) {

lineReader = new LineReader(codec.createInputStream(fileIn),job);

end = Long.MAX_VALUE;

} else {

if (start != 0) {

skipFirstLine = true;

--start;

fileIn.seek(start);

}

lineReader = new LineReader(fileIn,job);

}

if (skipFirstLine) {

start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE,end - start));

}

this.pos =start;

}

public IMFRecordReader(InputStreamin, long offset,long endOffset,int maxLineLength) {

this.maxLineLength =maxLineLength;

this.lineReader =new LineReader(in);

this.start =offset;

this.pos =offset;

this.end =endOffset;

}

public IMFRecordReader(InputStreamin, long offset,long endOffset, Configurationjob) throws IOException {

this.maxLineLength =job.getInt("mapred.IMFRecordReader.maxlength", Integer.MAX_VALUE);

this.lineReader =new LineReader(in,job);

this.start =offset;

this.pos =offset;

this.end =endOffset;

}

public LongWritable createKey() {

return new LongWritable();

}

public Text createValue() {

return new Text();

}

/**

* Reads the next record in the split. get usefull fields from the rawnginx

* log.

* @param key

* key of the record which will map to the byte offset of the

* record's line

* @param value

* the record in text format

* @return true if a record existed, false otherwise

* @throws IOException

public synchronized boolean next(LongWritablekey, Text value) throws IOException {

// Stay within the split

while (pos <end) {

key.set(pos);

int newSize =lineReader.readLine(value,maxLineLength,

Math.max((int) Math.min(Integer.MAX_VALUE,end - pos),maxLineLength));

if (newSize == 0)

return false;

String patternhive = "^(.*)\\^\\^(.*)\\^\\^(.*)\\^\\^(.*)\\|(.*)\\|(.*)\\|(.*)\\^\\^(.*)";

Pattern phive = Pattern.compile(patternhive);

String strhive = value.toString();

Matcher mhive = phive.matcher(strhive);

String resultstr = "defaultisblank";

while (mhive.find()) {

resultstr = mhive.group(1) + "\001" + mhive.group(2) + "\001" + mhive.group(3) + "\001" + mhive.group(4)

+ "\001" + mhive.group(5) + "\001" + mhive.group(6) + "\001" + "IMF" + mhive.group(7) + "\001"

+ mhive.group(8);

}

;

if (resultstr ==null || resultstr == "defaultisblank") {

} else {

value.set(resultstr);

pos += newSize;

if (newSize <maxLineLength)

return true;

}

return false;

}

public float getProgress() {

if (start ==end) {

return 0.0f;

} else {

return Math.min(1.0f, (pos -start) / (float) (end -start));

}

public synchronized long getPos()throws IOException {

return pos;

}

public synchronized void close()throws IOException {

if (lineReader !=null)

lineReader.close();

}

0 0