Hadoop之自定义格式分隔文件测试笔记
来源:互联网 发布:java导入excel poi 编辑:程序博客网 时间:2024/05/12 16:15
功能
通过重写FileInputFormat类下的getSplits方法实现自定义格式分隔文件,使用xml格式文件作为样例。
注意事项
注:仅测试,非工业版本!仅仅演示如何按自定义格式分隔文件!
XmlInputFormat类
package boa.hadoop.xml;import java.io.IOException;import java.util.ArrayList;import java.util.List;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataInputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.JobContext;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.slf4j.Logger;import org.slf4j.LoggerFactory;/** * * @author zhangdapeng * * 2016年1月23日 */public class XmlInputFormat extends FileInputFormat<LongWritable, Text>{ private static final Logger log = LoggerFactory.getLogger(XmlInputFormat.class); public static final String START_TAG_KEY = "xmlinput.start"; public static final String END_TAG_KEY = "xmlinput.end"; private long end = -1; private FSDataInputStream fsin=null; @Override public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) { try { return new XmlRecordReader((FileSplit) split, context.getConfiguration()); } catch (IOException ioe) { log.warn("Error while creating XmlRecordReader", ioe); return null; } // return new LineRecordReader(); } @Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> defaultSplits = super.getSplits(job); Configuration conf = job.getConfiguration(); List<InputSplit> result = new ArrayList<InputSplit>(); byte[] startTag = conf.get(START_TAG_KEY).getBytes("UTF-8"); byte[] endTag = conf.get(END_TAG_KEY).getBytes("UTF-8"); long blockStart = -1; long blockEnd = -1; for (InputSplit genericSplit : defaultSplits) { FileSplit fileSplit = (FileSplit) genericSplit; Path file = fileSplit.getPath(); end = fileSplit.getLength(); FileSystem fs = file.getFileSystem(job.getConfiguration()); try { fsin = fs.open(file); while (fsin.getPos() < end) { blockStart = -1; blockEnd = -1; if (readUntilMatch(startTag, false)) { blockStart = fsin.getPos() - startTag.length; if (readUntilMatch(endTag, true)) { blockEnd = fsin.getPos(); System.out.println(blockStart + "---" + blockEnd); } } if (blockStart != -1 && blockEnd != -1 && blockStart < blockEnd) { result.add(new FileSplit(file, blockStart, blockEnd - blockStart, fileSplit.getLocations())); System.out.println("Adding split start = " + blockStart + " end = " + blockEnd); } } } finally { fsin.close(); } } return result; } private boolean readUntilMatch(byte[] match, boolean withinBlock) throws IOException { int i = 0; while (true) { int b = fsin.read(); // end of file: if (b == -1) { return false; } // check if we're matching: if (b == match[i]) { i++; if (i >= match.length) { return true; } } else { i = 0; } // see if we've passed the stop point: if (!withinBlock && i == 0 && fsin.getPos() >= end) { return false; } } } @Override protected boolean isSplitable(JobContext context, Path file) { System.out.println(super.isSplitable(context, file)); return super.isSplitable(context, file); }}
XmlRecordReader类:
package boa.hadoop.xml;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataInputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.DataOutputBuffer;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.lib.input.FileSplit;/** * XMLRecordReader class to read through a given xml document to output xml * blocks as records as specified by the start tag and end tag */public class XmlRecordReader extends RecordReader<LongWritable, Text> { public static final String START_TAG_KEY = "xmlinput.start"; public static final String END_TAG_KEY = "xmlinput.end"; private final byte[] startTag; private final byte[] endTag; private final long start; private final long end; private final FSDataInputStream fsin; private final DataOutputBuffer buffer = new DataOutputBuffer(); private LongWritable currentKey; private Text currentValue; FileSplit split = null; public XmlRecordReader(FileSplit split, Configuration conf) throws IOException { this.split = split; startTag = conf.get(START_TAG_KEY).getBytes("UTF-8"); endTag = conf.get(END_TAG_KEY).getBytes("UTF-8"); // open the file and seek to the start of the split start = split.getStart(); end = start + split.getLength(); Path file = split.getPath(); FileSystem fs = file.getFileSystem(conf); fsin = fs.open(split.getPath()); fsin.seek(start); } private boolean next(LongWritable key, Text value) throws IOException { if (fsin.getPos() < end && readUntilMatch(startTag, false)) { try { buffer.write(startTag); if (readUntilMatch(endTag, true)) { key.set(fsin.getPos()); // buffer.writeBytes(split.toString()); // System.out.println(split.toString()); value.set(buffer.getData(), 0, buffer.getLength()); return true; } } finally { buffer.reset(); } } return false; } @Override public void close() throws IOException { fsin.close(); } @Override public float getProgress() throws IOException { return (fsin.getPos() - start) / (float) (end - start); } private boolean readUntilMatch(byte[] match, boolean withinBlock) throws IOException { int i = 0; while (true) { int b = fsin.read(); // end of file: if (b == -1) { return false; } // save to buffer: if (withinBlock) { buffer.write(b); } // check if we're matching: if (b == match[i]) { i++; if (i >= match.length) { return true; } } else { i = 0; } // see if we've passed the stop point: if (!withinBlock && i == 0 && fsin.getPos() >= end) { return false; } } } @Override public LongWritable getCurrentKey() throws IOException, InterruptedException { return currentKey; } @Override public Text getCurrentValue() throws IOException, InterruptedException { return currentValue; } @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { } @Override public boolean nextKeyValue() throws IOException, InterruptedException { currentKey = new LongWritable(); currentValue = new Text(); return next(currentKey, currentValue); }}
HadoopPropertyXMLMapReduce类:
package boa.hadoop.xml;import static javax.xml.stream.XMLStreamConstants.CHARACTERS;import static javax.xml.stream.XMLStreamConstants.START_ELEMENT;import java.io.ByteArrayInputStream;import java.io.IOException;import javax.xml.stream.XMLInputFactory;import javax.xml.stream.XMLStreamReader;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.slf4j.Logger;import org.slf4j.LoggerFactory;/** * * @author zhangdapeng * * 2016年1月23日 */public final class HadoopPropertyXMLMapReduce { private static final Logger log = LoggerFactory.getLogger(HadoopPropertyXMLMapReduce.class); public static class Map extends Mapper<LongWritable, Text, Text, Text> { @Override protected void map(LongWritable key, Text value, Mapper.Context context) throws IOException, InterruptedException { String document = value.toString(); System.out.println("'" + document + "'"); try { XMLStreamReader reader = XMLInputFactory.newInstance() .createXMLStreamReader(new ByteArrayInputStream(document.getBytes())); String propertyName = ""; String propertyValue = ""; String currentElement = ""; while (reader.hasNext()) { int code = reader.next(); switch (code) { case START_ELEMENT: currentElement = reader.getLocalName(); break; case CHARACTERS: if (currentElement.equalsIgnoreCase("name")) { propertyName += reader.getText(); } else if (currentElement.equalsIgnoreCase("value")) { propertyValue += reader.getText(); } break; } } reader.close(); context.write(propertyName.trim(), propertyValue.trim()); } catch (Exception e) { log.error("Error processing '" + document + "'", e); } } } public static void main(String... args) throws Exception { // runJob(args[0], args[1]); runJob("/home/hadoop/workspace/core-site.xml", "/home/hadoop/workspace/output"); } public static void runJob(String input, String output) throws Exception { Configuration conf = new Configuration(); conf.set("key.value.separator.in.input.line", " "); conf.set("xmlinput.start", "<property>"); conf.set("xmlinput.end", "</property>"); Job job = Job.getInstance(conf); job.setJarByClass(HadoopPropertyXMLMapReduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(Map.class); job.setInputFormatClass(XmlInputFormat.class); job.setNumReduceTasks(0); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(input)); Path outPath = new Path(output); FileOutputFormat.setOutputPath(job, outPath); outPath.getFileSystem(conf).delete(outPath, true); job.waitForCompletion(true); }}
Maven
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>boa</groupId> <artifactId>hadoop</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>hadoop</name> <url>http://maven.apache.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <maven.compiler.source>1.8</maven.compiler.source> <maven.compiler.target>1.8</maven.compiler.target> <maven.compiler.compilerVersion>1.8</maven.compiler.compilerVersion> </properties> <dependencies> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>2.7.1</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>2.7.1</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>2.7.1</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> <scope>test</scope> </dependency> </dependencies> <build> <plugins> <plugin> <artifactId>maven-assembly-plugin</artifactId> <configuration> <archive> <manifest> <!--这里要替换成jar包main方法所在类 --> <mainClass>boa.hadoop.Temperature</mainClass> </manifest> <manifestEntries> <Class-Path>.</Class-Path> </manifestEntries> </archive> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> </configuration> <executions> <execution> <id>make-assembly</id> <!-- this is used for inheritance merges --> <phase>package</phase> <!-- 指定在打包节点执行jar包合并操作 --> <goals> <goal>single</goal> </goals> </execution> </executions> </plugin> </plugins> </build></project>
测试文件:
<?xml version="1.0"?><?xml-stylesheet type="text/xsl" href="configuration.xsl"?><configuration> <property> <name>fs.default.name</name> <value>hdfs://localhost:8020</value> </property> <property> <name>hadoop.tmp.dir</name> <value>/var/lib/hadoop-0.20/cache/${user.name}</value> </property> <!-- OOZIE proxy user setting --> <property> <name>hadoop.proxyuser.oozie.hosts</name> <value>*</value> </property> <property> <name>hadoop.proxyuser.oozie.groups</name> <value>*</value> </property></configuration>
结果
1 0
- Hadoop之自定义格式分隔文件测试笔记
- hadoop自定义文件输出格式
- hadoop自定义文件的输入格式
- Hadoop学习笔记之<输入格式>
- 自定义hadoop map/reduce输入文件切割InputFormat 更改输入value的分隔符
- 自定义hadoop map/reduce输入文件切割InputFormat 更改输入value的分隔符
- Hadoop Pig、Hive 自定义输入输出分隔符
- Hadoop Pig、Hive 自定义输入输出分隔符
- hadoop自定义输出格式
- Hadoop笔记之自定义分组实现
- Hadoop之回收站同名文件测试
- Hadoop学习笔记之测试hadoop运行情况
- hadoop文件压缩格式
- Hadoop & Hadoop Streaming 自定义输出格式
- hadoop 之 输出格式
- Hadoop自定义读取文件
- Hadoop 笔记之创建自定义分区---手机流量统计
- Hadoop之Combiner与自定义Combiner(笔记8)
- Unix时间戳和北京时间相互转换
- Android基于XMPP Smack openfire 开发的聊天室(二) 【聊天信息、成员】
- linx性能监控
- c#之转义字符
- Android基于XMPP Smack openfire 开发的聊天室(三) 【新旧记录、踢人】
- Hadoop之自定义格式分隔文件测试笔记
- 快慢指针-----Linked List Cycle II
- Android基于XMPP Smack openfire 开发的聊天室(四) 【创建房间、表单;报文】
- Java 高级类(下) —— 内部类和匿名类
- 实用的第三方插件
- Android基于XMPP Smack openfire 开发的聊天室(五) 【邀请、被邀请】
- 看源代码库
- Leetcode 326. Power of Three
- Android基于XMPP Smack openfire 开发的聊天室(六) 【加入房间、权限错误】