基于HIVE文件格式的map reduce代码编写
来源:互联网 发布:淘宝分销商招商方案 编辑:程序博客网 时间:2024/05/22 01:35
更清晰的code格式版本可以移步:http://hugh-wangp.iteye.com/blog/1405804
我们的数据绝大多数都是在HIVE上,对HIVE的SEQUENCEFILE和RCFILE的存储格式都有利用,为了满足HIVE的数据开放,hiveclient的方式就比较单一,直接访问HIVE生成的HDFS数据也是一种必要途径,所以本文整理测试了如何编写基于TEXTFILE、SEQUENCEFILE、RCFILE的数据的mapreduce的代码。以wordcount的逻辑展示3种MR的代码。
其实只要知道MAP的输入格式是什么,就知道如何在MAP中处理数据;只要知道REDUCE(也可能只有MAP)的输出格式,就知道如何把处理结果转成输出格式。
如下代码片段是运行一个MR的最简单的配置:定义job、配置job、运行job
//map/reduce的job配置类,向hadoop框架描述map-reduce执行的工作JobConf conf = new JobConf(WordCountRC.class);//设置一个用户定义的job名称conf.setJobName("WordCountRC");//为job的输出数据设置Key类conf.setOutputKeyClass(Text.class);//为job输出设置value类conf.setOutputValueClass(IntWritable.class);//为job设置Mapper类conf.setMapperClass(MapClass.class);//为job设置Combiner类conf.setCombinerClass(Reduce.class);//为job设置Reduce类conf.setReducerClass(Reduce.class);//为map-reduce任务设置InputFormat实现类conf.setInputFormat(RCFileInputFormat.class);//为map-reduce任务设置OutputFormat实现类conf.setOutputFormat(TextOutputFormat.class);//为map-reduce job设置路径数组作为输入列表FileInputFormat.setInputPaths(conf, newPath(args[0]));//为map-reduce job设置路径数组作为输出列表FileOutputFormat.setOutputPath(conf, newPath(args[1]));//运行一个jobJobClient.runJob(conf);
当我们确定了输入输出格式,接下来就是来在实现map和reduce函数时首选对输入格式做相应的处理,然后处理具体的业务逻辑,最后把处理后的数据转成既定的输出格式。
代码1:textfile版wordcount
代码2:sequencefile版wordcountimportjava.io.IOException; importjava.util.Iterator; importjava.util.StringTokenizer; importorg.apache.hadoop.fs.Path; importorg.apache.hadoop.io.IntWritable; importorg.apache.hadoop.io.LongWritable; importorg.apache.hadoop.io.Text; importorg.apache.hadoop.mapred.FileInputFormat; importorg.apache.hadoop.mapred.FileOutputFormat; importorg.apache.hadoop.mapred.JobClient; importorg.apache.hadoop.mapred.JobConf; importorg.apache.hadoop.mapred.MapReduceBase; importorg.apache.hadoop.mapred.Mapper; importorg.apache.hadoop.mapred.OutputCollector; importorg.apache.hadoop.mapred.Reducer; importorg.apache.hadoop.mapred.Reporter; publicclass WordCountTxt{ public static class MapClass extends MapReduceBase implements Mapper<LongWritable,Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); @Override public void map(LongWritablekey, Text value, OutputCollector<Text,IntWritable> output, Reporter reporter) throws IOException{ Stringline = value.toString(); StringTokenizeritr = new StringTokenizer(line); while (itr.hasMoreTokens()){ word.set(itr.nextToken()); word, output.collect( one); } } } public static class Reduce extends MapReduceBase implements Reducer<Text,IntWritable, Text, IntWritable>{ @Override public void reduce(Textkey, Iterator<IntWritable>values, OutputCollector<Text, IntWritable>output, Reporter reporter) throws IOException{ int sum= 0; while (values.hasNext()){ sum += values.next().get(); } output.collect(key, new IntWritable(sum)); } } public static void main(String[]args) throws Exception{ JobConf conf = new JobConf(WordCountTxt.class); conf.setJobName( "wordcount");conf.setOutputKeyClass(Text. class);conf.setOutputValueClass(IntWritable. class);conf.setMapperClass(MapClass. class);conf.setCombinerClass(Reduce. class);conf.setReducerClass(Reduce. class);setInputPaths(conf, FileInputFormat. new Path(args[0])); setOutputPath(conf, FileOutputFormat. new Path(args[1])); runJob(conf); JobClient. } }
importjava.io.IOException; importjava.util.Iterator; importjava.util.StringTokenizer; importorg.apache.hadoop.fs.Path; importorg.apache.hadoop.io.IntWritable; importorg.apache.hadoop.io.Text; importorg.apache.hadoop.mapred.FileInputFormat; importorg.apache.hadoop.mapred.FileOutputFormat; importorg.apache.hadoop.mapred.JobClient; importorg.apache.hadoop.mapred.JobConf; importorg.apache.hadoop.mapred.MapReduceBase; importorg.apache.hadoop.mapred.Mapper; importorg.apache.hadoop.mapred.OutputCollector; importorg.apache.hadoop.mapred.Reducer; importorg.apache.hadoop.mapred.Reporter; importorg.apache.hadoop.mapred.SequenceFileAsTextInputF ormat; importorg.apache.hadoop.mapred.TextOutputFormat; publicclass WordCountSeq{ public static class MapClass extends MapReduceBase implements Mapper<Text,Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); @Override public void map(Textkey, Text value, OutputCollector<Text,IntWritable> output, Reporter reporter) throws IOException{ Stringline = value.toString(); StringTokenizeritr = new StringTokenizer(line); while (itr.hasMoreTokens()){ word.set(itr.nextToken()); word, output.collect( one); } } } public static class Reduce extends MapReduceBase implements Reducer<Text,IntWritable, Text, IntWritable>{ @Override public void reduce(Textkey, Iterator<IntWritable>values, OutputCollector<Text, IntWritable>output, Reporter reporter) throws IOException{ int sum= 0; while (values.hasNext()){ sum += values.next().get(); } output.collect(key, new IntWritable(sum)); } } public static void main(String[]args) throws IOException{ // TODO Auto-generated methodstub JobConf conf = new JobConf(WordCountSeq.class); "wordcount"); conf.setJobName( class); conf.setOutputKeyClass(Text. class); conf.setOutputValueClass(IntWritable. class); conf.setMapperClass(MapClass. class); conf.setCombinerClass(Reduce. class); conf.setReducerClass(Reduce. class); conf.setInputFormat(SequenceFileAsTextInputF ormat. class); conf.setOutputFormat(TextOutputFormat. setInputPaths(conf, FileInputFormat. new Path(args[0])); setOutputPath(conf, FileOutputFormat. new Path(args[1])); runJob(conf); JobClient. } }
代码3:rcfile版wordcount
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.RCFileInputFormat;
importorg.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextOutputFormat;
public class WordCountRC {
public static class MapClass
extends MapReduceBase implementsMapper<LongWritable, BytesRefArrayWritable, Text,IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word =new Text();
@Override
public void map(LongWritable key, BytesRefArrayWritablevalue,
OutputCollector<Text, IntWritable>output, Reporter reporter)
throws IOException {
Text txt = new Text();
txt.set(value.get(0).getData(), value.get(0).getStart(),value.get(0).getLength());
String[] result = txt.toString().split("\\s");
for(int i=0; i < result.length; i++){
word.set(result[i]);
output.collect(word,one);
}
}
}
public static class Reduce