mapreduce作业接受序列化文件(SequenceFile)作为文件输入的WordCount程序

来源:互联网 发布:淘宝运营培训视频 编辑:程序博客网 时间:2024/05/22 00:54

MapReduce作业接受序列化文件的输入时,可通过配置job的输入文件格式实现,具体见代码:


package hadoop;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;//import org.apache.hadoop.mapred.Task.CombinerRunner;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.SequenceFileAsTextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class WCseq {public static class Map extends Mapper<Text, Text, Text, IntWritable>{//private Writable Key = (Writable)ReflectionUtils.newInstance(reader.getKeyClass(), conf);@Override         //注意key为Text类型protected void map(Text key, Text value, Context context)throws IOException, InterruptedException {System.out.println("key:" + key.toString() + "    " + "value:" + value );String [] input = value.toString().split(" ");for(String s : input){context.write(new Text(s), new IntWritable(1));}}}public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable>{@Overrideprotected void reduce(Text key, Iterable<IntWritable> values, Context context)throws IOException, InterruptedException {int sum = 0;for (IntWritable val : values) {sum += val.get();}context.write(key, new IntWritable(sum));}}private static Path inputPath = new Path("/user/root/in-seqf/seq1");private static Path outputPath = new Path("out-seqf");public static void main(String[] args) throws Exception {    Configuration conf = new Configuration();    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();        Job job = new Job(conf, "WCseq");        job.setJarByClass(WCseq.class);                job.setMapperClass(Map.class);        job.setReducerClass(Reduce.class);        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(IntWritable.class);                job.setOutputKeyClass(Text.class);        job.setOutputValueClass(IntWritable.class);                //主要通过该设置实现job.setInputFormatClass(SequenceFileAsTextInputFormat.class);job.setOutputFormatClass(TextOutputFormat.class);        FileInputFormat.addInputPath(job, inputPath);        FileSystem fs = FileSystem.get(conf);        fs.delete(outputPath, true);        FileOutputFormat.setOutputPath(job, outputPath);                System.exit(job.waitForCompletion(true) ? 0 : 1);  }}

经测试发现,map作业接受key的值为行号,map程序接受的key、value如下:

key:0    value:hello worldkey:1    value:bye worldkey:2    value:hello hadoopkey:3    value:bye hadoopkey:4    value:hello worldkey:5    value:bye worldkey:6    value:hello hadoopkey:7    value:bye hadoopkey:8    value:hello worldkey:9    value:bye worldkey:10    value:hello hadoopkey:11    value:bye hadoopkey:12    value:hello world



原创粉丝点击