mapreduce编程练习(一)简单的练习 WordCount

来源:互联网 发布:软件存储权限 编辑:程序博客网 时间:2024/06/12 13:14

入门训练:WordCount

  问题描述:对一个或多个输入文件中的单词进行计数统计,比如一个文件的输入文件如下

 

输出格式:



运行代码实例:


package hadoopLearn;import java.io.IOException;import java.net.URI;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.DoubleWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;import org.apache.hadoop.yarn.webapp.hamlet.Hamlet.P;public class WordCount extends Configured implements Tool {private static double count = 0; public static class CountMapper extends Mapper<LongWritable, Text, Text, LongWritable>{private Text word = new Text();private LongWritable one = new LongWritable(1);@Overrideprotected void map(LongWritable key,Text value,Mapper<LongWritable, Text, Text, LongWritable>.Context context)throws IOException,InterruptedException{System.out.println("line pos:" + key.toString());String line = value.toString();StringTokenizer tokenizer = new StringTokenizer(line);while (tokenizer.hasMoreElements()) {count ++;word.set(tokenizer.nextToken());context.write(word, one);}}}public static class CountReducer extends Reducer<Text, LongWritable, Text, DoubleWritable>{private DoubleWritable result = new DoubleWritable();@Overrideprotected void reduce(Text key, Iterable<LongWritable> values,Reducer<Text, LongWritable, Text, DoubleWritable>.Context context) throws IOException, InterruptedException {int sum = 0;for(LongWritable v : values){sum += v.get();}result.set(sum);context.write(key, result);}}static FileSystem fs = null;static Configuration conf=null;public static void init() throws Exception{//读取classpath下的xxx-site.xml 配置文件,并解析其内容,封装到conf对象中conf = new Configuration();//也可以在代码中对conf中的配置信息进行手动设置,会覆盖掉配置文件中的读取的值conf.set("fs.defaultFS", "hdfs://192.168.41.136:9000/");//根据配置信息,去获取一个具体文件系统的客户端操作实例对象 fs = FileSystem.get(new URI("hdfs://192.168.41.136:9000/"),conf,"hadoop");}public int run(String[] args) throws Exception {Job job = Job.getInstance(getConf(),"WordCount");job.setJarByClass(WordCount.class);job.setMapperClass(CountMapper.class);job.setReducerClass(CountReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(LongWritable.class);Path in = new Path("/WordCount/input");if(fs.exists(in)){FileInputFormat.addInputPath(job, in);}else{System.out.println("输入文件不存在!");}Path os = new Path("/WordCount/output");int flage = 0;if(fs.exists(os)){System.out.println("输出文件已经存在!重新新建路径!"); fs.delete(os, true);   FileOutputFormat.setOutputPath(job, os); flage = job.waitForCompletion(false) ? 0:1;}else{FileOutputFormat.setOutputPath(job, os);flage = job.waitForCompletion(false) ? 0:1;}return  flage;}public static void main(String[] args) throws Exception {init();int res = ToolRunner.run(new WordCount(), args);System.exit(res);}}