WordCount经典编程

来源:互联网 发布:工程预结算软件 编辑:程序博客网 时间:2024/06/15 22:41

WordCount原理:
MapReduce任务被初始化为一个Job,每个Job又可以分为两种阶段:map阶段和reduce阶段。这两个阶段分别用两个函数表示,即map函数和reduce函数。

/** * WordCount.java * com.hainiu.hadoop.mr * Copyright (c) 2017, 小马同学版权所有. * @author   小马同学*/package com.xiaoma.hadoop.mr;import java.io.IOException;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;/** * * @author   小马同学 * @Date     2017年5月16日       */public class WordCount extends Configured implements Tool {    //内部类Map    //com.hainiu.hadoop.mr.WordCount    public static class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {        private Text word = new Text();        private LongWritable one = new LongWritable(1);        //重新实现增加相应的功能        @Override        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {            //所有原始数据            String str = value.toString();            //定义一个数组元素是按空格分割            String strs[] = str.split(" ");            //遍历            for (String s : strs) {                //把单词元素s设置进去                word.set(s);                //把单词及其统计写出来   型如: Hello 1 1 1 1                 context.write(word, one);            }        }    }    //内部类Reduce    public static class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {        private LongWritable valueout = new LongWritable();        @Override        protected void reduce(Text key, Iterable<LongWritable> values, Context context)                throws IOException, InterruptedException {            //用于统计每个单词的总个数            long sum = 0L;            //统计单词个数            for (LongWritable num : values) {                sum += num.get();            }            //将个数sum设置进去            valueout.set(sum);            context.write(key, valueout);        }    }    public int run(String[] args) throws Exception {                if (args.length !=2){                    System.out.println("必须输入两个参数,<input> <output>");                    return 1;                }        //定义job名字并得到配置        Job job = Job.getInstance(getConf(), "wordcount");        //设置jar使用的class        job.setJarByClass(WordCount.class);        //设置使用的mapper类型        job.setMapperClass(WordCountMapper.class);        //设置使用的reduce类型        job.setReducerClass(WordCountReducer.class);        //设置输出的key类型        job.setOutputKeyClass(Text.class);        //设置输出的value类型        job.setOutputValueClass(LongWritable.class);        //设置任务的输入地址        FileInputFormat.addInputPath(job, new Path(args[0]));        //设置任务的输出地址        FileOutputFormat.setOutputPath(job, new Path(args[1]));        //删除以前存在的目录        FileSystem fs = FileSystem.get(getConf());        if(fs.exists(new Path(args[1]))){            fs.delete(new Path(args[1]), true);            System.out.println("out put delete finish");        }        //提交状态        return job.waitForCompletion(true) ? 0 : 1;    }    public static void main(String[] args) throws Exception {        int res = ToolRunner.run(new WordCount(), args);        System.exit(res);    }}
原创粉丝点击