Hadoop1.x 的MapReduce 简单例子WordCount

来源:互联网 发布:淘宝ipad客户端下载 编辑:程序博客网 时间:2024/05/16 14:24

一. 前言

 之前笔记里记得案例今天看到了,所以拿出来分享。 **首先介绍一下案例需求,统计出hadoop上的一个hello目录下的文件不同单词的个数,并输出统计结果。**
MapReduce 是一种分布式计算模型,主要分为MapReduce两部分,用户只需要实现map()和reduce()函数就可以,一般两个函数之间以key和value这种键值对传递参数

mapreduce过程

二.代码

import java.io.IOException;import java.net.URI;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;public class WordCountApp {    static final String INPUT_PATH = "hdfs://hadoop1:9000/hello";    static final String OUT_PATH = "hdfs://hadoop1:9000/out";    public static void main(String[] args) throws Exception {        Configuration conf = new Configuration();        final Job job = new Job(conf,WordCountApp.class.getSimpleName());        //判断输出文件是否存在,存在就删除        FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);        Path outPath = new Path(OUT_PATH);        if(fileSystem.exists(outPath)){            fileSystem.delete(outPath, true);        }        // 指定输入目录        FileInputFormat.setInputPaths(job, INPUT_PATH);        //指定输入数据进行格式化处理的类        job.setInputFormatClass(TextInputFormat.class);        //指定自定义的Mapper类        job.setMapperClass(MyMapper.class);        //指定Mapper输出<k,v>类型        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(LongWritable.class);        //分区        job.setPartitionerClass(HashPartitioner.class);        //设置reduce个数        job.setNumReduceTasks(1);        //指定自定义的Reducer类        job.setReducerClass(MyReduce.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(LongWritable.class);        //指定输出的路径        FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));        //指定输出的格式化类        job.setOutputFormatClass(TextOutputFormat.class);        //将整个作业提交给JobTracker        job.waitForCompletion(true);    }    /**     * k1    每一行起始的位置     * v1    每一行的文本内容     * k2    每一行中的每个单词     * v2    每一行中的每个单词出现的次数,固定值1     * @author mademin     *     */    static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable>{        @Override        protected void map(LongWritable k1, Text v1,                Mapper<LongWritable, Text, Text, LongWritable>.Context context)                throws IOException, InterruptedException {            //将每行文本按照制表符来进行分割            String[] splited = v1.toString().split("\t");            for(String word : splited){                context.write(new Text(word), new LongWritable(1L));            }        }    }    /**     * k2      * v2     * k3      整个文件中的不同单词     * v3      整个文件中的不同单词出现总数     * @author mademin     *     */    static class MyReduce extends Reducer<Text, LongWritable, Text, LongWritable>{        @Override        protected void reduce(Text k2, Iterable<LongWritable> v2s,                Reducer<Text, LongWritable, Text, LongWritable>.Context context)                throws IOException, InterruptedException {            long sum = 0L;            for(LongWritable v2 : v2s){                sum += v2.get();            }            context.write(k2, new LongWritable(sum));        }    }}
阅读全文
1 0