使用MapReduce开发WordCount应用程序

来源:互联网 发布:软件汉化教程 编辑:程序博客网 时间:2024/06/05 06:15

package com.hadoop.mapreduce;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import java.io.IOException;/** * 使用MapReduce开发WordCount应用程序 */public class WordCountApp {    /**     * Map:读取输入的文件     * Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>     */    public static class MyMapper extends Mapper<LongWritable,Text,Text,LongWritable>{        LongWritable one = new LongWritable(1);        @Override        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {            //接收到的每一行数据            String line = value.toString();            //按照指定分隔符进行拆分            String[] words = line.split(" ");            for ( String word : words ) {                //通过上下文把map的处理结果输出                context.write(new Text(word),one);            }        }    }    /**     * Reduce: 归并操作     */    public static class MyReducer extends Reducer<Text,LongWritable,Text,LongWritable>{        @Override        protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {            long sum = 0;            for (LongWritable value:values) {                //求key出现的次数总和                sum += value.get();            }            //最终统计结果的输出            context.write(key,new LongWritable(sum));        }    }    /**     * 定义Driver:封装了MapReduce作业的所有信息     * @param args     */    public static void main(String[] args) throws Exception {        //创建Configuration        Configuration configuration = new Configuration();        //准备清理已存在的输出目录        Path outputPath = new Path(args[1]);        FileSystem fileSystem = FileSystem.get(configuration);        if (fileSystem.exists(outputPath)){            fileSystem.delete(outputPath,true);            System.out.println("output file exists, but is has deleted");        }        //创建Job        Job job = Job.getInstance(configuration, "wordcount");        //设置job的处理类        job.setJarByClass(WordCountApp.class);        //设置作业处理的输入路径        FileInputFormat.setInputPaths(job,new Path(args[0]));        //设置map相关参数        job.setMapperClass(MyMapper.class);        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(LongWritable.class);        //设置reduce相关参数        job.setReducerClass(MyReducer.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(LongWritable.class);        //设置作业处理的输出路径        FileOutputFormat.setOutputPath(job,new Path(args[1]));        System.exit(job.waitForCompletion(true) ? 0 : 1);    }    /**     * 打包:mvn clean package -DskipTests     * 上传到服务器:scp target/hadoop-train.1.0.jar hadoop@hadoop000:~/lib     * 运行:hadoop jar /home/hadoop/lib/hadoop-train-1.0.jar com.imooc.hadoop.mapreduce.WordCountApp     *          hdfs://hadoop000:8020/hello.txt hdfs://hadoop000:8020/output/wc     *     * 创建文件:mkdir wc_shell.sh     *                  hadoop fs -rm -r /output/wc    #先删除     *                  hadoop jar /home/hadoop/lib/hadoop-train-1.0.jar com.imooc.hadoop.mapreduce.WordCountApp     *                      hdfs://hadoop000:8020/hello.txt hdfs://hadoop000:8020/output/wc     * 增加执行权限:chmod u+x wc_shell.sh     */}