WordCount代码实现

来源：互联网发布：时尚笔记本电脑淘宝编辑：程序博客网时间：2024/06/06 04:58

导包

右击项目->Properties->Java Build Path->Libraries->Add Library->User Library->User Libraries->New (hadoop264)->Add External JARs

hadoop-2.6.4——share————hadoop——————common————————hadoop-common-2.6.4.jar————————lib——————————*.jar——————hdfs————————hadoop-hdfs-2.6.4.jar————————lib——————————*.jar——————mapreduce————————除了hadoop-mapreduce-examples-2.6.4.jar以外，都加————————lib——————————*.jar——————yarn————————除了*server*.jar不用，都加————————lib——————————*.jar

WordCountMapper.java

package mr.wcdemo;import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;/** * KEYIN: 默认情况下，是mr框架所读到的一行文本起始偏移量，Long *      但是在Hadoop中有自己的更精简的序列化接口，所以不直接使用Long，而用LongWritable * VALUEIN: 默认情况下，是mr框架所读到的一行文本的内容，String，同上，用Text * KEYOUT: 是用户自定义逻辑处理完之后输出数据中的key，在此处是单词，String,同上，用Text * VALUEOUT: 是用户自定义逻辑处理完之后输出数据中的value，在此处是单词次数，Integer, 同上，用IntWritable *  * @author yangzheng * */public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {    /**     * map阶段的业务逻辑就写在自定义的map()方法中     * maptask会对每一行输入数据调用一次我们自定义的map()方法     */    @Override    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {        //将maptask传给我们的文本内容先转换成String        String line = value.toString();        //根据空格将一行切分成单词        String[] words = line.split(" ");        //将单词输出为<单词, 1>        for (String word : words) {            //将单词作为key，将次数1作为value，以便后续的数据分发，可以根据单词分发，以便于相同的单词会到相同的reduce task            context.write(new Text(word), new IntWritable(1));        }    }}

WordCountReducer.java

package mr.wcdemo;import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;/** * KEYIN VALUEIN 对应mapper输出的KEYIN VALUEIN类型对应 *  * KEYOUT VALUEOUT 是自定义reduce逻辑处理结果的输出数据类型 * KEYOUT 是单词 * VALUEOUT 是单词总数 * @author yangzheng * */public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {    /**     * <hello, 1> <hello, 1> <hello, 1>  ...     * <world, 1>...     * <yz, 1>...     * 入参key，是一组相同单词kv对的key     */    @Override    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {        int count = 0;        for (IntWritable value : values) {            count += value.get();        }        context.write(key, new IntWritable(count));    }}

WordCountDriver.java

package mr.wcdemo;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** * 相当于一个yarn集群的客户端 * 需要在此封装我们的mr程序的相关运行参数，指定jar包 * 最后提交给yarn * @author * */public class WordCountDriver {    public static void main(String[] args) throws Exception {        if (args == null || args.length == 0) {            args = new String[2];            args[0] = "hdfs://master:9000/wordcount/input/wordcount.txt";            args[1] = "hdfs://master:9000/wordcount/output8";        }        Configuration conf = new Configuration();        //设置的没有用!  ??????//      conf.set("HADOOP_USER_NAME", "hadoop");//      conf.set("dfs.permissions.enabled", "false");        /*conf.set("mapreduce.framework.name", "yarn");        conf.set("yarn.resoucemanager.hostname", "mini1");*/        Job job = Job.getInstance(conf);        /*job.setJar("/home/hadoop/wc.jar");*/        //指定本程序的jar包所在的本地路径        job.setJarByClass(WordCountDriver.class);        //指定本业务job要使用的mapper/Reducer业务类        job.setMapperClass(WordCountMapper.class);        job.setReducerClass(WordCountReducer.class);        //指定mapper输出数据的kv类型        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(IntWritable.class);        //指定最终输出的数据的kv类型        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(IntWritable.class);        //指定job的输入原始文件所在目录        FileInputFormat.setInputPaths(job, new Path(args[0]));        //指定job的输出结果所在目录        FileOutputFormat.setOutputPath(job, new Path(args[1]));        //将job中配置的相关参数，以及job所用的java类所在的jar包，提交给yarn去运行        /*job.submit();*/        boolean res = job.waitForCompletion(true);        System.exit(res?0:1);    }}

运行结果

在hdfs上创建/wordcount/input，并上传文件
将导出的jar包上传到CentOS中，执行：

hadoop jar wc.jar mr.wcdemo.WordCountDriver /wordcount/input /wordcount/output

这里写图片描述

解决报错

这里写图片描述
这个错误是由于高版本的java project使用了低版本的来运行.
解决方法：
在Properties–>JAVA Compiler-中的Compiler compliance level从1.8改成1.7，之后就可以运行了。

阅读全文

0 0