hadoop案例实现之WordCount (计算单词出现的频数)

来源:互联网 发布:打车软件有几种 编辑:程序博客网 时间:2024/06/06 01:17


一、编写java代码,实现map函数以及reduce函数

package com.paic.elis.test;import java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;public class WordCount {        public static class WordCountMap extends            Mapper<LongWritable, Text, Text, IntWritable> {        private final IntWritable one = new IntWritable(1);        private Text word = new Text();        public void map(LongWritable key, Text value, Context context)                throws IOException, InterruptedException {            String line = value.toString();            StringTokenizer token = new StringTokenizer(line);            while (token.hasMoreTokens()) {                word.set(token.nextToken());                context.write(word, one);            }        }    }    public static class WordCountReduce extends            Reducer<Text, IntWritable, Text, IntWritable> {        public void reduce(Text key, Iterable<IntWritable> values,                Context context) throws IOException, InterruptedException {            int sum = 0;            for (IntWritable val : values) {                sum += val.get();            }            context.write(key, new IntWritable(sum));        }    }    public static void main(String[] args) throws Exception {        Configuration conf = new Configuration();        Job job = new Job(conf);        job.setJarByClass(WordCount.class);        job.setJobName("wordcount");        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(IntWritable.class);        job.setMapperClass(WordCountMap.class);        job.setReducerClass(WordCountReduce.class);        job.setInputFormatClass(TextInputFormat.class);        job.setOutputFormatClass(TextOutputFormat.class);        FileInputFormat.addInputPath(job, new Path(args[0]));        FileOutputFormat.setOutputPath(job, new Path(args[1]));        job.waitForCompletion(true);    }}


二、打包成jar文件并上传到远程云主机。

如何打包,打包过程详见我的另一篇博客。

三、在远程主机操作。

1.将文件通过ssh传到远程来。
这里写图片描述

2.在hadoop中创建文件夹,并将linux 主机的内容上传到HDFS中。

这里写图片描述

3.查看是否上传成功。

这里写图片描述

4.执行。

这里写图片描述

5.执行过程输出

这里写图片描述

这里写图片描述

6.查看结果:

这里写图片描述


ps:
file1.txt 以及 file2.txt如下所示:
这里写图片描述

1 0