hadoop之"hello world"--wordCount

来源：互联网发布：淘宝客程序2.0 编辑：程序博客网时间：2024/05/17 01:47

hadoop版本1.2.1 ,开发IDE：eclipse;

编写map函数,继承mapper类。

package com.cjh.hadoop;import java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;public class WordMapper extends Mapper<Object, Text, Text, IntWritable>{private final static IntWritable one = new IntWritable(1);private Text word = new Text() ; @Overrideprotected void map(Object key, Text value,Context context)throws IOException, InterruptedException {//map中的key为每一行的偏移量,value为每一行的内容//StringTokenizer是java.util中的类,根据空格分隔一行中的每一个单词StringTokenizer itr = new StringTokenizer(value.toString()) ;while(itr.hasMoreTokens()){word.set(itr.nextToken());context.write(word, one);}}}

编写reduce函数，继承Reducer类

package com.cjh.hadoop;import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;public class WordReducer extends Reducer<Text, IntWritable, Text, IntWritable>{private IntWritable result = new IntWritable() ; @Overrideprotected void reduce(Text key, Iterable<IntWritable> values,Context context)throws IOException, InterruptedException {int sum = 0 ; for (IntWritable val : values) {sum += val.get()  ;}result.set(sum);context.write(key, result);}}

编写main函数

package com.cjh.hadoop;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class WordMain {public static void main(String[] args) throws Exception {//读取hadoop的配置文件，如site-core.xml等。Configuration conf = new Configuration();//GenericOptionsParser这个类的主要作用是解析用户指定的参数并且修改conf的配置信息String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();if(otherArgs.length != 2){System.err.println("Usage: wordcount <in> <out>");System.exit(2);}//一个MapReduce任务。第二个参数为Job的名称Job job = new Job(conf,"WordCount") ;//主类job.setJarByClass(WordMain.class);//设置mapper类job.setMapperClass(WordMapper.class);//设置合成类job.setCombinerClass(WordReducer.class);//设置reducer类job.setReducerClass(WordReducer.class);//设置输出key类型job.setOutputKeyClass(Text.class);//设置输出value类型job.setOutputValueClass(IntWritable.class);//输入路径FileInputFormat.addInputPath(job, new Path(otherArgs[0]));//输出路径FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));//等待job完成System.exit(job.waitForCompletion(true) ? 0 : 1);}}

编写完代码后，打jar包上传到hadoop_home路径下。使用命令：bin/hadoop jar wordcount.jar com.cjh.hadoop.WordMain /user/hadoop/input/test* /user/hadoop/output_wordcount 执行mapReduce。成功后，使用命令：bin/hadoop fs -text /user/hadoop/output_wordcount/part-r-00000 查看输出文件。

遇到的问题：

1.3个类中导入的包一定要正确。

2.设置output文件夹的时候一定是不存在的。否则报错

0 0