Hadoop下用MapReduce处理WordCount

来源：互联网发布：发散和收敛定义知乎编辑：程序博客网时间：2024/05/18 16:17

WordCount 分词算法我使用eclipse开发，一共有三个类WCJob.class、WCMapper.class、WCReduce.class。WCMapper.class对上传的数据进行筛选，比方说用空格间隔，而WCReduce.class在对筛选后的词进行累加。算法很简单~直接上代码。

WCJob.class

package com.jpf.wc;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WCJob {

   public static void main(String[] args) throws ClassNotFoundException, InterruptedException, IOException {
       //
       Configuration conf = new Configuration();
       conf.set("mapred.jar", "loadforecast_utils.jar");
       conf.set("fs.defaultFS", "hdfs://192.168.56.101:8020");
       conf.set("yarn.resourcemanager.hostname", "192.168.56.103");

           Job job = Job.getInstance(conf);

           job.setJarByClass(WCJob.class);

           job.setMapperClass(WCMapper.class);
           job.setMapOutputKeyClass(Text.class);
           job.setMapOutputValueClass(IntWritable.class);

           job.setReducerClass(WCReduce.class);
           FileInputFormat.addInputPath(job, new Path("/wc/input"));

           Path outpath = new Path("/wc/output");

           FileSystem fSystem = FileSystem.get(conf);

           if(fSystem.exists(outpath)){
               fSystem.delete(outpath,true);
           }
           FileOutputFormat.setOutputPath(job, outpath);

           boolean flag = job.waitForCompletion(true);
           if(flag){
               System.out.println("job success");
           }
   }
}

WCMapper.class

package com.jpf.wc;

import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;

public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
   @Override
   protected void map(LongWritable key, Text value, Context context)
           throws IOException, InterruptedException {
           String string = value.toString();
           String[] strs = StringUtils.split(string, ' ');
           for(String s : strs){
               context.write(new Text(s), new IntWritable(1));
           }
   }
}

WCReduce.class

package com.jpf.wc;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WCReduce extends Reducer<Text, IntWritable, Text, IntWritable>{

   @Override
   protected void reduce(Text text, Iterable<IntWritable> iterable,Context context) throws IOException, InterruptedException {
       int sum =0;
       for(IntWritable intWritable : iterable){
           sum += intWritable.get();
       }
       context.write(text, new IntWritable(sum));
   }
}

源码下载地址：http://download.csdn.net/download/qq_32969281/10044154

阅读全文

0 0