利用MapperReducer的链式编程进行单词统计和过滤敏感词

来源：互联网发布：python的数据类型编辑：程序博客网时间：2024/06/06 09:27

Map Mapper1import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/** * Created by 张倩 on 2017/3/18. */public class WCMapMapper1 extends Mapper<LongWritable,Text,Text,IntWritable>{    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {       Text keyOut = new Text();       IntWritable valueOut =  new IntWritable();        String[] arr = value.toString().split(" ");       for(String s : arr){           keyOut.set(s);           valueOut.set(1);           context.write(keyOut,valueOut);       }    }}

Mapper1是进行单词分割阶段

MapMapper2import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/** * Created by 张倩 on 2017/3/18. */public class WCMapMapper2 extends Mapper<Text,IntWritable,Text,IntWritable> {    protected void map(Text key, IntWritable value, Context context) throws IOException, InterruptedException {      if(!key.toString().equals("falungong")){           context.write(key,value);      }    }}

Mapper2 阶段是过滤敏感词

Reduceimport org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;/** * Created by 张倩 on 2017/3/18. */public class WCReducer extends Reducer<Text,IntWritable,Text,IntWritable> {    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {       int count = 0;       for(IntWritable iw : values){           count = count + iw.get();       }        context.write(key,new IntWritable(count));    }}

Reduce阶段是对value进行迭代，算出总的次数

Reducer Mapperimport org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/** * Created by 张倩 on 2017/3/18. */public class WCReducerMapper1 extends Mapper<Text,IntWritable,Text,IntWritable>{    protected void map(Text key, IntWritable value, Context context) throws IOException, InterruptedException {       if(value.get()>5){           context.write(key,value);       }    }}

在Reduce端追加一个Mapper，过滤单词个数小于5的

ChainAppimport org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.chain.ChainMapper;import org.apache.hadoop.mapreduce.lib.chain.ChainReducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** * Created by 张倩 on 2017/3/18. */public class WCChainApp {    public static void main(String[] args) throws Exception {        Configuration conf = new Configuration();        conf.set("fs.defaultFS","file:///");        Job job = Job.getInstance(conf);        //设置job的属性        job.setJobName("WCChainApp");        job.setJarByClass(WCChainApp.class);        job.setInputFormatClass(TextInputFormat.class);        //在Mapper链条上增加Mapper1        ChainMapper.addMapper(job,WCMapMapper1.class, LongWritable.class, Text.class,Text.class, IntWritable.class,conf);        //在Mapper链条上增加Mapper2        ChainMapper.addMapper(job,WCMapMapper2.class,Text.class,IntWritable.class,Text.class,IntWritable.class,conf);        //在Reducer链条上设置Reducer        ChainReducer.setReducer(job,WCReducer.class,Text.class,IntWritable.class,Text.class,IntWritable.class,conf);        //在reducer链条上增加Reducer端的Mapper        ChainReducer.addMapper(job,WCReducerMapper1.class,Text.class,IntWritable.class,Text.class,IntWritable.class,conf);        //设置文件输入输出路径        FileInputFormat.addInputPath(job,new Path("g:/comp/chain/"));        FileOutputFormat.setOutputPath(job,new Path("g:/comp/chain/out"));        //设置reducer任务个数        job.setNumReduceTasks(3);        job.waitForCompletion(true);    }}

App端设置Mapper和Reducer的属性的时候，用的是ChainMapper和ChainReducer这两个类。好啦，链式编程已完成！

1 0