利用MapperReducer的链式编程进行单词统计和过滤敏感词
来源:互联网 发布:python的数据类型 编辑:程序博客网 时间:2024/06/06 09:27
Map Mapper1import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/** * Created by 张倩 on 2017/3/18. */public class WCMapMapper1 extends Mapper<LongWritable,Text,Text,IntWritable>{ protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { Text keyOut = new Text(); IntWritable valueOut = new IntWritable(); String[] arr = value.toString().split(" "); for(String s : arr){ keyOut.set(s); valueOut.set(1); context.write(keyOut,valueOut); } }}
Mapper1是进行 单词分割 阶段
MapMapper2import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/** * Created by 张倩 on 2017/3/18. */public class WCMapMapper2 extends Mapper<Text,IntWritable,Text,IntWritable> { protected void map(Text key, IntWritable value, Context context) throws IOException, InterruptedException { if(!key.toString().equals("falungong")){ context.write(key,value); } }}Mapper2 阶段是过滤敏感词
Reduceimport org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;/** * Created by 张倩 on 2017/3/18. */public class WCReducer extends Reducer<Text,IntWritable,Text,IntWritable> { protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int count = 0; for(IntWritable iw : values){ count = count + iw.get(); } context.write(key,new IntWritable(count)); }}Reduce阶段是对value进行迭代,算出总 的次数
Reducer Mapperimport org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;import java.io.IOException;/** * Created by 张倩 on 2017/3/18. */public class WCReducerMapper1 extends Mapper<Text,IntWritable,Text,IntWritable>{ protected void map(Text key, IntWritable value, Context context) throws IOException, InterruptedException { if(value.get()>5){ context.write(key,value); } }}在Reduce端追加一个Mapper,过滤单词个数小于5的
ChainAppimport org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.chain.ChainMapper;import org.apache.hadoop.mapreduce.lib.chain.ChainReducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** * Created by 张倩 on 2017/3/18. */public class WCChainApp { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); conf.set("fs.defaultFS","file:///"); Job job = Job.getInstance(conf); //设置job的属性 job.setJobName("WCChainApp"); job.setJarByClass(WCChainApp.class); job.setInputFormatClass(TextInputFormat.class); //在Mapper链条上增加Mapper1 ChainMapper.addMapper(job,WCMapMapper1.class, LongWritable.class, Text.class,Text.class, IntWritable.class,conf); //在Mapper链条上增加Mapper2 ChainMapper.addMapper(job,WCMapMapper2.class,Text.class,IntWritable.class,Text.class,IntWritable.class,conf); //在Reducer链条上设置Reducer ChainReducer.setReducer(job,WCReducer.class,Text.class,IntWritable.class,Text.class,IntWritable.class,conf); //在reducer链条上增加Reducer端的Mapper ChainReducer.addMapper(job,WCReducerMapper1.class,Text.class,IntWritable.class,Text.class,IntWritable.class,conf); //设置文件输入输出路径 FileInputFormat.addInputPath(job,new Path("g:/comp/chain/")); FileOutputFormat.setOutputPath(job,new Path("g:/comp/chain/out")); //设置reducer任务个数 job.setNumReduceTasks(3); job.waitForCompletion(true); }}App端设置Mapper和Reducer的属性的时候,用的是ChainMapper和ChainReducer这两个类。好啦,链式编程已完成!
1 0
- 利用MapperReducer的链式编程进行单词统计和过滤敏感词
- Java使用IKAnalyzer进行敏感词过滤
- 网站敏感词过滤的实现(附敏感词库)
- 利用PHP扩展trie_filter做敏感词过滤
- 利用PHP扩展trie_filter做中文敏感词过滤
- 使用php扩展trie_filter,利用词库,过滤敏感词
- 使用php扩展trie_filter,利用词库,过滤敏感词
- 使用过滤器实现敏感词的过滤
- 使用过滤器实现敏感词的过滤
- flex敏感词过滤用到的
- lua写的敏感词过滤
- Java服务端过滤敏感词的思路
- 高效的php过滤敏感词方法
- Java之敏感词的过滤
- 利用AC自动机进行关键字的提取和过滤
- 利用树统计单词出现的频率
- java进行文本单词的词频统计
- 过滤敏感词
- android 自定义控件
- Java
- 关于node-sass安装不上的问题
- React+webpack开发环境的搭建_0
- VMware9安装Ubuntu 12.10图文详细教程
- 利用MapperReducer的链式编程进行单词统计和过滤敏感词
- 【九度OJ】题目1199:找位置 解题报告
- python-函数学习总结
- docker下删除两个id相同的镜像
- 第二十六、Java面向对象之instanceof 关键字
- 鸟哥的linux私房菜学习笔记《二十九》用户信息传递
- Linux文件系统命令&文件权限
- Linux命令行配置jdk
- NIO入门笔记01