Mapreduce实例---倒排索引(含job串联)
来源:互联网 发布:sql查询学生成绩 编辑:程序博客网 时间:2024/05/22 15:13
一:问题介绍
统计每一个单词在各自文件中出现的总次数。
原始数据:
a.txt
hello tom
hello jerry
hello tom
b.txt
hello jerry
hello jerry
tom jerry
c.txt
hello jerry
hello tom
输出结果:
hello a.txt-->3 b.txt-->2 c.txt-->2
jerry a.txt-->1 b.txt-->3 c.txt-->1
tom a.txt-->2 b.txt-->1 c.txt-->1
二:代码
public class IndexStepOne {static class IndexStepOneMapper extends Mapper<LongWritable, Text, Text, IntWritable> {private Text k = new Text();private IntWritable v = new IntWritable(1);// 要从原始文档数据中输出 key: hello-->a.txt value:1@Overrideprotected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {FileSplit inputSplit = (FileSplit) context.getInputSplit();String fileName = inputSplit.getPath().getName();String line = value.toString();String[] words = line.split(" ");for (String word : words) {k.set(word + "-->" + fileName);context.write(k, v);}}}static class IndexStepOneReducer extends Reducer<Text, IntWritable, Text, IntWritable> {private IntWritable v = new IntWritable();// 拿到的数据 <hello-->a.txt,1> <hello-->a.txt,1> <hello-->a.txt,1>@Overrideprotected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {int count = 0;for (IntWritable value : values) {count += value.get();}v.set(count);context.write(key, v);}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job = Job.getInstance(conf);job.setJarByClass(IndexStepOne.class);job.setMapperClass(IndexStepOneMapper.class);job.setReducerClass(IndexStepOneReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);FileInputFormat.setInputPaths(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));boolean res = job.waitForCompletion(true);System.exit(res ? 0 : 1);}}
public class IndexStepTwo {static class IndexStepTwoMapper extends Mapper<LongWritable, Text, Text, Text> {private Text k = new Text();private Text v = new Text();@Overrideprotected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {String line = value.toString();String[] split = line.split("\t");String[] wordAndFile = split[0].split("-->");k.set(wordAndFile[0]);v.set(wordAndFile[1] + "-->" + split[1]);// <hello,a.txt-->3>context.write(k, v);}}static class IndexStepTwoReducer extends Reducer<Text, Text, Text, Text> {private Text v = new Text();// 拿到的数据: <hello,a.txt-->3> <hello,b.txt-->2> <hello,c.txt-->1>@Overrideprotected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {StringBuilder sb = new StringBuilder();for (Text value : values) {sb.append(value).append(" ");}v.set(sb.toString());context.write(key, v);}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job = Job.getInstance(conf);job.setJarByClass(IndexStepTwo.class);job.setMapperClass(IndexStepTwoMapper.class);job.setReducerClass(IndexStepTwoReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);FileInputFormat.setInputPaths(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));boolean res = job.waitForCompletion(true);System.exit(res ? 0 : 1);}}
/** * 简单的job串联可以使用jobControll来实现 * 更复杂的job的调度可以考虑用shell脚本来写,或者干脆用现成的任务调度工具oozie来做 */public class OnceSubmitClient {public static void main(String[] args) throws Exception {// 构造第一个阶段的基本job对象job1Configuration conf1 = new Configuration();Job job1 = Job.getInstance(conf1, "indexStepOne");job1.setJarByClass(OnceSubmitClient.class);job1.setMapperClass(IndexStepOneMapper.class);job1.setReducerClass(IndexStepOneReducer.class);job1.setOutputKeyClass(Text.class);job1.setOutputValueClass(IntWritable.class);FileInputFormat.setInputPaths(job1, new Path(args[0]));FileOutputFormat.setOutputPath(job1, new Path(args[1]));// ControlledJob是基本job的封装ControlledJob controlledJob1 = new ControlledJob(conf1);// 将job1封装到controlledJob1中去controlledJob1.setJob(job1);// 构造第2个阶段的基本job对象job2Configuration conf2 = new Configuration();Job job2 = Job.getInstance(conf2, "indexsteptwo");job2.setJarByClass(OnceSubmitClient.class);job2.setMapperClass(IndexStepTwoMapper.class);job2.setReducerClass(IndexStepTwoReducer.class);job2.setOutputKeyClass(Text.class);job2.setOutputValueClass(Text.class);// 第2个job的输入数据是第1个job的输出结果FileInputFormat.setInputPaths(job2, new Path(args[1]));FileOutputFormat.setOutputPath(job2, new Path(args[2]));ControlledJob controlledJob2 = new ControlledJob(conf2);// 将job2封装到controlledJob2中去controlledJob2.setJob(job2);// 先构造一个job控制器JobControl jobControl = new JobControl("index");// 指定两个job之间的依赖关系controlledJob2.addDependingJob(controlledJob1);// 向job控制器中添加job-->controlled jobjobControl.addJob(controlledJob1);jobControl.addJob(controlledJob2);// 创建一个线程去启动jobControlThread thread = new Thread(jobControl);thread.start();// 如果job没有运行完,主线程就等待while (!jobControl.allFinished()) {Thread.sleep(500);}int succeed = jobControl.getSuccessfulJobList().size();System.exit(succeed == 2 ? 0 : 1);/*0正常退出,1异常退出*/}}
三:运行
因为实现了多job串联执行,所以直接运行OnceSubmitClient类就行了。
注意:job串联执行在客户端是看不到运行过程的,可以登录Yarn控制平台(ip地址:8088 )观察。
1 0
- Mapreduce实例---倒排索引(含job串联)
- MapReduce实例----倒排索引
- MapReduce--倒排索引
- mapreduce--倒排索引
- MapReduce编程实例之倒排索引 1
- mapreduce实现倒排索引
- MapReduce实现倒排索引
- mapreduce实现倒排索引
- MapReduce倒排索引概要
- MapReduce实战--倒排索引
- MapReduce倒排索引实现
- MapReduce实现倒排索引
- 倒排索引的分布式实现(MapReduce程序)
- mapreduce系列(6)---倒排索引的建立
- Hadoop2.7.3 mapreduce(四)倒排索引的实现
- mapreduce文档倒排索引例程
- MapReduce 编程之 倒排索引
- mapreduce在倒排索引中练习
- redis 接口和命令机制
- 数组指针函数
- Remoting 外网访问和内网访问解决方案
- Linux安装Oracle 11g R2安装过程中遇到的报错及解决办法
- Oracle 使用druid连接池,发生“违反协议”异常
- Mapreduce实例---倒排索引(含job串联)
- java调用tcp socket接口
- 实训三+四
- 在线编辑 思考题
- 多线程之lock和synchronized的区别
- 关于正则表达式自己的
- freemarker.template.TemplateException: Expected string. column.nullable evaluated instead to freemar
- 管道命令(pipe)
- QPixmap显示图片