Mapreduce实例---倒排索引(含job串联)

来源:互联网 发布:sql查询学生成绩 编辑:程序博客网 时间:2024/05/22 15:13

一:问题介绍

统计每一个单词在各自文件中出现的总次数。


原始数据:

a.txt
hello tom
hello jerry
hello tom

b.txt
hello jerry
hello jerry
tom jerry

c.txt
hello jerry
hello tom


输出结果:
hello        a.txt-->3        b.txt-->2        c.txt-->2
jerry         a.txt-->1        b.txt-->3        c.txt-->1
tom          a.txt-->2        b.txt-->1        c.txt-->1



二:代码

public class IndexStepOne {static class IndexStepOneMapper extends Mapper<LongWritable, Text, Text, IntWritable> {private Text k = new Text();private IntWritable v = new IntWritable(1);// 要从原始文档数据中输出 key: hello-->a.txt    value:1@Overrideprotected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {FileSplit inputSplit = (FileSplit) context.getInputSplit();String fileName = inputSplit.getPath().getName();String line = value.toString();String[] words = line.split(" ");for (String word : words) {k.set(word + "-->" + fileName);context.write(k, v);}}}static class IndexStepOneReducer extends Reducer<Text, IntWritable, Text, IntWritable> {private IntWritable v = new IntWritable();// 拿到的数据 <hello-->a.txt,1> <hello-->a.txt,1> <hello-->a.txt,1>@Overrideprotected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {int count = 0;for (IntWritable value : values) {count += value.get();}v.set(count);context.write(key, v);}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job = Job.getInstance(conf);job.setJarByClass(IndexStepOne.class);job.setMapperClass(IndexStepOneMapper.class);job.setReducerClass(IndexStepOneReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);FileInputFormat.setInputPaths(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));boolean res = job.waitForCompletion(true);System.exit(res ? 0 : 1);}}

public class IndexStepTwo {static class IndexStepTwoMapper extends Mapper<LongWritable, Text, Text, Text> {private Text k = new Text();private Text v = new Text();@Overrideprotected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {String line = value.toString();String[] split = line.split("\t");String[] wordAndFile = split[0].split("-->");k.set(wordAndFile[0]);v.set(wordAndFile[1] + "-->" + split[1]);// <hello,a.txt-->3>context.write(k, v);}}static class IndexStepTwoReducer extends Reducer<Text, Text, Text, Text> {private Text v = new Text();// 拿到的数据: <hello,a.txt-->3> <hello,b.txt-->2> <hello,c.txt-->1>@Overrideprotected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {StringBuilder sb = new StringBuilder();for (Text value : values) {sb.append(value).append(" ");}v.set(sb.toString());context.write(key, v);}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job = Job.getInstance(conf);job.setJarByClass(IndexStepTwo.class);job.setMapperClass(IndexStepTwoMapper.class);job.setReducerClass(IndexStepTwoReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);FileInputFormat.setInputPaths(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));boolean res = job.waitForCompletion(true);System.exit(res ? 0 : 1);}}

/** * 简单的job串联可以使用jobControll来实现 * 更复杂的job的调度可以考虑用shell脚本来写,或者干脆用现成的任务调度工具oozie来做 */public class OnceSubmitClient {public static void main(String[] args) throws Exception {// 构造第一个阶段的基本job对象job1Configuration conf1 = new Configuration();Job job1 = Job.getInstance(conf1, "indexStepOne");job1.setJarByClass(OnceSubmitClient.class);job1.setMapperClass(IndexStepOneMapper.class);job1.setReducerClass(IndexStepOneReducer.class);job1.setOutputKeyClass(Text.class);job1.setOutputValueClass(IntWritable.class);FileInputFormat.setInputPaths(job1, new Path(args[0]));FileOutputFormat.setOutputPath(job1, new Path(args[1]));// ControlledJob是基本job的封装ControlledJob controlledJob1 = new ControlledJob(conf1);// 将job1封装到controlledJob1中去controlledJob1.setJob(job1);// 构造第2个阶段的基本job对象job2Configuration conf2 = new Configuration();Job job2 = Job.getInstance(conf2, "indexsteptwo");job2.setJarByClass(OnceSubmitClient.class);job2.setMapperClass(IndexStepTwoMapper.class);job2.setReducerClass(IndexStepTwoReducer.class);job2.setOutputKeyClass(Text.class);job2.setOutputValueClass(Text.class);// 第2个job的输入数据是第1个job的输出结果FileInputFormat.setInputPaths(job2, new Path(args[1]));FileOutputFormat.setOutputPath(job2, new Path(args[2]));ControlledJob controlledJob2 = new ControlledJob(conf2);// 将job2封装到controlledJob2中去controlledJob2.setJob(job2);// 先构造一个job控制器JobControl jobControl = new JobControl("index");// 指定两个job之间的依赖关系controlledJob2.addDependingJob(controlledJob1);// 向job控制器中添加job-->controlled jobjobControl.addJob(controlledJob1);jobControl.addJob(controlledJob2);// 创建一个线程去启动jobControlThread thread = new Thread(jobControl);thread.start();// 如果job没有运行完,主线程就等待while (!jobControl.allFinished()) {Thread.sleep(500);}int succeed = jobControl.getSuccessfulJobList().size();System.exit(succeed == 2 ? 0 : 1);/*0正常退出,1异常退出*/}}


三:运行

因为实现了多job串联执行,所以直接运行OnceSubmitClient类就行了。

注意:job串联执行在客户端是看不到运行过程的,可以登录Yarn控制平台(ip地址:8088 )观察。


1 0