mapreduce系列(6)---倒排索引的建立
来源:互联网 发布:提额神器软件 编辑:程序博客网 时间:2024/06/05 11:31
一、概述
如我们有三个文件:
a.txt,b.txt,c.txt
tian junli leihan meimeili leihan meimei
li leihan meimeitian jungegejiejietian jungegejiejie
gegejiejiehan meimeitian junhan meimeitian jun
统计出没个词在每篇文章中出现的次数,这就是倒排索引了,效果如下:
gege b.txt-->2,c.txt-->1han a.txt-->2,b.txt-->1,c.txt-->2jiejie b.txt-->2,c.txt-->1jun c.txt-->2,b.txt-->2,a.txt-->1lei b.txt-->1,a.txt-->2li a.txt-->2,b.txt-->1meimei a.txt-->2,b.txt-->1,c.txt-->2tian b.txt-->2,c.txt-->2,a.txt-->1
思路分析:
在mr程序中是通过相同的key来进行归并的,抓住这点,我们可以想到,把某个词加上它所属的文件名合起来组成一个key,这不就是离我们需要的结果很近了,但是可以看到,一个mr很难实现,所以在这个基础上,我们只需把key和value对换,换下前一个key的显示格式,通过两个mr就可以实现我们的需求。
二、代码实现
inverIndexStepOne.java
package inverIndex;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import java.io.IOException;import java.net.URI;import java.net.URISyntaxException;/** * Created by tianjun on 2017/3/20. */public class inverIndexStepOne { static class InverIndexStepOneMapper extends Mapper<LongWritable,Text,Text,IntWritable> { Text k = new Text(); IntWritable v = new IntWritable(1); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] words = line.split(" "); FileSplit inputSplit = (FileSplit) context.getInputSplit(); String filename = inputSplit.getPath().getName(); for(String word : words){ k.set(word+"--"+filename); context.write(k,v); } } } static class InverIndexStepOneReducer extends Reducer<Text,IntWritable,Text,IntWritable>{ @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int count = 0; for(IntWritable value : values){ count += value.get(); } context.write(key,new IntWritable(count)); } } public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { String os = System.getProperty("os.name").toLowerCase(); if (os.contains("windows")) { System.setProperty("HADOOP_USER_NAME", "root"); } Configuration conf = new Configuration(); conf.set("mapreduce.framework.name","yarn"); conf.set("yarn.resourcemanager.hostname","mini01"); conf.set("fs.defaultFS","hdfs://mini01:9000/");// 默认就是local模式// conf.set("mapreduce.framework.name","local");// conf.set("mapreduce.jobtracker.address","local");// conf.set("fs.defaultFS","file:///"); Job wcjob = Job.getInstance(conf); wcjob.setJar("F:/myWorkPlace/java/dubbo/demo/dubbo-demo/mr-demo1/target/mr.demo-1.0-SNAPSHOT.jar"); //如果从本地拷贝,是不行的,这时需要使用setJar// wcjob.setJarByClass(Rjoin.class); wcjob.setMapperClass(InverIndexStepOneMapper.class); wcjob.setReducerClass(InverIndexStepOneReducer.class); //设置我们的业务逻辑Mapper类的输出key和value的数据类型 wcjob.setMapOutputKeyClass(Text.class); wcjob.setMapOutputValueClass(IntWritable.class); //设置我们的业务逻辑Reducer类的输出key和value的数据类型 wcjob.setOutputKeyClass(Text.class); wcjob.setOutputValueClass(IntWritable.class); //如果不设置InputFormat,默认就是使用TextInputFormat.class// wcjob.setInputFormatClass(CombineFileInputFormat.class);// CombineFileInputFormat.setMaxInputSplitSize(wcjob,4194304);// CombineFileInputFormat.setMinInputSplitSize(wcjob,2097152); FileSystem fs = FileSystem.get(new URI("hdfs://mini01:9000"), new Configuration(), "root"); Path path = new Path("hdfs://mini01:9000/wc/index/stepone"); if (fs.exists(path)) { fs.delete(path, true); } //指定要处理的数据所在的位置 FileInputFormat.setInputPaths(wcjob, new Path("hdfs://mini01:9000/input/index")); //指定处理完成之后的结果所保存的位置 FileOutputFormat.setOutputPath(wcjob, new Path("hdfs://mini01:9000/wc/index/stepone")); boolean res = wcjob.waitForCompletion(true); System.exit(res ? 0 : 1); }}
inverIndexStepTwo.java
package inverIndex;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import java.io.IOException;import java.net.URI;import java.net.URISyntaxException;/** * Created by tianjun on 2017/3/20. */public class inverIndexStepTwo { static class inverIndexStepTwoMapper extends Mapper<LongWritable,Text,Text,Text> { Text k = new Text(); IntWritable v = new IntWritable(1); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] word_file = line.split("--"); String temp = word_file[1].replace("\t","-->"); context.write(new Text(word_file[0]),new Text(temp)); } } static class inverIndexStepTwoReducer extends Reducer<Text,Text,Text,Text>{ @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { StringBuffer sb = new StringBuffer(); for(Text value : values){ if(sb.length()!=0){ sb.append(","); } sb.append(value.toString()); } context.write(key,new Text(sb.toString())); } } public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { String os = System.getProperty("os.name").toLowerCase(); if (os.contains("windows")) { System.setProperty("HADOOP_USER_NAME", "root"); } Configuration conf = new Configuration(); conf.set("mapreduce.framework.name","yarn"); conf.set("yarn.resourcemanager.hostname","mini01"); conf.set("fs.defaultFS","hdfs://mini01:9000/");// 默认就是local模式// conf.set("mapreduce.framework.name","local");// conf.set("mapreduce.jobtracker.address","local");// conf.set("fs.defaultFS","file:///"); Job wcjob = Job.getInstance(conf); wcjob.setJar("F:/myWorkPlace/java/dubbo/demo/dubbo-demo/mr-demo1/target/mr.demo-1.0-SNAPSHOT.jar"); //如果从本地拷贝,是不行的,这时需要使用setJar// wcjob.setJarByClass(Rjoin.class); wcjob.setMapperClass(inverIndexStepTwoMapper.class); wcjob.setReducerClass(inverIndexStepTwoReducer.class); //设置我们的业务逻辑Mapper类的输出key和value的数据类型 wcjob.setMapOutputKeyClass(Text.class); wcjob.setMapOutputValueClass(Text.class); //设置我们的业务逻辑Reducer类的输出key和value的数据类型 wcjob.setOutputKeyClass(Text.class); wcjob.setOutputValueClass(Text.class); //如果不设置InputFormat,默认就是使用TextInputFormat.class// wcjob.setInputFormatClass(CombineFileInputFormat.class);// CombineFileInputFormat.setMaxInputSplitSize(wcjob,4194304);// CombineFileInputFormat.setMinInputSplitSize(wcjob,2097152); FileSystem fs = FileSystem.get(new URI("hdfs://mini01:9000"), new Configuration(), "root"); Path path = new Path("hdfs://mini01:9000/wc/index/steptwo"); if (fs.exists(path)) { fs.delete(path, true); } //指定要处理的数据所在的位置// FileInputFormat.setInputPaths(wcjob, new Path("hdfs://mini01:9000/input/index")); FileInputFormat.setInputPaths(wcjob, new Path("hdfs://mini01:9000/wc/index/stepone")); //指定处理完成之后的结果所保存的位置// FileOutputFormat.setOutputPath(wcjob, new Path("hdfs://mini01:9000/wc/index/stepone")); FileOutputFormat.setOutputPath(wcjob, new Path("hdfs://mini01:9000/wc/index/steptwo")); boolean res = wcjob.waitForCompletion(true); System.exit(res ? 0 : 1); }}
这样就可以计算出上述的需求
0 0
- mapreduce系列(6)---倒排索引的建立
- Mapreduce实现倒排索引建立
- 倒排索引的建立
- MapReduce--倒排索引
- mapreduce--倒排索引
- 倒排索引的分布式实现(MapReduce程序)
- Hadoop2.7.3 mapreduce(四)倒排索引的实现
- MapReduce 倒排索引的实现
- 倒排文件索引(Inverted File Index)的建立
- 倒排文件索引(Inverted File Index)的建立
- 倒排文件索引(Inverted File Index)的建立
- 倒排文件索引(Inverted File Index)的建立
- 倒排文件索引(Inverted File Index)的建立
- 倒排文件索引(Inverted File Index)的建立
- mapreduce实现倒排索引
- MapReduce实现倒排索引
- mapreduce实现倒排索引
- MapReduce倒排索引概要
- HTML:禁止鼠标拖动、禁止内容复制等
- Android布局优化
- 凯撒密码
- 我也来到这里了
- iOS开发笔记--添加cell动画
- mapreduce系列(6)---倒排索引的建立
- 初来乍到
- SeekBar 拖动条
- [转载]ORA-00942 表或视图不存在 问题的解决
- webstorm js中文乱码
- Java排序总结
- mysql 格式化日期
- zabbix_配置
- MySQL索引原理