hadoop学习笔记(倒排索引)
来源:互联网 发布:gcc怎么用windows 编辑:程序博客网 时间:2024/06/05 13:31
本文是在eclipse环境中运行hadoop程序,创建工程以及环境搭建请参照我之前写的几篇博文http://blog.csdn.net/xiaoyu_2011/article/category/3148479
这次编写倒排索引程序遇到几个小问题,故做个笔记。
DFS文件结构如下:
还有源码:
InvertedIndex.java
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class InvertedIndex {
public static class InvertedIndexMapper extends Mapper<Object,Text,Text,Text>{
private Text keyInfo = new Text();
private Text valueInfo = new Text();
private FileSplit split;
public void map(Object key,Text value,Context context) throws IOException,InterruptedException{
split = (FileSplit)context.getInputSplit();
StringTokenizer itr = new StringTokenizer(value.toString());
while(itr.hasMoreTokens()){
keyInfo.set(itr.nextToken()+":"+split.getPath().toString());
valueInfo.set("1");
context.write(keyInfo, valueInfo);
}
}
}
public static class InvertedIndexCombiner extends Reducer<Text,Text,Text,Text>{
private Text info = new Text();
public void reduce(Text key,Iterable<Text> values,Context context) throws IOException,InterruptedException{
int sum = 0;
for(Text value:values){
sum += Integer.parseInt(value.toString());
}
int splitIndex = key.toString().indexOf(":");
info.set(key.toString().substring(splitIndex+1)+":"+sum);
key.set(key.toString().substring(0,splitIndex));
context.write(key, info);
}
}
public static class InvertedIndexReducer extends Reducer<Text,Text,Text,Text>{
private Text result = new Text();
public void reduce(Text key,Iterable<Text> values,Context context) throws IOException,InterruptedException{
String fileList = new String();
for(Text value:values){
fileList += value.toString()+";";
}
result.set(fileList);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
if(otherArgs.length!=2){
System.err.println("Usage:invertedindex<in><out>");
System.exit(2);
}
Job job = new Job(conf,"InvertedIndex");
job.setJarByClass(InvertedIndex.class);
job.setMapperClass(InvertedIndexMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setCombinerClass(InvertedIndexCombiner.class);
job.setReducerClass(InvertedIndexReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true)?0:1);
}
}
- hadoop学习笔记(倒排索引)
- hadoop 学习笔记之倒排索引
- hadoop学习-倒排索引
- hadoop学习-倒排索引
- hadoop倒排索引---学习
- hadoop学习笔记-6-倒排索引InverseIndex
- hadoop学习笔记-6-倒排索引InverseIndex
- 倒排索引学习笔记
- 学习Hadoop第十七课(倒排索引)
- hadoop 倒排索引
- hadoop 倒排索引
- hadoop倒排索引
- hadoop 倒排索引
- hadoop 倒排索引
- hadoop倒排索引
- hadoop倒排索引
- elasticsearch学习笔记-倒排索引
- hadoop实现倒排索引
- Robot Motion
- BZOJ1202 [HNOI2005]狡猾的商人(并查集)
- 判断ip的正则表达式
- JNDI
- PrefixHeader.pch 引入的头文件信息先后顺序很重要
- hadoop学习笔记(倒排索引)
- Linux下开启mysql数据库的远程访问权限
- 黑马程序员---流程控制,函数,数组
- oracle分组查询
- iOS里面的委托
- Linux多进程编程
- LeetCode 题解(129): Majority Element II
- 基于Maven的Mybatis+spring+springMVC框架整合(mapper代理方式)
- 单片机串口收发数据