倒排索引
来源:互联网 发布:网易云音乐推荐算法 编辑:程序博客网 时间:2024/06/05 05:30
创建两个文件数据,并上传到hdfs
data
file editfile fileview search
data2
abcaaedit editview filefile1 file1
要求:列出单词所在的文件目录和每个单词的个数
思路:
多个输入文件怎么弄? 使用addInputPath(),循环或单个添加
在map函数怎么知道单词的路径? 通过context.getInputSplit()获取FileSplit ,这样就可以使用getpath()啦
map用什么作为k, v ? 用文件名和路径拼接作为k , 1作为v
package invartde;import java.io.IOException;import java.text.SimpleDateFormat;import java.util.Date;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.Reducer.Context;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class Inverted { public static class Map extends Mapper<Object, Text, Text, IntWritable>{ FileSplit split; @Override protected void map(Object key, Text value,Context context) throws IOException, InterruptedException { split = (FileSplit)context.getInputSplit(); String line = value.toString(); StringTokenizer token = new StringTokenizer(line); String path = split.getPath().toString(); while (token.hasMoreTokens()){ String filename = token.nextToken(); context.write(new Text(filename+":"+path), new IntWritable(1)); } } } public static class Re extends Reducer<Text, IntWritable, Text, IntWritable>{ @Override protected void reduce(Text key, Iterable<IntWritable> value, Context context) { int count = 0; for (IntWritable k : value){ count+=k.get(); } try { Text myvalue = new Text(String.valueOf(count)); context.write(key, new IntWritable(count)); } catch (IOException | InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } public static void main(String[] args) throws Exception{ Configuration conf = new Configuration(); String time = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); args = new String[]{"/input/data","/input/data2","/output/"+time}; Job job = Job.getInstance(conf); job.setJarByClass(Inverted.class); job.setMapperClass(Map.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setCombinerClass(Re.class); job.setReducerClass(Re.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileInputFormat.addInputPath(job, new Path(args[1])); FileOutputFormat.setOutputPath(job, new Path(args[2])); System.exit(job.waitForCompletion(true) ? 0 : 1); }}
输出结果:
file1:hdfs://localhost:9000/input/data2 2file:hdfs://localhost:9000/input/data 3file:hdfs://localhost:9000/input/data2 1search:hdfs://localhost:9000/input/data 1view:hdfs://localhost:9000/input/data 1view:hdfs://localhost:9000/input/data2 1
上面结果单词出现了重复, 若要将重复的合并成一个,可以自定义key类型
package test1;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import java.text.SimpleDateFormat;import java.util.Date;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.WritableComparable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Mapper.Context;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class Test1 { public static class DataBean implements WritableComparable<DataBean>{ public String word; public String path; public DataBean() { } public String getWord() { return word; } public void setWord(String word) { this.word = word; } public String getPath() { return path; } public void setPath(String path) { this.path = path; } public DataBean(String word, String path) { this.word = word; this.path = path; } @Override public void write(DataOutput out) throws IOException { out.writeUTF(word); out.writeUTF(path); } @Override public void readFields(DataInput in) throws IOException { word = in.readLine(); path = in.readLine(); } @Override public int compareTo(DataBean o) { return word.compareTo(o.word); } @Override public String toString() { return word; } } public static class Map extends Mapper<LongWritable, Text, Text, Text>{ FileSplit split; @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { split = (FileSplit) context.getInputSplit(); String path = split.getPath().toString(); StringTokenizer t = new StringTokenizer(value.toString()); while (t.hasMoreTokens()){ String name = t.nextToken(); DataBean bean = new DataBean(name, path); context.write(new Text(bean.word+"--"+bean.path), new Text("1")); } } } public static class Combiner extends Reducer<Text, Text, Text, Text>{ @Override protected void reduce(Text key, Iterable<Text> values , Context context) throws IOException, InterruptedException { int sum = 0; for (Text i : values){ sum += Integer.parseInt(i.toString()); } String[] str = key.toString().split("--"); context.write(new Text(str[0]), new Text(str[1]+";"+sum+"")); } } public static class Re extends Reducer<Text, Text, Text, Text>{ @Override protected void reduce(Text key, Iterable<Text> values , Context context) throws IOException, InterruptedException { int sum = 0 ; String str = ""; for (Text i : values){ str += i.toString(); String s = i.toString().split(";")[1]; sum += Integer.parseInt(s); } context.write(key, new Text(str+"总数:"+sum)); } } public static void main(String[] args) throws Exception{ String time = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); args = new String[]{"/input/data","/input/data2","/output/"+time}; Configuration conf = new Configuration(); Job job =Job.getInstance(conf); job.setMapperClass(Map.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setCombinerClass(Combiner.class); job.setReducerClass(Re.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileInputFormat.addInputPath(job, new Path(args[1])); FileOutputFormat.setOutputPath(job, new Path(args[2])); job.waitForCompletion(true); }}
结果:
aa hdfs://localhost:9000/input/data2;1总数:1abc hdfs://localhost:9000/input/data2;1总数:1edit hdfs://localhost:9000/input/data2;2hdfs://localhost:9000/input/data;1总数:3file hdfs://localhost:9000/input/data;3hdfs://localhost:9000/input/data2;1总数:4file1 hdfs://localhost:9000/input/data2;2总数:2search hdfs://localhost:9000/input/data;1总数:1view hdfs://localhost:9000/input/data;1hdfs://localhost:9000/input/data2;1总数:2
阅读全文
0 0
- 倒排索引原理
- 倒排索引
- 倒排索引
- 倒排索引
- 索引 倒排
- 倒排索引
- 倒排索引
- hadoop 倒排索引
- 倒排索引技术
- 什么是倒排索引?
- 倒排索引求子串
- 倒排索引
- 倒排索引
- 倒排索引
- 倒排索引
- 倒排索引
- 倒排索引
- 倒排索引
- 基于Bmob,环信easeUI的校园二手交易市场——用户管理(含头像),数据管理,文件管理(图片管理),聊天功能的开发
- 看你的linux编译系统是32位还是64位最简单的方法以及不同数据类型占用字节个数
- 一些基础的数论知识
- 【Linux C 多线程编程】互斥锁与条件变量
- 八小时实现迷你版vuejs之二:vuejs 架构
- 倒排索引
- 李开复给中国大学生的七封公开信(其六)
- uboot学习(1)
- 【fjsd 1261】整数拆分
- [NOIP2017模拟]游戏
- Python 操作MongoDB数据库
- NOIP2016 Day2
- 一 统计学习理论前奏:大数定理的发展脉络
- bzoj3231[Sdoi2008]递归数列