hadoop demo 倒排索引
来源:互联网 发布:nginx 子域名映射端口 编辑:程序博客网 时间:2024/06/05 01:07
通过map、combine、reduce实现倒排索引
package com.asin.hdp.inverted;import java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class InvertedIndexCombine {public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job = Job.getInstance(conf);job.setJarByClass(InvertedIndexCombine.class);job.setMapperClass(invertedMapper.class);job.setCombinerClass(invertedCombine.class);job.setReducerClass(invertedReduce.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);FileInputFormat.addInputPath(job, new Path("e:/a.txt"));FileInputFormat.addInputPath(job, new Path("e:/b.txt"));FileInputFormat.addInputPath(job, new Path("e:/c.txt"));FileOutputFormat.setOutputPath(job, new Path("e:/outputCombine"));System.exit(job.waitForCompletion(true) ? 0 : 1);}public static class invertedMapper extends Mapper<LongWritable, Text, Text, Text> {@Overrideprotected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)throws IOException, InterruptedException {FileSplit split = (FileSplit) context.getInputSplit();Path path = split.getPath();String name = path.getName().replace("e:/", "");StringTokenizer token = new StringTokenizer(value.toString(), " ");while (token.hasMoreTokens()) {context.write(new Text(name + "\t" + token.nextToken()), new Text("1"));}}}public static class invertedCombine extends Reducer<Text, Text, Text, Text> {@Overrideprotected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)throws IOException, InterruptedException {String line = key.toString();String[] split = line.split("\t");int sum = 0;for (Text text : values) {sum += Integer.parseInt(text.toString());}context.write(new Text(split[1]), new Text(split[0] + ":" + sum));}}public static class invertedReduce extends Reducer<Text, Text, Text, Text> {@Overrideprotected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)throws IOException, InterruptedException {String val = "";for (Text text : values) {val += text + "\t";}context.write(new Text(key), new Text(val));}}}
执行结果
Ab.txt:1Asiac.txt:1CFPa.txt:1b.txt:1China'sb.txt:2c.txt:1a.txt:1Countya.txt:1Eastc.txt:1Everyc.txt:1Fridayb.txt:1a.txt:1Hebeic.txt:1b.txt:1Heilongjiangb.txt:1a.txt:1
阅读全文
0 0
- hadoop demo 倒排索引
- hadoop 倒排索引
- hadoop 倒排索引
- hadoop倒排索引
- hadoop 倒排索引
- hadoop 倒排索引
- hadoop倒排索引
- hadoop倒排索引
- hadoop学习-倒排索引
- hadoop学习-倒排索引
- hadoop倒排索引---学习
- hadoop实现倒排索引
- Hadoop之倒排索引
- Hadoop之倒排索引
- Hadoop之倒排索引
- hadoop之倒排索引
- hadoop之倒排索引
- hadoop实现简单的倒排索引
- ubuntu 之 安装TFTP server
- Spring MVC 自学杂记(七) -- 去掉静态资源的拦截
- Fisher r-z变换,z-score标准化与常用标准化
- 零基础学TensorFlow(一):virtualenv及TensorFlow环境搭建
- Python笔记1——list,tuple,dict,set,不可变对象
- hadoop demo 倒排索引
- 自适应直方图均衡——adapthisteq
- linux使用国内镜像源
- Firebreath基础教程之三:使用插件
- mysql in和exists的效率
- JAVA断言使用
- QT学习路四
- C++ main()函数及其参数
- HttpUrlConnection与HttpClient的认识(三)-HttpClient的使用