hadoop demo 倒排索引

来源：互联网发布：nginx 子域名映射端口编辑：程序博客网时间：2024/06/05 01:07

通过map、combine、reduce实现倒排索引

package com.asin.hdp.inverted;import java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class InvertedIndexCombine {public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job = Job.getInstance(conf);job.setJarByClass(InvertedIndexCombine.class);job.setMapperClass(invertedMapper.class);job.setCombinerClass(invertedCombine.class);job.setReducerClass(invertedReduce.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);FileInputFormat.addInputPath(job, new Path("e:/a.txt"));FileInputFormat.addInputPath(job, new Path("e:/b.txt"));FileInputFormat.addInputPath(job, new Path("e:/c.txt"));FileOutputFormat.setOutputPath(job, new Path("e:/outputCombine"));System.exit(job.waitForCompletion(true) ? 0 : 1);}public static class invertedMapper extends Mapper<LongWritable, Text, Text, Text> {@Overrideprotected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)throws IOException, InterruptedException {FileSplit split = (FileSplit) context.getInputSplit();Path path = split.getPath();String name = path.getName().replace("e:/", "");StringTokenizer token = new StringTokenizer(value.toString(), " ");while (token.hasMoreTokens()) {context.write(new Text(name + "\t" + token.nextToken()), new Text("1"));}}}public static class invertedCombine extends Reducer<Text, Text, Text, Text> {@Overrideprotected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)throws IOException, InterruptedException {String line = key.toString();String[] split = line.split("\t");int sum = 0;for (Text text : values) {sum += Integer.parseInt(text.toString());}context.write(new Text(split[1]), new Text(split[0] + ":" + sum));}}public static class invertedReduce extends Reducer<Text, Text, Text, Text> {@Overrideprotected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)throws IOException, InterruptedException {String val = "";for (Text text : values) {val += text + "\t";}context.write(new Text(key), new Text(val));}}}

执行结果

Ab.txt:1Asiac.txt:1CFPa.txt:1b.txt:1China'sb.txt:2c.txt:1a.txt:1Countya.txt:1Eastc.txt:1Everyc.txt:1Fridayb.txt:1a.txt:1Hebeic.txt:1b.txt:1Heilongjiangb.txt:1a.txt:1

阅读全文

0 0