Hadoop下进行反向索引(Inverted Index)操作

来源：互联网发布：李健老婆孟小蓓知乎编辑：程序博客网时间：2024/05/29 09:13

参考资料：
代码参考1：http://www.pudn.com/downloads212/sourcecode/unix_linux/detail999273.html
理论参考2：http://zhangyu8374.javaeye.com/blog/86307，http://nything.javaeye.com/blog/411787

在eclipse下创建map/reduce项目InvertedIndex，然后将参考1中的src目录拷贝到项目目录下替换原有src目录。

在本地创建文件夹IndexTest并在里面创建3个文件，每个文件中的内容如下。
    * T0 = "it is what it is"
    * T1 = "what is it"
    * T2 = "it is a banana"
其中T0，T1，T2分别是文件名，后面为文件内容。将IndexTest文件夹上传到DFS中。然后运行反向索引程序。

最后输出结果为：
a     (T2, 3)
banana     (T2, 4)
is     (T2, 2) (T0, 2) (T0, 5) (T1, 2)
it     (T1, 3) (T2, 1) (T0, 1) (T0, 4)
what     (T0, 3) (T1, 1)

代码清单：
InvertedIndex.java

/* * To change this template, choose Tools | Templates * and open the template in the editor. */package pa4;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;/** * * @author Ming */public class InvertedIndex { public static class TokenizerMapper extends Mapper<Text, ValuePair, Text, ValuePair> {@Overridepublic void map(Text key, ValuePair value, Context context) throws IOException, InterruptedException {// TokenInputFormat has generate (word, (fileID, wordPosition))// so mapper just spill it to reducer key.set(key.toString().toLowerCase()); context.write(key, value);} } public static class IndexReducer extends Reducer<Text, ValuePair, Text, Text> {private Text postings = new Text();@Overridepublic void reduce(Text key, Iterable<ValuePair> values,Context context) throws IOException, InterruptedException { String list = ""; for (ValuePair val : values) {list += " " + val.toString(); } postings.set(list); context.write(key, postings);} } public static void main(String[] args) throws Exception {Configuration conf = new Configuration();String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();if (otherArgs.length != 2) { System.err.println("Usage: InvertedIndex <in-dir> <out-dir>"); System.exit(2);}// remove the old output dirFileSystem.get(conf).delete(new Path(otherArgs[1]), true);Job job = new Job(conf, "Inverted Indexer");job.setJarByClass(InvertedIndex.class);job.setInputFormatClass(TokenInputFormat.class);job.setMapperClass(InvertedIndex.TokenizerMapper.class);//job.setCombinerClass(InvertedIndex.IndexReducer.class);job.setReducerClass(InvertedIndex.IndexReducer.class);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(ValuePair.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);FileInputFormat.addInputPath(job, new Path(otherArgs[0]));FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));System.exit(job.waitForCompletion(true) ? 0 : 1); }}

TokenInputFormat.java

package pa4;import java.io.IOException;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.InputSplit;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataInputStream;import org.apache.hadoop.mapreduce.JobContext;import org.apache.hadoop.mapreduce.RecordReader;import org.apache.hadoop.mapreduce.TaskAttemptContext;import org.apache.hadoop.mapreduce.TaskAttemptID;import org.apache.hadoop.util.LineReader;import java.util.StringTokenizer;public class TokenInputFormat extends FileInputFormat<Text, ValuePair> { /** * Don't allow the files to be split! */ @Override protected boolean isSplitable(JobContext ctx, Path filename) {// ensure the input files are not splittable!return false; } /** * Just return the record reader * key is the docno */ public RecordReader<Text, ValuePair> createRecordReader(InputSplit split, TaskAttemptContext ctx) throws IOException, InterruptedException {return new TokenRecordReader(); } public static class TokenRecordReader extends RecordReader<Text, ValuePair> {private long start;private long pos;private long end;private LineReader in;private int maxLineLength;private Text line;private Text key = null;private ValuePair value = null;private StringTokenizer tokens = null;private int tokenPos = 0;private String fileID = "0";// input file id that appears in inverted indexpublic void initialize(InputSplit genericSplit,TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); // Assume file name is an integer of file ID fileID = file.getName(); FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); in = new LineReader(fileIn, job); this.pos = start; line = new Text(); key = new Text(); value = new ValuePair();}public boolean nextKeyValue() throws IOException { boolean splitEnds = false; while (tokens == null || !tokens.hasMoreTokens()) {int lineSize = in.readLine(line, maxLineLength,Math.max((int) Math.min(Integer.MAX_VALUE, end - pos),maxLineLength));if (lineSize == 0) { splitEnds = true; break;}pos += lineSize;tokens = new StringTokenizer(line.toString(), " /t/n/r/f,.;<>-?///!'/":=*{}()$[]"); } if (splitEnds) {key = null;value = null;line = null;tokens = null;return false; } elsereturn true;}@Overridepublic Text getCurrentKey() { key.set(tokens.nextToken()); tokenPos ++; return key;}@Overridepublic ValuePair getCurrentValue() { value.set(fileID, tokenPos); return value;}/** * Get the progress within the split */public float getProgress() { if (start == end) {return 0.0f; } else {return Math.min(1.0f, (pos - start) / (float) (end - start)); }}public synchronized void close() throws IOException { if (in != null) {in.close(); }} } public static void main(String[] args) throws IOException {String fn = args[0];Configuration conf = new Configuration();FileSplit split = new FileSplit(new Path(fn), 0, 10000000, null);TokenRecordReader irr = new TokenRecordReader();TaskAttemptContext ctx = new TaskAttemptContext(conf,new TaskAttemptID("hello", 12, true, 12, 12));irr.initialize(split, ctx);while (irr.nextKeyValue()) { System.out.println(irr.getCurrentKey() + ": " + irr.getCurrentValue());} }}

ValuePair.java

package pa4;/* * To change this template, choose Tools | Templates * and open the template in the editor. */import java.io.*;import org.apache.hadoop.io.*;/** * * @author Ming */public class ValuePair implements WritableComparable<ValuePair> { private Text one; private IntWritable two; public void set(Text first, IntWritable second) {one = first;two = second; } public void set(String first, int second) {one.set(first);two.set(second); } public ValuePair() {set(new Text(), new IntWritable()); } public ValuePair(Text first, IntWritable second) {set(first, second); } public ValuePair(String first, int second) {set(first, second); } public Text getFirst() {return one; } public IntWritable getSecond() {return two; } @Override public void write(DataOutput out) throws IOException {one.write(out);two.write(out); } @Override public void readFields(DataInput in) throws IOException {one.readFields(in);two.readFields(in); } @Override public int hashCode() {return one.hashCode(); } @Override public boolean equals(Object o) {if (o instanceof ValuePair) { ValuePair tp = (ValuePair)o; return one.equals(tp.one);}return false; } @Override public String toString() {return "(" + one + ", " + two + ")"; } @Override public int compareTo(ValuePair tp) {int cmp = one.compareTo(tp.one);if (cmp != 0) { return cmp;}return two.compareTo(tp.two); } public static class Comparator extends WritableComparator {private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();private static final IntWritable.Comparator INT_COMPARATOR = new IntWritable.Comparator();public Comparator() { super(ValuePair.class);}@Overridepublic int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { try {int oneL1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1);int oneL2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2);int cmp = TEXT_COMPARATOR.compare(b1, s1, oneL1, b2, s2, oneL2);if (cmp != 0) { return cmp;}return INT_COMPARATOR.compare(b1, s1+oneL1, l1-oneL1,b2, s2+oneL2, l2-oneL2); } catch (IOException e) {throw new IllegalArgumentException(e); }}@Overridepublic int compare(WritableComparable a, WritableComparable b) { if (a instanceof ValuePair && b instanceof ValuePair) {return ((ValuePair) a).compareTo((ValuePair) b); } return super.compare(a, b);} } static {WritableComparator.define(ValuePair.class, new Comparator()); }}