文件倒排索引算法及其hadoop实现

来源:互联网 发布:域名解析隐藏端口号 编辑:程序博客网 时间:2024/05/21 00:49

什么是文件的倒排索引?

简单讲就是一种搜索引擎的算法。过倒排索引,可以根据单词快速获取包含这个单词的文档列表。倒排索引主要由两个部分组成:“单词”和对应出现的“倒排文件”。

详细解释有一篇博客说得挺好:http://blog.csdn.net/hguisu/article/details/7962350

  1. MapReduce的设计思路

整个过程包含mapcombinerreduce三个阶段,它们各自对应的keyvalue类型如下表所示:


InputKey

InputValue

OutputKey

OutputValue

Map

Object

Text

Text

Text

Combiner

Text

Text

Text

Text

Reduce

Text

Text

Text

Text


使用默认的TextInputFormat读入文件,三个部分的具体操作如下:

Map:将每一行的内容分词,输出key为“单词:文章”,输出value为“出现次数”,这里是Text类型的“1

Combiner:针对每一个输入key,将value值转为int数值累加,并将key中的文章放入value,输出key为“单词”,输出value为“文章:出现次数;……”;

Reduce:针对每一个输入key,以冒号分割,将value值中的出现次数取出来累加,并记录文章数量,计算出出平均出现次数,输出key为“单词平均出现次数”,输出value为“文章:出现次数;……

2. MapReduce的代码片段

Map代码如下:public static class Map extends Mapper<Object,Text,Text,Text>{   private TextvalueInfo = new Text();   private TextkeyInfo = new Text();   privateFileSplit split;   public void map(Object key, Text value,Context context) throws IOException,InterruptedException    {       split =(FileSplit) context.getInputSplit();       StringTokenizerstk = new StringTokenizer(value.toString());//单词分割       while(stk.hasMoreElements()) //还有单词       {           Stringname = split.getPath().getName();//获取文件名           intsplitIndex = name.indexOf(".");//获取文件名中点的位置           keyInfo.set(stk.nextToken()+ ":" + name.substring(0, splitIndex));//单词:去后缀文件名           valueInfo.set("1");//outputValue置为1           context.write(keyInfo,valueInfo);//写入context        }    }}Combiner代码如下:public static class Combiner extends Reducer<Text,Text,Text,Text>{   Text info =new Text();    public void reduce(Text key,Iterable<Text> values,Context context) throwsIOException, InterruptedException    {       int sum = 0;       for (Textvalue : values)       {           sum +=Integer.parseInt(value.toString());//累加同单词在同文章中出现次数       }       intsplitIndex = key.toString().indexOf(":");//获取key中的冒号位置       info.set(key.toString().substring(splitIndex+1)+ ":" + sum);//设置value为文章:次数       key.set(key.toString().substring(0,splitIndex));//设置key为单词       context.write(key,info);//写入context    }}Reduce代码如下:public static class Reduce extends Reducer<Text,Text,Text,Text>{   private Textresult = new Text();   public void reduce(Text key, Iterable<Text> values,Context contex) throwsIOException, InterruptedException    {       StringfileList = new String();       double sum =0 , cnt = 0;       for (Textvalue : values)       {           cnt++;//统计出现的文章数           fileList+= value.toString() + ";";//文章次数之间加分号           intsplitIndex = value.toString().indexOf(":");            sum +=Integer.parseInt(value.toString().substring(splitIndex+1));//统计出现总次数       }       sum /= cnt;//计算平均次数       result.set(fileList);//设置value值       key.set(key.toString()+ '\t' + String.format("%.2f", sum));//设置key值       contex.write(key,result);//写入context    }}这里最终输出的key是“单词平均出现次数”,Value是“文章:出现次数;……”。

开发环境: Intellijidea + meaven + java1.8

对武侠小说集合的进行倒排索引,输出文件中江湖的截图如下:


完整代码如下:

import java.io.IOException;import java.util.StringTokenizer;import org.apache.commons.lang.ObjectUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hbase.HBaseConfiguration;import org.apache.hadoop.hbase.client.Put;import org.apache.hadoop.hbase.util.Bytes;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.hbase.client.HTable;public class InvertedIndex{    private static Configuration conf2 = null;    static    {        conf2 = HBaseConfiguration.create();    }    public static void addData(String tableName, String rowKey, String family,                               String qualifier, String value )throws Exception    {        try        {            HTable table = new HTable(conf2, tableName);            Put put = new Put(Bytes.toBytes(rowKey));            put.add(Bytes.toBytes(family), Bytes.toBytes(qualifier), Bytes.toBytes(value));            table.put(put);            System.out.println("insert success!");        }        catch (IOException e)        {            e.printStackTrace();        }    }    public static class Map extends Mapper<Object,Text,Text,Text>    {        private Text valueInfo = new Text();        private Text keyInfo = new Text();        private FileSplit split;        public void map(Object key, Text value,Context context) throws IOException, InterruptedException        {            split = (FileSplit) context.getInputSplit();            StringTokenizer stk = new StringTokenizer(value.toString());            while (stk.hasMoreElements())            {                String name = split.getPath().getName();                int splitIndex = name.indexOf(".");                keyInfo.set(stk.nextToken() + ":" + name.substring(0, splitIndex));                valueInfo.set("1");                context.write(keyInfo, valueInfo);            }        }    }    public static class Combiner extends Reducer<Text,Text,Text,Text>    {        Text info = new Text();        public void reduce(Text key, Iterable<Text> values,Context context) throws IOException, InterruptedException        {            int sum = 0;            for (Text value : values)            {                sum += Integer.parseInt(value.toString());            }            int splitIndex = key.toString().indexOf(":");            info.set(key.toString().substring(splitIndex+1) + ":" + sum);            key.set(key.toString().substring(0,splitIndex));            context.write(key, info);        }    }    public static class Reduce extends Reducer<Text,Text,Text,Text>    {        private Text result = new Text();        public void reduce(Text key, Iterable<Text> values,Context contex) throws IOException, InterruptedException        {            //生成文档列表            String fileList = new String();            double sum = 0 , cnt = 0;            for (Text value : values)            {                cnt++;                fileList += value.toString() + ";";                int splitIndex = value.toString().indexOf(":");                sum += Integer.parseInt(value.toString().substring(splitIndex+1));            }            sum /= cnt;            result.set(fileList);            //key.set(key.toString() + '\t' + String.format("%.2f", sum));            try            {                addData("test", key.toString(), "BigData", "aveNum", String.format("%.2f", sum));            }            catch (Exception e)            {                e.printStackTrace();            }            contex.write(key, result);        }    }    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException    {        Configuration conf = new Configuration();//配置对象        Job job = new Job(conf,"InvertedIndex");//新建job        job.setJarByClass(InvertedIndex.class);//job类        job.setMapperClass(Map.class);//map设置        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(Text.class);        job.setCombinerClass(Combiner.class);//combiner设置        job.setReducerClass(Reduce.class);//reduce设置        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(Text.class);        //FileInputFormat.addInputPath(job, new Path("/data/wuxia_novels/"));//路径设置        //FileOutputFormat.setOutputPath(job, new Path("/user/2016st28/exp2/"));        FileInputFormat.addInputPath(job, new Path("/input/exp2/"));//路径设置        FileOutputFormat.setOutputPath(job, new Path("/output/test/"));        System.exit(job.waitForCompletion(true)?0:1);    }}


1 0