Mapreduce实现倒排索引建立

来源：互联网发布：报税软件编辑：程序博客网时间：2024/06/08 10:37

需求：有大量的文本（文档、网页），需要建立搜索索引
举个例子吧
a文件内容
hello hadoop
hello hdfs
hello mapreduce
hi hive

b文件内容
hi hadoop
hi hdfs
hi mapreduce
hi hive
c文件。。。
最后要求建立索引
输出到一个文件中显示每个单词在每个小文件中出现的次数
如：hi在a文件出现了1次，b文件出现了4次
hi a.txt 1: b.txt 4

解决思路：使用mapreduce分两个阶段进行
第一个阶段：统计每个单词在各自的文件中出现的次数，不同的文件分成不同的行。
如：
hadoop–a.txt 1
hadoop–b.txt 1
…
第二个阶段：统计每个单词在各自的文件中出现的次数，相同的单词出现在同一行。如：hi a.txt 1: b.txt 4。

程序实现：第一阶段

/** * 统计每个单词在各个文件中出现的次数    分行给出 * @author 12706 * */public class ReverseIndexStepOne {    static class ReverseIndexStepOneMapper extends Mapper<LongWritable, Text, Text, IntWritable>{        Text k = new Text();        IntWritable v = new IntWritable(1);        //传进来的数据hello tom jerry        @Override        protected void map(LongWritable key, Text value, Context context)                throws IOException, InterruptedException {            //获取文件名            FileSplit fs = (FileSplit)context.getInputSplit();            String fileName = fs.getPath().getName();            //获取一行数据如:hello tom jerry            String line = value.toString();            String[] words = line.split("\t");            for (String word : words) {                //以为<word--文件名,1>的形式写出到输出收集器                k.set(word+"--"+fileName);                context.write(k, v);            }        }    }    static class ReverseIndexStepOneReducer extends Reducer<Text, IntWritable, Text, IntWritable>{        //传进来的数据是<word--文件名,values(多个InIntWritable值为1)>        @Override        protected void reduce(Text key, Iterable<IntWritable> values,                Context context) throws IOException, InterruptedException {            int count = 0;            for (IntWritable value : values) {                count += value.get();            }            //以<word--file,数量>写出去            context.write(key, new IntWritable(count));        }    }    public static void main(String[] args) throws Exception {        Configuration conf = new Configuration();        Job job = Job.getInstance(conf);        //jar包位置        job.setJarByClass(ReverseIndexStepOne.class);        job.setMapperClass(ReverseIndexStepOneMapper.class);        job.setReducerClass(ReverseIndexStepOneReducer.class);        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(IntWritable.class);        //设置最终输出类型        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(IntWritable.class);        FileInputFormat.setInputPaths(job, new Path(args[0]));        FileOutputFormat.setOutputPath(job, new Path(args[1]));        boolean ex = job.waitForCompletion(true);        System.exit(ex?0:1);     }}

测试第一阶段：
将工程打包上传到集群
创建输入文件夹
/reverse/input
创建文件reverse1.data,reverse1.data并编辑内容作为测试
将文件上传到input文件夹下
执行程序，查看输出文件内容

[root@mini1 ~]# hadoop fs -mkdir /reverse1/input[root@mini1 ~]# vi reverse2.data hi      hadoophi      hdfshi      mapreducehi      hivehello   spark[root@mini1 ~]# vi reverse1.datahello   hadoophello   hdfshello   mapreducehello   hive[root@mini1 ~]# hadoop fs -put reverse1.data reverse1.data /reverse/input/[root@mini1 ~]# hadoop jar reverse.jar com.scu.hadoop.sreverse.ReverseIndexStepOne /reverse/input /reverse/output[root@mini1 ~]# hadoop fs -cat /reverse/output/part-r-00000hadoop--reverse1.data   1hadoop--reverse2.data   1hdfs--reverse1.data     1hdfs--reverse2.data     1hello--reverse1.data    4hello--reverse2.data    1hi--reverse2.data       4hive--reverse1.data     1hive--reverse2.data     1mapreduce--reverse1.data        1mapreduce--reverse2.data        1spark--reverse2.data    1

程序实现：第二阶段

public class ReverIndexStepTwo {    /*     * 传进来的数据：统计每个单词在各个文件中出现的次数 一个单词就是一行     * 最终输出形式 ：单词       文件1 文件1中次数  。。。文件n  文件n中出现次数     *  hadoop--reverse1.data   1        hadoop--reverse2.data   1        hdfs--reverse1.data     1        hdfs--reverse2.data     1        hello--reverse1.data    4        hello--reverse2.data    1     */    static class ReverIndexStepTwoMapper extends Mapper<LongWritable, Text, Text, Text>{        Text k = new Text();        @Override        protected void map(LongWritable key, Text value,Context context)                throws IOException, InterruptedException {            //获得一行数据            String line = value.toString();            //切分            String[] word_file_counts = line.split("--");            //输出形式<hadoop,reverse1.data   1>            context.write(new Text(word_file_counts[0]), new Text(word_file_counts[1]+":"));        }    }    static class ReverIndexStepTwoReducer extends Reducer<Text, Text, Text, Text>{        //进来的形式<hadoop,reverse1.data   1reverse2.data   1>        @Override        protected void reduce(Text key, Iterable<Text> values, Context context)                throws IOException, InterruptedException {            StringBuffer sb = new StringBuffer();            for (Text value : values) {                //字符串拼接作为kv中的value输出                sb.append(value);            }            context.write(key, new Text(sb.toString()));        }    }    public static void main(String[] args) throws Exception {        Configuration conf = new Configuration();        Job job = Job.getInstance(conf);        //jar包位置        job.setJarByClass(ReverIndexStepTwo.class);        job.setMapperClass(ReverIndexStepTwoMapper.class);        job.setReducerClass(ReverIndexStepTwoReducer.class);        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(Text.class);        //设置最终输出类型        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(Text.class);        FileInputFormat.setInputPaths(job, new Path(args[0]));        FileOutputFormat.setOutputPath(job, new Path(args[1]));        boolean ex = job.waitForCompletion(true);        System.exit(ex?0:1);     }}

第二阶段测试：
将工程打包上传到集群
运行程序查看输出（第一阶段的输出文件作为第二阶段的输入）

[root@mini1 ~]# hadoop jar reverse.jar com.scu.hadoop.sreverse.ReverseIndexStepTwo /reverse/output/part-r-00000 /reverse/output2[root@mini1 ~]# hadoop fs -ls /reverse/output2Found 2 items-rw-r--r--   2 root supergroup          0 2017-10-16 23:35 /reverse/output2/_SUCCESS-rw-r--r--   2 root supergroup        241 2017-10-16 23:35 /reverse/output2/part-r-00000[root@mini1 ~]# hadoop fs -cat /reverse/output2/part-r-00000hadoop  reverse2.data   1:reverse1.data 1:hdfs    reverse2.data   1:reverse1.data 1:hello   reverse2.data   1:reverse1.data 4:hi      reverse2.data   4:hive    reverse2.data   1:reverse1.data 1:mapreduce       reverse2.data   1:reverse1.data 1:spark   reverse2.data   1:

阅读全文

0 0