mapreduce系列(6)---倒排索引的建立

来源:互联网 发布:提额神器软件 编辑:程序博客网 时间:2024/06/05 11:31

一、概述

如我们有三个文件:
a.txt,b.txt,c.txt

tian junli leihan meimeili leihan meimei
li leihan meimeitian jungegejiejietian jungegejiejie
gegejiejiehan meimeitian junhan meimeitian jun

统计出没个词在每篇文章中出现的次数,这就是倒排索引了,效果如下:

gege    b.txt-->2,c.txt-->1han     a.txt-->2,b.txt-->1,c.txt-->2jiejie  b.txt-->2,c.txt-->1jun     c.txt-->2,b.txt-->2,a.txt-->1lei     b.txt-->1,a.txt-->2li      a.txt-->2,b.txt-->1meimei  a.txt-->2,b.txt-->1,c.txt-->2tian    b.txt-->2,c.txt-->2,a.txt-->1

思路分析:
在mr程序中是通过相同的key来进行归并的,抓住这点,我们可以想到,把某个词加上它所属的文件名合起来组成一个key,这不就是离我们需要的结果很近了,但是可以看到,一个mr很难实现,所以在这个基础上,我们只需把key和value对换,换下前一个key的显示格式,通过两个mr就可以实现我们的需求。

二、代码实现

inverIndexStepOne.java

package inverIndex;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import java.io.IOException;import java.net.URI;import java.net.URISyntaxException;/** * Created by tianjun on 2017/3/20. */public class inverIndexStepOne {    static class InverIndexStepOneMapper extends Mapper<LongWritable,Text,Text,IntWritable> {        Text k = new Text();        IntWritable v = new IntWritable(1);        @Override        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {            String line = value.toString();            String[] words = line.split(" ");            FileSplit inputSplit = (FileSplit) context.getInputSplit();            String filename = inputSplit.getPath().getName();            for(String word : words){                k.set(word+"--"+filename);                context.write(k,v);            }        }    }    static class InverIndexStepOneReducer extends Reducer<Text,IntWritable,Text,IntWritable>{        @Override        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {            int count = 0;            for(IntWritable value : values){                count += value.get();            }            context.write(key,new IntWritable(count));        }    }    public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {        String os = System.getProperty("os.name").toLowerCase();        if (os.contains("windows")) {            System.setProperty("HADOOP_USER_NAME", "root");        }        Configuration conf = new Configuration();        conf.set("mapreduce.framework.name","yarn");        conf.set("yarn.resourcemanager.hostname","mini01");        conf.set("fs.defaultFS","hdfs://mini01:9000/");//            默认就是local模式//        conf.set("mapreduce.framework.name","local");//        conf.set("mapreduce.jobtracker.address","local");//        conf.set("fs.defaultFS","file:///");        Job wcjob = Job.getInstance(conf);        wcjob.setJar("F:/myWorkPlace/java/dubbo/demo/dubbo-demo/mr-demo1/target/mr.demo-1.0-SNAPSHOT.jar");        //如果从本地拷贝,是不行的,这时需要使用setJar//        wcjob.setJarByClass(Rjoin.class);        wcjob.setMapperClass(InverIndexStepOneMapper.class);        wcjob.setReducerClass(InverIndexStepOneReducer.class);        //设置我们的业务逻辑Mapper类的输出key和value的数据类型        wcjob.setMapOutputKeyClass(Text.class);        wcjob.setMapOutputValueClass(IntWritable.class);        //设置我们的业务逻辑Reducer类的输出key和value的数据类型        wcjob.setOutputKeyClass(Text.class);        wcjob.setOutputValueClass(IntWritable.class);        //如果不设置InputFormat,默认就是使用TextInputFormat.class//        wcjob.setInputFormatClass(CombineFileInputFormat.class);//        CombineFileInputFormat.setMaxInputSplitSize(wcjob,4194304);//        CombineFileInputFormat.setMinInputSplitSize(wcjob,2097152);        FileSystem fs = FileSystem.get(new URI("hdfs://mini01:9000"), new Configuration(), "root");        Path path = new Path("hdfs://mini01:9000/wc/index/stepone");        if (fs.exists(path)) {            fs.delete(path, true);        }        //指定要处理的数据所在的位置        FileInputFormat.setInputPaths(wcjob, new Path("hdfs://mini01:9000/input/index"));        //指定处理完成之后的结果所保存的位置        FileOutputFormat.setOutputPath(wcjob, new Path("hdfs://mini01:9000/wc/index/stepone"));        boolean res = wcjob.waitForCompletion(true);        System.exit(res ? 0 : 1);    }}

inverIndexStepTwo.java

package inverIndex;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import java.io.IOException;import java.net.URI;import java.net.URISyntaxException;/** * Created by tianjun on 2017/3/20. */public class inverIndexStepTwo {    static class inverIndexStepTwoMapper extends Mapper<LongWritable,Text,Text,Text> {        Text k = new Text();        IntWritable v = new IntWritable(1);        @Override        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {            String line = value.toString();            String[] word_file = line.split("--");            String temp = word_file[1].replace("\t","-->");            context.write(new Text(word_file[0]),new Text(temp));        }    }    static class inverIndexStepTwoReducer extends Reducer<Text,Text,Text,Text>{        @Override        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {            StringBuffer sb = new StringBuffer();            for(Text value : values){                if(sb.length()!=0){                    sb.append(",");                }                sb.append(value.toString());            }            context.write(key,new Text(sb.toString()));        }    }    public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {        String os = System.getProperty("os.name").toLowerCase();        if (os.contains("windows")) {            System.setProperty("HADOOP_USER_NAME", "root");        }        Configuration conf = new Configuration();        conf.set("mapreduce.framework.name","yarn");        conf.set("yarn.resourcemanager.hostname","mini01");        conf.set("fs.defaultFS","hdfs://mini01:9000/");//            默认就是local模式//        conf.set("mapreduce.framework.name","local");//        conf.set("mapreduce.jobtracker.address","local");//        conf.set("fs.defaultFS","file:///");        Job wcjob = Job.getInstance(conf);        wcjob.setJar("F:/myWorkPlace/java/dubbo/demo/dubbo-demo/mr-demo1/target/mr.demo-1.0-SNAPSHOT.jar");        //如果从本地拷贝,是不行的,这时需要使用setJar//        wcjob.setJarByClass(Rjoin.class);        wcjob.setMapperClass(inverIndexStepTwoMapper.class);        wcjob.setReducerClass(inverIndexStepTwoReducer.class);        //设置我们的业务逻辑Mapper类的输出key和value的数据类型        wcjob.setMapOutputKeyClass(Text.class);        wcjob.setMapOutputValueClass(Text.class);        //设置我们的业务逻辑Reducer类的输出key和value的数据类型        wcjob.setOutputKeyClass(Text.class);        wcjob.setOutputValueClass(Text.class);        //如果不设置InputFormat,默认就是使用TextInputFormat.class//        wcjob.setInputFormatClass(CombineFileInputFormat.class);//        CombineFileInputFormat.setMaxInputSplitSize(wcjob,4194304);//        CombineFileInputFormat.setMinInputSplitSize(wcjob,2097152);        FileSystem fs = FileSystem.get(new URI("hdfs://mini01:9000"), new Configuration(), "root");        Path path = new Path("hdfs://mini01:9000/wc/index/steptwo");        if (fs.exists(path)) {            fs.delete(path, true);        }        //指定要处理的数据所在的位置//        FileInputFormat.setInputPaths(wcjob, new Path("hdfs://mini01:9000/input/index"));        FileInputFormat.setInputPaths(wcjob, new Path("hdfs://mini01:9000/wc/index/stepone"));        //指定处理完成之后的结果所保存的位置//        FileOutputFormat.setOutputPath(wcjob, new Path("hdfs://mini01:9000/wc/index/stepone"));        FileOutputFormat.setOutputPath(wcjob, new Path("hdfs://mini01:9000/wc/index/steptwo"));        boolean res = wcjob.waitForCompletion(true);        System.exit(res ? 0 : 1);    }}

这样就可以计算出上述的需求

0 0