第三个MapReduce程序----倒排索引inverseindex

来源：互联网发布：java写的毕业设计编辑：程序博客网时间：2024/05/21 09:58

分为两步

第一步代码实现

package club.drguo.mapreduce.inverseindex;import java.io.IOException;import org.apache.commons.lang.StringUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** * 倒排索引的第一个步骤 *  * @author guo * */// club.drguo.mapreduce.inverseindex.InverseIndexStepOnepublic class InverseIndexStepOne {// <输入：LongWritable（那一行的起始偏移量），Text（那一行的所有数据）// 输出：Text（hello-->a.txt），LongWritable（出现的次数）>public static class InverseIndexStepOneMapper extends Mapper<LongWritable, Text, Text, LongWritable> {private Text k = new Text();private LongWritable v = new LongWritable();@Overrideprotected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {// 拿到第一行数据String line = value.toString();// 切分String[] words = StringUtils.split(line, " ");/** * 就是先知道切分的word来自哪个文件（word--x.txt） */// 获取本次调用传递进来的数据所在的文件信息，先要获取所属切片信息FileSplit inputSplit = (FileSplit) context.getInputSplit();// 从切片信息中获取到文件路径及文件名String fileName = inputSplit.getPath().getName();// 输出 kv对 < hello--a.txt , 1>for (String word : words) {k.set(word + "--" + fileName);v.set(1);context.write(k, v);// context.write(new Text(word + "--" + fileName), new// LongWritable(1));}}}public static class InverseIndexStepOneReducer extends Reducer<Text, LongWritable, Text, LongWritable> {private LongWritable v = new LongWritable();// <hello-->a.txt ,{1,1,1...}>@Overrideprotected void reduce(Text key, Iterable<LongWritable> values, Context context)throws IOException, InterruptedException {// 遍历values进行累加long count = 0;for (LongWritable value : values) {count += value.get();}v.set(count);context.write(key, v);// key还是原来的key(hello-->a.txt)}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job_stepOne = Job.getInstance(conf);job_stepOne.setJarByClass(InverseIndexStepOne.class);job_stepOne.setMapperClass(InverseIndexStepOneMapper.class);job_stepOne.setReducerClass(InverseIndexStepOneReducer.class);job_stepOne.setOutputKeyClass(Text.class);job_stepOne.setOutputValueClass(LongWritable.class);FileInputFormat.setInputPaths(job_stepOne, new Path(args[0]));// 先判断一下，如果输出目录已存在则删除FileSystem fileSystem = FileSystem.get(conf);Path output = new Path(args[1]);if (fileSystem.exists(output)) {fileSystem.delete(output, true);// true递归删除}FileOutputFormat.setOutputPath(job_stepOne, new Path(output);System.exit(job_stepOne.waitForCompletion(true) ? 0 : 1);;}}

导出jar包后运行（先把测试数据上传）

guo@guo:~$ hdfs dfs -mkdir /data/inverseindexguo@guo:~$ hdfs dfs -put /home/guo/a.txt b.txt c.txt /data/inverseindexguo@guo:~$ hdfs dfs -ls /data/inverseindexFound 3 items-rw-r--r--   1 guo supergroup         35 2016-03-20 16:11 /data/inverseindex/a.txt-rw-r--r--   1 guo supergroup         37 2016-03-20 16:11 /data/inverseindex/b.txt-rw-r--r--   1 guo supergroup         38 2016-03-20 16:11 /data/inverseindex/c.txtguo@guo:~$ hadoop jar /home/guo/inverseindex.jar club.drguo.mapreduce.inverseindex.InverseIndexStepOne /data/inverseindex /data/output/inverseindex

查看结果

guo@guo:~$ hdfs dfs -cat /data/output/inverseindex/*hadoop--b.txt1hadoop--c.txt3hello--a.txt3hello--b.txt2hello--c.txt1map--a.txt1map--b.txt1map--c.txt1reduce--a.txt1reduce--b.txt2reduce--c.txt1world--a.txt1

第二步代码实现

package club.drguo.mapreduce.inverseindex;import java.io.IOException;import org.apache.commons.lang.StringUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;//club.drguo.mapreduce.inverseindex.InverseIndexStepTwopublic class InverseIndexStepTwo {// K：起始偏移量 V： {hello--a.txt3}public static class InverseIndexStepTwoMapper extends Mapper<LongWritable, Text, Text, Text> {private Text k = new Text();private Text v = new Text();@Overrideprotected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {String line = value.toString();//切分出各個字段String[] strings = StringUtils.split(line, "\t");String wordAndFile = strings[0];long count = Long.parseLong(strings[1]);String[] wordAndFileName = StringUtils.split(wordAndFile, "--");String word = wordAndFileName[0];String fileName = wordAndFileName[1];//將單詞作爲key，文件-->次數作爲value輸出 形式：<hello, a.txt--3>k.set(word);v.set(fileName+"--"+count);context.write(k, v);}}public static class InverseIndexStepTwoReducer extends Reducer<Text, Text, Text, Text>{@Overrideprotected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)throws IOException, InterruptedException {//拿到的数据<hello,{a.txt--3,b.txt--2,...}>String result = "";for(Text value : values){result += value+" ";}context.write(key, new Text(result));//输出结果 K：hello V：a.txt--3 b.txt--2 ...}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job_stepTwo = Job.getInstance(conf);job_stepTwo.setJarByClass(InverseIndexStepTwo.class);job_stepTwo.setMapperClass(InverseIndexStepTwoMapper.class);job_stepTwo.setReducerClass(InverseIndexStepTwoReducer.class);job_stepTwo.setOutputKeyClass(Text.class);job_stepTwo.setOutputValueClass(Text.class);FileInputFormat.setInputPaths(job_stepTwo, new Path(args[0]));// 先判断一下，如果输出目录已存在则删除FileSystem fileSystem = FileSystem.get(conf);Path output = new Path(args[1]);if (fileSystem.exists(output)) {fileSystem.delete(output, true);// true递归删除}FileOutputFormat.setOutputPath(job_stepTwo, output);System.exit(job_stepTwo.waitForCompletion(true) ? 0 : 1);}}

导出jar包后运行（数据用第一步计算后的数据）

guo@guo:~$ hadoop jar /home/guo/inverseindex2.jar club.drguo.mapreduce.inverseindex.InverseIndexStepTwo /data/output/inverseindex /data/output/inverseindex2

查看结果

guo@guo:~$ hdfs dfs -cat /data/output/inverseindex2/*hadoopc.txt--3 b.txt--1 helloc.txt--1 b.txt--2 a.txt--3 mapc.txt--1 b.txt--1 a.txt--1 reducec.txt--1 b.txt--2 a.txt--1 worlda.txt--1

0 0