倒排索引

来源:互联网 发布:网易云音乐推荐算法 编辑:程序博客网 时间:2024/06/05 05:30

创建两个文件数据,并上传到hdfs
data

file editfile fileview search

data2

abcaaedit editview filefile1 file1

要求:列出单词所在的文件目录和每个单词的个数

思路:
多个输入文件怎么弄? 使用addInputPath(),循环或单个添加

在map函数怎么知道单词的路径? 通过context.getInputSplit()获取FileSplit ,这样就可以使用getpath()啦

map用什么作为k, v ? 用文件名和路径拼接作为k , 1作为v

package invartde;import java.io.IOException;import java.text.SimpleDateFormat;import java.util.Date;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.Reducer.Context;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class Inverted {    public static class Map extends        Mapper<Object, Text, Text, IntWritable>{        FileSplit split;        @Override        protected void map(Object key, Text value,Context context)                throws IOException, InterruptedException {            split = (FileSplit)context.getInputSplit();            String line = value.toString();            StringTokenizer token = new StringTokenizer(line);            String path = split.getPath().toString();            while (token.hasMoreTokens()){                String filename = token.nextToken();                context.write(new Text(filename+":"+path), new IntWritable(1));            }        }    }    public static class Re extends         Reducer<Text, IntWritable, Text, IntWritable>{        @Override        protected void reduce(Text key, Iterable<IntWritable> value, Context context) {            int count = 0;            for (IntWritable k : value){                count+=k.get();            }            try {                Text myvalue = new Text(String.valueOf(count));                context.write(key, new IntWritable(count));            } catch (IOException | InterruptedException e) {                // TODO Auto-generated catch block                e.printStackTrace();            }        }    }    public static void main(String[] args) throws Exception{        Configuration conf = new Configuration();        String time = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date());        args = new String[]{"/input/data","/input/data2","/output/"+time};        Job job = Job.getInstance(conf);        job.setJarByClass(Inverted.class);        job.setMapperClass(Map.class);        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(IntWritable.class);        job.setCombinerClass(Re.class);        job.setReducerClass(Re.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(IntWritable.class);        FileInputFormat.addInputPath(job, new Path(args[0]));        FileInputFormat.addInputPath(job, new Path(args[1]));        FileOutputFormat.setOutputPath(job, new Path(args[2]));        System.exit(job.waitForCompletion(true) ? 0 : 1);    }}

输出结果:

file1:hdfs://localhost:9000/input/data2 2file:hdfs://localhost:9000/input/data   3file:hdfs://localhost:9000/input/data2  1search:hdfs://localhost:9000/input/data 1view:hdfs://localhost:9000/input/data   1view:hdfs://localhost:9000/input/data2  1

上面结果单词出现了重复, 若要将重复的合并成一个,可以自定义key类型

package test1;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import java.text.SimpleDateFormat;import java.util.Date;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.WritableComparable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Mapper.Context;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class Test1 {    public static class DataBean implements WritableComparable<DataBean>{        public String word;        public String path;        public DataBean() {        }        public String getWord() {            return word;        }        public void setWord(String word) {            this.word = word;        }        public String getPath() {            return path;        }        public void setPath(String path) {            this.path = path;        }        public DataBean(String word, String path) {            this.word = word;            this.path = path;        }        @Override        public void write(DataOutput out) throws IOException {            out.writeUTF(word);            out.writeUTF(path);        }        @Override        public void readFields(DataInput in) throws IOException {            word = in.readLine();            path = in.readLine();        }        @Override        public int compareTo(DataBean o) {            return word.compareTo(o.word);        }        @Override        public String toString() {            return word;        }    }    public static class Map extends Mapper<LongWritable, Text, Text, Text>{        FileSplit split;        @Override        protected void map(LongWritable key, Text value, Context context)                throws IOException, InterruptedException {            split = (FileSplit) context.getInputSplit();            String path = split.getPath().toString();            StringTokenizer t = new StringTokenizer(value.toString());            while (t.hasMoreTokens()){                String name = t.nextToken();                DataBean bean = new DataBean(name, path);                context.write(new Text(bean.word+"--"+bean.path), new Text("1"));            }        }    }    public static class Combiner extends Reducer<Text, Text, Text, Text>{        @Override        protected void reduce(Text key, Iterable<Text> values , Context context)                throws IOException, InterruptedException {            int sum = 0;            for (Text i : values){                sum += Integer.parseInt(i.toString());            }            String[] str = key.toString().split("--");            context.write(new Text(str[0]), new Text(str[1]+";"+sum+""));        }    }    public static class Re extends Reducer<Text, Text, Text, Text>{        @Override        protected void reduce(Text key, Iterable<Text> values , Context context)                throws IOException, InterruptedException {            int sum = 0 ;            String str = "";            for (Text i : values){                str += i.toString();                String s = i.toString().split(";")[1];                sum += Integer.parseInt(s);            }            context.write(key, new Text(str+"总数:"+sum));        }    }    public static void main(String[] args) throws Exception{        String time = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date());        args = new String[]{"/input/data","/input/data2","/output/"+time};        Configuration conf = new Configuration();        Job job =Job.getInstance(conf);        job.setMapperClass(Map.class);        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(Text.class);        job.setCombinerClass(Combiner.class);        job.setReducerClass(Re.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(Text.class);        FileInputFormat.addInputPath(job, new Path(args[0]));        FileInputFormat.addInputPath(job, new Path(args[1]));        FileOutputFormat.setOutputPath(job, new Path(args[2]));        job.waitForCompletion(true);    }}

结果:

aa  hdfs://localhost:9000/input/data2;1总数:1abc hdfs://localhost:9000/input/data2;1总数:1edit    hdfs://localhost:9000/input/data2;2hdfs://localhost:9000/input/data;1总数:3file    hdfs://localhost:9000/input/data;3hdfs://localhost:9000/input/data2;1总数:4file1   hdfs://localhost:9000/input/data2;2总数:2search  hdfs://localhost:9000/input/data;1总数:1view    hdfs://localhost:9000/input/data;1hdfs://localhost:9000/input/data2;1总数:2
原创粉丝点击