MapReduce 练习三 文件倒排

来源:互联网 发布:linux使用gdb 编辑:程序博客网 时间:2024/06/06 13:12
/*** * MapReduce 练习 *  文件倒排 */public class Inverted {    public static class InvertedMapper extends Mapper<LongWritable,Text,Text,Text>{        private FileSplit fileSplit;        private Text keyInfo=new Text();        private Text valueInfo=new Text();        //单词:地址,词频        @Override        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {            //获得<key,value>对,所属的FileSplit对象            fileSplit = (FileSplit) context.getInputSplit();            //获得所属文件名            String name = fileSplit.getPath().getName();            //StringTokenizer是用来把字符串截取成一个个标记或者单词,默认空格,多个空格(\t\n\r)等            StringTokenizer str=new StringTokenizer(value.toString());            //key:单词和filename  value:词频            while(str.hasMoreTokens()){                keyInfo.set(str.nextToken()+":"+name);                valueInfo.set("1");                context.write(keyInfo,valueInfo);            }        }    }    //Combiner    public static class InvertedComber extends Reducer<Text,Text,Text,Text>{        private Text valueInfo=new Text();        //单词 filename:词频        @Override        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {            int count=0;            //统计词频            for (Text v:values){                count+=Integer.parseInt(v.toString());            }            //key以:分数组  -- key:单词 filename            String[] split = key.toString().split(":");            valueInfo.set(split[1]+" : "+count);            context.write(new Text(split[0]),valueInfo);        }    }    //Reduce    public static class InvertedReduce extends  Reducer<Text,Text,Text,Text>{        @Override        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {            //生成文档列表            String file=new String();            for (Text v:values){                file+=v.toString()+";";            }            context.write(key,new Text(file));        }    }    public static void main(String [] args) throws IOException, ClassNotFoundException, InterruptedException {        Configuration conf=new Configuration();        Job job= Job.getInstance(conf,"Inverted");        job.setJarByClass(Inverted.class);        //实现map函数,根据输入的<key,value> 生成中间结果        job.setMapperClass(InvertedMapper.class);        job.setCombinerClass(InvertedComber.class);        job.setReducerClass(InvertedReduce.class);        //指定map输出的<key,value>类型        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(Text.class);        //指定最终输出的<key,value>类型        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(Text.class);        //路径        FileInputFormat.setInputPaths(job,new Path(args[0]));        FileOutputFormat.setOutputPath(job,new Path(args[1]));        System.exit(job.waitForCompletion(true)?0:1);    }
原创粉丝点击