MapReduce应用

来源:互联网 发布:js调用cookie 编辑:程序博客网 时间:2024/06/05 19:38

1、单词计数

  • 计算出文件中各个单词的频数,要求输出结果按照单词的字母顺序进行排序
  • 每个单词和频数占一行,单词和频数之间有间隔

设计思路:
将文件内容怯分成单词,然后将所有相同的单词聚一块,最后计算单词出现的次数并输出,相同单词的频数计算可以并行化处理,相同单词交给一台机器来计算频数

package hadoop;/* * 计算出文件中各个单词的频数,要求输出结果按照单词的字母顺序进行排序 * 每个单词和频数占一行,单词和频数之间有间隔 */import java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class WordCount {    // 继承Mapper接口,设置map的输入类型为<Object,Text>    public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {        // one表示单词出现一次        private final static IntWritable one = new IntWritable(1);        // word存储切换下的单词        private Text word = new Text();        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {            StringTokenizer itr = new StringTokenizer(value.toString());// 对输入的切换词            while (itr.hasMoreTokens()) {                word.set(itr.nextToken());// 切下的单词存入word                context.write(word, one);            }        }    }    // 继承Reduce接口,设置reduce的输入类型<Text,IntWritable>    // 输出类型为<Text,IntWritable>    public static class IntSumReduce extends Reducer<Text, IntWritable, Text, IntWritable> {        // result记录单词的频数        private IntWritable result = new IntWritable();        public void reduce(Text key, Iterable<IntWritable> values, Context context)                throws IOException, InterruptedException {            int sum = 0;            // 对获取的<key,value-list>计算value的和            for (IntWritable val : values) {                sum += val.get();            }            // 将频数设置到result中            result.set(sum);            // 收集结果            context.write(key, result);        }    }    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {       Configuration conf=new Configuration();       //检查运行命令      String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();      if(otherArgs.length!=2){          System.err.println("Usage:wordcount<int><out>");          System.exit(2);      }      Job job=new Job(conf,"word count");      //配置作业各个类      job.setJarByClass(WordCount.class);      job.setMapperClass(TokenizerMapper.class);      job.setCombinerClass(IntSumReduce.class);      job.setReducerClass(IntSumReduce.class);      job.setOutputKeyClass(Text.class);      job.setOutputValueClass(IntWritable.class);      FileInputFormat.addInputPath(job, new Path(otherArgs[0]));      FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));      System.exit(job.waitForCompletion(true)?0:1);    }}

2、数据去重

  • 利用并行化思想对数据进行筛选,统计大数据集上的数据种类个数、从网站日志中计算访问地都涉及数据去重
  • 数据去重是让原始数据中出现超过一次的数据在输出文件中出现一次

设计思路:
1、map阶段 将输入中的value复制到输出数据的key上(输出的value任意)
2、MapReduce阶段,map的输出< key,value >经过shuffle过程聚集成< key,value-list >后交给reduce
3、reduce阶段直接把输入的key复制为输出的key,并输出(输出的value被设置成空)

同一个数据的所有记录交给一台reduce机器处理,最终结果输出一次

package hadoop;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapred.FileOutputFormat;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class Dedup {    // map将输入中的value复制到输出数据的key上,并直接输出    public static class Map extends Mapper<Object, Text, Text, Text> {        private static Text line = new Text();        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {            line = value;            context.write(line, new Text(""));        }    }    // reduce将输入中key复制到输出数据的key上,并将value设置为空值,并直接输出    public static class Reduce extends Reducer<Text, Text, Text, Text> {        public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {            context.write(key, new Text(""));        }    }    public static void main(String[] args) {        Configuration conf=new Configuration();        String[] otherArgs=new GenericOptionsParser(conf,args).getRemainingArgs();        if(otherArgs.length!=2){            System.err.println("Usage:wordcount<int><out>");            System.exit(2);        }        Job job=new Job(conf,"Data Deduplication");        job.setJarByClass(Dedup.class);        job.setMapperClass(Mapper.class);        job.setReducerClass(Reduce.class);        job.setCombinerClass(Reduce.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(Text.class);        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));        System.exit(job.waitForCompletion(true)?0:1);    }}

3、排序

  • 输入文件中的每行内容为一个数字,即一个数据。要求在在输出中每行有两个间隔的数字,其中,第一个代表位次,第二个代表原始数据

设计思路:
1、MapReduce默认的排序,如果key为封装int的IntW类型,那么MapReduce按照数字大小对key排序,如果key为封装String的Text类型,那么MapReduce按照字典顺序对字符串排序
2、在map中把读入的数据转化为IntWritable类型,然后排序
3、这里排序并不需要map输出合并,所以不需要Combiner

package hadoop;import java.io.IOException;import java.util.Iterator;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapred.FileOutputFormat;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Mapper.Context;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.util.GenericOptionsParser;import hadoop.Dedup.Reduce;import org.apache.hadoop.mapreduce.Reducer;public class Sort {    //map将输入中value化成InteWritable类型,作为输出的key    public static class Map extends Mapper<Object,Text,IntWritable,IntWritable>{        private static IntWritable data=new IntWritable();        public void map(Object key,Text value,Context context)  throws IOException, InterruptedException {            String line=value.toString();            data.set(Integer.parseInt(line));            context.write(data, new IntWritable(1));        }    }    //reduce将输入的key复制到输出的value上    //根据输入的value-list中元素的个数决定key的输出次数    //用全局linenum来代表key的位次    public static class Reduce extends Reducer<IntWritable,IntWritable,IntWritable,IntWritable>{        private static IntWritable linenum=new IntWritable();        public void reduce(IntWritable key,Iterable<IntWritable> values,Context context) throws IOException, InterruptedException{            for(IntWritable val:values){                context.write(linenum, key);                linenum=new IntWritable(linenum.get()+1);            }        }    }    public static void main(String[] args) {        Configuration conf=new Configuration();        String[] otherArgs=new GenericOptionsParser(conf,args).getRemainingArgs();        if(otherArgs.length!=2){            System.err.println("Usage:wordcount<int><out>");            System.exit(2);        }        Job job=new Job(conf,"Sort");        job.setJarByClass(Dedup.class);        job.setMapperClass(Mapper.class);        job.setReducerClass(Reduce.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(Text.class);        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));        System.exit(job.waitForCompletion(true)?0:1);    }}

参考《Hadoop实战》