MapReduce实现基本SQL操作的原理-join和group by，以及Dinstinct

来源：互联网发布：mac air wi-fi连电视上编辑：程序博客网时间：2024/06/05 15:46

感谢作者做的那么清晰易懂

http://blog.csdn.net/sn_zzy/article/details/43446027

Group By原理
map阶段
把需要group by的多个字段组合变成一个key
reduce字段
对组合的新key进行count
distinct原理
select dealid, count(distinct uid) num from order group by dealid;
map阶段
按照dealid+uid作为一个key进行map,然后对partition key进行shuffle
reduce阶段
把原来map中key进行拆开把dealid作为key然后对新的key进行reduce

package mapreducelearn;/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements.  See the NOTICE file * distributed with this work for additional information * regarding copyright ownership.  The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import java.util.HashMap;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.Writable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.GenericOptionsParser;public class GroupByWordCount {    /**     * MapReduceBase类:实现了Mapper和Reducer接口的基类（其中的方法只是实现接口，而未作任何事情） Mapper接口：     * WritableComparable接口：实现WritableComparable的类可以相互比较。所有被用作key的类应该实现此接口。     * Reporter 则可用于报告整个应用的运行进度，本例中未使用。     *      */    public static class TokenizerMapper extends            Mapper<Object, Text, Text, IntWritable> {        /**         * LongWritable, IntWritable, Text 均是 Hadoop 中实现的用于封装 Java         * 数据类型的类，这些类实现了WritableComparable接口，         * 都能够被串行化从而便于在分布式环境中进行数据交换，你可以将它们分别视为long,int,String 的替代品。         */        private final static IntWritable one = new IntWritable(1);        private Text word = new Text();        /**         * Mapper接口中的map方法： void map(K1 key, V1 value, OutputCollector<K2,V2>         * output, Reporter reporter) 映射一个单个的输入k/v对到一个中间的k/v对         * 输出对不需要和输入对是相同的类型，输入对可以映射到0个或多个输出对。         * OutputCollector接口：收集Mapper和Reducer输出的<k,v>对。         * OutputCollector接口的collect(k, v)方法:增加一个(k,v)对到output         */        public void map(Object key, Text value, Context context)                throws IOException, InterruptedException {            /**             * 原始数据： c++ java hello world java hello you me too             * map阶段，数据如下形式作为map的输入值：key为偏移量 0 c++ java hello 16 world java             * hello 34 you me too             */            /**             * 以下解析键值对 解析后以键值对格式形成输出数据 格式如下：前者是键排好序的，后者数字是值 c++ 1 java 1 hello 1             * world 1 java 1 hello 1 you 1 me 1 too 1 这些数据作为reduce的输出数据             *///          String[] str=value.toString().split("#");            HashMap<Text,Integer> hashmap=new HashMap<Text,Integer>();            StringTokenizer itr = new StringTokenizer(value.toString());//            System.out.println("value什么东西 ： "+value.toString());            // System.out.println("key什么东西 ： "+key.toString());            while (itr.hasMoreTokens()) {                word.set(itr.nextToken());                if(hashmap.containsKey(word)){                hashmap.put(word, hashmap.get(word)+1);                }else{                    hashmap.put(word, 1);                }            }            for(Text wordkey:hashmap.keySet()){                context.write(wordkey,new IntWritable(hashmap.get(wordkey)));            }        }    }    static class UserAndPostWritable implements Writable{        /**         * 类型 U表示用户,P表示帖子         */        private String type;        private String data;        public UserAndPostWritable()        {        }        public UserAndPostWritable(String type, String data)        {            super();            this.type = type;            this.data = data;        }        public String getType()        {            return type;        }        public void setType(String type)        {            this.type = type;        }        public String getData()        {            return data;        }        public void setData(String data)        {            this.data = data;        }        @Override        public void readFields(DataInput input) throws IOException        {            type = input.readUTF();            data = input.readUTF();        }        @Override        public void write(DataOutput output) throws IOException        {            output.writeUTF(type);            output.writeUTF(data);        }    }    public static class IntSumReducer extends            Reducer<Text, IntWritable, Text, IntWritable> {        private IntWritable result = new IntWritable();        /**         * reduce过程是对输入数据解析形成如下格式数据： (c++ [1]) (java [1,1]) (hello [1,1]) (world         * [1]) (you [1]) (me [1]) (you [1]) 供接下来的实现的reduce程序分析数据数据         *          */        public void reduce(Text key, Iterable<IntWritable> values,                Context context) throws IOException, InterruptedException {            int sum = 0;            /**             * 自己的实现的reduce方法分析输入数据 形成数据格式如下并存储 c++ 1 hello 2 java 2 me 1 too 1             * world 1 you 1             *              */            for (IntWritable val : values) {                sum += val.get();            }            result.set(sum);            context.write(key, result);        }    }    public static void main(String[] args) throws Exception {        /**         * JobConf：map/reduce的job配置类，向hadoop框架描述map-reduce执行的工作         * 构造方法：JobConf()、JobConf(Class exampleClass)、JobConf(Configuration         * conf)等         */        Configuration conf = new Configuration();        // System.setProperty("hadoop.home.dir",        // "D:/linux/hadoop-2.6.4/hadoop-2.6.4");        String[] otherArgs = new GenericOptionsParser(conf, args)                .getRemainingArgs();        if (otherArgs.length < 2) {            System.err.println("Usage: wordcount <in> [<in>...] <out>");            System.exit(2);        }        Job job = new Job(conf, "word count");// Job(Configuration conf, String                                                // jobName) 设置job名称和        job.setJarByClass(GroupByWordCount.class);        job.setMapperClass(TokenizerMapper.class);// 为job设置Mapper类        job.setCombinerClass(IntSumReducer.class); // 为job设置Combiner类        job.setReducerClass(IntSumReducer.class);// 为job设置Reduce类        job.setOutputKeyClass(Text.class); // 设置输出key的类型        job.setOutputValueClass(IntWritable.class);// 设置输出value的类型        for (int i = 0; i < otherArgs.length - 1; ++i) {            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));// 为map-reduce任务设置InputFormat实现类                                                                        // 设置输入路径            ;// 为map-reduce任务设置OutputFormat实现类 设置输出路径        }        FileOutputFormat.setOutputPath(job, new Path(                otherArgs[otherArgs.length - 1]));        System.exit(job.waitForCompletion(true) ? 0 : 1);    }}

阅读全文

0 0