hadoop中的自动分区

来源:互联网 发布:游戏数据分析师薪酬 编辑:程序博客网 时间:2024/06/01 10:30

lz在学习hadoop大数据实践,接触到可以通过继承partitioner这个类来自定义分区,将map后输出的结果按照key来划分到不同的reduce中进行汇总,也就是reduce,默认情况下只有一个partitioner分区,可以自定义来划分不同的partitioner分区,方便快捷。而且,自定义分区,必须通过hadoop jar的方式来运行,以下通过一个例子来说明如何运行这个partitioner。

有一个sort.txt文件,其中每一行的内容分别表示的是长和宽,那么需要按照面积的大小来进行排序,所以这里也要自定义排序,然后划分完后,再根据是否是长方形还是正方形来将其输送到不同的reduce中进行汇总计算。

首先是sort.txt文件,其中用tab来分隔。

1 1
9 9
4 5
7 8

然后是自定义排序

class RectangleWritable implements WritableComparable  //实现了接口{    int length,width;    public RectangleWritable()    {        super();    }    public RectangleWritable(int length, int width)    {        super();        this.length = length;        this.width = width;    }    public int getLength()    {        return length;    }    public int getWidth()    {        return width;    }    public void setLength(int length)    {        this.length = length;    }    public void setWidth(int width)    {        this.width = width;    }    public int compareTo(Object o)  //自定义排序规则    {  //就是按照长*宽得到的长方形的面积大小来进行排序        RectangleWritable to = (RectangleWritable)o;        if(this.getLength() * this.getWidth() > to.getLength() * to.getWidth())            return 1;        else if(this.getLength() * this.getWidth() < to.getLength() * to.getWidth())            return -1;        else             return 0;    }    public void write(DataOutput out) throws IOException  //序列化,保证序列化和反序列化的时候,长宽的顺序不能搞乱    {        out.writeInt(length);        out.writeInt(width);    }    public void readFields(DataInput in) throws IOException  //反序列化    {        this.length = in.readInt();        this.width = in.readInt();    }}
然后是map代码

public static class MyMapper extends Mapper<LongWritable,Text,RectangleWritable,NullWritable>    {        public void map(LongWritable k1,Text v1,Context context) throws IOException, InterruptedException        {            String[] splits = v1.toString().split("\t");            RectangleWritable k2 = new RectangleWritable(Integer.parseInt(splits[0]),Integer.parseInt(splits[1]));            context.write(k2,NullWritable.get());        }    }


那么在完成map之后,就需要通过自定义分区,将正方形放置到part-r-00000文件中,将长方形放置到part-r-00001文件中,新建两个不同的分区。

class MyPartitioner extends Partitioner<RectangleWritable,NullWritable>{    public int getPartition(RectangleWritable k2, NullWritable v2,                            int numReduceTasks)   //这里的两个参数分别是key和value    {        if(k2.getLength() == k2.getWidth())   //正方形在任务0中汇总            return 0;        else                        //长方形在任务1中汇总            return 1;    }}
接下来是reduce汇总程序:

public static class MyReducer extends Reducer<RectangleWritable,NullWritable,IntWritable,IntWritable>    {        public void reduce(RectangleWritable k2,Iterable<NullWritable> v2s,Context context) throws IOException, InterruptedException        {            context.write(new IntWritable(k2.getLength()),new IntWritable(k2.getWidth()));        }    }

然后是main代码,可以在此写入自定义划分

public static void main(String[] args) throws Throwable,URISyntaxException    {        //String uri = args[0];        Configuration conf = new Configuration();        /*FileSystem fs = FileSystem.get(new URI(uri),conf);        if(fs.exists(new Path(args[1])))    //先判断输出路径是否已经存在,如果存在,那么就将循环地删除此文件路径            fs.delete(new Path(args[1]),true);*/        Job job = Job.getInstance(conf,"RectangleSort");        job.setJarByClass(RectangleSort.class);        job.setMapperClass(RectangleSort.MyMapper.class);        job.setMapOutputKeyClass(RectangleWritable.class);        job.setMapOutputValueClass(NullWritable.class);        job.setReducerClass(RectangleSort.MyReducer.class);        job.setOutputKeyClass(IntWritable.class);        job.setOutputValueClass(IntWritable.class);        FileInputFormat.addInputPath(job,new Path(args[0]));        FileOutputFormat.setOutputPath(job,new Path(args[1]));        job.setPartitionerClass(MyPartitioner.class);  //使用自定义分区,用来对map之后的任务进行汇总        job.setNumReduceTasks(2);     //设置两个不同的reduce任务        Boolean b = job.waitForCompletion(true);        if(!b)        {            System.err.println("failed");        }        else            System.out.println("finished!");    }
注意,自定义分区必须用hadoop jar的形式来运行,其运行的结果
sunwangdongMacBook-Pro:sbin sunwangdong$ hdfs dfs -ls /data/rectangle/out17/07/15 14:34:10 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicableFound 3 items-rw-r--r--   1 sunwangdong supergroup          0 2017-07-15 14:03 /data/rectangle/out/_SUCCESS-rw-r--r--   1 sunwangdong supergroup          8 2017-07-15 14:03 /data/rectangle/out/part-r-00000-rw-r--r--   1 sunwangdong supergroup          8 2017-07-15 14:03 /data/rectangle/out/part-r-00001

就出现了两个part-r-00000和part-r-00001文件,分别存放不同类型的长方形和正方形。

sunwangdongMacBook-Pro:sbin sunwangdong$ hdfs dfs -cat /data/rectangle/out/p*017/07/15 14:35:12 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable1199

在part-r-00000中存放的就是正方形,而在part-r-00001中存放的就是长方形

sunwangdongMacBook-Pro:sbin sunwangdong$ hdfs dfs -cat /data/rectangle/out/p*117/07/15 14:35:56 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable4578

完整代码如下:

package com.sunwangdong.hadoop;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.*;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Partitioner;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import java.net.URISyntaxException;/** * Created by sunwangdong on 2017/7/2. */public class RectangleSort{    public static class MyMapper extends Mapper<LongWritable,Text,RectangleWritable,NullWritable>    {        public void map(LongWritable k1,Text v1,Context context) throws IOException, InterruptedException        {            String[] splits = v1.toString().split("\t");            RectangleWritable k2 = new RectangleWritable(Integer.parseInt(splits[0]),Integer.parseInt(splits[1]));            context.write(k2,NullWritable.get());        }    }    public static class MyReducer extends Reducer<RectangleWritable,NullWritable,IntWritable,IntWritable>    {        public void reduce(RectangleWritable k2,Iterable<NullWritable> v2s,Context context) throws IOException, InterruptedException        {            context.write(new IntWritable(k2.getLength()),new IntWritable(k2.getWidth()));        }    }    public static void main(String[] args) throws Throwable,URISyntaxException{    //String uri = args[0];    Configuration conf = new Configuration();        /*FileSystem fs = FileSystem.get(new URI(uri),conf);        if(fs.exists(new Path(args[1])))    //先判断输出路径是否已经存在,如果存在,那么就将循环地删除此文件路径            fs.delete(new Path(args[1]),true);*/    Job job = Job.getInstance(conf,"RectangleSort");    job.setJarByClass(RectangleSort.class);    job.setMapperClass(RectangleSort.MyMapper.class);    job.setMapOutputKeyClass(RectangleWritable.class);    job.setMapOutputValueClass(NullWritable.class);    job.setReducerClass(RectangleSort.MyReducer.class);    job.setOutputKeyClass(IntWritable.class);    job.setOutputValueClass(IntWritable.class);    FileInputFormat.addInputPath(job,new Path(args[0]));    FileOutputFormat.setOutputPath(job,new Path(args[1]));    job.setPartitionerClass(MyPartitioner.class);  //使用自定义分区,用来对map之后的任务进行汇总    job.setNumReduceTasks(2);     //设置两个不同的reduce任务    Boolean b = job.waitForCompletion(true);    if(!b)    {        System.err.println("failed");    }    else        System.out.println("finished!");}}class RectangleWritable implements WritableComparable{    int length,width;    public RectangleWritable()    {        super();    }    public RectangleWritable(int length, int width)    {        super();        this.length = length;        this.width = width;    }    public int getLength()    {        return length;    }    public int getWidth()    {        return width;    }    public void setLength(int length)    {        this.length = length;    }    public void setWidth(int width)    {        this.width = width;    }    public int compareTo(Object o)    {  //就是按照长*宽得到的长方形的面积大小来进行排序        RectangleWritable to = (RectangleWritable)o;        if(this.getLength() * this.getWidth() > to.getLength() * to.getWidth())            return 1;        else if(this.getLength() * this.getWidth() < to.getLength() * to.getWidth())            return -1;        else             return 0;    }    public void write(DataOutput out) throws IOException  //序列化,保证序列化和反序列化的时候,长宽的顺序不能搞乱    {        out.writeInt(length);        out.writeInt(width);    }    public void readFields(DataInput in) throws IOException  //反序列化    {        this.length = in.readInt();        this.width = in.readInt();    }}class MyPartitioner extends Partitioner<RectangleWritable,NullWritable>{    public int getPartition(RectangleWritable k2, NullWritable v2,                            int numReduceTasks)   //这里的两个参数分别是key和value    {        if(k2.getLength() == k2.getWidth())   //正方形在任务0中汇总            return 0;        else                        //长方形在任务1中汇总            return 1;    }}