Hadoop 自定义数据类型和自定义排序

来源：互联网发布：sql update 效率编辑：程序博客网时间：2024/05/21 22:25

首先需要明确的一点就是在Hadoop 技术框架下 key 必须实现 WritableComparable 接口，而value必须实现 Writable 接口，下面举两个自定义数据类型来描述这个场景。

我们需要对某个流量端口文件进行流量统计，这时我们需要定义一个流量类。

package definyType;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.Writable;public class LiuliangTongji implements Writable {long upPackNum,downPackNum,upPayLoad,downPayLoad;public LiuliangTongji(){}@Overridepublic String toString() {return "LiuliangTongji [upPackNum=" + upPackNum + "\tdownPackNum="+ downPackNum + "\tupPayLoad=" + upPayLoad + "\tdownPayLoad="+ downPayLoad + "]";}public LiuliangTongji(String upPackNum, String downPackNum, String upPayLoad,String downPayLoad) {super();this.upPackNum = Long.parseLong(upPackNum);this.downPackNum = Long.parseLong(downPackNum);this.upPayLoad = Long.parseLong(upPayLoad);this.downPayLoad = Long.parseLong(downPayLoad);}// 反序列化@Overridepublic void readFields(DataInput in) throws IOException {this.upPackNum = in.readLong();this.downPackNum = in.readLong();this.upPayLoad = in.readLong();this.downPayLoad = in.readLong();}//序列化@Overridepublic void write(DataOutput out) throws IOException {out.writeLong(upPackNum);out.writeLong(downPackNum);out.writeLong(upPayLoad);out.writeLong(downPayLoad);}}

设计分布式中 key-value对如下代码：

package definyType;import java.io.IOException;import java.net.URI;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;public class LiuliangCount extends Configured implements Tool {public static class Map extendsMapper<LongWritable, Text, Text, LiuliangTongji> {//Mapper中四个参数一定要明白，k1,v1是原始读入时，k2,v2是传给reduce类型public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {// 这里得到是系统读文件得到的key-value对，map函数需要就是处理得到中间key-value对，给ReducerString[] splits = value.toString().split("\t");LiuliangTongji lilTj = new LiuliangTongji(splits[1],splits[2],splits[3],splits[4]);Text key2 = new Text(splits[0]);context.write(key2, lilTj);}}public static class Reduce extendsReducer<Text, LiuliangTongji, Text, LiuliangTongji> {public void reduce(Text key, Iterable<LiuliangTongji> values,Context context) throws IOException, InterruptedException {long upPackNum=0L,downPackNum=0L,upPayLoad=0L,downPayLoad=0L;for (LiuliangTongji val : values) {upPackNum += val.upPackNum;downPackNum += val.downPackNum;upPayLoad += val.upPayLoad;downPayLoad += val.downPayLoad;}LiuliangTongji v3 = new LiuliangTongji(upPackNum+"",downPackNum+"",upPayLoad+"",downPayLoad+"");context.write(key, v3);}}public int run(String[] args) throws Exception {Configuration conf = new Configuration();Path outpath = new Path(args[1]);FileSystem fileSystem = FileSystem.get(new URI(args[1]),conf);if(fileSystem.exists(outpath))fileSystem.delete(outpath,true);Job job = new Job(conf,"LiuliangCount");FileInputFormat.setInputPaths(job, args[0]);FileOutputFormat.setOutputPath(job, outpath);job.setMapperClass(Map.class);job.setReducerClass(Reduce.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(LiuliangTongji.class);boolean success = job.waitForCompletion(true);return success ? 0 : 1;}public static void main(String[] args) throws Exception {int ret = ToolRunner.run(new LiuliangCount(), args);System.exit(ret);}}

假如现在我们需要根据长方形面积进行排序，这时key需要是我们自定义的类型需要实现 WritableComparable接口，如下代码：

package keySortedDemo;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import java.net.URI;import java.net.URISyntaxException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.WritableComparable;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Partitioner;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;public class RectangleSort {static final String Input_Path = "hdfs://localhost:9000/user/huruzun/input1/data1";static final String Output_Path = "hdfs://localhost:9000/user/huruzun/output";public static void main(String[] args) throws IOException,URISyntaxException, InterruptedException, ClassNotFoundException {Configuration conf = new Configuration();FileSystem fileSystem = FileSystem.get(new URI(Input_Path), conf);Path outpath = new Path(Output_Path);if (fileSystem.exists(outpath)) {fileSystem.delete(outpath, true);}Job job = new Job(conf, "RectangleSort");job.setJarByClass(RectangleSort.class);FileInputFormat.setInputPaths(job, Input_Path);job.setInputFormatClass(TextInputFormat.class);job.setMapperClass(MyMapper.class);job.setMapOutputKeyClass(RectangleWritable.class);job.setMapOutputValueClass(NullWritable.class);job.setReducerClass(MyReducer.class);job.setOutputKeyClass(IntWritable.class);job.setOutputValueClass(IntWritable.class);FileOutputFormat.setOutputPath(job, new Path(Output_Path));job.setOutputFormatClass(TextOutputFormat.class);// 把任务根据自己设定的划分策略放在不同task执行，必须jar运行，执行运行报错job.setPartitionerClass(MyPatitioner.class);job.setNumReduceTasks(2);job.waitForCompletion(true);}static class MyMapper extendsMapper<LongWritable, Text, RectangleWritable, NullWritable> {protected void map(LongWritable k1, Text v1, Context context)throws IOException, InterruptedException {String[] splits = v1.toString().split("\t");RectangleWritable k2 = new RectangleWritable(Integer.parseInt(splits[0]), Integer.parseInt(splits[1]));context.write(k2, NullWritable.get());}}// 这个导致相同面积只留下来一个static class MyReducer extendsReducer<RectangleWritable, NullWritable, IntWritable, IntWritable> {protected void reduce(RectangleWritable k2, Iterable<NullWritable> v2s,Context context) throws IOException, InterruptedException {context.write(new IntWritable(k2.getLength()),new IntWritable(k2.getWidth()));}}}class RectangleWritable implements WritableComparable {int length, width;public RectangleWritable(int length, int width) {super();this.length = length;this.width = width;}public RectangleWritable() {super();}public int getLength() {return length;}public void setLength(int length) {this.length = length;}public int getWidth() {return width;}public void setWidth(int width) {this.width = width;}@Overridepublic void readFields(DataInput in) throws IOException {this.length = in.readInt();this.width = in.readInt();}@Overridepublic void write(DataOutput out) throws IOException {out.writeInt(length);out.writeInt(width);}@Overridepublic int compareTo(Object o) {RectangleWritable to = (RectangleWritable) o;if (this.getLength() * this.getWidth() > to.getLength() * to.getWidth())return 1;else if (this.getLength() * this.getWidth() < to.getLength()* to.getWidth())return -1;elsereturn 0;}}class MyPatitioner extends Partitioner<RectangleWritable, NullWritable> {@Overridepublic int getPartition(RectangleWritable k2, NullWritable v2,int numReduceTask) {if (k2.getLength() == k2.getWidth())return 0; // 正方形在这个taskelsereturn 1; // 长方形在这个task}}

我们可能注意到了有个 MyPatitioner类，这个是继承分区类，前面我们发现我们没有写分区，系统默认是Hash 分区实现。

还有需要注意的是分区代码运行必须是命令行执行的。

0 0