Hadoop——自定义数据类型，实现WritableComparable, 并且分组，排序

来源：互联网发布：埋点数据编辑：程序博客网时间：2024/05/21 00:19

http://blog.csdn.net/u014432433/article/details/51104026

1. 在进行mapreduce编程时key键往往用于分组或排序，当我们在进行这些操作时Hadoop内置的key键数据类型不能满足需求时，

或针对用例优化自定义数据类型可能执行的更好。因此可以通过实现org.apache.hadoop.io.WritableComparable接口定义一个自定义的WritableComparable类型，并使其作为mapreduce计算的key类型。

2.自定义Hadoop key类型。
1.Hadoop mapreduce的key类型往往用于进行相互比较，可以达到进行相互比较来满足排序的目的。
2.Hadoop Writable数据类型实现了WritableComparable<T>接口，并增加了CompareTo()方法。
CompaeTo()方法的返回值有三种类型。负整数、0、正整数分别对应小于、等于、大于被比较对象。

3. 实例先统计一个手机号码的上下行及总流量，再对结果按总流量排序

日志文件 *.dat

1363157985066 1372623050300-FD-07-A4-72-B8:CMCC120.196.100.82i02.c.aliimg.com24272481246812001363157995052 138265441015C-0E-8B-C7-F1-E0:CMCC120.197.40.44026402001363157991076 1392643565620-10-7A-28-CC-0A:CMCC120.196.100.992413215122001363154400022 139262511065C-0E-8B-8B-B1-50:CMCC120.197.40.44024002001363157993044 1821157596194-71-AC-CD-E6-18:CMCC-EASY120.196.100.99iface.qiyi.com视频网站1512152721062001363157995074 841384135C-0E-8B-8C-E8-20:7DaysInn120.197.40.4122.72.52.122016411614322001363157993055 13560439658C4-17-FE-BA-DE-D9:CMCC120.196.100.99181511169542001363157995033 159201332575C-0E-8B-C7-BA-20:CMCC120.197.40.4sug.so.360.cn信息安全2020315629362001363157983019 1371919941968-A1-B7-03-07-B1:CMCC-EASY120.196.100.824024002001363157984041 136605779915C-0E-8B-92-5C-20:CMCC-EASY120.197.40.4s19.cnzz.com站点统计24969606902001363157973098 150136858585C-0E-8B-C7-F7-90:CMCC120.197.40.4rank.ie.sogou.com搜索引擎2827365935382001363157986029 15989002119E8-99-C4-4E-93-E0:CMCC-EASY120.196.100.99www.umeng.com站点统计3319381802001363157992093 13560439658C4-17-FE-BA-DE-D9:CMCC120.196.100.9915991849382001363157986041 134802531045C-0E-8B-C7-FC-80:CMCC-EASY120.197.40.4331801802001363157984040 136028465655C-0E-8B-8B-B6-00:CMCC120.197.40.42052.flash2-http.qq.com综合门户1512193829102001363157995093 1392231446600-FD-07-A2-EC-BA:CMCC120.196.100.82img.qfc.cn1212300837202001363157982040 135024688235C-0A-5B-6A-0B-D4:CMCC-EASY120.196.100.99y0.ifengimg.com综合门户5710273351103492001363157986072 1832017338284-25-DB-4F-10-1A:CMCC-EASY120.196.100.99input.shouji.sogou.com搜索引擎2118953124122001363157990043 1392505741300-1F-64-E1-E6-9A:CMCC120.196.100.55t3.baidu.com搜索引擎696311058482432001363157988072 1376077871000-FD-07-A4-7B-08:CMCC120.196.100.82221201202001363157985066 1372623888800-FD-07-A4-72-B8:CMCC120.196.100.82i02.c.aliimg.com24272481246812001363157993055 13560436666C4-17-FE-BA-DE-D9:CMCC120.196.100.9918151116954200

FlowBean.java 自定义类型

package com.kevin.model;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.WritableComparable;public class FlowBean implements WritableComparable<FlowBean> {private String phoneNbr;private long up_flow;private long d_flow;private long sum_flow;public void set(String phoneNbr, long up_flow, long d_flow){this.phoneNbr = phoneNbr;this.up_flow = up_flow;this.d_flow = d_flow;this.sum_flow = up_flow + d_flow;}/** * 序列化，将数据字段以字节流写出去 */@Overridepublic void write(DataOutput out) throws IOException {out.writeUTF(this.phoneNbr);out.writeLong(this.up_flow);out.writeLong(this.d_flow);out.writeLong(this.sum_flow);}/** * 反序列化，从字节流中读出各个数据字段 读出的顺序应该跟序列化时写入的顺序保持一致 */@Overridepublic void readFields(DataInput in) throws IOException {this.phoneNbr = in.readUTF();this.up_flow = in.readLong();this.d_flow = in.readLong();this.sum_flow = in.readLong();}@Overridepublic int compareTo(FlowBean o) {return  this.sum_flow > o.getSum_flow() ? -1 : 1 ;}@Overridepublic String toString() { return up_flow + "\t" + d_flow + "\t" + sum_flow;}public String getPhoneNbr() {return phoneNbr;}public void setPhoneNbr(String phoneNbr) {this.phoneNbr = phoneNbr;}public long getUp_flow() {return up_flow;}public void setUp_flow(long up_flow) {this.up_flow = up_flow;}public long getD_flow() {return d_flow;}public void setD_flow(long d_flow) {this.d_flow = d_flow;}public long getSum_flow() {return sum_flow;}public void setSum_flow(long sum_flow) {this.sum_flow = sum_flow;}}

AreaPartitioner.java 定义分组继承Partitioner 并实现分组的方法，按手机号码分组

package com.kevin.partitioner;import java.util.HashMap;import org.apache.hadoop.mapreduce.Partitioner;public class AreaPartitioner<KEY, VALUE> extends Partitioner<KEY, VALUE>{private static HashMap<String, Integer> areaMap =  new HashMap<>();static {areaMap.put("136", 0);areaMap.put("137", 1);areaMap.put("138", 2);areaMap.put("139", 3);}@Overridepublic int getPartition(KEY key, VALUE value, int numPartitions) {Integer provinceCode = areaMap.get(key.toString().substring(0,3));return provinceCode==null?4:provinceCode;}}

FlowCount.java 对手机号码进行流量统计

package com.kevin.mapreducedemo2;import java.io.IOException;import org.apache.commons.lang.StringUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import com.kevin.model.FlowBean;import com.kevin.partitioner.AreaPartitioner;//hadoop自己实现的序列化机制跟jdk有区别： 比jdk更精简public class FlowCount {public static class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean>{private FlowBean flowBean = new FlowBean();@Overrideprotected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {// 拿到一行数据String line = value.toString();// 切分字段String[] fields = StringUtils.split(line, "\t");// 拿到我们需要的若干个字段String phoneNbr = fields[1];long up_flow = Long.parseLong(fields[fields.length - 3]);long d_flow = Long.parseLong(fields[fields.length - 2]);// 将数据封装到一个flowbean中flowBean.set(phoneNbr, up_flow, d_flow);context.write(new Text(phoneNbr), flowBean);}}public static class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean>{private FlowBean flowBean = new FlowBean();@Overrideprotected void reduce(Text key, Iterable<FlowBean> values,Context context) throws IOException, InterruptedException {long up_flow_sum = 0;long d_flow_sum = 0;for(FlowBean flowBean : values){up_flow_sum += flowBean.getUp_flow();d_flow_sum += flowBean.getD_flow();}flowBean.set(key.toString(), up_flow_sum, d_flow_sum);context.write(key, flowBean);}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job = Job.getInstance(conf,"flowjob");job.setJarByClass(FlowCount.class);job.setMapperClass(FlowCountMapper.class);job.setReducerClass(FlowCountReducer.class);/** * 加入自定义分区定义 ： AreaPartitioner */job.setPartitionerClass(AreaPartitioner.class);/** * 设置reduce task的数量，要跟AreaPartitioner返回的partition个数匹配 * 如果reduce task的数量比partitioner中分组数多，就会产生多余的几个空文件 * 如果reduce task的数量比partitioner中分组数少，就会发生异常，因为有一些key没有对应reducetask接收 * (如果reduce task的数量为1，也能正常运行，所有的key都会分给这一个reduce task) * reduce task 或 map task 指的是，reuder和mapper在集群中运行的实例 */job.setNumReduceTasks(5);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(FlowBean.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(FlowBean.class);job.setInputFormatClass(TextInputFormat.class);job.setOutputFormatClass(TextOutputFormat.class);FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.1.108:9000/flow_count_demo/data-files/"));FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.1.108:9000/flow_count_demo/data-out/"));job.waitForCompletion(true);}}

FlowCountSort.java 对结果按总量排序

package com.kevin.mapreducedemo2;import java.io.IOException;import org.apache.commons.lang.StringUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import com.kevin.mapreducedemo2.FlowCount.FlowCountMapper;import com.kevin.mapreducedemo2.FlowCount.FlowCountReducer;import com.kevin.model.FlowBean;public class FlowCountSort {public static class FlowCountSortMapper extends Mapper<LongWritable, Text, FlowBean, NullWritable>{private FlowBean bean = new FlowBean();@Overrideprotected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {String line = value.toString();String[] fields = StringUtils.split(line, "\t");String phoneNbr = fields[0];long up_flow = Long.parseLong(fields[1]);long d_flow = Long.parseLong(fields[2]);bean.set(phoneNbr, up_flow, d_flow);context.write(bean, NullWritable.get());}}public static class FlowCountSortReducer extends Reducer<FlowBean, NullWritable, Text, FlowBean>{@Overrideprotected void reduce(FlowBean bean, Iterable<NullWritable> values,Context context) throws IOException, InterruptedException {context.write(new Text(bean.getPhoneNbr()), bean);}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job = Job.getInstance(conf,"flowjob");job.setJarByClass(FlowCountSort.class);job.setMapperClass(FlowCountSortMapper.class);job.setReducerClass(FlowCountSortReducer.class);job.setMapOutputKeyClass(FlowBean.class);job.setMapOutputValueClass(NullWritable.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(FlowBean.class);job.setInputFormatClass(TextInputFormat.class);job.setOutputFormatClass(TextOutputFormat.class);FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.1.108:9000/flow_count_demo/data-out/"));FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.1.108:9000/flow_count_demo/data-out2/"));job.waitForCompletion(true);}}

0 0

Hadoop——自定义数据类型，实现WritableComparable, 并且 分组，排序

Hadoop——自定义数据类型，实现WritableComparable, 并且分组，排序