Hadoop之Mapreducer里的Partitioner(笔记25)

来源:互联网 发布:程序员小品剧本 编辑:程序博客网 时间:2024/05/16 15:51
Partitioner就是对map输出的key进行分组,不同的组可以指定不同的reduce task处理;
Partition功能由partitioner的实现子类来实现

每写一段代码都会加深理解,程序里记录了自己的理解

FlowBean类源码:

[java] view plain copy
 
 在CODE上查看代码片派生到我的代码片
  1. package cn.zxl.flowcountpartitioner;  
  2.   
  3. import java.io.DataInput;  
  4. import java.io.DataOutput;  
  5. import java.io.IOException;  
  6.   
  7. import org.apache.hadoop.io.WritableComparable;  
  8.   
  9. public class FlowBean implements WritableComparable<FlowBean>{  
  10.     private long upflow;//上行流量  
  11.     private long downflow;//下行流量  
  12.     private long sumflow;//总流量  
  13.     public long getUpflow() {  
  14.         return upflow;  
  15.     }  
  16.     public void setUpflow(long upflow) {  
  17.         this.upflow = upflow;  
  18.     }  
  19.     public long getDownflow() {  
  20.         return downflow;  
  21.     }  
  22.     public void setDownflow(long downflow) {  
  23.         this.downflow = downflow;  
  24.     }  
  25.     public long getSumflow() {  
  26.         return sumflow;  
  27.     }  
  28.     public void setSumflow(long sumflow) {  
  29.         this.sumflow = sumflow;  
  30.     }  
  31.     public FlowBean() {  
  32.     }  
  33.     public FlowBean(long upflow, long downflow) {  
  34.         super();  
  35.         this.upflow = upflow;  
  36.         this.downflow = downflow;  
  37.         this.sumflow = upflow+downflow;  
  38.     }  
  39.     @Override  
  40.     public void readFields(DataInput in) throws IOException {  
  41.         upflow=in.readLong();  
  42.         downflow=in.readLong();  
  43.         sumflow=in.readLong();  
  44.     }  
  45.     @Override  
  46.     public void write(DataOutput out) throws IOException {  
  47.         out.writeLong(upflow);  
  48.         out.writeLong(downflow);  
  49.         out.writeLong(sumflow);  
  50.           
  51.     }  
  52.     @Override  
  53.     public int compareTo(FlowBean bean) {  
  54.         return sumflow>bean.getSumflow()?-1:1;  
  55.     }  
  56.       
  57.     @Override  
  58.     public String toString() {  
  59.         return upflow+"\t"+downflow+"\t"+sumflow;  
  60.     }  
  61. }  
ProvicePartition类源码:

[java] view plain copy
 
 在CODE上查看代码片派生到我的代码片
  1. package cn.zxl.flowcountpartitioner;  
  2.   
  3. import java.util.HashMap;  
  4.   
  5. import org.apache.hadoop.io.Text;  
  6. import org.apache.hadoop.mapreduce.Partitioner;  
  7.   
  8. public class ProvicePartition extends Partitioner<Text, FlowBean>{  
  9.     //根据手机号前三位划分分组  
  10.     //Partitioner就是对key进行分组  
  11.     private static HashMap<String, Integer> pmap = new HashMap<String, Integer>();  
  12.     static{  
  13.         pmap.put("136"0);  
  14.         pmap.put("137"1);  
  15.         pmap.put("138"2);  
  16.         pmap.put("139"3);  
  17.     }  
  18.     @Override  
  19.     public int getPartition(Text key, FlowBean bean, int numPartitions) {  
  20.         String prex=key.toString().substring(0,3);  
  21.         Integer partNum=pmap.get(prex);//根据key截取的前三位做key和map的值是否匹配  
  22.         return partNum==null?4:partNum;  
  23.     }  
  24. }  

FlowCount类源码:

[java] view plain copy
 
 在CODE上查看代码片派生到我的代码片
  1. package cn.zxl.flowcountpartitioner;  
  2.   
  3. import java.io.IOException;  
  4.   
  5. import org.apache.hadoop.conf.Configuration;  
  6. import org.apache.hadoop.fs.FileSystem;  
  7. import org.apache.hadoop.fs.Path;  
  8. import org.apache.hadoop.io.LongWritable;  
  9. import org.apache.hadoop.io.Text;  
  10. import org.apache.hadoop.mapreduce.Job;  
  11. import org.apache.hadoop.mapreduce.Mapper;  
  12. import org.apache.hadoop.mapreduce.Reducer;  
  13. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
  14. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
  15.   
  16. public class FlowCount {  
  17.     static class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean>{  
  18.         @Override  
  19.         protected void map(LongWritable key, Text value,Context context)  
  20.                 throws IOException, InterruptedException {  
  21.             String line=value.toString();  
  22.             String[] phoneinfo=line.split("\t");  
  23.             String phoneN=phoneinfo[0];  
  24.             String upflow=phoneinfo[phoneinfo.length-3];  
  25.             String downflow=phoneinfo[phoneinfo.length-2];  
  26.             FlowBean fb=new FlowBean(Long.parseLong(upflow),Long.parseLong(downflow));  
  27.             context.write(new Text(phoneN),fb);  
  28.         }  
  29.     }  
  30.     //reducer里的值是<key,list(value)>,也就是相同的键里对应一个集合  
  31.     //reducer是根据key排序的,而不是value,要根据什么排序,那就得已什么作为key输出  
  32.     static class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean>{  
  33.         @Override  
  34.         protected void reduce(Text key, Iterable<FlowBean> values,Context context)  
  35.                 throws IOException, InterruptedException {  
  36.             long upflow_sum=0;  
  37.             long downflow_sum=0;  
  38.             for(FlowBean bean:values){  
  39.                 upflow_sum+=bean.getUpflow();  
  40.                 downflow_sum+=bean.getDownflow();  
  41.             }  
  42.             FlowBean fb=new FlowBean(upflow_sum,downflow_sum);  
  43.             context.write(new Text(key), fb);  
  44.         }  
  45.     }  
  46.       
  47.     public static void main(String[] args) throws Exception {  
  48.         Configuration conf=new Configuration();// cn.zxl.flowcountpartitioner.FlowCount  
  49.           
  50.         Job job=Job.getInstance(conf);  
  51.           
  52.         job.setJarByClass(FlowCount.class);  
  53.           
  54.         job.setMapperClass(FlowCountMapper.class);  
  55.         job.setReducerClass(FlowCountReducer.class);  
  56.           
  57.         job.setOutputKeyClass(Text.class);  
  58.         job.setOutputValueClass(FlowBean.class);  
  59.         //job指定自定义的Partitioner组件  
  60.         //job.setPartitionerClass(ProvicePartition.class);  
  61.         /*job中指定reducertask的数量,说明:这里的reducertask数量可以指定为1个,如果是1个reducertask, 
  62.         *那么所有的分区数据都输入到一个文件里,如果指定个数小于分区个数(这里是5个),那么程序会报错, 
  63.         *因为不知道对应的一个分区数据放置到哪里,如果指定个数超过分区个数,那么后面产生的文件是空的 
  64.         */  
  65.         //job.setNumReduceTasks(5);  
  66.           
  67.         FileInputFormat.setInputPaths(job, new Path(args[0]));  
  68.           
  69.         Path output = new Path(args[1]);  
  70.         FileSystem fs = output.getFileSystem(conf);  
  71.         //看输出是否存在,存在就删除,特别说明:安全起见正式的线上建议最好不要做这个判断,如果这样做,会把以前产生的数据删除  
  72.         //补充:正式生产环境最好指定删除多久后正式删除数据,以便错删时可以恢复数据  
  73.         /* 
  74.          * 添加在hdfs-site的配置文件里 
  75.          * <property> 
  76.             <name>fs.trash.interval</name> 
  77.             <value>60</value><!-- 回收站过期机制检查频率(分钟) --> 
  78.             </property> 
  79.              
  80.             <property> 
  81.             <name>fs.trash.checkpoint.interval</name> 
  82.             <value>20</value><!-- 回收站中文件过期的时间限制(分钟) --> 
  83.             </property> 
  84.          */  
  85.         if(fs.exists(output)){  
  86.             fs.delete(output, true);  
  87.         }  
  88.         FileOutputFormat.setOutputPath(job, new Path(args[1]));  
  89.           
  90.         job.waitForCompletion(true);  
  91.     }  
  92. }  

测试数据:

1363157985066 1372623050300-FD-07-A4-72-B8:CMCC120.196.100.82i02.c.aliimg.com2427 248124681 200
1363157995052 138265441015C-0E-8B-C7-F1-E0:CMCC120.197.40.440 264 0 200
1363157991076 1392643565620-10-7A-28-CC-0A:CMCC120.196.100.9924 132 1512 200
1363154400022 139262511065C-0E-8B-8B-B1-50:CMCC120.197.40.440 240 0 200
1363157993044 1821157596194-71-AC-CD-E6-18:CMCC-EASY120.196.100.99iface.qiyi.com视频网站15 12 1527 2106 200
1363157995074 841384135C-0E-8B-8C-E8-20:7DaysInn120.197.40.4122.72.52.122016 41161432 200
1363157993055 13560439658C4-17-FE-BA-DE-D9:CMCC120.196.100.991815 1116954 200
1363157995033 159201332575C-0E-8B-C7-BA-20:CMCC120.197.40.4sug.so.360.cn信息安全20 20 3156 2936 200
1363157983019 1371919941968-A1-B7-03-07-B1:CMCC-EASY120.196.100.8240 240 0 200
1363157984041 136605779915C-0E-8B-92-5C-20:CMCC-EASY120.197.40.4s19.cnzz.com站点统计24 9 6960 690 200
1363157973098 150136858585C-0E-8B-C7-F7-90:CMCC120.197.40.4rank.ie.sogou.com搜索引擎28 27 3659 3538 200
1363157986029 15989002119E8-99-C4-4E-93-E0:CMCC-EASY120.196.100.99www.umeng.com站点统计3 3 1938 180 200
1363157992093 13560439658C4-17-FE-BA-DE-D9:CMCC120.196.100.99159 918 4938 200
1363157986041 134802531045C-0E-8B-C7-FC-80:CMCC-EASY120.197.40.433 180 180 200
1363157984040 136028465655C-0E-8B-8B-B6-00:CMCC120.197.40.42052.flash2-http.qq.com综合门户15 12 1938 2910 200
1363157995093 1392231446600-FD-07-A2-EC-BA:CMCC120.196.100.82img.qfc.cn1212 30083720 200
1363157982040 135024688235C-0A-5B-6A-0B-D4:CMCC-EASY120.196.100.99y0.ifengimg.com综合门户57 102 7335 110349 200
1363157986072 1832017338284-25-DB-4F-10-1A:CMCC-EASY120.196.100.99input.shouji.sogou.com搜索引擎21 18 9531 2412 200
1363157990043 1392505741300-1F-64-E1-E6-9A:CMCC120.196.100.55t3.baidu.com搜索引擎69 63 11058 48243 200
1363157988072 1376077871000-FD-07-A4-7B-08:CMCC120.196.100.8222 120 120 200
1363157985066 1372623888800-FD-07-A4-72-B8:CMCC120.196.100.82i02.c.aliimg.com2427 248124681 200
1363157993055 13560436666C4-17-FE-BA-DE-D9:CMCC120.196.100.991815 1116954 200
1373157985066 1372623050300-FD-07-A4-72-B8:CMCC120.196.100.82i02.c.aliimg.com2427 248124681 200
1373157995052 138265441015C-0E-8B-C7-F1-E0:CMCC120.197.40.440 264 0 200
1373157991076 1392643565620-10-7A-28-CC-0A:CMCC120.196.100.9924 132 1512 200
1373154400022 139262511065C-0E-8B-8B-B1-50:CMCC120.197.40.440 240 0 200
1373157993044 1821157596194-71-AC-CD-E6-18:CMCC-EASY120.196.100.99iface.qiyi.com视频网站15 12 1527 2106 200
1373157995074 841384135C-0E-8B-8C-E8-20:7DaysInn120.197.40.4122.72.52.122016 41161432 200
1373157993055 13560439658C4-17-FE-BA-DE-D9:CMCC120.196.100.991815 1116954 200
1373157995033 159201332575C-0E-8B-C7-BA-20:CMCC120.197.40.4sug.so.360.cn信息安全20 20 3156 2936 200
1373157983019 1371919941968-A1-B7-03-07-B1:CMCC-EASY120.196.100.8240 240 0 200
1373157984041 136605779915C-0E-8B-92-5C-20:CMCC-EASY120.197.40.4s19.cnzz.com站点统计24 9 6960 690 200
1373157973098 150136858585C-0E-8B-C7-F7-90:CMCC120.197.40.4rank.ie.sogou.com搜索引擎28 27 3659 3538 200
1373157986029 15989002119E8-99-C4-4E-93-E0:CMCC-EASY120.196.100.99www.umeng.com站点统计3 3 1938 180 200
1373157992093 13560439658C4-17-FE-BA-DE-D9:CMCC120.196.100.99159 918 4938 200
1373157986041 134802531045C-0E-8B-C7-FC-80:CMCC-EASY120.197.40.433 180 180 200
1373157984040 136028465655C-0E-8B-8B-B6-00:CMCC120.197.40.42052.flash2-http.qq.com综合门户15 12 1938 2910 200
1373157995093 1392231446600-FD-07-A2-EC-BA:CMCC120.196.100.82img.qfc.cn1212 30083720 200
1373157982040 135024688235C-0A-5B-6A-0B-D4:CMCC-EASY120.196.100.99y0.ifengimg.com综合门户57 102 7335 110349 200
1373157986072 1832017338284-25-DB-4F-10-1A:CMCC-EASY120.196.100.99input.shouji.sogou.com搜索引擎21 18 9531 2412 200
1373157990043 1392505741300-1F-64-E1-E6-9A:CMCC120.196.100.55t3.baidu.com搜索引擎69 63 11058 48243 200
1373157988072 1376077871000-FD-07-A4-7B-08:CMCC120.196.100.8222 120 120 200
1373157985066 1372623888800-FD-07-A4-72-B8:CMCC120.196.100.82i02.c.aliimg.com2427 248124681 200
1373157993055 13560436666C4-17-FE-BA-DE-D9:CMCC120.196.100.991815 1116954 200
1383157985066 1372623050300-FD-07-A4-72-B8:CMCC120.196.100.82i02.c.aliimg.com2427 248124681 200
1383157995052 138265441015C-0E-8B-C7-F1-E0:CMCC120.197.40.440 264 0 200
1383157991076 1392643565620-10-7A-28-CC-0A:CMCC120.196.100.9924 132 1512 200
1383154400022 139262511065C-0E-8B-8B-B1-50:CMCC120.197.40.440 240 0 200
1383157993044 1821157596194-71-AC-CD-E6-18:CMCC-EASY120.196.100.99iface.qiyi.com视频网站15 12 1527 2106 200
1383157995074 841384135C-0E-8B-8C-E8-20:7DaysInn120.197.40.4122.72.52.122016 41161432 200
1383157993055 13560439658C4-17-FE-BA-DE-D9:CMCC120.196.100.991815 1116954 200
1383157995033 159201332575C-0E-8B-C7-BA-20:CMCC120.197.40.4sug.so.360.cn信息安全20 20 3156 2936 200
1383157983019 1371919941968-A1-B7-03-07-B1:CMCC-EASY120.196.100.8240 240 0 200
1383157984041 136605779915C-0E-8B-92-5C-20:CMCC-EASY120.197.40.4s19.cnzz.com站点统计24 9 6960 690 200
1383157973098 150136858585C-0E-8B-C7-F7-90:CMCC120.197.40.4rank.ie.sogou.com搜索引擎28 27 3659 3538 200
1383157986029 15989002119E8-99-C4-4E-93-E0:CMCC-EASY120.196.100.99www.umeng.com站点统计3 3 1938 180 200
1383157992093 13560439658C4-17-FE-BA-DE-D9:CMCC120.196.100.99159 918 4938 200
1383157986041 134802531045C-0E-8B-C7-FC-80:CMCC-EASY120.197.40.433 180 180 200
1383157984040 136028465655C-0E-8B-8B-B6-00:CMCC120.197.40.42052.flash2-http.qq.com综合门户15 12 1938 2910 200
1383157995093 1392231446600-FD-07-A2-EC-BA:CMCC120.196.100.82img.qfc.cn1212 30083720 200
1383157982040 135024688235C-0A-5B-6A-0B-D4:CMCC-EASY120.196.100.99y0.ifengimg.com综合门户57 102 7335 110349 200
1383157986072 1832017338284-25-DB-4F-10-1A:CMCC-EASY120.196.100.99input.shouji.sogou.com搜索引擎21 18 9531 2412 200
1383157990043 1392505741300-1F-64-E1-E6-9A:CMCC120.196.100.55t3.baidu.com搜索引擎69 63 11058 48243 200
1383157988072 1376077871000-FD-07-A4-7B-08:CMCC120.196.100.8222 120 120 200
1383157985066 1372623888800-FD-07-A4-72-B8:CMCC120.196.100.82i02.c.aliimg.com2427 248124681 200
1383157993055 13560436666C4-17-FE-BA-DE-D9:CMCC120.196.100.991815 1116954 200
1393157985066 1372623050300-FD-07-A4-72-B8:CMCC120.196.100.82i02.c.aliimg.com2427 248124681 200
1393157995052 138265441015C-0E-8B-C7-F1-E0:CMCC120.197.40.440 264 0 200
1393157991076 1392643565620-10-7A-28-CC-0A:CMCC120.196.100.9924 132 1512 200
13963154400022 139262511065C-0E-8B-8B-B1-50:CMCC120.197.40.440 240 0 200
1393157993044 1821157596194-71-AC-CD-E6-18:CMCC-EASY120.196.100.99iface.qiyi.com视频网站15 12 1527 2106 200
1393157995074 841384135C-0E-8B-8C-E8-20:7DaysInn120.197.40.4122.72.52.122016 41161432 200
1393157993055 13560439658C4-17-FE-BA-DE-D9:CMCC120.196.100.991815 1116954 200
1393157995033 159201332575C-0E-8B-C7-BA-20:CMCC120.197.40.4sug.so.360.cn信息安全20 20 3156 2936 200
1393157983019 1371919941968-A1-B7-03-07-B1:CMCC-EASY120.196.100.8240 240 0 200
1393157984041 136605779915C-0E-8B-92-5C-20:CMCC-EASY120.197.40.4s19.cnzz.com站点统计24 9 6960 690 200
1393157973098 150136858585C-0E-8B-C7-F7-90:CMCC120.197.40.4rank.ie.sogou.com搜索引擎28 27 3659 3538 200
1393157986029 15989002119E8-99-C4-4E-93-E0:CMCC-EASY120.196.100.99www.umeng.com站点统计3 3 1938 180 200
1393157992093 13560439658C4-17-FE-BA-DE-D9:CMCC120.196.100.99159 918 4938 200
1393157986041 134802531045C-0E-8B-C7-FC-80:CMCC-EASY120.197.40.433 180 180 200
1393157984040 136028465655C-0E-8B-8B-B6-00:CMCC120.197.40.42052.flash2-http.qq.com综合门户15 12 1938 2910 200
1393157995093 1392231446600-FD-07-A2-EC-BA:CMCC120.196.100.82img.qfc.cn1212 30083720 200
1393157982040 135024688235C-0A-5B-6A-0B-D4:CMCC-EASY120.196.100.99y0.ifengimg.com综合门户57 102 7335 110349 200
1393157986072 1832017338284-25-DB-4F-10-1A:CMCC-EASY120.196.100.99input.shouji.sogou.com搜索引擎21 18 9531 2412 200
1393157990043 1392505741300-1F-64-E1-E6-9A:CMCC120.196.100.55t3.baidu.com搜索引擎69 63 11058 48243 200
1393157988072 1376077871000-FD-07-A4-7B-08:CMCC120.196.100.8222 120 120 200
1393157985066 1372623888800-FD-07-A4-72-B8:CMCC120.196.100.82i02.c.aliimg.com2427 248124681 200
1393157993055 13560436666C4-17-FE-BA-DE-D9:CMCC120.196.100.991815 1116954 200
1503157985066 1372623050300-FD-07-A4-72-B8:CMCC120.196.100.82i02.c.aliimg.com2427 248124681 200
1503157995052 138265441015C-0E-8B-C7-F1-E0:CMCC120.197.40.440 264 0 200
1503157991076 1392643565620-10-7A-28-CC-0A:CMCC120.196.100.9924 132 1512 200
1503154400022 139262511065C-0E-8B-8B-B1-50:CMCC120.197.40.440 240 0 200
1503157993044 1821157596194-71-AC-CD-E6-18:CMCC-EASY120.196.100.99iface.qiyi.com视频网站15 12 1527 2106 200
1513157995074 841384135C-0E-8B-8C-E8-20:7DaysInn120.197.40.4122.72.52.122016 41161432 200
1513157993055 13560439658C4-17-FE-BA-DE-D9:CMCC120.196.100.991815 1116954 200
1513157995033 159201332575C-0E-8B-C7-BA-20:CMCC120.197.40.4sug.so.360.cn信息安全20 20 3156 2936 200
1513157983019 1371919941968-A1-B7-03-07-B1:CMCC-EASY120.196.100.8240 240 0 200
1513157984041 136605779915C-0E-8B-92-5C-20:CMCC-EASY120.197.40.4s19.cnzz.com站点统计24 9 6960 690 200
1523157973098 150136858585C-0E-8B-C7-F7-90:CMCC120.197.40.4rank.ie.sogou.com搜索引擎28 27 3659 3538 200
1523157986029 15989002119E8-99-C4-4E-93-E0:CMCC-EASY120.196.100.99www.umeng.com站点统计3 3 1938 180 200
1523157992093 13560439658C4-17-FE-BA-DE-D9:CMCC120.196.100.99159 918 4938 200
1523157986041 134802531045C-0E-8B-C7-FC-80:CMCC-EASY120.197.40.433 180 180 200
1523157984040 136028465655C-0E-8B-8B-B6-00:CMCC120.197.40.42052.flash2-http.qq.com综合门户15 12 1938 2910 200
1533157995093 1392231446600-FD-07-A2-EC-BA:CMCC120.196.100.82img.qfc.cn1212 30083720 200
1533157982040 135024688235C-0A-5B-6A-0B-D4:CMCC-EASY120.196.100.99y0.ifengimg.com综合门户57 102 7335 110349 200
1533157986072 1832017338284-25-DB-4F-10-1A:CMCC-EASY120.196.100.99input.shouji.sogou.com搜索引擎21 18 9531 2412 200
1533157990043 1392505741300-1F-64-E1-E6-9A:CMCC120.196.100.55t3.baidu.com搜索引擎69 63 11058 48243 200
1533157988072 1376077871000-FD-07-A4-7B-08:CMCC120.196.100.8222 120 120 200
1533157985066 1372623888800-FD-07-A4-72-B8:CMCC120.196.100.82i02.c.aliimg.com2427 248124681 200
1533157993055 13560436666C4-17-FE-BA-DE-D9:CMCC120.196.100.991815 1116954 200

原创粉丝点击