hadoop-2.7.1 MapReduce自定义分组的实现

来源：互联网发布：淘宝号贷款怎么贷款编辑：程序博客网时间：2024/05/10 07:14

对数据中电话进行分组，并统计上传、下载、总流量

1、待处理的数据为：

1363157985066 1372623050300-FD-07-A4-72-B8:CMCC120.196.100.82i02.c.aliimg.com24272481246812001363157995052 138265441015C-0E-8B-C7-F1-E0:CMCC120.197.40.44026402001363157991076 1392643565620-10-7A-28-CC-0A:CMCC120.196.100.992413215122001363154400022 139262511065C-0E-8B-8B-B1-50:CMCC120.197.40.44024002001363157993044 1821157596194-71-AC-CD-E6-18:CMCC-EASY120.196.100.99iface.qiyi.com视频网站1512152721062001363157995074 841384135C-0E-8B-8C-E8-20:7DaysInn120.197.40.4122.72.52.122016411614322001363157993055 13560439658C4-17-FE-BA-DE-D9:CMCC120.196.100.99181511169542001363157995033 159201332575C-0E-8B-C7-BA-20:CMCC120.197.40.4sug.so.360.cn信息安全2020315629362001363157983019 1371919941968-A1-B7-03-07-B1:CMCC-EASY120.196.100.824024002001363157984041 136605779915C-0E-8B-92-5C-20:CMCC-EASY120.197.40.4s19.cnzz.com站点统计24969606902001363157973098 150136858585C-0E-8B-C7-F7-90:CMCC120.197.40.4rank.ie.sogou.com搜索引擎2827365935382001363157986029 15989002119E8-99-C4-4E-93-E0:CMCC-EASY120.196.100.99www.umeng.com站点统计3319381802001363157992093 13560439658C4-17-FE-BA-DE-D9:CMCC120.196.100.9915991849382001363157986041 134802531045C-0E-8B-C7-FC-80:CMCC-EASY120.197.40.4331801802001363157984040 136028465655C-0E-8B-8B-B6-00:CMCC120.197.40.42052.flash2-http.qq.com综合门户1512193829102001363157995093 1392231446600-FD-07-A2-EC-BA:CMCC120.196.100.82img.qfc.cn1212300837202001363157982040 135024688235C-0A-5B-6A-0B-D4:CMCC-EASY120.196.100.99y0.ifengimg.com综合门户5710273351103492001363157986072 1832017338284-25-DB-4F-10-1A:CMCC-EASY120.196.100.99input.shouji.sogou.com搜索引擎2118953124122001363157990043 1392505741300-1F-64-E1-E6-9A:CMCC120.196.100.55t3.baidu.com搜索引擎696311058482432001363157988072 1376077871000-FD-07-A4-7B-08:CMCC120.196.100.82221201202001363157985066 1372623888800-FD-07-A4-72-B8:CMCC120.196.100.82i02.c.aliimg.com24272481246812001363157993055 13560436666C4-17-FE-BA-DE-D9:CMCC120.196.100.9918151116954200

2、根据电话前3为进行自定义分组：

package cn.nanda.area;import java.util.HashMap;import org.apache.hadoop.mapreduce.Partitioner;public class AreaPartitioner<KEY, VALUE> extends Partitioner<KEY, VALUE> {private static HashMap<String, Integer> areaMap = new HashMap<String, Integer>();static {areaMap.put("135", 0);areaMap.put("136", 1);areaMap.put("137", 2);areaMap.put("138", 3);areaMap.put("139", 4);}@Overridepublic int getPartition(KEY key, VALUE value, int numPartitions) {// 从key中拿到手机号，查询手机归属地字典，不同的省份返回不同的组号int areaCoder = areaMap.get(key.toString().substring(0, 3)) == null ? 5: areaMap.get(key.toString().substring(0, 3));return areaCoder;}}

3、使用mr进行处理：

package cn.nanda.area;import java.io.IOException;import org.apache.commons.lang.StringUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import cn.nanda.wordCount.FlowBean;/** * 对流量原始日志进行流量统计，将不同省份的用户统计结果输出到不同文件 需要自定义改造两个机制 1.改造分区的逻辑，自定义一个partitioner * 2.自定义redcuer task 的并发任务数 *  * @author kun *  */public class FlowSumArea {public static class FlowSumAreaMapper extendsMapper<LongWritable, Text, Text, FlowBean> {@Overrideprotected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {// 拿第一行数据String line = value.toString();String[] fields = StringUtils.split(line, "\t");// 拿到我们需要的字段String phoneNB = fields[1];long up_flow = Long.parseLong(fields[7]);long d_flow = Long.parseLong(fields[8]);// 封装数据kv并输出context.write(new Text(phoneNB), new FlowBean(phoneNB, up_flow,d_flow));}}public static class FlowSumAreaReducer extends Reducer<Text, FlowBean, Text, FlowBean>{@Overrideprotected void reduce(Text key, Iterable<FlowBean> values,Context context)throws IOException, InterruptedException {long up_flow_count = 0 ;long d_flow_count = 0 ;for(FlowBean bean : values){up_flow_count += bean.getUp_flow();d_flow_count += bean.getD_flow();}context.write(key, new FlowBean(key.toString(), up_flow_count, d_flow_count));}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job = Job.getInstance(conf);job.setJarByClass(FlowSumArea.class);job.setMapperClass(FlowSumAreaMapper.class);job.setReducerClass(FlowSumAreaReducer.class);//设置我们自定义的分组逻辑定义job.setPartitionerClass(AreaPartitioner.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(FlowBean.class);//设置reduce的任务并发数，应该跟分组的数量保持一致job.setNumReduceTasks(6);FileInputFormat.setInputPaths(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));System.exit(job.waitForCompletion(true) ? 0 :1);}}

4、分组并处理后结果：

（分了6组，分别存储自定义分组后的结果信息）

part-r-00000:

13502468823102733574371356043666695420011541356043965858924006292

part-r-00001

13602846565121938195013660577991969606969

part-r-00002

1371919941902002001372623050324812468127162137262388882481246812716213760778710120200320

part-r-00003

138265441010200200

part-r-00004

13922314466300837206728139250574136311058111211392625110602002001392643565615122001712

part-r-00005

134802531041802003801501368585827365936861592013325720315631761598900211931938194118211575961121527153918320173382189531954984138413411614325548

0 0