日志数据中的上下行流量信息汇总-----总流量倒序排序

来源:互联网 发布:人工智能高峰论坛 编辑:程序博客网 时间:2024/05/16 10:18

对日志数据中的上下行流量信息汇总,并输出按照总流量倒序排序的结果


数据如下:

13480253104 180180 0
13502468823 1103497335 0
13560436666 9541116 0
13560439658 58922034 0
13602846565 29101938 0
13660577991 6906960 0
13719199419 0 240 0
13726230503 246812481 0
13726238888 246812481 0
13760778710 120120 0
13826544101 0 264 0
13922314466 37203008 0
13925057413 4824311058 0
13926251106 0 240 0
13926435656 1512132 0
15013685858 35383659 0
15920133257 29363156 0
15989002119 1801938 0
18211575961 21061527 0
18320173382 24129531 0
84138413 1432 4116 0

基本思路:实现自定义的bean来封装流量信息,并将bean作为map输出的key来传输

 

MR程序在处理数据的过程中会对数据排序(map输出的kv对传输到reduce之前,会排序),排序的依据是map输出的key

所以,我们如果要实现自己需要的排序规则,则可以考虑将排序因素放到key中,让key实现接口:WritableComparable

然后重写keycompareTo方法


定义一个bean

import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.WritableComparable;/** * 把流星信息封装成对象 *  * @author * */public class FlowBean implements WritableComparable<FlowBean> {private long upFlow;private long dFlow;private long sumFlow;public FlowBean() {super();}public FlowBean(long upFlow, long dFlow) {this.upFlow = upFlow;this.dFlow = dFlow;this.sumFlow = upFlow + dFlow;}public void set(long upFlow, long dFlow) {this.upFlow = upFlow;this.dFlow = dFlow;this.sumFlow = upFlow + dFlow;}public long getUpFlow() {return upFlow;}public void setUpFlow(long upFlow) {this.upFlow = upFlow;}public long getdFlow() {return dFlow;}public void setdFlow(long dFlow) {this.dFlow = dFlow;}public long getSumFlow() {return sumFlow;}public void setSumFlow(long sumFlow) {this.sumFlow = sumFlow;}// 序列化 :将字段信息写到输出流中@Overridepublic void write(DataOutput out) throws IOException {out.writeLong(upFlow);out.writeLong(dFlow);out.writeLong(sumFlow);}// 反序列化:从输出流中读取各个字段的信息// 注意:反序列化的顺序必须跟序列化的对象一致@Overridepublic void readFields(DataInput in) throws IOException {upFlow = in.readLong();dFlow = in.readLong();sumFlow = in.readLong();}// 重写toString()方法@Overridepublic String toString() {return upFlow + "\t" + dFlow + "\t" + sumFlow;}@Overridepublic int compareTo(FlowBean o) {return this.sumFlow > o.getSumFlow() ? -1 : 1;}}
定义map
import java.io.IOException;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;public class FlowCountMapper extends Mapper<LongWritable, Text, FlowBean, Text> {FlowBean bean = new FlowBean();Text v = new Text();@Overrideprotected void map(LongWritable key, Text values, Context context) throws IOException, InterruptedException {// 将一行的内容转化为StringString value = values.toString();// 切分字段String[] split = value.split("\t");// 取出手机号码String phoneNum = split[0];// 取出上行流量和下行流量long upFlow = Long.parseLong(split[1]);long dFlow = Long.parseLong(split[2]);// context.write(new FlowBean(upFlow, dFlow),new Text(phoneNum));bean.set(upFlow, dFlow);v.set(phoneNum);System.out.println(bean.toString() + v);context.write(bean, v);}}
定义raduce
import java.io.IOException;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;public class FlowCountReduce extends Reducer<FlowBean, Text, Text, FlowBean> {@Overrideprotected void reduce(FlowBean key, Iterable<Text> values, Context context)throws IOException, InterruptedException {System.out.println(key);context.write(values.iterator().next(), key);}}

定义一个FlowCount主类

import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class FlowCount {public static void main(String[] args) throws Exception {String inPath = "";String outPath = "";if (args.length == 2) {inPath = args[0];outPath = args[1];}Configuration conf = new Configuration();Job job = Job.getInstance(conf);// 指定jar包所在的本地路径job.setJarByClass(FlowCount.class);// 指定jar包使用的mapper和Reduce业务类job.setMapperClass(FlowCountMapper.class);job.setReducerClass(FlowCountReduce.class);// 指定mapper输出数据的kv类型job.setMapOutputKeyClass(FlowBean.class);job.setMapOutputValueClass(Text.class);// 指定最终的输出数据的kv类型job.setOutputKeyClass(Text.class);job.setOutputValueClass(FlowBean.class);// 指定job的输入原始文件所在的目录FileInputFormat.setInputPaths(job, new Path(inPath));FileOutputFormat.setOutputPath(job, new Path(outPath));boolean res = job.waitForCompletion(true);System.exit(res ? 0 : 1);}}

排序结果如下:



原创粉丝点击