MapReducer流量统计

来源:互联网 发布:投稿可以造假数据吗 编辑:程序博客网 时间:2024/05/21 06:31

1、对流量日志中的用户统计总上、下行流量

技术点:自定义javaBean用来在mapreduce中充当value

注意: javaBean要实现Writable接口,实现两个方法

package com.mr.flowsum;import org.apache.hadoop.io.WritableComparable;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;/** * 自定义JavaBean:FlowBean */public class FlowBean implements WritableComparable<FlowBean> {    private long upFlow;//上行流量    private long dFlow;//下行流量    private long sumFlow;//总流量    //反序列化时,需要反射调用空参构造函数,所以要显示定义一个    public FlowBean() {    }    public FlowBean(long upFlow, long dFlow) {        this.upFlow = upFlow;        this.dFlow = dFlow;        this.sumFlow = upFlow + dFlow;    }    public void set(long upFlow, long dFlow) {        this.upFlow = upFlow;        this.dFlow = dFlow;        this.sumFlow = upFlow + dFlow;    }    /**     * 序列化方法     */    public void write(DataOutput out) throws IOException {        out.writeLong(upFlow);        out.writeLong(dFlow);        out.writeLong(sumFlow);    }    /**     * 反序列化方法     * 注意:反序列化的顺序跟序列化的顺序完全一致     */    public void readFields(DataInput in) throws IOException {        upFlow = in.readLong();        dFlow = in.readLong();        sumFlow = in.readLong();    }    public long getUpFlow() {        return upFlow;    }    public void setUpFlow(long upFlow) {        this.upFlow = upFlow;    }    public long getdFlow() {        return dFlow;    }    public void setdFlow(long dFlow) {        this.dFlow = dFlow;    }    public long getSumFlow() {        return sumFlow;    }    public void setSumFlow(long sumFlow) {        this.sumFlow = sumFlow;    }    @Override    public String toString() {        return upFlow + "\t" + dFlow + "\t" + sumFlow;    }    public int compareTo(FlowBean o) {        return this.getSumFlow() > o.getSumFlow() ? -1 : 1;//从大到小, 当前对象和要比较的对象比, 如果当前对象大, 返回-1, 交换他们的位置    }}
package com.mr.flowsum;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import java.io.IOException;/** * 统计每一个用户(手机号)所耗费的总上行流量、下行流量,总流量 */public class FlowCount {    //Mapper    public static class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean> {        //1363157985066 1372623050300-FD-07-A4-72-B8:CMCC120.196.100.82i02.c.aliimg.com2427248124681200        @Override        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {            //将一行内容转成string            String line = value.toString();            //切分字段            String[] fields = line.split("\t");            //取出手机号            String phoneNbr = fields[1];            //取出上下行流量            long upFlow = Long.parseLong(fields[fields.length - 3]);            long dFlow = Long.parseLong(fields[fields.length - 2]);            context.write(new Text(phoneNbr), new FlowBean(upFlow, dFlow));        }    }    //Reduce    public static class FlowCountReduce extends Reducer<Text, FlowBean, Text, FlowBean> {        //<183323,bean1><183323,bean2><183323,bean3><183323,bean4>.......        @Override        protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {            long sum_upFlow = 0;            long sum_dFlow = 0;            //遍历所有bean,将其中的上行流量,下行流量分别累加            for (FlowBean bean : values) {                sum_upFlow += bean.getUpFlow();                sum_dFlow += bean.getdFlow();            }            FlowBean resultBean = new FlowBean(sum_upFlow, sum_dFlow);            context.write(key, resultBean);        }    }    //Driver    public static void main(String[] args) throws Exception {        Configuration conf = new Configuration();        Job job = Job.getInstance(conf);        //指定本程序的jar包所在的本地路径        job.setJarByClass(FlowCount.class);        //指定本业务job要使用的mapper/Reducer业务类        job.setMapperClass(FlowCountMapper.class);        job.setReducerClass(FlowCountReduce.class);        //指定mapper输出数据的key,value类型        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(FlowBean.class);        //指定最终输出的数据的kv类型        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(FlowBean.class);        //指定job的输入原始文件所在目录        FileInputFormat.setInputPaths(job, new Path(args[0]));        //指定job的输出结果所在目录        FileOutputFormat.setOutputPath(job, new Path(args[1]));        //将job中配置的相关参数,以及job所用的java类所在的jar包,提交给yarn去运行        /*job.submit();*/        boolean res = job.waitForCompletion(true);        System.out.println(res ? 0 : 1);    }    /**     * mvn clean package -DskipTests     * hadoop jar hadoop-train-1.0.jar com.mr.flowsum.FlowCount /flowsum/input/flow.log /flowsum/output     */}
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

flow.log数据

1363157985066 1372623050300-FD-07-A4-72-B8:CMCC 120.196.100.82i02.c.aliimg.com 2427 248124681 200
1363157995052 138265441015C-0E-8B-C7-F1-E0:CMCC 120.197.40.44 0 264 0 200
1363157991076 1392643565620-10-7A-28-CC-0A:CMCC 120.196.100.992 4 132 1512 200
1363154400022 139262511065C-0E-8B-8B-B1-50:CMCC 120.197.40.44 0 240 0 200
1363157993044 1821157596194-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99iface.qiyi.com 视频网站15 12 1527 2106 200
1363157995074 841384135C-0E-8B-8C-E8-20:7DaysInn 120.197.40.4122.72.52.12 2016 41161432 200
1363157993055 13560439658C4-17-FE-BA-DE-D9:CMCC 120.196.100.9918 15 1116 954 200
1363157995033 159201332575C-0E-8B-C7-BA-20:CMCC 120.197.40.4sug.so.360.cn 信息安全20 20 3156 2936 200
1363157983019 1371919941968-A1-B7-03-07-B1:CMCC-EASY 120.196.100.824 0 240 0 200
1363157984041 136605779915C-0E-8B-92-5C-20:CMCC-EASY 120.197.40.4s19.cnzz.com 站点统计24 9 6960 690 200
1363157973098 150136858585C-0E-8B-C7-F7-90:CMCC 120.197.40.4rank.ie.sogou.com 搜索引擎28 27 3659 3538 200
1363157986029 15989002119E8-99-C4-4E-93-E0:CMCC-EASY 120.196.100.99www.umeng.com 站点统计3 3 1938 180 200
1363157992093 13560439658C4-17-FE-BA-DE-D9:CMCC 120.196.100.9915 9 918 4938 200
1363157986041 134802531045C-0E-8B-C7-FC-80:CMCC-EASY 120.197.40.43 3 180 180 200
1363157984040 136028465655C-0E-8B-8B-B6-00:CMCC 120.197.40.42052.flash2-http.qq.com 综合门户15 12 1938 2910 200
1363157995093 1392231446600-FD-07-A2-EC-BA:CMCC 120.196.100.82img.qfc.cn 1212 30083720 200
1363157982040 135024688235C-0A-5B-6A-0B-D4:CMCC-EASY 120.196.100.99y0.ifengimg.com 综合门户57 102 7335 110349 200
1363157986072 1832017338284-25-DB-4F-10-1A:CMCC-EASY 120.196.100.99input.shouji.sogou.com 搜索引擎21 18 9531 2412 200
1363157990043 1392505741300-1F-64-E1-E6-9A:CMCC 120.196.100.55t3.baidu.com 搜索引擎69 63 11058 48243 200
1363157988072 1376077871000-FD-07-A4-7B-08:CMCC 120.196.100.822 2 120 120 200
1363157985066 1372623888800-FD-07-A4-72-B8:CMCC 120.196.100.82i02.c.aliimg.com 2427 248124681 200
1363157993055 13560436666C4-17-FE-BA-DE-D9:CMCC 120.196.100.9918 15 1116 954 200

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

清理后数据

13480253104     180     180     360
13502468823     7335    110349  117684
13560436666     1116    954     2070
13560439658     2034    5892    7926
13602846565     1938    2910    4848
13660577991     6960    690     7650
13719199419     240     0       240
13726230503     2481    24681   27162
13726238888     2481    24681   27162
13760778710     120     120     240
13826544101     264     0       264
13922314466     3008    3720    6728
13925057413     11058   48243   59301
13926251106     240     0       240
13926435656     132     1512    1644
15013685858     3659    3538    7197
15920133257     3156    2936    6092
15989002119     1938    180     2118
18211575961     1527    2106    3633
18320173382     9531    2412    11943
84138413        4116    1432    5548

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

1、统计流量且按照流量大小倒序排序

技术点:这种需求,用一个mapreduce -job 不好实现,需要两个mapreduce -job

第一个job负责流量统计,跟上题相同

第二个job读入第一个job的输出,然后做排序

要将flowBean作为map的key输出,这样mapreduce就会自动排序

     此时,flowBean要实现接口WritableComparable

     要实现其中的compareTo()方法,方法中,我们可以定义倒序比较的逻辑

package com.mr.flowsum;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import java.io.IOException;/** * 统计流量且按照流量大小倒序排序 */public class FlowCountSort {    //13502468823     7335    110349  117684    //mapper    static class FlowCountSortMapper extends Mapper<LongWritable, Text, FlowBean, Text> {        FlowBean bean = new FlowBean();        Text v = new Text();        @Override        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {            // 拿到的是上一个统计程序的输出结果,已经是各手机号的总流量信息            String line = value.toString();            String[] fields = line.split("\t");            String phoneNbr = fields[0];            long upFlow = Long.parseLong(fields[1]);            long dFlow = Long.parseLong(fields[2]);            bean.set(upFlow, dFlow);            v.set(phoneNbr);            context.write(bean, v);        }    }    /**     * 根据key来比较, 传过来的是对象, 每个对象都是不一样的, 所以每个对象都调用一次reduce方法     */    static class FlowCountSortReducer extends Reducer<FlowBean, Text, Text, FlowBean> {        //<bean(),phoneNum>        @Override        protected void reduce(FlowBean bean, Iterable<Text> values, Context context) throws IOException, InterruptedException {            context.write(values.iterator().next(),bean);        }    }    //Driver    public static void main(String[] args) throws Exception {        Configuration conf = new Configuration();        Job job = Job.getInstance(conf);        //指定本程序的jar包所在的本地路径        job.setJarByClass(FlowCountSort.class);        //指定本业务job要使用的mapper/Reducer业务类        job.setMapperClass(FlowCountSortMapper.class);        job.setReducerClass(FlowCountSortReducer.class);        //指定mapper输出数据的key,value类型        job.setMapOutputKeyClass(FlowBean.class);        job.setMapOutputValueClass(Text.class);        //指定最终输出的数据的kv类型        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(FlowBean.class);        //指定job的输入原始文件所在目录        FileInputFormat.setInputPaths(job, new Path(args[0]));        //指定job的输出结果所在目录        FileOutputFormat.setOutputPath(job, new Path(args[1]));        //将job中配置的相关参数,以及job所用的java类所在的jar包,提交给yarn去运行        /*job.submit();*/        boolean res = job.waitForCompletion(true);        System.out.println(res ? 0 : 1);    }}

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

运行后数据:

13502468823     7335    110349  117684
13925057413     11058   48243   59301
13726238888     2481    24681   27162
13726230503     2481    24681   27162
18320173382     9531    2412    11943
13560439658     2034    5892    7926
13660577991     6960    690     7650
15013685858     3659    3538    7197
13922314466     3008    3720    6728
15920133257     3156    2936    6092
84138413        4116    1432    5548
13602846565     1938    2910    4848
18211575961     1527    2106    3633
15989002119     1938    180     2118
13560436666     1116    954     2070
13926435656     132     1512    1644
13480253104     180     180     360
13826544101     264     0       264
13926251106     240     0       240
13760778710     120     120     240
13719199419     240     0       240

原创粉丝点击