mapreduce 统计PV UV

来源:互联网 发布:医院配眼镜 知乎 编辑:程序博客网 时间:2024/04/30 18:05

在互联网环境下,一般网站都需要堆网站的pv,uv进行数据统计,简单理解下pv 就是url被访问的次数,uv则是url被不同ip访问的次数,ok 问题来了,一个文本日志,数据格式为:data,url,ip 现在需要使用mr按天对这份数据统计每个url链接的pv和uv。
数据如下:

20150405,url1,ip120150405,url2,ip120150405,url1,ip220150405,url1,ip320150405,url1,ip420150405,url1,ip520150405,url2,ip920150406,url1,ip120150406,url1,ip220150406,url1,ip1

分析:对数据统计分两步,第一个pv好统计url作为key只是一个单纯的计数,uv的话首先对key做成url+ip value置1,第二个job做一次拆分即可

package com.demo.mapreduce;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;import java.io.IOException;import java.util.HashSet;import java.util.Set;/** * Created by leslie on 17/6/24. */public class MapreduceForPVUV {    static class myMapperPUV extends Mapper<LongWritable,Text,Text,IntWritable>{        IntWritable one = new IntWritable(1);        @Override        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {            String[] strings = value.toString().split(",");            context.write(new Text(strings[0]+":"+strings[1]),one);            context.write(new Text(strings[0]+":"+strings[1]+":"+strings[2]),one);        }    }    static class myReducerPUV extends Reducer<Text ,IntWritable,Text,NullWritable>{        @Override        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {            int sum = 0;            for(IntWritable value:values){                sum+=value.get();            }            context.write(new Text(key.toString()+":"+sum),NullWritable.get());        }    }    static class myMapperPV extends Mapper<LongWritable,Text,Text,IntWritable >{        IntWritable one = new IntWritable(1);        @Override        protected void map(LongWritable key, Text values, Context context) throws IOException, InterruptedException {           String[] strings = values.toString().split(":");            if(strings.length==4){                context.write(new Text(strings[0]+":"+strings[1]+":uv"),one);            }else if(strings.length==3){                context.write(new Text(strings[0]+":"+strings[1]),new IntWritable(Integer.parseInt(strings[2])));            }        }    }    static class myReducerPV extends Reducer<Text,IntWritable,Text,NullWritable>{        @Override        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {            int sum = 0;            for(IntWritable value:values){                sum+=value.get();            }            context.write(new Text(key.toString()+":"+sum),NullWritable.get());        }    }    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {        Configuration conf = new Configuration();        Job job = new Job(conf,"mapreduce");        Path mypath = new Path(args[1]);        FileSystem hdfs = mypath.getFileSystem(conf);        if (hdfs.isDirectory(mypath)) {            hdfs.delete(mypath, true);        }        job.setJarByClass(MapreduceForPVUV.class);        job.setMapperClass(MapreduceForPVUV.myMapperPUV.class);        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(IntWritable.class);        job.setReducerClass(MapreduceForPVUV.myReducerPUV.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(NullWritable.class);        job.setInputFormatClass( TextInputFormat.class );        job.setOutputFormatClass( TextOutputFormat.class );        FileInputFormat.addInputPath(job, new Path(args[0]));        FileOutputFormat.setOutputPath( job,new Path(args[1]));//        job.waitForCompletion(true);//        job.setNumReduceTasks( 1 );//        boolean flag = j;        if(job.waitForCompletion(true)){            Job jobN = new Job(conf,"mapreduceN");            Path mypaths = new Path(args[2]);            FileSystem hdfss = mypaths.getFileSystem(conf);            if (hdfss.isDirectory(mypaths)) {                hdfss.delete(mypaths, true);            }            jobN.setJarByClass(MapreduceForPVUV.class);            jobN.setMapperClass(myMapperPV.class);            jobN.setMapOutputKeyClass(Text.class);            jobN.setMapOutputValueClass(IntWritable.class);            jobN.setReducerClass(myReducerPV.class);            jobN.setOutputKeyClass(Text.class);            jobN.setOutputValueClass(NullWritable.class);            jobN.setInputFormatClass( TextInputFormat.class );            jobN.setOutputFormatClass( TextOutputFormat.class );            FileInputFormat.addInputPath(jobN, new Path(args[1]));            FileOutputFormat.setOutputPath( jobN,new Path(args[2]) );            System.exit(jobN.waitForCompletion(true)?0:1);        }    }}

结果输出:

20150405:url1:520150405:url1:uv:520150405:url2:220150405:url2:uv:220150406:url1:320150406:url1:uv:2