mapreduce pv
来源:互联网 发布:Java开发dwg批量转jpg 编辑:程序博客网 时间:2024/06/05 06:01
分析网站基本指标PV
网站基本指标
1. (Page View),网站浏览量,指页面的浏览量或点击次数,用户每次刷新即被计算一次。如果用户刷新了页面100次,那么,流量统计工具就会显示100个“PV”量。这就是“PV”在流量统计中具体的一个定义了;
2. (Unique Vistor),独立访客数,指1天内访问某站点的人数,以cookie或者用户唯一ID为依据。1天内同一访客的多次访问只计为1个访客;
3.(Visit View),访客的访问次数,用以记录所有访客1天内访问了多少次您的网站。当访客完成所有浏览并最终关掉该网站的所有页面时便完成了一次访问,同一访客1天内可能有多次访问行为,次数累加。
3. IP(独立IP),指1天内使用不同IP地址的用户访问网站的数量,同一IP无论访问了几个页面,独立IP数均为1。
PV分析和编写程序
pv指标指的是网站的浏览量,通过查看日志的条数就可以知道某一时间端内的pv指标。下面我们统计各个省份的某一时间段内的pV指标。
1. 通过查看字段说明文档,可以查看到省份字段
2. 把省份的ID作为MAP任务输出结果的KEY,VALUE为计数1
3. 目录的创建
bin/hdfsdfs -mkdir -p /user/liangman/mapreduce/webpv/input
4. 文件上传
bin/hdfsdfs -put /home/liangman/2015082818 /user/liangman/mapreduce/webpv/input
1. 程序代码
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.ibeifeng.bigdata.hadoop.hdfs.LiangmanWordCount.WordCountCombiner;
/**
*
*@author liangman
*
*/
public class WebPVMapReduce extends Configuredimplements Tool {
// 1.map
public static class PVMap extends
Mapper<LongWritable, Text, IntWritable,IntWritable> {
private final static IntWritable mapOutputValue = new IntWritable(1);
//private IntWritable outputKey = new IntWritable();
private IntWritable mapOutputKey = new IntWritable();
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException{
String linevalue = value.toString();
// StringTokenizer tokenizer = new StringTokenizer(linevalue,"\t");
// int counttoken = tokenizer.countTokens();
// String[] tokenarray = new String[counttoken];
// int i = 0;
// while(tokenizer.hasMoreTokens()){
// tokenarray[i++] = tokenizer.nextToken();
// }
String[] tokenarray = linevalue.split("\t");
//if length less than 30 return
if(tokenarray.length < 30){
//counter
context.getCounter("WEB_PV_MAP_COUNTERS","LENGTH_LT_30_COUNTER").increment(1L);;
return;
}
//if url is null return
if(StringUtils.isBlank(tokenarray[1])){
context.getCounter("WEB_PV_MAP_COUNTERS","NULL_OF_URI").increment(1l);
return ;
}
//provinceid
String provinceID = tokenarray[23];
//if blank return
if(StringUtils.isBlank(provinceID)){
context.getCounter("WEB_PV_MAP_COUNTERS","NULL_OF_PROVICEID").increment(1l);
return ;
}
Integer pID = Integer.MAX_VALUE;
//parse error return
try{
pID = Integer.parseInt(provinceID);
}catch(Exception e){
context.getCounter("WEB_PV_MAP_COUNTERS","PARESEINT_ERROR").increment(1l);
return;
}
//map output
// map output key
mapOutputKey.set(pID);
//
context.write(mapOutputKey, mapOutputValue);
}
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
// TODO Auto-generated method stub
super.cleanup(context);
}
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
// TODO Auto-generated method stub
super.setup(context);
}
}
// TODO
public static class PVCombiner extends
Reducer<IntWritable, IntWritable,IntWritable, IntWritable> {
IntWritable outputvalue = new IntWritable();
@Override
public void reduce(IntWritable key, Iterable<IntWritable> values,
Context context) throws IOException,InterruptedException {
int sum = 0;
for (IntWritable intWritable : values){
sum = sum + intWritable.get();
}
outputvalue.set(sum);
context.write(key, outputvalue);
}
@Override
protected void cleanup(
org.apache.hadoop.mapreduce.Reducer.Contextcontext)
throws IOException, InterruptedException{
// TODO Auto-generated method stub
super.cleanup(context);
}
@Override
protected void setup(org.apache.hadoop.mapreduce.Reducer.Context context)
throws IOException, InterruptedException{
// TODO Auto-generated method stub
super.setup(context);
}
}
// 2.reducer
public static class PVReducer extends
Reducer<IntWritable, IntWritable,IntWritable, IntWritable> {
IntWritable outputvalue = new IntWritable();
@Override
public void reduce(IntWritable key, Iterable<IntWritable> values,
Context context) throws IOException,InterruptedException {
int sum = 0;
for (IntWritable intWritable : values){
sum = sum + intWritable.get();
}
outputvalue.set(sum);
context.write(key, outputvalue);
}
@Override
protected void cleanup(
org.apache.hadoop.mapreduce.Reducer.Contextcontext)
throws IOException, InterruptedException{
// TODO Auto-generated method stub
super.cleanup(context);
}
@Override
protected void setup(org.apache.hadoop.mapreduce.Reducer.Context context)
throws IOException, InterruptedException{
// TODO Auto-generated method stub
super.setup(context);
}
}
// 3.driver
public int run(String[] args) throws Exception {
//
Configuration configuration = this.getConf();
// one application is a job
Job job = Job.getInstance(configuration, this.getClass()
.getSimpleName());
// run a jar
job.setJarByClass(WebPVMapReduce.class);
// the input address
Path inpath = new Path(args[0]);
// add an input to the job
FileInputFormat.addInputPath(job, inpath);
// the output address
Path outpath = new Path(args[1]);
FileOutputFormat.setOutputPath(job, outpath);
// mapper
// TODO
job.setMapperClass(PVMap.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
// ###################################shuffle
// 1.fen qu
// job.setPartitionerClass(cls);
// 2.sort
// job.setSortComparatorClass(cls);
// 3.fen zu group
// job.setGroupingComparatorClass(cls);
job.setCombinerClass(PVCombiner.class);
// reducer
// TODO
job.setReducerClass(PVReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
// TODO
// job.setNumReduceTasks(tasks);
// submit the job
boolean isSucess = job.waitForCompletion(true);
return isSucess ? 0 : 1;
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
// set the compress
//TODO
// configuration.set("mapreduce.map.output.compress", "true");
// configuration.set("mapreduce.map.output.compress.codec",
// "org.apache.hadoop.io.compress.SnappyCodec");
args=new String[]{
"hdfs://hadoop02-linux.alibaba.com:8020/user/liangman/mapreduce/webpv/input",
"hdfs://hadoop02-linux.alibaba.com:8020/user/liangman/mapreduce/webpv/output"
};
// run the job
int status = ToolRunner.run(configuration, new WebPVMapReduce(), args);
// exit
System.exit(status);
}
}
1. 运行结果
自定义程序计数器
● 自定义程序计数器(组名,计数器名)
context.getCounter("WEB_PV_MAP_COUNTERS","LENGTH_LT_30_COUNTER").increment(1L);;
● 自定义程序计数器输出结果
可以看到数据字段不全,不能很好的进行分析,要和相关部门进行沟通,刷洗数据。
- mapreduce pv
- mapreduce 统计PV UV
- MapReduce功能实现九---Pv、Uv
- 【pv】pv信号量
- MapReduce
- MapReduce
- MapReduce
- MapReduce
- mapreduce
- MapReduce
- MapReduce
- MapReduce
- MapReduce
- mapreduce
- MapReduce
- MapReduce
- mapreduce
- mapreduce
- ava NIO使用及原理分析(三)
- ubuntu创建a sudo user
- ubuntu 16.04 安装Chrome
- 仿微信头像剪切
- 队列的链表存储实现
- mapreduce pv
- Java NIO使用及原理分析 (四)
- 1006
- Idea 快捷键
- MySQL更改数据库数据存储目录
- zoj1141
- oracle导入日期提示ora-01843
- log4net使用详解
- Spring MVC 知识汇总