mapreduce pv

来源：互联网发布：Java开发dwg批量转jpg 编辑：程序博客网时间：2024/06/05 06:01

分析网站基本指标PV

网站基本指标

1. (Page View)，网站浏览量，指页面的浏览量或点击次数，用户每次刷新即被计算一次。如果用户刷新了页面100次，那么，流量统计工具就会显示100个“PV”量。这就是“PV”在流量统计中具体的一个定义了；

2. (Unique Vistor)，独立访客数，指1天内访问某站点的人数，以cookie或者用户唯一ID为依据。1天内同一访客的多次访问只计为1个访客；
3.(Visit View)，访客的访问次数，用以记录所有访客1天内访问了多少次您的网站。当访客完成所有浏览并最终关掉该网站的所有页面时便完成了一次访问，同一访客1天内可能有多次访问行为，次数累加。

3. IP(独立IP)，指1天内使用不同IP地址的用户访问网站的数量，同一IP无论访问了几个页面，独立IP数均为1。

PV分析和编写程序

pv指标指的是网站的浏览量，通过查看日志的条数就可以知道某一时间端内的pv指标。下面我们统计各个省份的某一时间段内的pV指标。

1. 通过查看字段说明文档，可以查看到省份字段

2. 把省份的ID作为MAP任务输出结果的KEY，VALUE为计数1

3. 目录的创建

bin/hdfsdfs -mkdir -p /user/liangman/mapreduce/webpv/input

4. 文件上传

bin/hdfsdfs -put /home/liangman/2015082818 /user/liangman/mapreduce/webpv/input

1. 程序代码

import java.io.IOException;

import java.util.StringTokenizer;

import org.apache.commons.lang.StringUtils;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.conf.Configured;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Mapper.Context;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

import com.ibeifeng.bigdata.hadoop.hdfs.LiangmanWordCount.WordCountCombiner;

/**

*@author liangman

public class WebPVMapReduce extends Configuredimplements Tool {

// 1.map

public static class PVMap extends

Mapper<LongWritable, Text, IntWritable,IntWritable> {

private final static IntWritable mapOutputValue = new IntWritable(1);

//private IntWritable outputKey = new IntWritable();

private IntWritable mapOutputKey = new IntWritable();

@Override

public void map(LongWritable key, Text value, Context context)

throws IOException, InterruptedException{

String linevalue = value.toString();

// StringTokenizer tokenizer = new StringTokenizer(linevalue,"\t");

// int counttoken = tokenizer.countTokens();

// String[] tokenarray = new String[counttoken];

// int i = 0;

// while(tokenizer.hasMoreTokens()){

// tokenarray[i++] = tokenizer.nextToken();

// }

String[] tokenarray = linevalue.split("\t");

//if length less than 30 return

if(tokenarray.length < 30){

//counter

context.getCounter("WEB_PV_MAP_COUNTERS","LENGTH_LT_30_COUNTER").increment(1L);;

return;

}

//if url is null return

if(StringUtils.isBlank(tokenarray[1])){

context.getCounter("WEB_PV_MAP_COUNTERS","NULL_OF_URI").increment(1l);

return ;

}

//provinceid

String provinceID = tokenarray[23];

//if blank return

if(StringUtils.isBlank(provinceID)){

context.getCounter("WEB_PV_MAP_COUNTERS","NULL_OF_PROVICEID").increment(1l);

return ;

}

Integer pID = Integer.MAX_VALUE;

//parse error return

try{

pID = Integer.parseInt(provinceID);

}catch(Exception e){

context.getCounter("WEB_PV_MAP_COUNTERS","PARESEINT_ERROR").increment(1l);

return;

}

//map output

// map output key

mapOutputKey.set(pID);

context.write(mapOutputKey, mapOutputValue);

}

@Override

protected void cleanup(Context context) throws IOException,

InterruptedException {

// TODO Auto-generated method stub

super.cleanup(context);

}

@Override

protected void setup(Context context) throws IOException,

InterruptedException {

// TODO Auto-generated method stub

super.setup(context);

}

// TODO

public static class PVCombiner extends

Reducer<IntWritable, IntWritable,IntWritable, IntWritable> {

IntWritable outputvalue = new IntWritable();

@Override

public void reduce(IntWritable key, Iterable<IntWritable> values,

Context context) throws IOException,InterruptedException {

int sum = 0;

for (IntWritable intWritable : values){

sum = sum + intWritable.get();

}

outputvalue.set(sum);

context.write(key, outputvalue);

}

@Override

protected void cleanup(

org.apache.hadoop.mapreduce.Reducer.Contextcontext)

throws IOException, InterruptedException{

// TODO Auto-generated method stub

super.cleanup(context);

}

@Override

protected void setup(org.apache.hadoop.mapreduce.Reducer.Context context)

throws IOException, InterruptedException{

// TODO Auto-generated method stub

super.setup(context);

}

// 2.reducer

public static class PVReducer extends

Reducer<IntWritable, IntWritable,IntWritable, IntWritable> {

IntWritable outputvalue = new IntWritable();

@Override

public void reduce(IntWritable key, Iterable<IntWritable> values,

Context context) throws IOException,InterruptedException {

int sum = 0;

for (IntWritable intWritable : values){

sum = sum + intWritable.get();

}

outputvalue.set(sum);

context.write(key, outputvalue);

}

@Override

protected void cleanup(

org.apache.hadoop.mapreduce.Reducer.Contextcontext)

throws IOException, InterruptedException{

// TODO Auto-generated method stub

super.cleanup(context);

}

@Override

protected void setup(org.apache.hadoop.mapreduce.Reducer.Context context)

throws IOException, InterruptedException{

// TODO Auto-generated method stub

super.setup(context);

}

// 3.driver

public int run(String[] args) throws Exception {

Configuration configuration = this.getConf();

// one application is a job

Job job = Job.getInstance(configuration, this.getClass()

.getSimpleName());

// run a jar

job.setJarByClass(WebPVMapReduce.class);

// the input address

Path inpath = new Path(args[0]);

// add an input to the job

FileInputFormat.addInputPath(job, inpath);

// the output address

Path outpath = new Path(args[1]);

FileOutputFormat.setOutputPath(job, outpath);

// mapper

// TODO

job.setMapperClass(PVMap.class);

job.setMapOutputKeyClass(IntWritable.class);

job.setMapOutputValueClass(IntWritable.class);

// ###################################shuffle

// 1.fen qu

// job.setPartitionerClass(cls);

// 2.sort

// job.setSortComparatorClass(cls);

// 3.fen zu group

// job.setGroupingComparatorClass(cls);

job.setCombinerClass(PVCombiner.class);

// reducer

// TODO

job.setReducerClass(PVReducer.class);

job.setOutputKeyClass(IntWritable.class);

job.setOutputValueClass(IntWritable.class);

// TODO

// job.setNumReduceTasks(tasks);

// submit the job

boolean isSucess = job.waitForCompletion(true);

return isSucess ? 0 : 1;

}

public static void main(String[] args) throws Exception {

Configuration configuration = new Configuration();

// set the compress

//TODO

// configuration.set("mapreduce.map.output.compress", "true");

// configuration.set("mapreduce.map.output.compress.codec",

// "org.apache.hadoop.io.compress.SnappyCodec");

args=new String[]{

"hdfs://hadoop02-linux.alibaba.com:8020/user/liangman/mapreduce/webpv/input",

"hdfs://hadoop02-linux.alibaba.com:8020/user/liangman/mapreduce/webpv/output"

};

// run the job

int status = ToolRunner.run(configuration, new WebPVMapReduce(), args);

// exit

System.exit(status);

}

1. 运行结果

自定义程序计数器

● 自定义程序计数器（组名，计数器名）

context.getCounter("WEB_PV_MAP_COUNTERS","LENGTH_LT_30_COUNTER").increment(1L);;

● 自定义程序计数器输出结果

可以看到数据字段不全，不能很好的进行分析，要和相关部门进行沟通，刷洗数据。

0 0