使用hive和sqoop来实现统计24小时每个时段的PV和UV
来源:互联网 发布:卫星通信网络 编辑:程序博客网 时间:2024/05/21 09:15
【案例】使用hive和sqoop来实现网站基本指标,PV和UV
1、PV统计网页浏览总量
2、UV去重
-》【需求】统计24小时每个时段的PV和UV
-》建分区表,按天一级,按小时一级,多级分区
-》第一步分析需求
-》第二步获取时间字段,天,小时
-》对于时间格式进行数据清洗,比如:2015-08-28 18:10:00,从中获取日期和小时
-》获取需要有用的字段:id、url、guid、trackTime
-》第三步数据分析
-》使用select sql
-》第四步使用sqoop导出
-》预期结果
日期 小时 PVUV
建库:
create database track_log2;
建表:源表
create table yhd_source2(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)
row format delimited fields terminated by '\t'
stored as textfile;
shift+alt 下拉列式编辑
加载数据:
load data local inpath '/opt/datas/2015082818' into table yhd_source;
load data local inpath '/opt/datas/2015082819' into table yhd_source;
分区的方式:静态分区
create table yhd_part1(
id string,
url string,
guid string
)
partitioned by (date string,hour string)
row format delimited fields terminated by '\t';
加载数据,来源于source源表
insert into table yhd_part1 partition (date='20150828',hour='18') select id,url,guid from yhd_qingxi where date='28' and hour='18';
insert into table yhd_part1 partition (date='20150828',hour='19') select id,url,guid from yhd_qingxi where date='28' and hour='19';
select id,date,hour from yhd_part1 where date='20150828' and hour='18';
建一张清洗表,将时间字段清洗,提取部分的时间字段出来
create table yhd_qingxi(
id string,
url string,
guid string,
date string,
hour string
)
row format delimited fields terminated by '\t';
字段截取,天&小时
insert into table yhd_qingxi select id,url,guid,substring(trackTime,9,2) date,substring(trackTime,12,2) hour from yhd_source;
分区的方式:动态分区
<property>
<name>hive.exec.dynamic.partition</name>
<value>true</value>
<description>Whether or not to allow dynamic partitions in DML/DDL.</description>
</property>
-》默认值是true,代表允许使用动态分区实现
<property>
<name>hive.exec.dynamic.partition.mode</name>
<value>strict</value>
<description>In strict mode, the user must specify at least one static partition in case the user accidentally overwrites all partitions.</description>
</property>
-》set hive.exec.dynamic.partition.mode=nonstrict; 使用非严格模式
建表:
create table yhd_part2(
id string,
url string,
guid string
)
partitioned by (date string,hour string)
row format delimited fields terminated by '\t';
执行动态分区:
insert into table yhd_part2 partition (date,hour) select * from yhd_qingxi;
-》也可以不写select * ,但是要写全字段
-》首先根据select * 找到表,按照里面的字段date hour进行匹配
实现PV和UV的统计
PV实现:
select date,hour,count(url) PV from yhd_part1 group by date,hour;
-》按照天和小时进行分区
-》结果:
+-----------+-------+--------+--+
| date | hour | pv |
+-----------+-------+--------+--+
| 20150828 | 18 | 64972 |
| 20150828 | 19 | 61162 |
+-----------+-------+--------+--+
UV实现:
select date,hour,count(distinct guid) UV from yhd_part1 group by date,hour;
-》结果:
+-----------+-------+--------+--+
| date | hour | uv |
+-----------+-------+--------+--+
| 20150828 | 18 | 23938 |
| 20150828 | 19 | 22330 |
+-----------+-------+--------+--+
endUserId guid
登录的身份:
-》游客
-》会员
无论是游客还是会员都会有一个guid
endUserId应该是只针对于会员的,使用账号登录的
将PV和UV结合统计
create table if not exists result as select date,hour,count(url) PV ,count(distinct guid) UV from yhd_part1 group by date,hour;
-》结果:
+--------------+--------------+------------+------------+--+
| result.date | result.hour | result.pv | result.uv |
+--------------+--------------+------------+------------+--+
| 20150828 | 18 | 64972 | 23938 |
| 20150828 | 19 | 61162 | 22330 |
+--------------+--------------+------------+------------+--+
将结果导出到mysql表中
先在mysql建表:用于保存结果集
create table if not exists save2(
date varchar(30) not null,
hour varchar(30) not null,
pv varchar(30) not null,
uv varchar(30) not null,
primary key(date,hour)
);
使用sqoop实现导出到mysql
bin/sqoop export \
--connect \
jdbc:mysql://bigdata-senior02.ibeifeng.com:3306/sqoop \
--username root \
--password 123456 \
--table save2 \
--export-dir /user/hive/warehouse/track_log2.db/result \
--num-mappers 1 \
--input-fields-terminated-by '\001'
+----------+------+-------+-------+
| date | hour | pv | uv |
+----------+------+-------+-------+
| 20150828 | 18 | 64972 | 23938 |
| 20150828 | 19 | 61162 | 22330 |
+----------+------+-------+-------+
hive默认的分隔符:\001
1、PV统计网页浏览总量
2、UV去重
-》【需求】统计24小时每个时段的PV和UV
-》建分区表,按天一级,按小时一级,多级分区
-》第一步分析需求
-》第二步获取时间字段,天,小时
-》对于时间格式进行数据清洗,比如:2015-08-28 18:10:00,从中获取日期和小时
-》获取需要有用的字段:id、url、guid、trackTime
-》第三步数据分析
-》使用select sql
-》第四步使用sqoop导出
-》预期结果
日期 小时 PVUV
建库:
create database track_log2;
建表:源表
create table yhd_source2(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)
row format delimited fields terminated by '\t'
stored as textfile;
shift+alt 下拉列式编辑
加载数据:
load data local inpath '/opt/datas/2015082818' into table yhd_source;
load data local inpath '/opt/datas/2015082819' into table yhd_source;
分区的方式:静态分区
create table yhd_part1(
id string,
url string,
guid string
)
partitioned by (date string,hour string)
row format delimited fields terminated by '\t';
加载数据,来源于source源表
insert into table yhd_part1 partition (date='20150828',hour='18') select id,url,guid from yhd_qingxi where date='28' and hour='18';
insert into table yhd_part1 partition (date='20150828',hour='19') select id,url,guid from yhd_qingxi where date='28' and hour='19';
select id,date,hour from yhd_part1 where date='20150828' and hour='18';
建一张清洗表,将时间字段清洗,提取部分的时间字段出来
create table yhd_qingxi(
id string,
url string,
guid string,
date string,
hour string
)
row format delimited fields terminated by '\t';
字段截取,天&小时
insert into table yhd_qingxi select id,url,guid,substring(trackTime,9,2) date,substring(trackTime,12,2) hour from yhd_source;
分区的方式:动态分区
<property>
<name>hive.exec.dynamic.partition</name>
<value>true</value>
<description>Whether or not to allow dynamic partitions in DML/DDL.</description>
</property>
-》默认值是true,代表允许使用动态分区实现
<property>
<name>hive.exec.dynamic.partition.mode</name>
<value>strict</value>
<description>In strict mode, the user must specify at least one static partition in case the user accidentally overwrites all partitions.</description>
</property>
-》set hive.exec.dynamic.partition.mode=nonstrict; 使用非严格模式
建表:
create table yhd_part2(
id string,
url string,
guid string
)
partitioned by (date string,hour string)
row format delimited fields terminated by '\t';
执行动态分区:
insert into table yhd_part2 partition (date,hour) select * from yhd_qingxi;
-》也可以不写select * ,但是要写全字段
-》首先根据select * 找到表,按照里面的字段date hour进行匹配
实现PV和UV的统计
PV实现:
select date,hour,count(url) PV from yhd_part1 group by date,hour;
-》按照天和小时进行分区
-》结果:
+-----------+-------+--------+--+
| date | hour | pv |
+-----------+-------+--------+--+
| 20150828 | 18 | 64972 |
| 20150828 | 19 | 61162 |
+-----------+-------+--------+--+
UV实现:
select date,hour,count(distinct guid) UV from yhd_part1 group by date,hour;
-》结果:
+-----------+-------+--------+--+
| date | hour | uv |
+-----------+-------+--------+--+
| 20150828 | 18 | 23938 |
| 20150828 | 19 | 22330 |
+-----------+-------+--------+--+
endUserId guid
登录的身份:
-》游客
-》会员
无论是游客还是会员都会有一个guid
endUserId应该是只针对于会员的,使用账号登录的
将PV和UV结合统计
create table if not exists result as select date,hour,count(url) PV ,count(distinct guid) UV from yhd_part1 group by date,hour;
-》结果:
+--------------+--------------+------------+------------+--+
| result.date | result.hour | result.pv | result.uv |
+--------------+--------------+------------+------------+--+
| 20150828 | 18 | 64972 | 23938 |
| 20150828 | 19 | 61162 | 22330 |
+--------------+--------------+------------+------------+--+
将结果导出到mysql表中
先在mysql建表:用于保存结果集
create table if not exists save2(
date varchar(30) not null,
hour varchar(30) not null,
pv varchar(30) not null,
uv varchar(30) not null,
primary key(date,hour)
);
使用sqoop实现导出到mysql
bin/sqoop export \
--connect \
jdbc:mysql://bigdata-senior02.ibeifeng.com:3306/sqoop \
--username root \
--password 123456 \
--table save2 \
--export-dir /user/hive/warehouse/track_log2.db/result \
--num-mappers 1 \
--input-fields-terminated-by '\001'
+----------+------+-------+-------+
| date | hour | pv | uv |
+----------+------+-------+-------+
| 20150828 | 18 | 64972 | 23938 |
| 20150828 | 19 | 61162 | 22330 |
+----------+------+-------+-------+
hive默认的分隔符:\001
阅读全文
0 0
- 使用hive和sqoop来实现统计24小时每个时段的PV和UV
- 统计UV和PV
- shell统计pv和uv、独立ip的方法
- 网站流量统计之PV和UV
- 网站流量统计之PV和UV
- 网站流量统计之PV和UV
- pv和uv的区别
- 什么是PV和UV?
- pv和uv
- 基于Hive及Sqoop的每日PV、UV、IP定时分析
- 基于Hive及Sqoop的每日PV、UV、IP定时分析
- ip、uv和pv的定义
- pv uv ip的定义和区别
- 如何清楚的理解pv和uv
- 通过nginx日志利用shell统计日pv和uv
- 有关cookie实现统计pv,uv的一些用法
- 网站统计IP PV UV实现原理
- 网站统计IP PV UV实现原理
- JavaScript 类型简介
- 3.Visio画图后,粘贴到word白边太宽?
- android gradle tools 3.X 中依赖,implement、api 指令
- linux 常用命令集合
- JAVA性能分析——JStack
- 使用hive和sqoop来实现统计24小时每个时段的PV和UV
- 前端ui框架为什么没有一枝独秀
- ofbiz 使用迷你语言对请求数据进行校验
- Butterknif自定义view注解 不用依赖
- 小猴子下落
- jQuery DataTables的serverSide选项
- 制作业务流程 timeline 结合AdminLTE
- CentOS搭建MySQL主从复制,读写分离
- JSP的MVC模式