,日志各地区访问量统计分析

来源:互联网 发布:ubuntu16如何安装软件 编辑:程序博客网 时间:2024/05/23 01:17

1. 对access.log进行数据的预处理并进行清洗,将静态请求去除,

//过滤所有静态的资源请求if(url.startsWith("GET /static")){return;}if(url.startsWith("GET")){url = url.substring("GET ".length()+1, url.length()-" HTTP/1.1".length());}if(url.startsWith("POST")){url = url.substring("POST ".length()+1, url.length()-" HTTP/1.1".length());}if(url.endsWith(".js")||url.endsWith(".css")||url.endsWith(".js")||url.endsWith(".jpg")||url.endsWith(".png")||url.endsWith(".gif"))return;v.set(ip+"\t"+logtime +"\t"+url);

hadoop jar /home/hadoop/cleaner.jar /flume/$CURRENT /cleaned/$CURRENT

2  hive外部表创建

create EXTERNAL  table accesslog(ip string, time string ,url string) partitioned by (cleantime string) row format delimited fields terminated by '\t' 

3. 加载数据

LOAD DATA INPATH '/cleaned/20160623' INTO TABLE accesslog partition (20160623) ;

4. 创建ip地址范围区域表,并加载到hive中

create table ip_address (start_ip string,end_ip string ,province_city string,area string) row format delimited fields terminated by '\t';./sqoop import --connect jdbc:mysql://10.28.0.132:3306/test --username root --password admin  --query 'SELECT start_ip,end_ip,province_city,area FROM ip_address where $CONDITIONS' --target-dir '/user/hive/warehouse/ip_address'   --hive-table ip_address -hive-import  -m 1


5.对accesslog进行初步分析

create table iptimes(ip string ,times int) row format delimited fields terminated by '\t';insert into iptimes  select ip,count(ip) from accesslog where cleantime='20160623' group by ip;

6. 编写hiveudf     inet_aton增加到hive中

add jar /home/hadoop/inet_aton.jar;create temporary function inet_aton as 'com.qht.core.inet2n';


7,ip地址段预处理,加速运行速度

create table ipadress_temp (start_ip bigint,end_ip bigint ,province_city string) row format delimited fields terminated by '\t';insert into ipadress_temp select inet_aton(start_ip), inet_aton(end_ip), province_city from ip_address;

8.  区域分析

create table iparea row format delimited fields terminated by '\t' as select a.ip, b.province_city , a.times from iptimes a left outer join ip_address b where inet_aton(a.ip)<inet_aton(b.end_ip) and inet_aton(a.ip)>inet_aton(b.start_ip);

9. 分析统计

<pre name="code" class="sql">create table arestimes row format delimited fields terminated by '\t' as select province_city,sum(times) as time from iparea group by province_city order by time;;












0 0
原创粉丝点击