HIVE实战:简单处理web日志

来源:互联网 发布:户外拍摄技巧淘宝 编辑:程序博客网 时间:2024/05/22 04:44

测试文件下载

需求:简单处理一个web日志,该日志共有11个字段,5794条,现要进行简单处理

展示日志中的一条数据:

"27.38.5.159" "-" "31/Aug/2015:00:04:37 +0800" "GET /course/view.php?id=27 HTTP/1.1" "303" "440" - "http://www.ibeifeng.com/user.php?act=mycourse" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36" "-" "learn.ibeifeng.com"

1、在hive中创建表并加载数据

//创建数据原表create table if not exists default.web_log(remote_addr string,remote_user string,time_local string,request string,status string,body_bytes_sent string,request_body string,http_refer string,http_user_agent string,http_x_forwarded_for string,host string)ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'WITH SERDEPROPERTIES (  "input.regex" = "(\"[^ ]*\") (\"-|[^ ]*\") (\"[^\]]*\") (\"[^\"]*\") (\"[0-9]*\") (\"[0-9]*\") (-|[^ ]*) (\"[^ ]*\") (\"[^\"]*\") (-|[^ ]*) (\"[^ ]*\")")STORED AS TEXTFILE;//加载数据load data local inpath '/usr/datas/moodle.ibeifeng.access.log' into table default.bf_log;

2、创建一个新表,用来存储 ip、时间、请求、父链接

create table if not exists default.web_log_common(remote_addr string,time_local string,request string,http_refer string)row format delimited fields terminated by '\t'stored as orc tblproperties("orc.compress"="SNAPPY");
3、UDF

//去除引号public class RemoveQuotes extends UDF {        public Text evaluate(Text a) {                if(a == null) {            return new Text();        }        if(a.toString() == null) {            return new Text();        }                        return new Text(a.toString().replaceAll("\"", ""));    }}//规范时间public class FormatDate extends UDF {        private final SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.ENGLISH);    private final SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MMM-dd: HH:mm:ss");    @SuppressWarnings("unused")        public Text evaluate(Text input) {        Text output = new Text();        String inputdate = input.toString().trim();                if(input == null) {            return null;        }        if(inputdate == null) {            return null;        }                     try {                        Date oldDate = inputFormat.parse(inputdate);            String newDate = outputFormat.format(oldDate);            output.set(newDate);                    } catch (ParseException e) {                        e.printStackTrace();            return output;        }                        return output;    }}//获取IP前两段public class IpAddress extends UDF {           public Text evaluate(Text ip) {        Text output = new Text();                if(ip == null) {            return new Text();        }        if(ip.toString() == null) {            return new Text();        }        if(ip.toString() .equals("")) {            return new Text();        }        System.out.println(ip);        String[] split = (ip.toString()).split("\\.");        String ipAddress = split[0] + "." + split[1];        output.set(ipAddress);                                return output;    }}
add jar /usr/datas/myUDF.jar;1、create temporary function remove_quotes as "com.bpf.hive.udf.RemoveQuotes";2、create temporary function formatDate as "com.bpf.hive.udf.FormatDate";3、create temporary function ipAddress as "com.bpf.hive.udf.IpAddress";
4、新表中插入数据

insert overwrite table default.web_log_common select remove_quotes(remote_addr), formatDate(remove_quotes(time_local)), remove_quotes(request),remove_quotes(http_refer) from default.web_log_src;
5、哪个时间段请求网页的次数多

select t.hour, count(*) cnt from(select substring(time_local,13,2) hour from web_log_common) tgroup by t.hour order by cnt desc;
6、IP所在地域多

select t.ip_prex, count(*) cnt from(select ipAddress(remote_addr) ip_prex from web_log_common) tgroup by t.ip_prex order by cnt desc;








原创粉丝点击