Hive内置函数的应用案例

来源:互联网 发布:ubuntu tftp-hpa配置 编辑:程序博客网 时间:2024/06/05 20:21
1:计算两个时间相差多少天:hive内置函数datediff
 例子Demo:datediff('2009-07-28','2009-07-30')=2
2:也可以写自定义函数
package cdel.edu.hive.udf.demo04;


import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;


import org.apache.hadoop.hive.ql.exec.UDF;


/**
 * 计算一个时间到今天的相差多少天
 * @author dell
 *
 */
public class MyDateDiff extends UDF {


public String evaluate(String str) {
try {
String  today = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date().getTime());
int days = daysBetween(str,today);
return String.valueOf(days);
} catch (Exception e) {
return null;
}
}


/**
* 计算两个日期之间相差的天数
*
* @param smdate
*            较小的时间
* @param bdate
*            较大的时间
* @return 相差天数
* @throws ParseException
*/
public static int daysBetween(Date smdate, Date bdate)
throws ParseException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
smdate = sdf.parse(sdf.format(smdate));
bdate = sdf.parse(sdf.format(bdate));
Calendar cal = Calendar.getInstance();
cal.setTime(smdate);
long time1 = cal.getTimeInMillis();
cal.setTime(bdate);
long time2 = cal.getTimeInMillis();
long between_days = (time2 - time1) / (1000 * 3600 * 24);


return Integer.parseInt(String.valueOf(between_days));
}


/**
* 字符串的日期格式的计算
*/
public static int daysBetween(String smdate, String bdate)
throws ParseException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
Calendar cal = Calendar.getInstance();
cal.setTime(sdf.parse(smdate));
long time1 = cal.getTimeInMillis();
cal.setTime(sdf.parse(bdate));
long time2 = cal.getTimeInMillis();
long between_days = (time2 - time1) / (1000 * 3600 * 24);


return Integer.parseInt(String.valueOf(between_days));
}


public static void main(String[] args) throws Exception {
   String str="2015-04-27";
       System.out.println(new MyDateDiff().evaluate(str));
}
}




-创建自定义函数
add jar /home/admin/xxg/MyDateDiff.jar;
create temporary function myDateDiff as 'cdel.edu.hive.udf.demo04.MyDateDiff';


--hive 统计 
 
-- 最近30 天内学员的做题量统计 
--分网统计
create table  qz_count as
SELECT   mps.userID,mem.memberid,COUNT(distinct qpq.questionID) as  doQuesCount,mem.domain,
to_date(concat_ws(' ',substr(mps.createTime,1,10),substr(mps.createTime,11,length(mps.createTime))))   as daytime
from   tmp_qz_member_paper_score  mps   
left join  tmp_qz_member_paper_question qpq  on  qpq.paperScoreID = mps.paperScoreID  and mps.domain = qpq.domain 
left join order.tmp_member_info_new  mem on mem.uid = mps.userID and mem.domain = mps.domain
where   cast(myDateDiff(concat_ws(' ',substr(mps.createTime,1,10),'')) as int) <=30
group  by  mps.userID,mem.memberid ,mem.domain,
to_date(concat_ws(' ',substr(mps.createTime,1,10),substr(mps.createTime,11,length(mps.createTime))))
having  COUNT(distinct qpq.questionID)  >0
order by   doQuesCount  desc  ;


SELECT   mps.userID,mem.memberid,COUNT(distinct qpq.questionID) as  doQuesCount,mem.domain,
to_date(concat_ws(' ',substr(mps.createTime,1,10),substr(mps.createTime,11,length(mps.createTime))))   as daytime
from   tmp_qz_member_paper_score  mps   
left join  tmp_qz_member_paper_question qpq  on  qpq.paperScoreID = mps.paperScoreID  and mps.domain = qpq.domain 
left join order.tmp_member_info_new  mem on mem.uid = mps.userID and mem.domain = mps.domain
where  mps.userID='48888922'   and  cast(myDateDiff(concat_ws(' ',substr(mps.createTime,1,10),'')) as int) <=31
group  by  mps.userID,mem.memberid ,mem.domain,
to_date(concat_ws(' ',substr(mps.createTime,1,10),substr(mps.createTime,11,length(mps.createTime))))
having  COUNT(distinct qpq.questionID)  >0
order by   doQuesCount  desc  ;


SELECT   qpq.*
from   tmp_qz_member_paper_score  mps   
left join  tmp_qz_member_paper_question qpq  on  qpq.paperScoreID = mps.paperScoreID  and mps.domain = qpq.domain 
where  mps.userID='48888922'   limit 100;


经验验证:没有医学网做题题目信息
select  distinct logdata from tmp_qz_member_paper_question where domain='med' limit 10
验证数据
select * from qz_count  where userid=48888922 order by   doQuesCount  desc limit 100;


select * from qz_count  where doQuesCount>=500 order by   doQuesCount  desc ;


-- 不分网统计
create table  qz_count as
SELECT   mps.userID,mem.memberid,COUNT(distinct qpq.questionID) as  doQuesCount,
to_date(concat_ws(' ',substr(mps.createTime,1,10),substr(mps.createTime,11,length(mps.createTime))))   as daytime
from   tmp_qz_member_paper_score  mps   
left join  tmp_qz_member_paper_question qpq  on  qpq.paperScoreID = mps.paperScoreID  and mps.domain = qpq.domain 
left join order.tmp_member_info_new  mem on mem.uid = mps.userID and mem.domain = mps.domain
where   cast(myDateDiff(concat_ws(' ',substr(mps.createTime,1,10),'')) as int) <=30
group  by  mps.userID,mem.memberid , 
to_date(concat_ws(' ',substr(mps.createTime,1,10),substr(mps.createTime,11,length(mps.createTime))))
having  COUNT(distinct qpq.questionID)  >0
order by   doQuesCount  desc  ;




-- 统计学员每天做多少套试卷
-- 分网统计
create table  paper_count as
select   mps.userID,mem.Memberid,mem.domain, COUNT(distinct mps.paperViewID) AS doPaperCount , 
to_date(concat_ws(' ',substr(mps.createTime,1,10),substr(mps.createTime,11,length(mps.createTime))))   as daytime
from tmp_qz_member_paper_score mps
left join  order.tmp_member_info_new  mem  on mem.uID = mps.userID  and mem.domain = mps.domain
where  cast(myDateDiff(concat_ws(' ',substr(mps.createTime,1,10),'')) as int) <=31
group  by  mps.userID,mem.Memberid, mem.domain,
to_date(concat_ws(' ',substr(mps.createTime,1,10),substr(mps.createTime,11,length(mps.createTime))))
having  COUNT(distinct mps.paperViewID) >0
order by   doPaperCount   desc;


 -- 不分网统计
select   mps.userID,mem.Memberid,COUNT(distinct mps.paperViewID) AS doPaperCount , 
to_date(concat_ws(' ',substr(mps.createTime,1,10),substr(mps.createTime,11,length(mps.createTime))))   as daytime
from tmp_qz_member_paper_score mps
left join  order.tmp_member_info_new  mem  on mem.uID = mps.userID  and mem.domain = mps.domain
where   mps.userID=48888922
group  by mps.userID,mem.Memberid,
to_date(concat_ws(' ',substr(mps.createTime,1,10),substr(mps.createTime,11,length(mps.createTime))))
having  COUNT(distinct mps.paperViewID) >0
order by   doPaperCount   desc;




测试:
select * from  paper_count  order by   doPaperCount   desc  limit 100;


select   mps.userID,  mps.paperViewID  from tmp_qz_member_paper_score mps  where   mps.userID=48888922;






-- 导出本地文件中
insert overwrite local directory '/home/admin/xxg/qz/med/'
row format delimited
fields terminated by '\t'
select  *  from qz_count;






-- 最近30天内学员的提问数量
-- 针对主题的提问统计
 
SELECT ft.userID,ft.userName ,  COUNT(ft.userID)  as num FROM  FAQ_TOPIC  ft
group by FT.userid ,ft.userName




SELECT ft.userID,ft.userName ,  COUNT(af.USERID)  as num 
FROM  FAQ_TOPIC  ft
inner JOIN   faq   af on ft.topicID = af.topicID and af.istopic = 1
group by ft.userID,ft.userName


create table  faq_count as
SELECT   ft.userID, mem.memberid, COUNT(distinct ft.topicID) as num, mem.domain,
to_date(concat_ws(' ',substr(ft.createTime,1,10),substr(ft.createTime,11,length(ft.createTime))))   as daytime
FROM  tmp_faq_topic   ft  
left join order.tmp_member_info_new  mem on mem.uid = ft.userID   and mem.domain = ft.domain   
where     cast(myDateDiff(concat_ws(' ',substr(ft.createTime,1,10),'')) as int) <=30
group by ft.userID, mem.memberid,mem.domain,
to_date(concat_ws(' ',substr(ft.createTime,1,10),substr(ft.createTime,11,length(ft.createTime))))
having  COUNT(distinct ft.topicID)  >0
order by num desc  ;




--针对主题和继续提问的 --- 就是如果一个主题,两个继续的话 你统计的sum =3


create table  faq_count2 as
SELECT   ft.userID, mem.memberid, COUNT(distinct af.faqid) as num, mem.domain,
to_date(concat_ws(' ',substr(af.createTime,1,10),substr(af.createTime,11,length(af.createTime))))   as daytime
FROM   tmp_faq af  
left join  tmp_faq_topic   ft   on ft.topicID = af.topicID  and af.domain = ft.domain  
left join order.tmp_member_info_new  mem on mem.uid = ft.userID   and mem.domain = af.domain   
where     cast(myDateDiff(concat_ws(' ',substr(af.createTime,1,10),'')) as int) <=30
group by ft.userID, mem.memberid, mem.domain,
to_date(concat_ws(' ',substr(af.createTime,1,10),substr(af.createTime,11,length(af.createTime))))
having  COUNT(distinct af.faqid)  >0
order by num desc   limit 10;




-- 导出本地文件中
insert overwrite local directory '/home/admin/xxg/faq/'
row format delimited
fields terminated by '\t'
select  *  from faq_count;


-- 删除表信息
drop table qz_count;
drop table faq_count;
drop table paper_count;


-- 统计 
每天做题 >=3000道以上学员的个数
每天做题 >=2500  and <3000 道以上的学员个数
每天做题 >=2000  and <2500 道以上的学员个数
每天做题 >=1500 and <2000 道以上的学员个数
每天做题 >=1000 and <1500 道以上的学员个数
每天做题 >=500 and <1000 道以上的学员个数
每天做题 >=100 and <500 道以上的学员个数
每天做题 >=50 and <100 道以上的学员个数
每天做题 >0   and <50  道以上的学员个数


-- 根据学员分组统计,学员做题数量统计
select userid,memberid,domain,
SUM(case when doquescount=0    then 1 else 0 end) as n0,  
SUM(case when doquescount>0  and doquescount<50  then 1 else 0 end) as n1,  
SUM(case when doquescount>=50  and doquescount<100  then 1 else 0 end) as n2, 
SUM(case when doquescount>=100  and doquescount<500  then 1 else 0 end) as n3, 
SUM(case when doquescount>=500  and doquescount<1000  then 1 else 0 end) as n4,
SUM(case when doquescount>=1000  and doquescount<1500  then 1 else 0 end) as n5,
SUM(case when doquescount>=1500  and doquescount<2000  then 1 else 0 end) as n6,
SUM(case when doquescount>=2000  and doquescount<2500  then 1 else 0 end) as n7,
SUM(case when doquescount>=2500  and doquescount<3000  then 1 else 0 end) as n8,
SUM(case when doquescount>=3000    then 1 else 0 end) as n9   
from  qz_count 
group by  userid,memberid, domain limit 100;


-- 根据日期分组统计,学员做题量统计
select  daytime,
SUM(case when doquescount=0    then 1 else 0 end) as n0,  
SUM(case when doquescount>=1  and doquescount<50  then 1 else 0 end) as n1,  
SUM(case when doquescount>=50  and doquescount<100  then 1 else 0 end) as n2, 
SUM(case when doquescount>=100  and doquescount<500  then 1 else 0 end) as n3, 
SUM(case when doquescount>=500  and doquescount<1000  then 1 else 0 end) as n4,
SUM(case when doquescount>=1000  and doquescount<1500  then 1 else 0 end) as n5,
SUM(case when doquescount>=1500  and doquescount<2000  then 1 else 0 end) as n6,
SUM(case when doquescount>=2000  and doquescount<2500  then 1 else 0 end) as n7,
SUM(case when doquescount>=2500  and doquescount<3000  then 1 else 0 end) as n8,
SUM(case when doquescount>=3000    then 1 else 0 end) as n9 
from  qz_count 
group by   daytime;


测试
select  count(distinct userid)
from  qz_count  
where  doquescount>=2500  and doquescount<3000;


select  *
from  qz_count  
where  doquescount>=2500  and doquescount<3000;


select  count(distinct userid)
from  qz_count  
where  doquescount>=3000 ;
-- 
-- 根据日期统计每天学员的所做试卷量
select  daytime,
SUM(case when dopapercount=0  then 1 else 0 end) as n0,  
SUM(case when dopapercount>=1  and dopapercount<5  then 1 else 0 end) as n1,  
SUM(case when dopapercount>=5  and dopapercount<10  then 1 else 0 end) as n2, 
SUM(case when dopapercount>=10  and dopapercount<50  then 1 else 0 end) as n3, 
SUM(case when dopapercount>=50  and dopapercount<100  then 1 else 0 end) as n4,
SUM(case when dopapercount>=100  and dopapercount<150  then 1 else 0 end) as n5,
SUM(case when dopapercount>=150  and dopapercount<200  then 1 else 0 end) as n6,
SUM(case when dopapercount>=200  and dopapercount<250  then 1 else 0 end) as n7,
SUM(case when dopapercount>=250  and dopapercount<300  then 1 else 0 end) as n8,
SUM(case when dopapercount>=300    then 1 else 0 end) as n9 
from  paper_count 
group by   daytime  limit 100;


测试检验:
select *  from  paper_count  order by dopapercount desc limit 100;
select  count(*) from paper_count  where   dopapercount>=10  and dopapercount<50   and daytime='2015-04-26'   limit 100;
select  count(distinct userid) from paper_count  where   dopapercount>=10  and dopapercount<50   and daytime='2015-04-26'   limit 100;




 


-- 根据日期统计每天学员在不同区间内答疑数量
每天答疑提问主题 >=60  道以上学员的个数
每天答疑提问主题 >=55 and < 60道以上学员的个数
每天答疑提问主题 >=50 and < 55道以上学员的个数
每天答疑提问主题 >=45  and <50 道以上的学员个数
每天答疑提问主题 >=40  and <45 道以上的学员个数
每天答疑提问主题 >=35 and <40 道以上的学员个数
每天答疑提问主题 >=30 and <35 道以上的学员个数
每天答疑提问主题 >=25 and <30 道以上的学员个数
每天答疑提问主题 >=20 and <25 道以上的学员个数
每天答疑提问主题 >=15 and <20 道以上的学员个数
每天答疑提问主题 >=10 and <15 道以上的学员个数
每天答疑提问主题 >=5   and <10  道以上的学员个数
每天答疑提问主题 >=1   and <5  道以上的学员个数
每天答疑提问主题 =0      道以上的学员个数


select  daytime,
SUM(case when num=0  then 1 else 0 end) as n0,  
SUM(case when num>=1  and num<5  then 1 else 0 end) as n1,  
SUM(case when num>=5  and num<10  then 1 else 0 end) as n2, 
SUM(case when num>=10  and num<15  then 1 else 0 end) as n3, 
SUM(case when num>=15  and num<20  then 1 else 0 end) as n4,
SUM(case when num>=20  and num<25  then 1 else 0 end) as n5,
SUM(case when num>=25  and num<30  then 1 else 0 end) as n6,
SUM(case when num>=30  and num<35  then 1 else 0 end) as n7,
SUM(case when num>=35  and num<40  then 1 else 0 end) as n8,
SUM(case when num>=40  and num<45  then 1 else 0 end) as n9,
SUM(case when num>=45  and num<50  then 1 else 0 end) as n10,
SUM(case when num>=50  and num<55  then 1 else 0 end) as n11,
SUM(case when num>=55  and num<60  then 1 else 0 end) as n12,
SUM(case when num>=60    then 1 else 0 end) as n13 
from  faq_count 
group by   daytime  limit 100;
0 0