SQL构建特征--入门篇

来源：互联网发布：08总决赛皮尔斯数据编辑：程序博客网时间：2024/04/24 02:20

1.原始数据
共有4列，User_id是用户标识，Merchant_id是商家标识，Location_id是地点标识，Time_stamp是时间。
以第一条记录为例，代表着用户1027765在20151028日在地点172的4822商家有购买行为。

2.构建特征
主要有以下几大类特征（直接统计与加distinct进行统计）
（1）计数型
（2）rank型
（3）交叉型
下面是完整的代码，可以借鉴思路，使用的时候注意更换表名

select distinct tmp.User_id,tmp.Location_id,tmp.Merchant_id,tmp.ulm_count,tmp.u_count,DENSE_RANK() OVER(order by tmp.u_count desc) as u_count_rank,tmp.l_count,DENSE_RANK() OVER(order by tmp.l_count desc) as l_count_rank,tmp.m_count,DENSE_RANK() OVER(order by tmp.m_count desc) as m_count_rank,tmp.ul_count,DENSE_RANK() OVER(order by tmp.ul_count desc) as ul_count_rank,tmp.um_count,DENSE_RANK() OVER(order by tmp.um_count desc) as um_count_rank,tmp.lm_count,DENSE_RANK() OVER(order by tmp.lm_count desc) as lm_count_rank,tmp.ul_num,DENSE_RANK() OVER(order by tmp.ul_num desc) as ul_num_rank,tmp.lu_num,DENSE_RANK() OVER(order by tmp.lu_num desc) as lu_num_rank,tmp.um_num,DENSE_RANK() OVER(order by tmp.um_num desc) as um_num_rank,tmp.mu_num,DENSE_RANK() OVER(order by tmp.mu_num desc) as mu_num_rank,tmp.lm_num,DENSE_RANK() OVER(order by tmp.lm_num desc) as lm_num_rank,tmp.ml_num,DENSE_RANK() OVER(order by tmp.ml_num desc) as ml_num_rankfrom(    select distinct tmp1.*    ,tmp2.um_num,tmp2.mu_num    from    (        select distinct tmp1.*,        tmp2.lu_num,tmp2.ul_num        from         (            select distinct tmp1.User_id,tmp1.Location_id,tmp1.Merchant_id            ,tmp1.u_count,tmp1.l_count,tmp1.m_count,tmp1.ul_count,tmp1.um_count,tmp1.lm_count,tmp1.ulm_count            ,tmp2.lm_num,tmp2.ml_num            from             (   --可以多次                select distinct User_id,Location_id,Merchant_id                ,COUNT(*) over(partition by User_id) as u_count                ,COUNT(*) over(partition by Merchant_id) as m_count                           ,COUNT(*) over(partition by Location_id) as l_count                 ,COUNT(*) over(partition by User_id,Location_id) as ul_count                ,COUNT(*) over(partition by User_id,Merchant_id) as um_count                ,COUNT(*) over(partition by Merchant_id,Location_id) as lm_count                     ,COUNT(*) over(partition by User_id,Location_id,Merchant_id) as ulm_count                from ijcai.dbo.train                where Time_Stamp < '20151101'            )tmp1            join            (   --单次 distinct                select distinct tmp.Merchant_id,tmp.Location_id                ,COUNT(*) over(partition by tmp.Location_id)  as lm_num                ,COUNT(*) over(partition by tmp.Merchant_id)  as ml_num                from                 (                    select distinct Merchant_id,Location_id                    from ijcai.dbo.train                    where Time_Stamp < '20151101'                )tmp            )tmp2 on tmp1.Merchant_id = tmp2.Merchant_id and tmp1.Location_id = tmp2.Location_id        )tmp1        join         (   --单次 distinct            select distinct tmp.User_id,tmp.Location_id            ,COUNT(*) over(partition by tmp.User_id)  as ul_num            ,COUNT(*) over(partition by tmp.Location_id)  as lu_num            from             (                select distinct User_id,Location_id                from ijcai.dbo.train                where Time_Stamp < '20151101'            )tmp        )tmp2 on tmp1.User_id = tmp2.User_id and tmp1.Location_id = tmp2.Location_id    )tmp1    join    (   --单次 distinct        select distinct tmp.User_id,tmp.Merchant_id        ,COUNT(*) over(partition by tmp.User_id)  as um_num        ,COUNT(*) over(partition by tmp.Merchant_id)  as mu_num        from         (            select distinct User_id,Merchant_id            from ijcai.dbo.train            where Time_Stamp < '20151101'        )tmp    )tmp2 on tmp1.User_id = tmp2.User_id and tmp1.Merchant_id = tmp2.Merchant_id)tmp

0 0