O2O PYTHON 代码

来源:互联网 发布:网络社交方式的利与弊 编辑:程序博客网 时间:2024/06/07 12:01
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
train = pd.read_csv('F:\O2O\My data\ccf_offline_stage1_train.csv',na_values='null',dtype=str)
#标记缺失值
train_data1=pd.DataFrame(train)
train_data1.replace('null',np.nan)
#删掉couponid缺失的行
train_data2=train_data1.dropna(subset=['Coupon_id'])
#将distance转化成float类型
train_data2['Distance']=train_data2['Distance'].astype(float)
#对distance缺失的用均值四舍五入取整补缺
Distance=train_data2['Distance'].dropna()
Distance1=train_data2['Distance'].fillna(round(Distance.mean()))
train_data2['Distance']=Distance1

#将date和date_received改成日期类型
train_data2['Date']=pd.to_datetime(train_data2['Date'])
train_data2['Date_received']=pd.to_datetime(train_data2['Date_received'])
#构造变量
train_data2['days']=train_data2['Date']-train_data2['Date_received']
train_data2['number']=train_data2['days']/np.timedelta64(1,'D')
#构造y一个列以及一个函数
list=[]
def bian(numbers):
    for n in numbers:
        i=0
        if np.isnan(n):
            if np.isnan(date.iloc[i]):
                list.append(0)
                i=i+1
            else:
                if np.isnan(date_received.iloc[i]):
                    list.append(1)
                    i=i+1
        else:
            if n>15:
                list.append(0)
                i=i+1
            else:
                list.append(1)
                i=i+1
   

#对date\date_received处理
date=(train_data2['Date']-train_data2['Date'])/np.timedelta64(1,'D')
date_received=(train_data2['Date_received']-train_data2['Date_received'])/np.timedelta64(1,'D')
#对函数使用
bian(train_data2['number'])
#补在train_data2后面
train_data2['target']=list


#看看效果
train_data2.columns #查看列名
train_data2.info() #查看各字段的信息
train_data2.shape #查看数据集行列分布,几行几列

train1=pd.read_csv('F:\O2O\My data\ccf_offline_stage1_train.csv',na_values='null',dtype=str)
train1.replace('null',np.nan)
train2=pd.DataFrame(train1)
train3=train2.dropna(subset=['Date'])
user_count=train3.groupby('User_id')['User_id'].agg({'用户总购买次数':np.size})#用户总购买次数
user_merchant_count=train3.groupby([u'User_id',u'Merchant_id'])['User_id','Merchant_id'].agg({'用户购买此类商品的次数':np.size})#用户购买此类商品的次数
merge1=pd.merge(user_count.reset_index(),user_merchant_count.reset_index())
merge1['Percent1']=merge1['用户购买此类商品的次数']/(merge1['用户总购买次数']*2)
print(merge1)

train4=train3.dropna(subset=['Coupon_id'])
user_you_count=train4.groupby(u'User_id')['User_id'].agg({'用户用券购买的次数':np.size})#用户有券的情况
merge2=pd.merge(user_count.reset_index(),user_you_count.reset_index())
merge2['Percent2']=merge2['用户用券购买的次数']/merge2['用户总购买次数']
print(merge2)
training=train_data2[train_data2.target>0]
user_15count=training.groupby('User_id')['User_id'].agg({'用户15天内使用优惠券购买次数':np.size})#用户15天内使用优惠券购买次数
merge3=pd.merge(user_15count.reset_index(),user_you_count.reset_index())
merge3['Percent3']=merge3['用户15天内使用优惠券购买次数']/merge3['用户用券购买的次数']
print(merge3)
user_merchant_you_count=train4.groupby([u'User_id',u'Merchant_id'])['User_id','Merchant_id'].agg({'用户使用消费券购买此类商品的次数':np.size})#用户使用优惠券购买此类商品的次数
merge4=pd.merge(user_merchant_you_count.reset_index(),user_merchant_count.reset_index())
merge4['Percent4']=merge4['用户使用消费券购买此类商品的次数']/merge4['用户购买此类商品的次数']
print(merge4)

user_merchant_coupon_count=train4.groupby([u'User_id',u'Merchant_id',u'Coupon_id'])['User_id','Merchant_id','Coupon_id'].agg({'用户使用此类优惠券购买该商品的次数':np.size})
merge5=pd.merge(user_merchant_you_count.reset_index(),user_merchant_coupon_count.reset_index())
merge5['Percent5']=(merge5['用户使用此类优惠券购买该商品的次数']/3)/(merge5['用户使用消费券购买此类商品的次数']/2)
print(merge5)

coupon_count=train4.groupby(u'Coupon_id')['Coupon_id'].agg({'Count':np.size})
merge6=coupon_count.reset_index()
print(merge6)
#构造训练样本
merge1_1=pd.merge(train_data2,merge1)
merge1_2=pd.merge(merge1_1,merge2)
merge1_3=pd.merge(merge1_2,merge3)
merge1_4=pd.merge(merge1_3,merge4)
merge1_5=pd.merge(merge1_4,merge5)
merge1_6=pd.merge(merge1_5,merge6)
print(merge1_6)
final_train=merge1_6[['Percent1','Percent2','Percent3','Percent4','Percent5','Count','Distance','target']]
print(final_train)
#最大最小规范化
final_final_train=(final_train-final_train.min())/(final_train.max()-final_train.min())
print(final_final_train)
#开始构造逻辑回归模型
data=final_final_train
x=data.iloc[:,:7].as_matrix()
y=data.iloc[:,7].as_matrix()
data1=data.iloc[:,:7]
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR
rlr=RLR()
rlr.fit(x,y)
rlr.get_support()
print(u'通过随机逻辑回归模型筛选特征结束。')
print(u'有效特征为:%s'%','.join(data1.columns[rlr.get_support()]))
x=data[data1.columns[rlr.get_support()]].as_matrix()
lr=LR()
lr.fit(x,y)
print(u'逻辑回归模型训练结束')
print(u'模型的平均正确率为:%s' % lr.score(x,y))