利用ipython notebook --numpy,pandas构建特征

来源:互联网 发布:淘宝装修设计教程 编辑:程序博客网 时间:2024/05/27 08:13

工具:ipython notebook
数据来源:滴滴算法大赛
目标:统计预测时间的前三个时间片按照区域(district_id)、每分钟(time_piece)的需求(demand)、供应缺口(gap),及以10分钟为滑动窗口的统计值,再加上平均值、方差(多考虑了是否工作日)
亮点:一步代码,一步数据演示

import pandas as pdimport numpy as npimport gc
def trans_time_slice(time):    time = time.split('-')    if int(time[3]) > 1:        time[3] = str(int(time[3]) - 1)    else:        if int(time[2]) > 11:            time[2] = str(int(time[2]) - 1)        else:            time[2] = str(0)+str(int(time[2])-1)        time[3] = '144'    return '-'.join(time)
order_data = pd.read_csv('./clear_data/add_minute_order_data.csv',header=None                         ,names=['order_id','driver_id','passenger_id','start_district_id','dest_district_id','price','time_piece','minute']                        ,dtype = {'order_id':np.str,'driver_id':np.str,'passenger_id':np.str,'start_district_id':np.uint8,'dest_district_id':np.uint8,'price':np.float16,'time_piece':np.str,'minute':np.uint8})
print order_data[:1]
order_id driver_id \ 0 97ebd0c6680f7c0535dbfdead6e51b4b dd65fa250fca2833a3a8c16d2cf0457c passenger_id start_district_id dest_district_id \ 0 ed180d7daf639d936f1aeae4f7fb482f 23 47 price time_piece minute 0 24.0 2016-01-01-82 8
order_data = order_data[(order_data['time_piece']>'2016-01-01-3')]   #确保每个时间片都有前3个时间片
#得到分钟统计值account_order_data_minute = order_data.groupby(['start_district_id', 'time_piece', 'minute'],as_index=False)['order_id','driver_id'].count()account_order_data_minute = account_order_data_minute.rename(columns={'start_district_id':'district_id','order_id':'demand', 'driver_id':'supply'})account_order_data_minute['gap'] = account_order_data_minute['demand'] - account_order_data_minute['supply']account_order_data_minute = account_order_data_minute.drop(['supply'],axis=1)
account_order_data_minute[:1]
district_id time_piece minute demand gap 0 1 2016-01-01-30 1 1 0
#为了后续求均值、方差,需要定义一个变量data_roll_10 = account_order_data_minute[['district_id','time_piece']].drop_duplicates()
data_roll_10[:1]
district_id time_piece 0 1 2016-01-01-30
#添加前3个时间片data_roll_10['time_piece_1'] = data_roll_10['time_piece'].apply(trans_time_slice)data_roll_10['time_piece_2'] = data_roll_10['time_piece_1'].apply(trans_time_slice)data_roll_10['time_piece_3'] = data_roll_10['time_piece_2'].apply(trans_time_slice)
#这里可以很清晰的看到,trans_time_slice函数式用于得到某个时间片的前一个时间片data_roll_10[:1]
district_id time_piece time_piece_1 time_piece_2 time_piece_3 0 1 2016-01-01-30 2016-01-01-29 2016-01-01-28 2016-01-01-27
#得到前30分钟每分钟的demand,gapfor i in range(10):    data_roll_10['minute_'+str(i+1)] = i+1#connect 1 minute featurefor i in range(30):    time_slice = int(i/10+1)    minute = int((i+1)%10 if (i+1)%10 != 0 else 10)    account_order_data_minute.columns = ['district_id','time_piece_'+str(time_slice),'minute_'+str(minute),'demand_'+str(time_slice)+'_'+str(minute),'gap_'+str(time_slice)+'_'+str(minute)]    data_roll_10 = pd.merge(left = data_roll_10, right = account_order_data_minute, how = 'left' ,on = ['district_id','time_piece_'+str(time_slice),'minute_'+str(minute)], sort = False).fillna(0)#删除用于连接的列for i in range(10):    data_roll_10 = data_roll_10.drop(['minute_'+str(i+1)],axis=1)data_roll_10 = data_roll_10.drop(['time_piece_1','time_piece_2','time_piece_3'],axis=1)
data_roll_10[:3]
district_id time_piece demand_1_1 gap_1_1 demand_1_2 gap_1_2 demand_1_3 gap_1_3 demand_1_4 gap_1_4 … demand_3_6 gap_3_6 demand_3_7 gap_3_7 demand_3_8 gap_3_8 demand_3_9 gap_3_9 demand_3_10 gap_3_10 0 1 2016-01-01-30 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 … 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 1 2016-01-01-31 1.0 0.0 4.0 4.0 0.0 0.0 4.0 1.0 … 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2 1 2016-01-01-32 7.0 5.0 3.0 3.0 3.0 2.0 3.0 2.0 … 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

3 rows × 62 columns

#计算滑动窗口统计值all_slice = 1all_minute = 1for i in range(21):    tmp_slice = all_slice    tmp_minute = all_minute    for j in range(10):        if j == 0:            data_roll_10['demand_sum10_'+str(i+1)] = data_roll_10['demand_'+str(tmp_slice)+'_'+str(tmp_minute)]            data_roll_10['gap_sum10_'+str(i+1)] = data_roll_10['gap_'+str(tmp_slice)+'_'+str(tmp_minute)]        else:            data_roll_10['demand_sum10_'+str(i+1)] += data_roll_10['demand_'+str(tmp_slice)+'_'+str(tmp_minute)]            data_roll_10['gap_sum10_'+str(i+1)] += data_roll_10['gap_'+str(tmp_slice)+'_'+str(tmp_minute)]        if tmp_minute <= 9:            tmp_minute += 1        else:            tmp_slice += 1            tmp_minute = 1    if all_minute <= 9:        all_minute += 1    else:        all_slice += 1        all_minute = 1
#删除用于求滚动窗口的列for i in range(30):    time_slice = int(i/10+1)    minute = int((i+1)%10 if (i+1)%10 != 0 else 10)    data_roll_10 = data_roll_10.drop(['demand_'+str(time_slice)+'_'+str(minute),'gap_'+str(time_slice)+'_'+str(minute)],axis=1)
data_roll_10[:1]
district_id time_piece demand_sum10_1 gap_sum10_1 demand_sum10_2 gap_sum10_2 demand_sum10_3 gap_sum10_3 demand_sum10_4 gap_sum10_4 … demand_sum10_17 gap_sum10_17 demand_sum10_18 gap_sum10_18 demand_sum10_19 gap_sum10_19 demand_sum10_20 gap_sum10_20 demand_sum10_21 gap_sum10_21 0 1 2016-01-01-30 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 … 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

1 rows × 44 columns

#加上是否工作日from datetime import datetimedata_roll_10['slice'] = data_roll_10['time_piece'].apply(lambda x : x[x.rfind('-')+1:])data_roll_10['week_info'] = data_roll_10['time_piece'].apply(lambda x : datetime.strptime(x[:x.rfind('-')],'%Y-%m-%d').weekday()+1)data_roll_10.loc[(data_roll_10['week_info'] < 6),'isWeekday'] = 0data_roll_10.loc[(data_roll_10['week_info'] >= 6),'isWeekday'] = 1data_roll_10.loc[(data_roll_10['time_piece'].apply(lambda x : x[:x.rfind('-')]) == '2016-01-01'),'isWeekday'] = 1data_roll_10['isWeekday'] = data_roll_10['isWeekday'].astype(np.uint8)data_roll_10 = data_roll_10.drop(['week_info'],axis=1)
data_roll_10[:1]
district_id time_piece demand_sum10_1 gap_sum10_1 demand_sum10_2 gap_sum10_2 demand_sum10_3 gap_sum10_3 demand_sum10_4 gap_sum10_4 … demand_sum10_18 gap_sum10_18 demand_sum10_19 gap_sum10_19 demand_sum10_20 gap_sum10_20 demand_sum10_21 gap_sum10_21 slice isWeekday 0 1 2016-01-01-30 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 … 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 30 1

1 rows × 46 columns

data_roll_10_tmp = data_roll_10.drop(['time_piece'],axis=1)#平均值data_roll_10_avg = data_roll_10_tmp.groupby(['district_id','isWeekday','slice']).mean().reset_index()data_roll_10_avg_name=list(data_roll_10_avg.columns)for i in range(3,len(data_roll_10_avg_name)):    data_roll_10_avg_name[i]+='_avg'data_roll_10_avg.columns=data_roll_10_avg_name#方差data_roll_10_std = data_roll_10_tmp.groupby(['district_id','isWeekday','slice']).std().reset_index()data_roll_10_std_name=list(data_roll_10_std.columns)for i in range(3,len(data_roll_10_std_name)):    data_roll_10_std_name[i]+='_std'data_roll_10_std.columns=data_roll_10_std_name
data_roll_10_avg[:1]
district_id isWeekday slice demand_sum10_1_avg gap_sum10_1_avg demand_sum10_2_avg gap_sum10_2_avg demand_sum10_3_avg gap_sum10_3_avg demand_sum10_4_avg … demand_sum10_17_avg gap_sum10_17_avg demand_sum10_18_avg gap_sum10_18_avg demand_sum10_19_avg gap_sum10_19_avg demand_sum10_20_avg gap_sum10_20_avg demand_sum10_21_avg gap_sum10_21_avg 0 1 0 1 54.857143 3.214286 55.571429 3.357143 56.285714 3.357143 57.357143 … 60.142857 1.928571 60.714286 2.357143 60.428571 2.428571 61.285714 2.428571 63.642857 2.357143

1 rows × 45 columns

data_roll_10_std[:1]
district_id isWeekday slice demand_sum10_1_std gap_sum10_1_std demand_sum10_2_std gap_sum10_2_std demand_sum10_3_std gap_sum10_3_std demand_sum10_4_std … demand_sum10_17_std gap_sum10_17_std demand_sum10_18_std gap_sum10_18_std demand_sum10_19_std gap_sum10_19_std demand_sum10_20_std gap_sum10_20_std demand_sum10_21_std gap_sum10_21_std 0 1 0 1 17.866415 1.57766 18.587482 1.736803 19.648561 1.736803 20.220352 … 19.743966 2.129077 20.020868 2.239751 19.712215 2.502746 20.306985 2.208873 22.057991 2.437121

1 rows × 45 columns

#链接均值、方差data_roll_10 = pd.merge(left=data_roll_10,right=data_roll_10_avg,how='left',on=['district_id','isWeekday','slice'],sort=False).fillna(-1)data_roll_10 = pd.merge(left=data_roll_10,right=data_roll_10_std,how='left',on=['district_id','isWeekday','slice'],sort=False).fillna(-1)data_roll_10 = data_roll_10.drop(['isWeekday','slice'],axis=1)del data_roll_10_avg,data_roll_10_std,data_roll_10_tmp,data_roll_10_avg_name,data_roll_10_std_namegc.collect()
79
#至此,我们完成了以10分钟为滑动窗口的demand,gap的统计,并且加上了avg,std这两个统计量data_roll_10[:2]
district_id time_piece demand_sum10_1 gap_sum10_1 demand_sum10_2 gap_sum10_2 demand_sum10_3 gap_sum10_3 demand_sum10_4 gap_sum10_4 … demand_sum10_17_std gap_sum10_17_std demand_sum10_18_std gap_sum10_18_std demand_sum10_19_std gap_sum10_19_std demand_sum10_20_std gap_sum10_20_std demand_sum10_21_std gap_sum10_21_std 0 1 2016-01-01-30 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 … 4.577377 1.133893 4.755949 1.133893 4.572173 1.133893 4.336995 1.133893 4.472136 1.133893 1 1 2016-01-01-31 25.0 13.0 24.0 13.0 20.0 9.0 20.0 9.0 … 4.810702 0.377964 4.358899 0.377964 4.572173 0.377964 4.613644 0.377964 4.535574 0.000000

2 rows × 128 columns

0 0