利用ipython notebook --numpy,pandas构建特征
来源:互联网 发布:淘宝装修设计教程 编辑:程序博客网 时间:2024/05/27 08:13
工具:ipython notebook
数据来源:滴滴算法大赛
目标:统计预测时间的前三个时间片按照区域(district_id)、每分钟(time_piece)的需求(demand)、供应缺口(gap),及以10分钟为滑动窗口的统计值,再加上平均值、方差(多考虑了是否工作日)
亮点:一步代码,一步数据演示
import pandas as pdimport numpy as npimport gc
def trans_time_slice(time): time = time.split('-') if int(time[3]) > 1: time[3] = str(int(time[3]) - 1) else: if int(time[2]) > 11: time[2] = str(int(time[2]) - 1) else: time[2] = str(0)+str(int(time[2])-1) time[3] = '144' return '-'.join(time)
order_data = pd.read_csv('./clear_data/add_minute_order_data.csv',header=None ,names=['order_id','driver_id','passenger_id','start_district_id','dest_district_id','price','time_piece','minute'] ,dtype = {'order_id':np.str,'driver_id':np.str,'passenger_id':np.str,'start_district_id':np.uint8,'dest_district_id':np.uint8,'price':np.float16,'time_piece':np.str,'minute':np.uint8})
print order_data[:1]
order_id driver_id \ 0 97ebd0c6680f7c0535dbfdead6e51b4b dd65fa250fca2833a3a8c16d2cf0457c passenger_id start_district_id dest_district_id \ 0 ed180d7daf639d936f1aeae4f7fb482f 23 47 price time_piece minute 0 24.0 2016-01-01-82 8 order_data = order_data[(order_data['time_piece']>'2016-01-01-3')] #确保每个时间片都有前3个时间片
#得到分钟统计值account_order_data_minute = order_data.groupby(['start_district_id', 'time_piece', 'minute'],as_index=False)['order_id','driver_id'].count()account_order_data_minute = account_order_data_minute.rename(columns={'start_district_id':'district_id','order_id':'demand', 'driver_id':'supply'})account_order_data_minute['gap'] = account_order_data_minute['demand'] - account_order_data_minute['supply']account_order_data_minute = account_order_data_minute.drop(['supply'],axis=1)
account_order_data_minute[:1]
#为了后续求均值、方差,需要定义一个变量data_roll_10 = account_order_data_minute[['district_id','time_piece']].drop_duplicates()
data_roll_10[:1]
#添加前3个时间片data_roll_10['time_piece_1'] = data_roll_10['time_piece'].apply(trans_time_slice)data_roll_10['time_piece_2'] = data_roll_10['time_piece_1'].apply(trans_time_slice)data_roll_10['time_piece_3'] = data_roll_10['time_piece_2'].apply(trans_time_slice)
#这里可以很清晰的看到,trans_time_slice函数式用于得到某个时间片的前一个时间片data_roll_10[:1]
#得到前30分钟每分钟的demand,gapfor i in range(10): data_roll_10['minute_'+str(i+1)] = i+1#connect 1 minute featurefor i in range(30): time_slice = int(i/10+1) minute = int((i+1)%10 if (i+1)%10 != 0 else 10) account_order_data_minute.columns = ['district_id','time_piece_'+str(time_slice),'minute_'+str(minute),'demand_'+str(time_slice)+'_'+str(minute),'gap_'+str(time_slice)+'_'+str(minute)] data_roll_10 = pd.merge(left = data_roll_10, right = account_order_data_minute, how = 'left' ,on = ['district_id','time_piece_'+str(time_slice),'minute_'+str(minute)], sort = False).fillna(0)#删除用于连接的列for i in range(10): data_roll_10 = data_roll_10.drop(['minute_'+str(i+1)],axis=1)data_roll_10 = data_roll_10.drop(['time_piece_1','time_piece_2','time_piece_3'],axis=1)
data_roll_10[:3]
3 rows × 62 columns
#计算滑动窗口统计值all_slice = 1all_minute = 1for i in range(21): tmp_slice = all_slice tmp_minute = all_minute for j in range(10): if j == 0: data_roll_10['demand_sum10_'+str(i+1)] = data_roll_10['demand_'+str(tmp_slice)+'_'+str(tmp_minute)] data_roll_10['gap_sum10_'+str(i+1)] = data_roll_10['gap_'+str(tmp_slice)+'_'+str(tmp_minute)] else: data_roll_10['demand_sum10_'+str(i+1)] += data_roll_10['demand_'+str(tmp_slice)+'_'+str(tmp_minute)] data_roll_10['gap_sum10_'+str(i+1)] += data_roll_10['gap_'+str(tmp_slice)+'_'+str(tmp_minute)] if tmp_minute <= 9: tmp_minute += 1 else: tmp_slice += 1 tmp_minute = 1 if all_minute <= 9: all_minute += 1 else: all_slice += 1 all_minute = 1
#删除用于求滚动窗口的列for i in range(30): time_slice = int(i/10+1) minute = int((i+1)%10 if (i+1)%10 != 0 else 10) data_roll_10 = data_roll_10.drop(['demand_'+str(time_slice)+'_'+str(minute),'gap_'+str(time_slice)+'_'+str(minute)],axis=1)
data_roll_10[:1]
1 rows × 44 columns
#加上是否工作日from datetime import datetimedata_roll_10['slice'] = data_roll_10['time_piece'].apply(lambda x : x[x.rfind('-')+1:])data_roll_10['week_info'] = data_roll_10['time_piece'].apply(lambda x : datetime.strptime(x[:x.rfind('-')],'%Y-%m-%d').weekday()+1)data_roll_10.loc[(data_roll_10['week_info'] < 6),'isWeekday'] = 0data_roll_10.loc[(data_roll_10['week_info'] >= 6),'isWeekday'] = 1data_roll_10.loc[(data_roll_10['time_piece'].apply(lambda x : x[:x.rfind('-')]) == '2016-01-01'),'isWeekday'] = 1data_roll_10['isWeekday'] = data_roll_10['isWeekday'].astype(np.uint8)data_roll_10 = data_roll_10.drop(['week_info'],axis=1)
data_roll_10[:1]
1 rows × 46 columns
data_roll_10_tmp = data_roll_10.drop(['time_piece'],axis=1)#平均值data_roll_10_avg = data_roll_10_tmp.groupby(['district_id','isWeekday','slice']).mean().reset_index()data_roll_10_avg_name=list(data_roll_10_avg.columns)for i in range(3,len(data_roll_10_avg_name)): data_roll_10_avg_name[i]+='_avg'data_roll_10_avg.columns=data_roll_10_avg_name#方差data_roll_10_std = data_roll_10_tmp.groupby(['district_id','isWeekday','slice']).std().reset_index()data_roll_10_std_name=list(data_roll_10_std.columns)for i in range(3,len(data_roll_10_std_name)): data_roll_10_std_name[i]+='_std'data_roll_10_std.columns=data_roll_10_std_name
data_roll_10_avg[:1]
1 rows × 45 columns
data_roll_10_std[:1]
1 rows × 45 columns
#链接均值、方差data_roll_10 = pd.merge(left=data_roll_10,right=data_roll_10_avg,how='left',on=['district_id','isWeekday','slice'],sort=False).fillna(-1)data_roll_10 = pd.merge(left=data_roll_10,right=data_roll_10_std,how='left',on=['district_id','isWeekday','slice'],sort=False).fillna(-1)data_roll_10 = data_roll_10.drop(['isWeekday','slice'],axis=1)del data_roll_10_avg,data_roll_10_std,data_roll_10_tmp,data_roll_10_avg_name,data_roll_10_std_namegc.collect()
79#至此,我们完成了以10分钟为滑动窗口的demand,gap的统计,并且加上了avg,std这两个统计量data_roll_10[:2]
2 rows × 128 columns
0 0
- 利用ipython notebook --numpy,pandas构建特征
- windows64 配置numpy,scipy及ipython notebook
- python3 利用pip安装ipython notebook
- Ubuntu下安装numpy matplotlib scikit-learn ipython-notebook
- 无监督机器学习--Ipython notebook
- Mac下iPython-notebook搭建及使用
- mac下的Ipython Notebook-如何修改默认路径
- ipython notebook
- Ipython notebook
- ipython notebook
- ipython+notebook
- IPython notebook
- ipython notebook
- ipython notebook
- ipython + notebook
- ipython notebook
- IPython notebook快捷键(Jupyter notebook)
- IPython notebook搭建
- Android用代码判断是否在使用VPN
- linux screen命令详解
- Kodi ——7 Filling containers
- scrollView
- Oracle表空间和数据文件详解(一)
- 利用ipython notebook --numpy,pandas构建特征
- poj 1061 青蛙的约会 (线性同余,扩展欧几里得)
- java爬虫(Jsoup)爬取某站点评论
- 深入理解JSX
- 数据挖掘学习书籍推荐 1
- CSRF漏洞
- 批处理总结
- vim使用
- AABB轴向包围盒