Pandas常用笔记
来源:互联网 发布:阿里云快还是腾讯云快 编辑:程序博客网 时间:2024/06/15 19:14
官方文档
http://pandas.pydata.org/pandas-docs/stable/
导入包pandas
import pandas as pd
获取文件夹下文件名称
import osfilenames=[]for file in os.listdir(path): filenames.append(file)
读前几行文件(大文件)
# -*- coding: utf-8 -*-##读前几行文件f= open("/Users/yuyin/Downloads/Action_all.csv")for i in range(5): print(f.readline().strip())
读取文件
file = pd.read_csv("./非空记录/"+filenames[i],sep='--->',header=None)
写文件
file.to_csv("./非空all/alldata.csv",header=False,mode='a',index=False) #a追加
字符串截取与类型转换
str(12+int(t2[6:8])
创建DF list to DF
add_all=pd.DataFrame({'shop_id':add_01,'time':add_02,'year':add_03,'mouth':add_04,'day':add_05,'pay_count':add_06,'pay_user_count':add_07})
DF复制
re2=re.copy(deep=True)
DF类型转换astype
model_off_train_1[model_off_train_1.iloc[:,5].astype(int)<=20160401]
DF截取某一列
alldata.iloc[:,4]test_pre_uidmid['u1']
DF选取多列
train.iloc[:,2:(train.shape[1]-1)]#最后一列train.iloc[:,-1]#带名字的列val[['user_id','sku_id']]
抽样
re = train.sample(frac=0.25, random_state=66)
利用sql执行DF
from pandasql import sqldfpysqldf=lambda q:sqldf(q,globals())pysqldf("select * from test_sample limit 5")
DF列重命名
feature_1.columns=['c0','c1','c2','c3','c4','c5','c6']
DF左连接
train_sample_all=pd.merge(train_sample_all,feature_1,how='left',on=['c0','c1'])
查看缺失值及缺失值补为0
train_sample_all[pd.isnull(train_sample_all.iloc[:,7])].head() #查看缺失值train_sample_all.iloc[:,7]=train_sample_all.iloc[:,7].fillna(0) #填充缺失值
重新DF建立行索引
tmp=pd.DataFrame(model_off_train_1_tmp)tmp.index=[i for i in range(tmp.shape[0])]tmp=train_all[(train_all['time']<=20161031)&(train_all['time']>=20161018)].reset_index()
显示DF所有的行列
pd.set_option('display.max_rows', None)pd.set_option('display.max_columns', None)
选择DF满足多条件的部分
train_sample_1=model_off_train_1[(model_off_train_1.iloc[:,6].astype(int)<=20160615)&(model_off_train_1.iloc[:,6].astype(int)>=20160201)]tmp=train_all[(train_all['time']<=20161031)&(train_all['time']>=20161018)].reset_index()
抽样
pture.sample(int(pture.shape[0]/2))df.sample(n)
多个list转DF
pd.DataFrame({'c0':tmp2.index,'c1':tmp2.iloc[:,1]})feature_1=pd.DataFrame({'u5':tmp})
添加新列及列之间运算
re1['diff'] = map(lambda x, y: abs(x-y) , re1['label'], re1['pro'])
DF添加DF类型列(合并列)
test_pre_uidmid=pd.concat([test_pre_uidmid,feature_1],axis=1)
相同列的DF行合并(合并行)
test_pre_allend_all=pd.concat([test_pre_allend,test_pre_allend_1])
DF排序
lc.sort(["loan_amnt"])#默认升序Truelc.sort(["loan_amnt"],ascending=True)#降序lc.sort(["int_rate","loan_amnt"],ascending=False)
更改DF列排序
默认字典排序
columns = ['shop_id','time','year','mouth','day','pay_count','pay_user_count']add_all=add_all.ix[:,columns]
DF去重
tmp=tmp.drop_duplicates(['c0','c1','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','u1','u2','u3','u4','u5','u6','u7','m1','m2','m3','m4','m5'])shop_id = user_pay_all['shop_id']shop_id = shop_id.drop_duplicates()
DF选择缺失值部分
test_pre_allend_1=test_pre_allend[pd.isnull(test_pre_allend.iloc[:,7])]test_pre_allend=test_pre_allend[-pd.isnull(test_pre_allend.iloc[:,7])]
DF选取满足含有某字符串的部分=like匹配
alldata=alldata[(alldata['c2'].str.contains('@nuc',na=False))&(alldata['c3'].str.contains('@nuc',na=False))]
计算时间差
import datetime#计算时间差-天 (datetime.datetime(2016,2,11)-datetime.datetime(2016,2,01)).days#时间差 秒a=datetime.now()b=datetime.now()(b-a)(b-a).seconds
时间转换-字符串格式更改-时间戳转换
#如a = "2013-10-10 23:40:00",想改为 a = "2013/10/10 23:40:00"方法:先转换为时间数组,然后转换为其他格式import timea = "2013-10-10 23:40:00"timeArray = time.strptime(a, "%Y-%m-%d %H:%M:%S")otherStyleTime = time.strftime("%Y/%m/%d %H:%M:%S", timeArray)#转换为时间戳:timeStamp = int(time.mktime(timeArray))timeStamp == 1381419600#时间戳转换为指定格式日期:timeStamp = 1381419600timeArray = time.localtime(timeStamp)otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)otherStyletime == "2013-10-10 23:40:00"#获得当前时间时间戳now = int(time.time()) ->这是时间戳##datetimeimport datetime#获得当前时间now = datetime.datetime.now() ->这是时间数组格式#string转datetimestr = '2012-11-19'date_time = datetime.datetime.strptime(str,'%Y-%m-%d')#datetime转stringdate_time.strftime('%Y-%m-%d')#datetime转时间戳time_time = time.mktime(date_time.timetuple())#时间戳转stringtime.strftime('%Y-%m-%d',time.localtime(time_time))#date转datetimedate = datetime.date.today()datetime.datetime.strptime(str(date),'%Y-%m-%d') #将date转换为str,在由str转换为datetime
时间间隔增加/减少
import datetimenow_time = datetime.datetime.now()yes_time = now_time + datetime.timedelta(days=1) #-n nyes_time_nyr = yes_time.strftime('%Y%m%d') ##//格式化输出
np生成随机数
#生成100个0-100的等差数列x = np.linspace(0, 100, 100) x2=np.random.randint(0,50, 100) ##0-50范围100个随机数 np.random.random(100) 0-1范围 np.random.normal(0,0.1,100)均值0方差0.1正态分布
简单绘图
import matplotlib.pyplot as pltimport numpy as np#生成100个0-100的等差数列x = np.linspace(0, 100, 100) #plt.plot(x,tmp1[0:100],'')#默认折线图plt.plot(x,tmp1[0:100])plt.show()#直接把x按照折线图画出plt.plot(x)plt.ylabel('np_linspace')plt.xlabel('1-100')plt.title('test')plt.show()##颜色线的形状##'g--'意思 颜色g 和--形状plt.plot(x,'g--')plt.show()##'go-'意思 颜色g、点的形状o、线的形状-x2=np.random.randint(0,50, 100)plt.plot(x2,'go-')##等价于##plt.plot(x2,color='g',linestyle='-',marker='o')plt.show()##刻度plt.xlim([0,100])plt.ylim([0,100])#散点图 oplt.plot(x,tmp1[0:100],'o')plt.show()##柱状图plt.hist(x2)plt.show()#一个图绘制多个线 o散点r颜色plt.plot(x,tmp1[0:100],'or')plt.plot(x,tmp1[0:100])plt.show()#汇制多个图x = np.linspace(0, 100, 100) x2=np.random.randint(0,50, 100)fig=plt.figure()#设置宽高fig=plt.figure(figsize=(15,5)) ##宽x高# fig.set_figwidth(15)# fig.set_figheight(5)##p1图是画布1x2(1行2列)里面第1个p1=fig.add_subplot(1,2,1)p2=fig.add_subplot(1,2,2)p1.plot(x,'r--')p1.set_ylabel('xx')#用set方法p2.hist(x2)plt.show()
生成时间序列
import datetimedef datelist(start, end): start_date = datetime.date(*start) end_date = datetime.date(*end) result = [] curr_date = start_date while curr_date != end_date: result.append("%04d%02d%02d" % (curr_date.year, curr_date.month, curr_date.day)) curr_date += datetime.timedelta(1) result.append("%04d%02d%02d" % (curr_date.year, curr_date.month, curr_date.day)) return resultif __name__ == "__main__": print datelist((2014, 7, 28), (2014, 8, 3))
分组运算groupby count sum 自定义函数
mean(), sum(), max()
参考https://my.oschina.net/lionets/blog/280332
train_all.groupby(['send','recieve'])['time'].count().reset_index()
1 0
- Pandas常用笔记
- python | 工作笔记 | pandas 常用总结
- pandas 笔记
- pandas笔记
- pandas笔记
- Pandas笔记
- 学习笔记三、pandas包中常用方法整理
- Pandas学习笔记:pandas基础
- pandas常用操作
- pandas常用的方法
- Pandas常用操作
- pandas 常用函数
- python~pandas常用代码
- Pandas常用操作
- pandas 常用操作
- pandas常用功能整理
- pandas常用函数
- pandas常用函数
- Colored Sticks(欧拉回路+字典树+并查集)
- skearn.cluster.AffiinityPropagation
- J2EE学习笔记(三)
- LeetCode 27 Remove Element题解
- CLLocationManager定位坐标不准确问题以及WGS_84转GCJ_02坐标位置纠错的方法
- Pandas常用笔记
- win7低版本升级到win7旗舰版的方法
- 文本反垃圾的原理和应用
- 3 InetAddress
- Android shape属性
- shell 脚本之 Function 功能的使用
- hadoop基础----hadoop实战(九)-----hadoop管理工具---CDH的错误排查(持续更新)
- 微信js-sdk使用流程
- 单独设置UIView的坐标x或y或width或height