Pandas常用笔记

来源：互联网发布：阿里云快还是腾讯云快编辑：程序博客网时间：2024/06/15 19:14

官方文档

http://pandas.pydata.org/pandas-docs/stable/

导入包pandas

import pandas as pd

获取文件夹下文件名称

import osfilenames=[]for file in os.listdir(path):    filenames.append(file)

读前几行文件(大文件)

# -*- coding: utf-8 -*-##读前几行文件f= open("/Users/yuyin/Downloads/Action_all.csv")for i in range(5):    print(f.readline().strip())

读取文件

file = pd.read_csv("./非空记录/"+filenames[i],sep='--->',header=None)

写文件

file.to_csv("./非空all/alldata.csv",header=False,mode='a',index=False) #a追加

字符串截取与类型转换

str(12+int(t2[6:8])

创建DF list to DF

add_all=pd.DataFrame({'shop_id':add_01,'time':add_02,'year':add_03,'mouth':add_04,'day':add_05,'pay_count':add_06,'pay_user_count':add_07})

DF复制

re2=re.copy(deep=True)

DF类型转换astype

model_off_train_1[model_off_train_1.iloc[:,5].astype(int)<=20160401]

DF截取某一列

alldata.iloc[:,4]test_pre_uidmid['u1']

DF选取多列

train.iloc[:,2:(train.shape[1]-1)]#最后一列train.iloc[:,-1]#带名字的列val[['user_id','sku_id']]

抽样

re = train.sample(frac=0.25, random_state=66)

利用sql执行DF

from pandasql import sqldfpysqldf=lambda q:sqldf(q,globals())pysqldf("select * from test_sample  limit 5")

DF列重命名

feature_1.columns=['c0','c1','c2','c3','c4','c5','c6']

DF左连接

train_sample_all=pd.merge(train_sample_all,feature_1,how='left',on=['c0','c1'])

查看缺失值及缺失值补为0

train_sample_all[pd.isnull(train_sample_all.iloc[:,7])].head()  #查看缺失值train_sample_all.iloc[:,7]=train_sample_all.iloc[:,7].fillna(0)  #填充缺失值

重新DF建立行索引

tmp=pd.DataFrame(model_off_train_1_tmp)tmp.index=[i for i in range(tmp.shape[0])]tmp=train_all[(train_all['time']<=20161031)&(train_all['time']>=20161018)].reset_index()

显示DF所有的行列

pd.set_option('display.max_rows', None)pd.set_option('display.max_columns', None)

选择DF满足多条件的部分

train_sample_1=model_off_train_1[(model_off_train_1.iloc[:,6].astype(int)<=20160615)&(model_off_train_1.iloc[:,6].astype(int)>=20160201)]tmp=train_all[(train_all['time']<=20161031)&(train_all['time']>=20161018)].reset_index()

抽样

pture.sample(int(pture.shape[0]/2))df.sample(n)

多个list转DF

pd.DataFrame({'c0':tmp2.index,'c1':tmp2.iloc[:,1]})feature_1=pd.DataFrame({'u5':tmp})

添加新列及列之间运算

re1['diff'] = map(lambda x, y: abs(x-y) , re1['label'], re1['pro'])

DF添加DF类型列(合并列)

test_pre_uidmid=pd.concat([test_pre_uidmid,feature_1],axis=1)

相同列的DF行合并(合并行)

test_pre_allend_all=pd.concat([test_pre_allend,test_pre_allend_1])

DF排序

lc.sort(["loan_amnt"])#默认升序Truelc.sort(["loan_amnt"],ascending=True)#降序lc.sort(["int_rate","loan_amnt"],ascending=False)

更改DF列排序

默认字典排序

columns = ['shop_id','time','year','mouth','day','pay_count','pay_user_count']add_all=add_all.ix[:,columns]

DF去重

tmp=tmp.drop_duplicates(['c0','c1','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','u1','u2','u3','u4','u5','u6','u7','m1','m2','m3','m4','m5'])shop_id = user_pay_all['shop_id']shop_id = shop_id.drop_duplicates()

DF选择缺失值部分

test_pre_allend_1=test_pre_allend[pd.isnull(test_pre_allend.iloc[:,7])]test_pre_allend=test_pre_allend[-pd.isnull(test_pre_allend.iloc[:,7])]

DF选取满足含有某字符串的部分=like匹配

alldata=alldata[(alldata['c2'].str.contains('@nuc',na=False))&(alldata['c3'].str.contains('@nuc',na=False))]

计算时间差

import datetime#计算时间差-天 (datetime.datetime(2016,2,11)-datetime.datetime(2016,2,01)).days#时间差  秒a=datetime.now()b=datetime.now()(b-a)(b-a).seconds

时间转换-字符串格式更改-时间戳转换

#如a = "2013-10-10 23:40:00",想改为 a = "2013/10/10 23:40:00"方法:先转换为时间数组,然后转换为其他格式import timea = "2013-10-10 23:40:00"timeArray = time.strptime(a, "%Y-%m-%d %H:%M:%S")otherStyleTime = time.strftime("%Y/%m/%d %H:%M:%S", timeArray)#转换为时间戳:timeStamp = int(time.mktime(timeArray))timeStamp == 1381419600#时间戳转换为指定格式日期:timeStamp = 1381419600timeArray = time.localtime(timeStamp)otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)otherStyletime == "2013-10-10 23:40:00"#获得当前时间时间戳now = int(time.time())  ->这是时间戳##datetimeimport datetime#获得当前时间now = datetime.datetime.now()  ->这是时间数组格式#string转datetimestr = '2012-11-19'date_time = datetime.datetime.strptime(str,'%Y-%m-%d')#datetime转stringdate_time.strftime('%Y-%m-%d')#datetime转时间戳time_time = time.mktime(date_time.timetuple())#时间戳转stringtime.strftime('%Y-%m-%d',time.localtime(time_time))#date转datetimedate = datetime.date.today()datetime.datetime.strptime(str(date),'%Y-%m-%d') #将date转换为str，在由str转换为datetime

时间间隔增加/减少

import datetimenow_time = datetime.datetime.now()yes_time = now_time + datetime.timedelta(days=1) #-n nyes_time_nyr = yes_time.strftime('%Y%m%d')  ##//格式化输出

np生成随机数

#生成100个0-100的等差数列x = np.linspace(0, 100, 100) x2=np.random.randint(0,50, 100) ##0-50范围100个随机数 np.random.random(100) 0-1范围 np.random.normal(0,0.1,100)均值0方差0.1正态分布

简单绘图

import matplotlib.pyplot as pltimport numpy as np#生成100个0-100的等差数列x = np.linspace(0, 100, 100) #plt.plot(x,tmp1[0:100],'')#默认折线图plt.plot(x,tmp1[0:100])plt.show()#直接把x按照折线图画出plt.plot(x)plt.ylabel('np_linspace')plt.xlabel('1-100')plt.title('test')plt.show()##颜色线的形状##'g--'意思 颜色g 和--形状plt.plot(x,'g--')plt.show()##'go-'意思 颜色g、点的形状o、线的形状-x2=np.random.randint(0,50, 100)plt.plot(x2,'go-')##等价于##plt.plot(x2,color='g',linestyle='-',marker='o')plt.show()##刻度plt.xlim([0,100])plt.ylim([0,100])#散点图 oplt.plot(x,tmp1[0:100],'o')plt.show()##柱状图plt.hist(x2)plt.show()#一个图绘制多个线 o散点r颜色plt.plot(x,tmp1[0:100],'or')plt.plot(x,tmp1[0:100])plt.show()#汇制多个图x = np.linspace(0, 100, 100) x2=np.random.randint(0,50, 100)fig=plt.figure()#设置宽高fig=plt.figure(figsize=(15,5)) ##宽x高# fig.set_figwidth(15)# fig.set_figheight(5)##p1图是画布1x2(1行2列)里面第1个p1=fig.add_subplot(1,2,1)p2=fig.add_subplot(1,2,2)p1.plot(x,'r--')p1.set_ylabel('xx')#用set方法p2.hist(x2)plt.show()

生成时间序列

import datetimedef datelist(start, end):    start_date = datetime.date(*start)    end_date = datetime.date(*end)    result = []    curr_date = start_date    while curr_date != end_date:        result.append("%04d%02d%02d" % (curr_date.year, curr_date.month, curr_date.day))        curr_date += datetime.timedelta(1)    result.append("%04d%02d%02d" % (curr_date.year, curr_date.month, curr_date.day))    return resultif __name__ == "__main__":    print datelist((2014, 7, 28), (2014, 8, 3))

分组运算groupby count sum 自定义函数

mean(), sum(), max()
参考https://my.oschina.net/lionets/blog/280332

train_all.groupby(['send','recieve'])['time'].count().reset_index()

1 0