Pandas tips
来源:互联网 发布:苹果扫描软件下载 编辑:程序博客网 时间:2024/05/29 15:23
#numpy主要是列表形式,pandas主要是字典形式,pandas是基于numpy构建的
#使得应用更加的简单,pandas的两个主要的数据结构为:Series和DataFrame
import pandas as pd
import numpy as nps = pd.Series([1,3,6,np.nan,44,1])
print(s)
#Series的字符串形式为:索引在左边,值在右边
#因为没有为数据指定索引,所以自动创建0到长度-1的整数型索引
dates = pd.date_range('20171110',periods = 6)
df = pd.DataFrame(np.random.randn(6,4),index = dates,columns=['a','b','c','d'])
print(df)
#DataFrame是一个表格型的数据结构,包含有一组有序的列,每列可以是不同的值类型
#数值,字符串,布尔值等,DataFrame既有行索引也有列索引,可以被看做由Series组成
#的字典
print(df['b'])
#根据索引挑选b列的元素
df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
#创建一组没有给定行标和列标的数据
#系统会默认从0开始索引
print(df1)
df2 = pd.DataFrame({'A':1.,
'B':pd.Timestamp('20171201'),
'C':pd.Series(1,index = list(range(4)),dtype = 'float32'),
'D':np.array([3]*4,dtype = 'int32'),
'E':pd.Categorical(['test','train','test','train']),
'F':'too'})#对每一列进行设置
print(df2)
print(df2.dtypes) #打印每列的属性
##A float64
##B datetime64[ns]
##C float32
##D int32
##E category
##F object
##dtype: object
print(df2.index)#行标
print(df2.columns)#列标
print(df2.values)#查看df2的值
print(df2.describe())#查看数据的总结
print(df2.T) #对数据进行反转
print(df2.sort_index(axis=1,ascending=False))#对index进行按列降序输出
print(df2.sort_values(by = 'B'))#对数据值进行排序输出
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import pandas as pd
import numpy as np
#创建6x4的矩阵
dates = pd.date_range('20171201',periods = 6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index = dates, columns = ['A','B','C','D'])
print(df)
## A B C D
##2017-12-01 0 1 2 3
##2017-12-02 4 5 6 7
##2017-12-03 8 9 10 11
##2017-12-04 12 13 14 15
##2017-12-05 16 17 18 19
##2017-12-06 20 21 22 23
#选取数据
print(df['A'])
print(df.A)
#选择多行数据
print(df[0:3])
print(df['20171203':'20171205'])
print(df[3:3])#返回空对象
print(df.loc['20171202']) #选择某行
print(df.loc[:,['A','B']])#选择A、B两列
print(df.loc['20171203',['A','B']])#指定行,指定列
print(df.iloc[3,1])#根据位置选择
print(df.iloc[3:5,1:3])#根据位置选择
print(df.iloc[[1,3,5],1:3])#根据位置选择
print(df.ix[:3,['A','C']])#前三行,A和C两列的数据
print(df[df.A>8])#通过判断的选择
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import pandas as pd
import numpy as np
#创建6x4的矩阵
dates = pd.date_range('20171201',periods = 6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index = dates, columns = ['A','B','C','D'])
print(df)
## A B C D
##2017-12-01 0 1 2 3
##2017-12-02 4 5 6 7
##2017-12-03 8 9 10 11
##2017-12-04 12 13 14 15
##2017-12-05 16 17 18 19
##2017-12-06 20 21 22 23
#修改值
df.iloc[2,2] = 1111
df.loc['20171201','B'] = 2222
print(df)
## A B C D
##2017-12-01 0 2222 2 3
##2017-12-02 4 5 6 7
##2017-12-03 8 9 1111 11
##2017-12-04 12 13 14 15
##2017-12-05 16 17 18 19
##2017-12-06 20 21 22 23
#根据条件修改值
df.B[df.A>4] = 0 #对与A的值大于4的位置,B在相应的位置上修改为0
print(df)
## A B C D
##2017-12-01 0 2222 2 3
##2017-12-02 4 5 6 7
##2017-12-03 8 0 1111 11
##2017-12-04 12 0 14 15
##2017-12-05 16 0 18 19
##2017-12-06 20 0 22 23
#添加列,并对整列进行设置
df['F'] = np.nan
print(df)
## A B C D F
##2017-12-01 0 2222 2 3 NaN
##2017-12-02 4 5 6 7 NaN
##2017-12-03 8 0 1111 11 NaN
##2017-12-04 12 0 14 15 NaN
##2017-12-05 16 0 18 19 NaN
##2017-12-06 20 0 22 23 NaN
#添加列,加入Serious序列,长度必须符合之前的定义
df['E'] = pd.Series([1,2,3,4,5,6],index = pd.date_range('20171201',periods=6))
print(df)
## A B C D F E
##2017-12-01 0 2222 2 3 NaN 1
##2017-12-02 4 5 6 7 NaN 2
##2017-12-03 8 0 1111 11 NaN 3
##2017-12-04 12 0 14 15 NaN 4
##2017-12-05 16 0 18 19 NaN 5
##2017-12-06 20 0 22 23 NaN 6
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import pandas as pd
import numpy as np
#创建6x4的矩阵
dates = pd.date_range('20171201',periods = 6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index = dates, columns = ['A','B','C','D'])
print(df)
## A B C D
##2017-12-01 0 1 2 3
##2017-12-02 4 5 6 7
##2017-12-03 8 9 10 11
##2017-12-04 12 13 14 15
##2017-12-05 16 17 18 19
##2017-12-06 20 21 22 23
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df)
## A B C D
##2017-12-01 0 NaN 2.0 3
##2017-12-02 4 5.0 NaN 7
##2017-12-03 8 9.0 10.0 11
##2017-12-04 12 13.0 14.0 15
##2017-12-05 16 17.0 18.0 19
##2017-12-06 20 21.0 22.0 23
print(df.dropna(
axis = 0, #0表示对行进行操作,1表示对列进行操作
how = 'any' #any表示只要存在NaN就drop掉,all表示必须全部是NaN才drop掉
))#去掉了NaN
print(df)#保持不变
print(df.fillna(value = 0))#将NaN的值用0代替
print(df.isnull())#判断是否有缺失的数据,有为True,否则为False
print(np.any(df.isnull()) == True) #检测数据中是否存在NaN,如果存在就返回True
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#pandas可以读取与存取的资料格式有很多种,像csv、excel、json、html与pickle等
#读取csv
import pandas as pd
data = pd.read_csv('students.csv')
print(data)
#将资料存取成pickle
data.to_pickle('students.pickle')
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#axis未设置时,默认值为0
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.ones((3,4))*0,columns = ['a','b','c','d'])
## a b c d
##0 0.0 0.0 0.0 0.0
##1 0.0 0.0 0.0 0.0
##2 0.0 0.0 0.0 0.0
df2 = pd.DataFrame(np.ones((3,4))*1,columns = ['a','b','c','d'])
## a b c d
##0 1.0 1.0 1.0 1.0
##1 1.0 1.0 1.0 1.0
##2 1.0 1.0 1.0 1.0
df3 = pd.DataFrame(np.ones((3,4))*2,columns = ['a','b','c','d'])
## a b c d
##0 2.0 2.0 2.0 2.0
##1 2.0 2.0 2.0 2.0
##2 2.0 2.0 2.0 2.0
print(df1)
print(df2)
print(df3)
res = pd.concat([df1,df2,df3],axis = 0)#按行合并
print(res)
## a b c d
##0 0.0 0.0 0.0 0.0
##1 0.0 0.0 0.0 0.0
##2 0.0 0.0 0.0 0.0
##0 1.0 1.0 1.0 1.0
##1 1.0 1.0 1.0 1.0
##2 1.0 1.0 1.0 1.0
##0 2.0 2.0 2.0 2.0
##1 2.0 2.0 2.0 2.0
##2 2.0 2.0 2.0 2.0
#重置index
res = pd.concat([df1,df2,df3],axis = 0,ignore_index = True)
print(res)
df1 = pd.DataFrame(np.ones((3,4))*0,columns = ['a','b','c','d'],index = [1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns = ['b','c','d','e'],index = [2,3,4])
print(df1)
print(df2)
res = pd.concat([df1,df2],axis = 0, join = 'outer')#不同的行也会合并,没有值的位置用NaN填充
print(res)
## a b c d e
##1 0.0 0.0 0.0 0.0 NaN
##2 0.0 0.0 0.0 0.0 NaN
##3 0.0 0.0 0.0 0.0 NaN
##2 NaN 1.0 1.0 1.0 1.0
##3 NaN 1.0 1.0 1.0 1.0
##4 NaN 1.0 1.0 1.0 1.0
res = pd.concat([df1,df2],axis = 0,join = 'inner')#只有相同的行合并在一起,其他的会被抛弃
print(res)
## b c d
##1 0.0 0.0 0.0
##2 0.0 0.0 0.0
##3 0.0 0.0 0.0
##2 1.0 1.0 1.0
##3 1.0 1.0 1.0
##4 1.0 1.0 1.0
#重置index
res = pd.concat([df1,df2],axis = 0, join = 'inner',ignore_index = True)
print(res)
## b c d
##0 0.0 0.0 0.0
##1 0.0 0.0 0.0
##2 0.0 0.0 0.0
##3 1.0 1.0 1.0
##4 1.0 1.0 1.0
##5 1.0 1.0 1.0
df1 = pd.DataFrame(np.ones((3,4))*0,columns = ['a','b','c','d'],index = [1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns = ['b','c','d','e'],index = [2,3,4])
res = pd.concat([df1,df2],axis = 1,join_axes = [df1.index]) #横向合并,根据df1.index
print(res)
## a b c d b c d e
##1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
##2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
##3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
res = pd.concat([df1,df2],axis = 1)
print(res)
## a b c d b c d e
##1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
##2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
##3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
##4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
df1 = pd.DataFrame(np.ones((3,4))*0,columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns = ['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns = ['a','b','c','d'])
s1 = pd.Series([1,2,3,4],index = ['a','b','c','d'])
#append只有纵向合并,没有横向合并
res = df1.append(df2,ignore_index = True)
#将df2合并到df1的下面,并且重置index
print(res)
res = df1.append([df2,df3],ignore_index = True)
print(res)
res = df1.append(s1,ignore_index = True)
print(res)
## a b c d
##0 0.0 0.0 0.0 0.0
##1 0.0 0.0 0.0 0.0
##2 0.0 0.0 0.0 0.0
##3 1.0 2.0 3.0 4.0
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import pandas as pd
left = pd.DataFrame({'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(left)
## A B key
##0 A0 B0 K0
##1 A1 B1 K1
##2 A2 B2 K2
##3 A3 B3 K3
print(right)
## C D key
##0 C0 D0 K0
##1 C1 D1 K1
##2 C2 D2 K2
##3 C3 D3 K3
#根据key column进行合并
res = pd.merge(left,right,on = 'key')
print(res)
## A B key C D
##0 A0 B0 K0 C0 D0
##1 A1 B1 K1 C1 D1
##2 A2 B2 K2 C2 D2
##3 A3 B3 K3 C3 D3
left = pd.DataFrame({'key1':['K0','K0','K1','K2'],
'key2':['K0','K1','K0','K1'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key1':['K0','K1','K1','K2'],
'key2':['K0','K0','K0','K0'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(left)
## A B key1 key2
##0 A0 B0 K0 K0
##1 A1 B1 K0 K1
##2 A2 B2 K1 K0
##3 A3 B3 K2 K1
print(right)
## C D key1 key2
##0 C0 D0 K0 K0
##1 C1 D1 K1 K0
##2 C2 D2 K1 K0
##3 C3 D3 K2 K0
#根据key1和key2进行合并
res = pd.merge(left,right,on = ['key1','key2'],how = 'inner')
print(res)
## A B key1 key2 C D
##0 A0 B0 K0 K0 C0 D0
##1 A2 B2 K1 K0 C1 D1
##2 A2 B2 K1 K0 C2 D2
res = pd.merge(left,right,on = ['key1','key2'],how = 'outer')
print(res)
## A B key1 key2 C D
##0 A0 B0 K0 K0 C0 D0
##1 A1 B1 K0 K1 NaN NaN
##2 A2 B2 K1 K0 C1 D1
##3 A2 B2 K1 K0 C2 D2
##4 A3 B3 K2 K1 NaN NaN
##5 NaN NaN K2 K0 C3 D3
res = pd.merge(left,right,on = ['key1','key2'],how = 'left')
print(res)
## A B key1 key2 C D
##0 A0 B0 K0 K0 C0 D0
##1 A1 B1 K0 K1 NaN NaN
##2 A2 B2 K1 K0 C1 D1
##3 A2 B2 K1 K0 C2 D2
##4 A3 B3 K2 K1 NaN NaN
res = pd.merge(left,right,on = ['key1','key2'],how = 'right')
print(res)
## A B key1 key2 C D
##0 A0 B0 K0 K0 C0 D0
##1 A2 B2 K1 K0 C1 D1
##2 A2 B2 K1 K0 C2 D2
##3 NaN NaN K2 K0 C3 D3
- Pandas tips
- pandas tips one
- pandas
- pandas
- Pandas
- pandas
- pandas
- pandas
- pandas
- Pandas
- pandas
- pandas
- pandas
- pandas
- pandas
- pandas
- Pandas
- Tips
- LeetCode算法第三题
- POJ-1651 Multiplication Puzzle (区间DP)
- 深入Java关键字null
- czl的知识点整理5——单调队列
- 安装第三方库
- Pandas tips
- iOS11 打开系统相册 导航栏透明 且列表的frame也不对
- 【MYSQL】win7安装mysql-5.7.10绿色版
- 120行纯css代码制作沙漏小动画
- Java 中 String str = new String(“abc”) & String str = “abc” 区别
- C++ 如何快速清空vector以及释放vector内存?
- 煮熟的鸭子就剩嘴硬?贾跃亭全面否认负面消息,国外逍遥
- 如何解决Unsupported major.minor version 52.0问题?
- kNN分类算法