pandas tips one

来源:互联网 发布:我的祖国 知乎 编辑:程序博客网 时间:2024/06/04 17:53
import pandas as pd

df1 = pd.DataFrame({'col1':[0,1],'col_left':['a','b']})#按列定义
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})

print(df1)
##   col1 col_left
##0     0        a
##1     1        b
print(df2)
##   col1  col_right
##0     1          2
##1     2          2
##2     2          2

#indicator = True会将合并的记录放在新的一列
#根据col1进行合并
res = pd.merge(df1,df2,on = 'col1', how = 'outer', indicator = True)
print(res)
##   col1 col_left  col_right      _merge
##0     0        a        NaN   left_only
##1     1        b        2.0        both
##2     2      NaN        2.0  right_only
##3     2      NaN        2.0  right_only

#自定义indicator column的名称
res = pd.merge(df1,df2,on = 'col1', how = 'outer',indicator = 'indicator_column')
print(res)
##   col1 col_left  col_right indicator_column
##0     0        a        NaN        left_only
##1     1        b        2.0             both
##2     2      NaN        2.0       right_only
##3     2      NaN        2.0       right_only

left = pd.DataFrame({'A':['A0','A1','A2'],
                    'B':['B0','B1','B2']},
                    index = ['K0','K1','K2'])
right = pd.DataFrame({'C':['C0','C2','C3'],
                     'D':['D0','D2','D3']},
                    index = ['K0','K2','K3'])
print(left)
##     A   B
##K0  A0  B0
##K1  A1  B1
##K2  A2  B2
print(right)
##     C   D
##K0  C0  D0
##K2  C2  D2
##K3  C3  D3

#根据index进行合并,how = 'outer',并且打印输出
res = pd.merge(left,right,left_index = True, right_index = True, how = 'outer')
print(res)

#根据index进行合并,how = 'inner'
res = pd.merge(left,right,left_index = True, right_index = True, how = 'inner')
print(res)
##     A   B   C   D
##K0  A0  B0  C0  D0
##K2  A2  B2  C2  D2

boys = pd.DataFrame({'K':['K0','K1','K2'],'age':[1,2,3]})
girls = pd.DataFrame({'K':['K0','K1','K3'],'age':[4,5,6]})

#使用suffixes后缀解决overlapping重叠的问题
res = pd.merge(boys,girls,on='K',suffixes = ['_boy','_girl'],how = 'inner')
print(res)
##    K  age_boy  age_girl
##0  K0        1         4
##1  K1        2         5
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#数据的可视化 matplotlib --> plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#随机生成1000个数据
data = pd.Series(np.random.randn(1000),index = np.arange(1000))

#为了方便观看效果,累加数据
data.cumsum()

#pandas数据可以直接观看可视化效果
data.plot()

plt.show()

#生成1000*4的DataFrame,并对它们进行累加
data = pd.DataFrame(np.random.randn(1000,4),
                    index = np.arange(1000),
                    columns = list('ABCD')
                    )
data.cumsum()
data.plot()
plt.show()

#除了plot,scatter散点图,其他还有bar,hist,box,kde,area,hexbin
ax = data.plot.scatter(x = 'A', y = 'B', color = 'DarkBlue', label = 'Class1')
#画在同一个ax上面,选择不同的数据列
data.plot.scatter(x='A',y = 'C',color = 'LightGreen',label = 'Class2',ax = ax)
plt.show()