pandas <十分钟掌握pandas常用操作>

来源:互联网 发布:淘宝网机械水压开关 编辑:程序博客网 时间:2024/06/06 02:59

panda version:’0.19.2’

1.创建数据集

In [58]: names=['Bob','Jessica','Mary','John','Mel']In [59]: births = [968, 155, 77, 578, 973]In [60]: BabyDataSet = list(zip(names,births))    ...: BabyDataSetOut[60]: [('Bob', 968), ('Jessica', 155), ('Mary', 77), ('John', 578), ('Mel', 973)]In [61]: df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births    ...: '])    ...: dfOut[61]:     Names  Births0      Bob     9681  Jessica     1552     Mary      773     John     5784      Mel     973

2.查看数据类型

In [64]: df.dtypesOut[64]:Names     objectBirths     int64dtype: objectIn [65]: df.Names.dtypesOut[65]: dtype('O')In [66]: df.Births.dtypesOut[66]: dtype('int64')

3.按照指定 的一列排序

In [68]: df.sort_values(['Births'])Out[68]:     Names  Births2     Mary      771  Jessica     1553     John     5780      Bob     9684      Mel     973In [69]: df.sort_values(['Births'],ascending=False)Out[69]:     Names  Births4      Mel     9730      Bob     9683     John     5781  Jessica     1552     Mary      77

4.选取前几行或者末尾几行

In [72]: df.head(3)Out[72]:     Names  Births0      Bob     9681  Jessica     1552     Mary      77In [73]: df.tail(2)Out[73]:  Names  Births3  John     5784   Mel     973In [74]: dfOut[74]:     Names  Births0      Bob     9681  Jessica     1552     Mary      773     John     5784      Mel     973

5.选取一列的最大值

In [75]: df['Births']Out[75]:0    9681    1552     773    5784    973Name: Births, dtype: int64In [76]: df['Births'].max()Out[76]: 973In [77]: df['Names'][df['Births'] == df['Births'].max()]Out[77]:4    MelName: Names, dtype: object
# encoding=utf-8import pandas as pdimport numpy as npimport matplotlib.pyplot as plt# Creating a Series by passing a list of values, letting pandas create a default integer indexs = pd.Series([1, 3, 5, np.nan, 6, 8])print(s)# Creating a DataFrame by passing a numpy array, with a datetime index and labeled columns:
0    1.01    3.02    5.03    NaN4    6.05    8.0dtype: float64
dates = pd.date_range('20130101', periods=6)print(dates)# Creating a DataFrame by passing a numpy array, with a datetime index and labeled columns:df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))print(df)df2 = pd.DataFrame({'A': 1.,                    'B': pd.Timestamp('20130102'),                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),                    'D': np.array([3] * 4, dtype='int32'),                    'E': pd.Categorical(["test", "train", "test", "train"]),                    'F': 'foo'})print(df2)
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',               '2013-01-05', '2013-01-06'],              dtype='datetime64[ns]', freq='D')                   A         B         C         D2013-01-01 -1.324399 -0.917851 -0.710650  0.9770882013-01-02  0.034877 -0.417994  1.412711 -0.6261972013-01-03 -0.559784 -0.085540  1.067182  0.6496212013-01-04  0.849592 -1.251283  1.956991  1.1897812013-01-05 -1.742392  0.193744 -0.570087 -0.2771562013-01-06 -0.129934 -0.890113 -1.324529 -1.298726     A          B    C  D      E    F0  1.0 2013-01-02  1.0  3   test  foo1  1.0 2013-01-02  1.0  3  train  foo2  1.0 2013-01-02  1.0  3   test  foo3  1.0 2013-01-02  1.0  3  train  foo
# 查看数据的头部和尾部# See the top & bottom rows of the frameprint "--------*---------------"print(df2.head(1))print(df2.tail(1))print(df2.values)# 排序# Sorting by an axis(轴)print "--------*---------------"# 可以看到按照列名排序了print dfprint df.sort_index(axis=1, ascending=False)# Sorting by values# 按照B列排序print  df.sort_values(by='B')
     A          B    C  D     E    F0  1.0 2013-01-02  1.0  3  test  foo     A          B    C  D      E    F3  1.0 2013-01-02  1.0  3  train  foo[[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo'] [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo'] [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo'] [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]--------*---------------                   A         B         C         D2013-01-01 -1.324399 -0.917851 -0.710650  0.9770882013-01-02  0.034877 -0.417994  1.412711 -0.6261972013-01-03 -0.559784 -0.085540  1.067182  0.6496212013-01-04  0.849592 -1.251283  1.956991  1.1897812013-01-05 -1.742392  0.193744 -0.570087 -0.2771562013-01-06 -0.129934 -0.890113 -1.324529 -1.298726                   D         C         B         A2013-01-01  0.977088 -0.710650 -0.917851 -1.3243992013-01-02 -0.626197  1.412711 -0.417994  0.0348772013-01-03  0.649621  1.067182 -0.085540 -0.5597842013-01-04  1.189781  1.956991 -1.251283  0.8495922013-01-05 -0.277156 -0.570087  0.193744 -1.7423922013-01-06 -1.298726 -1.324529 -0.890113 -0.129934                   A         B         C         D2013-01-04  0.849592 -1.251283  1.956991  1.1897812013-01-01 -1.324399 -0.917851 -0.710650  0.9770882013-01-06 -0.129934 -0.890113 -1.324529 -1.2987262013-01-02  0.034877 -0.417994  1.412711 -0.6261972013-01-03 -0.559784 -0.085540  1.067182  0.6496212013-01-05 -1.742392  0.193744 -0.570087 -0.277156
# Selecting a single column, which yields a Series, equivalent to df.Aprint df.Aprint df['A']# Selecting via [], which slices the rows.print df[0:3]
2013-01-01   -1.3243992013-01-02    0.0348772013-01-03   -0.5597842013-01-04    0.8495922013-01-05   -1.7423922013-01-06   -0.129934Freq: D, Name: A, dtype: float642013-01-01   -1.3243992013-01-02    0.0348772013-01-03   -0.5597842013-01-04    0.8495922013-01-05   -1.7423922013-01-06   -0.129934Freq: D, Name: A, dtype: float64                   A         B         C         D2013-01-01 -1.324399 -0.917851 -0.710650  0.9770882013-01-02  0.034877 -0.417994  1.412711 -0.6261972013-01-03 -0.559784 -0.085540  1.067182  0.649621
#选取第一行print df.loc[dates[0]]
A   -1.324399B   -0.917851C   -0.710650D    0.977088Name: 2013-01-01 00:00:00, dtype: float64
#选取所有行中的A、B列df.loc[:,['A','B']]
A B 2013-01-01 -1.324399 -0.917851 2013-01-02 0.034877 -0.417994 2013-01-03 -0.559784 -0.085540 2013-01-04 0.849592 -1.251283 2013-01-05 -1.742392 0.193744 2013-01-06 -0.129934 -0.890113
df.loc['20130102':'20130104',['A','B']]#获取一个值df.loc[dates[0],'A']
-1.3243992801327025
#Using the isin() method for filtering:df2 = df.copy()df2['E'] = ['one', 'one','two','three','four','three']df2
A B C D E 2013-01-01 -1.324399 -0.917851 -0.710650 0.977088 one 2013-01-02 0.034877 -0.417994 1.412711 -0.626197 one 2013-01-03 -0.559784 -0.085540 1.067182 0.649621 two 2013-01-04 0.849592 -1.251283 1.956991 1.189781 three 2013-01-05 -1.742392 0.193744 -0.570087 -0.277156 four 2013-01-06 -0.129934 -0.890113 -1.324529 -1.298726 three
#Using the isin() method for filtering:df2[df2['E'].isin(['two','four'])]
A B C D E 2013-01-03 -0.559784 -0.085540 1.067182 0.649621 two 2013-01-05 -1.742392 0.193744 -0.570087 -0.277156 four
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))s1
2013-01-02 1 2013-01-03 2 2013-01-04 3 2013-01-05 4 2013-01-06 5 2013-01-07 6 Freq: D, dtype: int64
#在原有的数据集上添加新的数据df['F'] = s1df
A B C D F 2013-01-01 -1.324399 -0.917851 -0.710650 0.977088 NaN 2013-01-02 0.034877 -0.417994 1.412711 -0.626197 1.0 2013-01-03 -0.559784 -0.085540 1.067182 0.649621 2.0 2013-01-04 0.849592 -1.251283 1.956991 1.189781 3.0 2013-01-05 -1.742392 0.193744 -0.570087 -0.277156 4.0 2013-01-06 -0.129934 -0.890113 -1.324529 -1.298726 5.0
# 每列的平均数df.mean()
A -0.478673 B -0.561506 C 0.305270 D 0.102402 F 3.000000 dtype: float64
# 使用lamda表达式作用于每一列df.apply(lambda x: x.max() - x.min())
A 2.591984 B 1.445027 C 3.281519 D 2.488508 F 4.000000 dtype: float64
# 字符串大小写转换s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])s.str.lower()
0 a 1 b 2 c 3 aaba 4 baca 5 NaN 6 caba 7 dog 8 cat dtype: object
# merge操作left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})leftrightpd.merge(left, right, on='key')
key lval rval 0 foo 1 4 1 foo 1 5 2 foo 2 4 3 foo 2 5
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})leftrightpd.merge(left, right, on='key')
key lval rval 0 foo 1 4 1 bar 2 5
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],'B' : ['one', 'one', 'two', 'three','two', 'two', 'one', 'three'],'C' : np.random.randn(8),'D' : np.random.randn(8)})df
A B C D 0 foo one 0.850151 -0.095071 1 bar one 0.252074 0.504999 2 foo two -0.139441 -1.190568 3 bar three -0.971856 0.340176 4 foo two 1.546175 -0.402114 5 bar two 0.026199 1.313452 6 foo one -0.267510 -0.981974 7 foo three 1.018972 1.100904
df.groupby('A').sum()
C D A bar -0.693583 2.158627 foo 3.008348 -1.568822
df.groupby(['A','B']).sum()
C D A B bar one 0.252074 0.504999 three -0.971856 0.340176 two 0.026199 1.313452 foo one 0.582641 -1.077045 three 1.018972 1.100904 two 1.406734 -1.592681
<matplotlib.axes._subplots.AxesSubplot at 0x9b27b00>
原创粉丝点击
热门问题 老师的惩罚 人脸识别 我在镇武司摸鱼那些年 重生之率土为王 我在大康的咸鱼生活 盘龙之生命进化 天生仙种 凡人之先天五行 春回大明朝 姑娘不必设防,我是瞎子 刚满月的宝宝黄疸高怎么办 换奶粉孩子不喝怎么办 宝宝整夜哭闹不睡觉怎么办 满月宝宝整夜不睡觉怎么办 6个半月宝宝一喂粥就哭怎么办 宝宝敷鸡蛋白过敏怎么办 七个月宝宝不吃米糊怎么办 涨奶引起的发烧怎么办 8个月宝宝积食怎么办 宝宝吃奶一会就睡了怎么办 宝宝喝凉酸奶拉肚子怎么办 宝宝戒奶不吃奶粉怎么办 三个月大婴儿不吃奶粉怎么办 三个月大的婴儿不吃奶粉怎么办 40天宝宝肚脐凸怎么办 6个月婴儿消化不好怎么办 2个月婴儿消化不好怎么办 10月婴儿不吃饭怎么办 9个月宝宝不吃饭怎么办 十个多月的宝宝便秘怎么办 十个多月宝宝便秘怎么办 8个月宝宝过敏怎么办 宝宝二十个月便秘怎么办 7个月宝宝便秘拉不出怎么办 二十六个月宝宝便秘怎么办 八个月宝宝吃鸡蛋过敏怎么办 8个月宝宝 吃盐怎么办 40多天婴儿拉肚怎么办 刚出生婴儿拉肚怎么办 20多天婴儿拉肚怎么办 米汤煮的太稠了怎么办 两岁宝宝不吃蔬菜怎么办 两岁宝宝不爱吃蔬菜怎么办 四个月宝宝头有点歪怎么办 宝宝吃过了还闹怎么办 ddrops d3吃多了怎么办 ddrops d3滴多了怎么办 维生素d滴多了怎么办 ddrops最后滴不出来的怎么办 ddrops一次滴3滴怎么办 小孩子头睡偏了怎么办