python pandas

来源：互联网发布：大数据开发工程师面试编辑：程序博客网时间：2024/05/26 02:21

通过dict构建Series

year_data = {2001: 17.8, 2002: 20.1, 2003: 16.5}ser_obj2 = pd.Series(year_data)print ser_obj2.head()

name属性

ser_obj2.name = 'temp'ser_obj2.index.name = 'year'print ser_obj2.head()

通过dict构建DataFrame

dict_data = {'A': 1.,              'B': pd.Timestamp('20161217'),             'C': pd.Series(1, index=list(range(4)),dtype='float32'),             'D': np.array([3] * 4,dtype='int32'),             'E' : pd.Categorical(["Python","Java","C++","C#"]),             'F' : 'ChinaHadoop' }#print dict_datadf_obj2 = pd.DataFrame(dict_data)print df_obj2.head()

DataFram不同的列类型可以不同，而narray却不行。

增加列

df_obj2['G'] = df_obj2['D'] + 4print df_obj2.head()

切片索引

In [5]: ser_obj = pd.Series(range(5), index = ['a', 'b', 'c', 'd', 'e'])In [6]: print ser_obj.head()a    0b    1c    2d    3e    4dtype: int64In [7]: print ser_obj[1:3] #不包含末尾索引b    1c    2dtype: int64In [8]: print ser_obj['b':'d'] #包含末尾索引b    1c    2d    3dtype: int64

不连续索引

In [9]: print ser_obj[[0, 2, 4]]a    0c    2e    4dtype: int64In [10]: print ser_obj[['a', 'e']]a    0e    4dtype: int64

默认按照列索引

三种索引方式

# 标签索引 loc  包含末尾索引# 位置索引 iloc 不包含末尾索引# 混合索引 ix# Seriesprint ser_obj['b':'d']print ser_obj.loc['b':'d']# DataFrameprint df_obj['a']In [20]: print df_obj.loc[0:2, 'a']0   -0.5046641    0.3105202   -0.353881Name: a, dtype: float64# 位置索引 ilocprint ser_obj[1:3]print ser_obj.iloc[1:3]# DataFrameprint df_obj.iloc[0:2, 0] #不包含末尾df_obj.loc[0:2, 'a'] #标签索引包含末尾

In [22]: df1 = pd.DataFrame(np.ones((2,2)), columns = ['a', 'b'])In [23]: df2 = pd.DataFrame(np.ones((3,3)), columns = ['a', 'b', 'c'])In [25]: print df1     a    b0  1.0  1.01  1.0  1.0In [28]: print df2     a    b    c0  1.0  1.0  1.01  1.0  1.0  1.02  1.0  1.0  1.0In [29]: # DataFrame对齐操作In [30]: df1 + df2Out[30]:      a    b   c0  2.0  2.0 NaN1  2.0  2.0 NaN2  NaN  NaN NaN

fill_value

In [35]: s1 = pd.Series(range(10, 20), index = range(10))In [36]: s1Out[36]: 0    101    112    123    134    145    156    167    178    189    19dtype: int64In [37]: s2 = pd.Series(range(20, 25), index = range(5))In [38]: s2Out[38]: 0    201    212    223    234    24dtype: int64In [39]: s1.add(s2, fill_value = -1)Out[39]: 0    30.01    32.02    34.03    36.04    38.05    14.06    15.07    16.08    17.09    18.0dtype: float64

fillna 对空缺值进行填充

In [43]: s3 = s1 +s2In [44]: In [44]: s3Out[44]: 0    30.01    32.02    34.03    36.04    38.05     NaN6     NaN7     NaN8     NaN9     NaNdtype: float64In [45]: s3_filled = s3.fillna(-1)In [46]: In [46]: s3_filledOut[46]: 0    30.01    32.02    34.03    36.04    38.05    -1.06    -1.07    -1.08    -1.09    -1.0dtype: float64

idmax，idmin 输出最大最小的索引

# dataframe根据key1进行分组print type(df_obj.groupby('key1'))<class 'pandas.core.groupby.DataFrameGroupBy'># data1列根据key1进行分组print type(df_obj['data1'].groupby(df_obj['key1']))<class 'pandas.core.groupby.SeriesGroupBy'># 按自定义key分组，列表self_def_key = [1, 1, 2, 2, 2, 1, 1, 1]df_obj.groupby(self_def_key).size()

内层索引和外层索引

In [3]: ser_obj = pd.Series(np.random.randn(12),   ...:                     index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd', 'd'],   ...:                            [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]])In [4]: ser_objOut[4]: a  0   -0.247766   1    1.111938   2   -0.553020b  0   -0.255932   1    1.369432   2   -1.056148c  0   -0.026812   1    0.828609   2    0.232325d  0    0.211127   1    0.717061   2    1.891033dtype: float64In [5]: # 外层选取In [6]: print ser_obj['c']0   -0.0268121    0.8286092    0.232325dtype: float64In [7]: # 内层选取In [8]: print ser_obj[:, 2]a   -0.553020b   -1.056148c    0.232325d    1.891033dtype: float64

交换并排序分层

print ser_obj.swaplevel().sortlevel()

迭代查看分组

In [10]: dict_obj = {'key1' : ['a', 'b', 'a', 'b',    ....:                       'a', 'b', 'a', 'a'],   ....:             'key2' : ['one', 'one', 'two', 'three',   ....:                       'two', 'two', 'one', 'three'],   ....:             'data1': np.random.randn(8),   ....:             'data2': np.random.randn(8)}In [11]: df_obj = pd.DataFrame(dict_obj)In [12]: print df_obj      data1     data2 key1   key20 -0.351249  0.312981    a    one1  0.350752  0.156992    b    one2 -1.703088  1.374289    a    two3  1.589940 -1.870309    b  three4 -0.797821 -0.315044    a    two5  1.983528  0.195856    b    two6  1.481909  0.232982    a    one7 -1.069567 -0.020851    a  threeIn [13]: grouped1 = df_obj.groupby('key1')In [14]: # 单层分组In [15]: for group_name, group_data in grouped1:   ....:         print group_name   ....:         print group_data   ....:     a      data1     data2 key1   key20 -0.351249  0.312981    a    one2 -1.703088  1.374289    a    two4 -0.797821 -0.315044    a    two6  1.481909  0.232982    a    one7 -1.069567 -0.020851    a  threeb      data1     data2 key1   key21  0.350752  0.156992    b    one3  1.589940 -1.870309    b  three5  1.983528  0.195856    b    two

通过函数分组

In [16]: # 通过函数分组In [17]: df_obj3 = pd.DataFrame(np.random.randint(1, 10, (5,5)),   ....:                        columns=['a', 'b', 'c', 'd', 'e'],   ....:                        index=['AA', 'BBB', 'CC', 'D', 'EE'])In [18]: #df_obj3In [19]: In [19]: def group_key(idx):   ....:         """   ....:             idx 为列索引或行索引   ....:         """   ....:         #return idx   ....:         return len(idx)   ....: In [20]: df_obj3.groupby(group_key,axis = 0).size()Out[20]: 1    12    33    1dtype: int64

In [21]: columns = pd.MultiIndex.from_arrays([['Python', 'Java', 'Python', 'Java', 'Python'],   ....:                                      ['A', 'A', 'B', 'C', 'B']], names=['language', 'index'])In [22]: df_obj4 = pd.DataFrame(np.random.randint(1, 10, (5, 5)), columns=columns)In [23]: df_obj4Out[23]: language Python Java Python Java Pythonindex         A    A      B    C      B0             5    3      4    2      61             5    6      2    5      32             2    4      2    5      63             2    2      6    7      74             7    2      3    6      8In [24]: # 根据language进行分组In [25]: df_obj4.groupby(level='language', axis=1).sum()Out[25]: language  Java  Python0            5      151           11      102            9      103            9      154            8      18In [26]: df_obj4.groupby(level='index', axis=1).sum()Out[26]: index   A   B  C0       8  10  21      11   5  52       6   8  53       4  13  74       9  11  6

自定义函数聚合

In [36]: dict_obj = {'key1' : ['a', 'b', 'a', 'b',    ....:                       'a', 'b', 'a', 'a'],   ....:             'key2' : ['one', 'one', 'two', 'three',   ....:                       'two', 'two', 'one', 'three'],   ....:             'data1': np.random.randint(1,10, 8),   ....:             'data2': np.random.randint(1,10, 8)}In [37]: df_obj5 = pd.DataFrame(dict_obj)In [38]: print df_obj5   data1  data2 key1   key20      6      2    a    one1      8      9    b    one2      3      9    a    two3      4      8    b  three4      4      4    a    two5      9      2    b    two6      1      2    a    one7      5      3    a  threeIn [39]: # 自定义聚合函数In [40]: def peak_range(df):   ....:         """   ....:             返回数值范围   ....:         """   ....:         #print type(df) #参数为索引所对应的记录   ....:         return df.max() - df.min()   ....: In [41]: print df_obj5.groupby('key1').agg(peak_range)      data1  data2key1              a         5      7b         5      7In [42]: # 应用多个聚合函数In [43]: # 同时应用多个聚合函数In [44]: print df_obj.groupby('key1').agg(['mean', 'std', 'count', peak_range]) # 默认列名为函数名         data1                                data2                                     mean       std count peak_range      mean       std count peak_rangekey1                                                                          a    -0.487963  1.205186     5   3.184997  0.316871  0.640031     5   1.689332b     1.308073  0.852101     3   1.632776 -0.505820  1.181842     3   2.066165In [45]: dict_mapping = {'data1':['mean','max'],   ....:                 'data2':'sum'}In [46]: print df_obj.groupby('key1').agg(dict_mapping)         data1               data2          mean       max       sumkey1                              a    -0.487963  1.481909  1.584357b     1.308073  1.983528 -1.517461

0 0