pandas基础

来源：互联网发布：手机淘宝下载安装2017免费下载编辑：程序博客网时间：2024/06/04 19:01

pandas基础

pandas的基本用法01-Series

#coding:utf-8from pandas import Seriesobj = Series([4, 7, -5, 3], index = ['a', 'b', 'c', 'd'])print obj# a    4# b    7# c   -5# d    3# dtype: int64print obj[['a', 'b', 'c']]# a    4# b    7# c   -5# dtype: int64print obj[obj>0]# a    4# b    7# d    3# dtype: int64print 'b' in obj# Trueprint '使用字典生成Series'dic = {'Paul':95, 'James':98, 'Harden':94}obj = Series(dic)print obj# Harden    94# James     98# Paul      95# dtype: int64name = ['Wade', 'James', 'Harden']obj = Series(dic, name)print obj# Wade     NaN# James     98# Harden    94# dtype: float64print '指定Series及其索引的名字'obj.name = 'name'obj.index.name = 'score'print obj# score# Wade     NaN# James     98# Harden    94# Name: name, dtype: float64print '替换index'obj.index = ['韦德', '詹姆斯', '哈登']print obj# 韦德    NaN# 詹姆斯    98# 哈登     94# Name: name, dtype: float64

pandas的基本用法02-DataFrame基础

#coding:utf-8import numpy as npfrom pandas import Series,DataFrameprint '用字典生成DataFrame，key为列的名字。'data = {'city':['Beijing', 'Shanghai', 'Shenzheng', 'Nanjing', 'Hangzhou'],        'gdp':[8000, 9000, 3000, 4000, 4500],        'pop':[2500, 3500, 500, 1500, 1000]}print DataFrame(data)#         city   gdp   pop# 0    Beijing  8000  2500# 1   Shanghai  9000  3500# 2  Shenzheng  3000   500# 3    Nanjing  4000  1500# 4   Hangzhou  4500  1000print '指定列顺序：'print DataFrame(data, columns=['city', 'pop', 'gdp'])#         city   pop   gdp# 0    Beijing  2500  8000# 1   Shanghai  3500  9000# 2  Shenzheng   500  3000# 3    Nanjing  1500  4000# 4   Hangzhou  1000  4500print '指定索引，在列中指定不存在的列，默认数据用NaN'data2 = DataFrame(data, columns=['city', 'pop', 'gdp', 'env'],                  index=['one', 'two', 'three', 'four', 'five']                  )print data2#    city   pop   gdp  env# one      Beijing  2500  8000  NaN# two     Shanghai  3500  9000  NaN# three  Shenzheng   500  3000  NaN# four     Nanjing  1500  4000  NaN# five    Hangzhou  1000  4500  NaNprint data2.city# Name: city, dtype: objectprint data2['city']# one        Beijing# two       Shanghai# three    Shenzheng# four       Nanjing# five      Hangzhou# Name: city, dtype: objectprint data2.ix['three']# city    Shenzheng# pop           500# gdp          3000# env           NaN# Name: three, dtype: objectdata2.env = np.arange(5)print data2#             city   pop   gdp  env# one      Beijing  2500  8000    0# two     Shanghai  3500  9000    1# three  Shenzheng   500  3000    2# four     Nanjing  1500  4000    3# five    Hangzhou  1000  4500    4print '用Series指定要修改的索引及其对应的值，没有指定的默认数据用NaN。'val = Series([5,3,1,3,2], index=['one', 'two', 'three', 'four', 'five'])data2.env = valprint data2#             city   pop   gdp  env# one      Beijing  2500  8000    5# two     Shanghai  3500  9000    3# three  Shenzheng   500  3000    1# four     Nanjing  1500  4000    3# five    Hangzhou  1000  4500    2print '赋值给新列'data2['suit'] = (data2.city == 'Shenzheng')print data2#             city   pop   gdp  env   suit# one      Beijing  2500  8000    5  False# two     Shanghai  3500  9000    3  False# three  Shenzheng   500  3000    1   True# four     Nanjing  1500  4000    3  False# five    Hangzhou  1000  4500    2  Falseprint data2.columns# Index([city, pop, gdp, env, suit], dtype=object)print 'DataFrame转置'print data2.T#           one       two      three     four      five# city  Beijing  Shanghai  Shenzheng  Nanjing  Hangzhou# pop      2500      3500        500     1500      1000# gdp      8000      9000       3000     4000      4500# env         5         3          1        3         2# suit    False     False       True    False     Falseprint '指定索引顺序，以及使用切片初始化数据。'data2.index = [1,2,3,4,5]print data2['city'][:-1]# 1      Beijing# 2     Shanghai# 3    Shenzheng# 4      Nanjing# Name: city, dtype: objectprint '打印索引和列的名称'print data2.index.nameprint data2.columns.name

pandas的基本用法03-reindex()

这里写图片描述

# -*- coding: utf-8 -*-     import numpy as np    from pandas import DataFrame, Series    from matplotlib.pyplot import axis    s = Series([1,2,3,4], index=['a','b','c','d'])    s2 = s.reindex(['a','b','c','d','e'], fill_value=0)    print s2    # a    1    # b    2    # c    3    # d    4    # e    0    # dtype: int64    s2 = s.reindex(['a','b','c','d','e'], method='ffill')    print s2    # a    1    # b    2    # c    3    # d    4    # e    4    # dtype: int64    print '对DataFrame重新指定索引'    f = DataFrame(np.arange(9).reshape(3, 3),              index = ['i1', 'i2', 'i3'],              columns = ['c1','c2','c3'])    print f    #     c1  c2  c3    # i1   0   1   2    # i2   3   4   5    # i3   6   7   8    f2 = f.reindex(['a','b','c', 'd'])    print f2    #  c1  c2  c3    # a NaN NaN NaN    # b NaN NaN NaN    # c NaN NaN NaN    # d NaN NaN NaN    print '重新指定column'    names = ['c1', 'c2', 'Tony']    print f.reindex(columns=names)    #     c1  c2  Tony    # i1   0   1   NaN    # i2   3   4   NaN    # i3   6   7   NaN    print '对DataFrame重新指定索引并指定填元素充方法'    print f    #     c1  c2  c3    # i1   0   1   2    # i2   3   4   5    # i3   6   7   8    f2 = f.reindex(index=['i1', 'i2', 'x'],                    method='ffill', columns=names)    print f2    #     c1  c2  Tony    # i1   0   1   NaN    # i2   3   4   NaN    # x    6   7   NaN    print f2.fillna(method='ffill', axis=1)    #     c1  c2  Tony    # i1   0   1     1    # i2   3   4     4    # x    6   7     7

pandas的基本用法04-drop()

# -*- coding: utf-8 -*-     import numpy as np    from pandas import Series, DataFrame    from numpy.core.defchararray import index    from matplotlib.pyplot import axis    s = Series(np.arange(5.), index=[1,2,3,4,5])    newS = s.drop([1,3])    print newS    # 2    1    # 4    3    # 5    4    # dtype: float64    print 'DataFrame删除元素，可指定索引或列。'    df = DataFrame(np.arange(16).reshape(4,4),                   index = ['a','b','c','d'],                   columns=[1,2,3,4]                   )    print df    #     1   2   3   4    # a   0   1   2   3    # b   4   5   6   7    # c   8   9  10  11    # b  12  13  14  15    print df.drop(['a', 'b']) #闭区间 ['a','b']    #     1   2   3   4    # c   8   9  10  11    # d  12  13  14  15    print df.drop([1,2], axis=1)    #     3   4    # a   2   3    # b   6   7    # c  10  11    # d  14  15

pandas的基本用法05-索引

这里写图片描述

# -*- coding: utf-8 -*- import numpy as npfrom pandas import Series, DataFramefrom numpy.core.defchararray import indexprint 'Series的索引，默认数字索引可以工作。's = Series(np.arange(4.), index=['a','b','c','d'])print s[['a','b']]# a    0# b    1# dtype: float64print s[[0,1,2]]# a    0# b    1# c    2# dtype: float64print s[s>1]# c    2# d    3# dtype: float64print 'Series的数组切片'print s['a':'c']# a    0# b    1# c    2# dtype: float64print 'DataFrame的索引'df = DataFrame(np.arange(16).reshape(4,4),               index = ['a', 'b','c','d'],               columns=[1,2,3,4]               )print df#     1   2   3   4# a   0   1   2   3# b   4   5   6   7# c   8   9  10  11# d  12  13  14  15print '打印列'print df[[1,2]]#     1   2# a   0   1# b   4   5# c   8   9# d  12  13print '打印行'print df[:2]#    1  2  3  4# a  0  1  2  3# b  4  5  6  7print df.ix[:2]#    1  2  3  4# a  0  1  2  3# b  4  5  6  7print '指定索引和列'print df.ix[['a','b'], [1,2,3]]#    1  2  3# a  0  1  2# b  4  5  6print df.ix[:'c', :3]#    1  2   3# a  0  1   2# b  4  5   6# c  8  9  10print '根据条件选择'print df>3#        1      2      3      4# a  False  False  False  False# b   True   True   True   True# c   True   True   True   True# d   True   True   True   Trueprint df[[1,2]]>3#        1      2# a  False  False# b   True   True# c   True   True# d   True   Truedf[df<3]=0print df#     1   2   3   4# a   0   0   0   3# b   4   5   6   7# c   8   9  10  11# d  12  13  14  15print df.T#    a  b   c   d# 1  0  4   8  12# 2  0  5   9  13# 3  0  6  10  14# 4  3  7  11  15

pandas的基本用法06-算术和数据对齐

# -*- coding: utf-8 -*- import numpy as npfrom pandas import Series, DataFrameimport dataframeDemofrom matplotlib.pyplot import axiss1 = Series([-1,3,4], index=['a','b','c'])s2 = Series([0,9,2,5], index=['a','b','c','d'])print s1+s2# a    -1# b    12# c     6# d   NaN# dtype: float64print 'DataFrame加法'df1 = DataFrame(np.arange(9).reshape(3,3),          index=list('abc'),          columns=list('123')          )df2 = DataFrame(np.arange(9).reshape(3,3),          index=list('abd'),          columns=list('023')          )print df1print df2print df1+df2#    1  2  3# a  0  1  2# b  3  4  5# c  6  7  8#    +#    0  2  3# a  0  1  2# b  3  4  5# d  6  7  8#    =#     0   1   2   3# a NaN NaN   2   4# b NaN NaN   8  10# c NaN NaN NaN NaN# d NaN NaN NaN NaNprint '数据填充'df1 = DataFrame(np.arange(12.).reshape(3,4))df2 = DataFrame(np.arange(20.).reshape(4,5))print df1#    0  1   2   3# 0  0  1   2   3# 1  4  5   6   7# 2  8  9  10  11print df2#     0   1   2   3   4# 0   0   1   2   3   4# 1   5   6   7   8   9# 2  10  11  12  13  14# 3  15  16  17  18  19print df1.add(df2)#     0   1   2   3   4# 0   0   2   4   6 NaN# 1   9  11  13  15 NaN# 2  18  20  22  24 NaN# 3 NaN NaN NaN NaN NaNprint df1.add(df2, fill_value=0)#     0   1   2   3   4# 0   0   2   4   6   4# 1   9  11  13  15   9# 2  18  20  22  24  14# 3  15  16  17  18  19print df1.reindex(columns=df2.columns, fill_value=0)#    0  1   2   3  4# 0  0  1   2   3  0# 1  4  5   6   7  0# 2  8  9  10  11  0print 'DataFrame与Series之间的操作'dataframe = DataFrame(np.arange(12.).reshape(3,4))series = dataframe.ix[0]print dataframe#    0  1   2   3# 0  0  1   2   3# 1  4  5   6   7# 2  8  9  10  11print series# 0    0# 1    1# 2    2# 3    3print dataframe-series#    0  1  2  3# 0  0  0  0  0# 1  4  4  4  4# 2  8  8  8  8series = dataframe[0]print dataframe.sub(series, axis=0)# 按列减#    0  1  2  3# 0  0  1  2  3# 1  0  1  2  3# 2  0  1  2  3

pandas的基本用法07-匿名函数

# -*- coding: utf-8 -*- import numpy as npfrom pandas import Series, DataFramefrom pandas.core.format import DataFrameFormatterprint 'lambda以及应用'frame = DataFrame(np.random.randn(3,4),                  index=list('abc'),columns=[1,2,3,4]                  )print frame#           1         2         3         4# a  1.099778 -0.953612  0.776224 -0.751771# b  0.094904 -0.882894 -0.689881 -0.694949# c  1.329955  0.137261  2.276389  0.546899f = lambda x:x.max()-x.min()# apply() 和applymap()是DataFrame数据类型的函数# applymap()是element-wise的，作用于每个DataFrame的每个数据。 # map()是Series数据类型的函数，也是element-wise的，对Series中的每个数据调用一次函数。print frame.apply(f, axis=1)#按行减# a    2.680024# b    3.077474# c    1.255980_format = lambda x: '%.2f'%xprint frame.applymap(_format)#        1      2      3      4# a  -0.23  -1.47  -0.16   0.40# b   0.21   0.94  -0.22  -3.51# c  -0.30  -0.35  -0.25   0.76print frame[1].map(_format)# a    0.63# b    0.51# c    0.11

pandas的基本用法08-重复索引和排序排名

# -*- coding: utf-8 -*- import numpy as npfrom pandas import Series, DataFrameprint '重复的索引's = Series(range(5), index=['a','a','b','b','c'])print s# a    0# a    1# b    2# b    3# c    4print s['a'][0]# 0df = DataFrame(np.random.randn(4, 3), index = ['a', 'a', 'b', 'b'])print df# a  0.818048 -2.071100 -1.278806# a -1.113891  1.694715  0.766724# b  0.606746 -0.846371 -1.685604# b -0.174152 -0.902854 -0.893771print df.ix['a'].ix[0]# 0    0.818048# 1   -2.071100# 2   -1.278806print '根据索引排序，对于DataFrame可以指定轴。's = Series(range(4), index = ['d', 'a', 'b', 'c'])print s.sort_index()# a    1# b    2# c    3# d    0frame = DataFrame(np.arange(8).reshape((2, 4)),                  index = ['three', 'one'],                  columns = list('dabc'))print frame#        d  a  b  c# three  0  1  2  3# one    4  5  6  7print frame.sort_index() #index排序#        d  a  b  c# one    4  5  6  7# three  0  1  2  3print frame.sort_index(axis=1) #columns排序#        a  b  c  d# three  1  2  3  0# one    5  6  7  4print frame.sort_index(axis=1, ascending=False) #columns降序#        d  c  b  a# three  0  3  2  1# one    4  7  6  5print '根据值排序's = Series([4,-1,10,9])print s.order() #新版本pandas: sort_values代替# 1    -1# 0     4# 3     9# 2    10print 'DataFrame指定列排序'df = DataFrame({'b':[-2,0,10,3], 'a':[0,1,1,0]})print df#    a   b# 0  0  -2# 1  1   0# 2  1  10# 3  0   3print df.sort_index(by='b')#    a   b# 0  0  -2# 1  1   0# 3  0   3# 2  1  10print 'rank，求排名的平均位置(从1开始)'s = Series([7, -5, 7, 4, 4]) #-5 4 4 7 7print s.rank()# 0    4.5 (4+5)/2# 1    1.0# 2    4.5# 3    2.5 (1+2)/2# 4    2.5print s.rank(method = 'first')  # 去第一次出现，不求平均值。# 0    4# 1    1# 2    5# 3    2# 4    3print s.rank(ascending = False, method = 'max') # 逆序，并取最大值。所以-5的rank是5.# 0    2# 1    5# 2    2# 3    4# 4    4print 'dataframe 排名'frame = DataFrame({'b':[4.3, 7, -3, 2],                  'a':[0, 1, 0, 1],                  'c':[-2, 5, 8, -2.5]})print frame#    a    b    c# 0  0  4.3 -2.0# 1  1  7.0  5.0# 2  0 -3.0  8.0# 3  1  2.0 -2.5print frame.rank(axis = 0, method='first', ascending=False) #按列排#    a  b  c# 0  3  2  3# 1  1  1  2# 2  4  4  1# 3  2  3  4

pandas的基本用法09-统计函数

这里写图片描述

# -*- coding: utf-8 -*- import numpy as npimport pandas as pdfrom pandas import Series, DataFrameprint '求和'df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],              index = ['a', 'b', 'c', 'd'],              columns = ['one', 'two'])print df#     one  two# a  1.40  NaN# b  7.10 -4.5# c   NaN  NaN# d  0.75 -1.3print df.sum() #默认按列求和# one    9.25# two   -5.80print df.sum(axis=1) #按行求和# a    1.40# b    2.60# c     NaN# d   -0.55print df.idxmax()# one    b# two    dprint df.cumsum()# one  two# a  1.40  NaN# b  8.50 -4.5# c   NaN  NaN# d  9.25 -5.8print df.describe()#  one       two# count  3.000000  2.000000# mean   3.083333 -2.900000# std    3.493685  2.262742# min    0.750000 -4.500000# 25%    1.075000 -3.700000# 50%    1.400000 -2.900000# 75%    4.250000 -2.100000# max    7.100000 -1.300000# print '相关性与协方差' # all_data = {}# for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:#     all_data[ticker] = web.get_data_yahoo(ticker, '4/1/2016', '7/15/2015')#     price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()})#     volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()})# returns = price.pct_change()# # print returns.tail()# print returns.MSFT.corr(returns.IBM)# print returns.corr()  # 相关性，自己和自己的相关性总是1# print returns.cov() # 协方差# print returns.corrwith(returns.IBM)# print returns.corrwith(returns.volume)print '去重's = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])print s.unique()# ['c' 'a' 'd' 'b']print pd.value_counts(s)# c    3# a    3# b    2# d    1print '判断元素存在'mask = s.isin(['a', 'b', 'c'])print mask# 0     True# 1     True# 2    False# 3     True# 4     True# 5     True# 6     True# 7     True# 8     Trueprint s[mask]# 0    c# 1    a# 3    a# 4    a# 5    b# 6    b# 7    c# 8    cdata = DataFrame({'c1':[1, 3, 4, 3, 4],                  'c2':[2, 3, 1, 2, 3],                  'c3':[1, 5, 2, 4, 4]})print data#    c1  c2  c3# 0   1   2   1# 1   3   3   5# 2   4   1   2# 3   3   2   4# 4   4   3   4print data.apply(pd.value_counts).fillna(0)#    c1  c2  c3# 1   1   1   1# 2   0   2   1# 3   2   2   0# 4   2   0   2# 5   0   0   1print data.apply(pd.value_counts, axis = 1).fillna(0)#    1  2  3  4  5# 0  2  1  0  0  0# 1  0  0  2  0  1# 2  1  1  0  1  0# 3  0  1  1  1  0# 4  0  0  1  2  0

pandas的基本用法10-处理NAN

这里写图片描述

# -*- coding: utf-8 -*- import numpy as npfrom numpy import nan as NAfrom pandas import Series,DataFrameprint '作为null处理的值's = Series(['aardvark', 'artichoke', np.nan, 'avocado'])print sprint s.isnull()# 0    False# 1    False# 2     True# 3    Falses[0] = Noneprint s.isnull()# 0     True# 1    False# 2     True# 3    Falseprint s.dropna()# 1    artichoke# 3      avocadoprint s[s.notnull()]# 1    artichoke# 3      avocadoprint 'DataFrame对丢弃NA的处理'data = DataFrame([[1., 6.5, 3.], [1., NA, NA],                  [NA, NA, NA], [NA, 6.5, 3.]])print data#     0    1   2# 0   1  6.5   3# 1   1  NaN NaN# 2 NaN  NaN NaN# 3 NaN  6.5   3print data.dropna() # 默认只要某行有NA就全部删除#    0    1  2# 0  1  6.5  3print data.dropna(axis=0, how='all') #某行全部为na才删除#     0    1   2# 0   1  6.5   3# 1   1  NaN NaN# 3 NaN  6.5   3data = DataFrame(np.random.randn(7, 3))data.ix[:4, 1] = NAdata.ix[:2, 2] = NAprint data#           0         1         2# 0  0.819602       NaN       NaN# 1  0.513875       NaN       NaN# 2  1.232815       NaN       NaN# 3 -0.272040       NaN -0.202212# 4 -0.485529       NaN -0.121475# 5  0.054189  0.025241  1.031688# 6 -1.729412 -0.975371 -2.013163print data.dropna(thresh = 2) # 每行 >=2个NA 就删除#           0         1         2# 3 -0.743106       NaN -0.460403# 4 -1.379843       NaN -0.495650# 5  1.151642 -1.087282 -2.163735# 6 -0.008196  0.674448 -0.650966print '填充0'data.fillna(0, inplace=True)print data#           0         1         2# 0 -0.392616  0.000000  0.000000# 1 -1.069262  0.000000  0.000000# 2 -0.751801  0.000000  0.000000# 3 -0.899334  0.000000  1.185419# 4  0.554094  0.000000  0.823630# 5 -0.799200 -0.655324 -0.590763# 6 -0.073688 -1.353579 -0.870911print '不同行列填充不同的值'data.ix[:4, 1] = NAdata.ix[:2, 2] = NAprint data#           0         1         2# 0  1.289977       NaN       NaN# 1  0.556263       NaN       NaN# 2 -1.388250       NaN       NaN# 3 -0.424846       NaN -1.120281# 4 -0.885350       NaN  0.103914# 5 -0.043046 -0.190838  2.351472# 6  0.291554  0.837331  0.164307print data.fillna({1:0.5, 2:-1})  # 列1填0.5, 列2填-1

pandas的基本用法11-层次化索引

# -*- coding: utf-8 -*- import numpy as npfrom pandas import Series, DataFrame, MultiIndexprint 'Series的层次索引'data = Series([1,3,56,2,88, 32,43,12,65,90],              index = [['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],                       [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])print data# a  1     1#    2     3#    3    56# b  1     2#    2    88#    3    32# c  1    43#    2    12# d  2    65#    3    90print data.index# [a  1,    2,    3, b  1,    2,    3, c  1,    2, d  2,    3]print data[:2]# a  1    1#    2    3print data.unstack()#     1   2   3# a   1   3  56# b   2  88  32# c  43  12 NaN# d NaN  65  90print data.unstack().stack()# a  1     1#    2     3#    3    56# b  1     2#    2    88#    3    32# c  1    43#    2    12# d  2    65#    3    90print 'DataFrame的层次索引'frame = DataFrame(np.arange(12).reshape((4, 3)),                  index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]],                  columns = [['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])print frame#       Ohio       Colorado#      Green  Red     Green# a 1      0    1         2#   2      3    4         5# b 1      6    7         8#   2      9   10        11frame.index.names = ['key1', 'key2']frame.columns.names = ['state', 'color']print frame# key1 key2                      # a    1         0    1         2#      2         3    4         5# b    1         6    7         8#      2         9   10        11print frame.ix['a', 1]# state     color# Ohio      Green    0#           Red      1# Colorado  Green    2print frame.ix['a', 2]['Colorado']# color# Green    5print frame.ix['a', 2]['Ohio']['Red']# 4print '直接用MultiIndex创建层次索引结构'print MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Gree', 'Red', 'Green']],                             names = ['state', 'color'])print '索引层级交换'frame_swapped = frame.swaplevel('key1', 'key2')print frame_swapped# state       Ohio       Colorado# color      Green  Red     Green# key2 key1                      # 1    a         0    1         2# 2    a         3    4         5# 1    b         6    7         8# 2    b         9   10        11print frame_swapped.swaplevel(0, 1) #也可以这样写# state       Ohio       Colorado# color      Green  Red     Green# key1 key2                      # a    1         0    1         2#      2         3    4         5# b    1         6    7         8#      2         9   10        11print '根据索引排序'print frame.sortlevel('key2')# state       Ohio       Colorado# color      Green  Red     Green# key1 key2                      # a    1         0    1         2# b    1         6    7         8# a    2         3    4         5# b    2         9   10        11print frame.swaplevel(0, 1).sortlevel(0)# state       Ohio       Colorado# color      Green  Red     Green# key2 key1                      # 1    a         0    1         2#      b         6    7         8# 2    a         3    4         5#      b         9   10        11print '根据指定的key计算统计信息'print frame.sum(level = 'key2')# state   Ohio       Colorado# color  Green  Red     Green# key2                       # 1          6    8        10# 2         12   14        16print '使用列生成层次索引'frame = DataFrame({'a':range(7),                   'b':range(7, 0, -1),                   'c':['one', 'one', 'one', 'two', 'two', 'two', 'two'],                   'd':[0, 1, 2, 0, 1, 2, 3]})print frame#  a  b    c  d# 0  0  7  one  0# 1  1  6  one  1# 2  2  5  one  2# 3  3  4  two  0# 4  4  3  two  1# 5  5  2  two  2# 6  6  1  two  3print frame.set_index(['c', 'd'])  # 把c/d列变成索引# c   d      # one 0  0  7#     1  1  6#     2  2  5# two 0  3  4#     1  4  3#     2  5  2#     3  6  1cdprint frame.set_index(['c', 'd'], drop = False) # cd列依然保留# c   d              # one 0  0  7  one  0#     1  1  6  one  1#     2  2  5  one  2# two 0  3  4  two  0#     1  4  3  two  1#     2  5  2  two  2#     3  6  1  two  3frame2 = frame.set_index(['c', 'd'])print frame2.reset_index() #还原#      c  d  a  b# 0  one  0  0  7# 1  one  1  1  6# 2  one  2  2  5# 3  two  0  3  4# 4  two  1  4  3# 5  two  2  5  2# 6  two  3  6  1

pandas的基本用法12-整数索引

# -*- coding: utf-8 -*- import numpy as npimport sysfrom pandas import Series, DataFrameprint '整数索引'ser = Series(np.arange(3.))print sertry:    print ser[-1] # 这里会有歧义except:    print sys.exc_info()[0]ser2 = Series(np.arange(3.), index = ['a', 'b', 'c'])print ser2[-1]ser3 = Series(range(3), index = [-5, 1, 3])print ser3.iloc[2]  # 避免直接用[2]产生的歧义printprint '对DataFrame使用整数索引'frame = DataFrame(np.arange(6).reshape((3, 2)), index = [2, 0, 1])print frameprint frame.iloc[0]print frame.iloc[:, 1]

阅读全文

1 0