Python3-pandas

来源：互联网发布：结构的刚度矩阵编辑：程序博客网时间：2024/06/04 19:58

参考：

1、http://pandas.pydata.org/pandas-docs/stable/10min.html

2、http://pandas.pydata.org/pandas-docs/stable/tutorials.html

10 Minutes to pandas

Object Creation
Viewing Data
Selection
- Getting
- Selection by Label
- Selection by Position
- Boolean Indexing
- Setting
Missing Data
Operations
- Stats
- Apply
- Histogramming
- String Methods
Merge
- Concat
- Join
- Append
Grouping
Reshaping
- Stack
- Pivot Tables
Time Series
Categoricals
Plotting
Getting Data In/Out
- CSV
- HDF5
- Excel
Gotchas

10 Minutes to pandas

Object Creation

import pandas as pdimport numpy as npimport matplotlib.pyplot as plt# Creating a Seriess = pd.Series([1,3,5,np.nan,6,8])# print(s)'''0    1.01    3.02    5.03    NaN4    6.05    8.0dtype: float64'''# Creating a DataFrame by passing a numpy arraydates = pd.date_range('20130101', periods=6)# print(dates)'''DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',               '2013-01-05', '2013-01-06'],              dtype='datetime64[ns]', freq='D')'''df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))# print(df)'''                   A         B         C         D2013-01-01 -2.106954  1.731281  0.252205 -1.2573732013-01-02 -1.297739 -0.144438  1.405233  0.1286842013-01-03 -1.515715  0.068778  0.313289  0.1272282013-01-04 -0.028522  0.732110 -0.289821 -0.1455442013-01-05  0.505480  0.918043  0.159986 -0.4802232013-01-06  0.237698 -0.030478  0.920267  1.040430'''# Creating a DataFrame by passing a dictdf2 = pd.DataFrame({ 'A' : 1.,                         'B' : pd.Timestamp('20130102'),                         'C' : pd.Series(1,index=list(range(4)),dtype='float32'),                         'D' : np.array([3] * 4,dtype='int32'),                         'E' : pd.Categorical(["test","train","test","train"]),                         'F' : 'foo' })# print(df2)'''     A          B    C  D      E    F0  1.0 2013-01-02  1.0  3   test  foo1  1.0 2013-01-02  1.0  3  train  foo2  1.0 2013-01-02  1.0  3   test  foo3  1.0 2013-01-02  1.0  3  train  foo'''# Having specific dtypesprint(df2.dtypes)'''A           float64B    datetime64[ns]C           float32D             int32E          categoryF            objectdtype: object'''

Viewing Data

import pandas as pdimport numpy as npimport matplotlib.pyplot as plt# Creating a DataFrame by passing a numpy arraydates = pd.date_range('20130101', periods=6)df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))# print(df)'''                   A         B         C         D2013-01-01 -0.144929  0.516025  1.414747  0.9569312013-01-02 -0.114975 -1.999099 -1.305006 -1.2638972013-01-03  0.409725 -0.829613  0.010113 -1.9911702013-01-04  1.282872 -1.338264 -3.314622  0.4474572013-01-05  0.499045  0.326436  0.117400 -0.1887792013-01-06 -0.701257 -1.522831 -1.270148 -1.496934'''print(df.head(2)) # 查看前2行'''                   A         B         C         D2013-01-01 -0.144929  0.516025  1.414747  0.9569312013-01-02 -0.114975 -1.999099 -1.305006 -1.263897'''print(df.tail(3)) # 查看后3行'''                   A         B         C         D2013-01-04  1.282872 -1.338264 -3.314622  0.4474572013-01-05  0.499045  0.326436  0.117400 -0.1887792013-01-06 -0.701257 -1.522831 -1.270148 -1.496934'''# Display the index, columns, and the underlying numpy dataprint(df.index) # Display the index 显示行标题print(df.columns) # 显示列标题print(df.values) # 显示实际值print(type(df.values)) # <numpy ndarray>print(df.describe()) # Describe shows a quick statistic summary of your dataprint(df.T) # Transposing your dataprint(df.sort_index(axis=1, ascending=False)) # Sorting by an axis 按某一轴方向排序print(df.sort_values(by='B')) # 按B列排序

Selection

Getting

import pandas as pdimport numpy as npimport matplotlib.pyplot as plt# Creating a DataFrame by passing a numpy arraydates = pd.date_range('20130101', periods=6)df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))df['A'] # 获取第A列的值df[0:3] # 得到前3行的值df['20130102':'20130104'] # 得到['20130102','20130104')行的值df.loc[dates[0]] # 第一行的值df.loc[:,['A','B']] # 获取A、B列的值df.loc['20130102':'20130104',['A','B']] # 行取['20130102','20130104') 列取A、Bdf.loc['20130102',['A','B']] # 行取'20130102' 列取A、Bdf.loc[dates[0],'A'] # 行取第一行，列取第A列df.at[dates[0],'A'] # 行取第一行，列取第A列df.iloc[3] # 第4行df.iloc[3:5,0:2] # 行 [3,5) 列[0,2)df.iloc[[1,2,4],[0,2]] # 行 1,2,4 列 0,2df.iloc[1:3,:] # 行[1,3) 列 所有列df.iloc[:,1:3] # 行 所有行 列[1,3)df.iloc[1,1] # 行 1，列 1df.iat[1,1] # 行 1，列 1df[df.A > 0] # 第A列的值大于0的所有行列，注包含A、B，C，D等其他所有列df[df > 0] # df中值大于0的所有行列，缺少的用NaN补充df2 = df.copy()df2['E'] = ['one', 'one','two','three','four','three']df2[df2['E'].isin(['two','four'])] # 第E列中的  'two','four'对应的行

import pandas as pdimport numpy as npimport matplotlib.pyplot as plt# Creating a DataFrame by passing a numpy arraydates = pd.date_range('20130101', periods=6)df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))df['F']=s1# print(df)'''                   A         B         C         D    F2013-01-01 -0.447267 -1.585050 -0.388142  0.378795  NaN2013-01-02 -0.045593 -0.876191 -1.910121  0.365615  1.02013-01-03  2.137712 -0.384255 -0.940755  1.987526  2.02013-01-04 -1.662465 -0.264535 -0.315382 -0.448721  3.02013-01-05  0.568981 -0.448420 -0.294313  1.914237  4.02013-01-06  1.867176  0.646454  1.968858 -0.290831  5.0'''# Setting values by labeldf.at[dates[0],'A'] = 0 # 第0行，第A列的值为 0# Setting values by positiondf.iat[0,1] = 0 # 第0行，第2列（这里就是B列）的值为# Setting by assigning with a numpy arraydf.loc[:, 'D'] = np.array([5] * len(df)) # 设置第D列的值print(df)'''                   A         B         C  D    F2013-01-01  0.000000  0.000000  0.038916  5  NaN2013-01-02  0.730072  1.325252 -0.166603  5  1.02013-01-03  0.727788  1.001638 -0.293277  5  2.02013-01-04  1.801973  0.814501 -0.145767  5  3.02013-01-05 -0.245231 -0.060449 -0.244515  5  4.02013-01-06  0.116800  0.115574 -0.300012  5  5.0'''df2 = df.copy()df2[df2 > 0] = -df2 # 值大于0，取取相反数

Missing Data¶

import pandas as pdimport numpy as npimport matplotlib.pyplot as plt# Creating a DataFrame by passing a numpy arraydates = pd.date_range('20130101', periods=6)df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))df['F']=s1df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])df1.loc[dates[0]:dates[1], 'E'] = 1print(df1)'''                   A         B         C         D    F    E2013-01-01  1.777087 -1.400922 -0.719084 -0.025552  NaN  1.02013-01-02  0.577299  1.455149 -1.410174 -0.331292  1.0  1.02013-01-03  0.078308  0.173087 -1.370062 -1.497506  2.0  NaN2013-01-04  1.032272  0.805309  1.447437  0.027947  3.0  NaN'''# To drop any rows that have missing data.print(df1.dropna(how='any'))'''                   A         B         C         D    F    E2013-01-02  0.577299  1.455149 -1.410174 -0.331292  1.0  1.0'''print(df1.fillna(value=5)) # Filling missing data'''                   A         B         C         D    F    E2013-01-01  1.777087 -1.400922 -0.719084 -0.025552  5.0  1.02013-01-02  0.577299  1.455149 -1.410174 -0.331292  1.0  1.02013-01-03  0.078308  0.173087 -1.370062 -1.497506  2.0  5.02013-01-04  1.032272  0.805309  1.447437  0.027947  3.0  5.0'''print(pd.isnull(df1)) # To get the boolean mask where values are nan'''                A      B      C      D      F      E2013-01-01  False  False  False  False   True  False2013-01-02  False  False  False  False  False  False2013-01-03  False  False  False  False  False   True2013-01-04  False  False  False  False  False   True'''

Operations

Stats

Operations in general exclude missing data.

import pandas as pdimport numpy as npimport matplotlib.pyplot as plt# Creating a DataFrame by passing a numpy arraydates = pd.date_range('20130101', periods=6)df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))# Operations in general exclude missing data.# print(df.dropna())df.mean(0) #  df.mean() 每一列的平均值df.mean(1) # 每一行的平均值print(df)'''                   A         B         C         D2013-01-01 -0.677487 -0.807600 -0.487288  0.0253622013-01-02 -0.647777  1.900635  0.372034  0.7857232013-01-03  0.893734  0.837393  1.175039 -0.2353302013-01-04 -0.334574 -0.250514  0.474364  1.0416982013-01-05  0.264945  1.679856 -0.716078  1.2985072013-01-06  0.051315  1.352663  0.678500  2.070729'''s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)print(s)'''2013-01-01    NaN2013-01-02    NaN2013-01-03    1.02013-01-04    3.02013-01-05    5.02013-01-06    NaNFreq: D, dtype: float64'''print(df.sub(s, axis=0)) #  df.sub(s, axis='index')'''                   A         B         C         D2013-01-01       NaN       NaN       NaN       NaN2013-01-02       NaN       NaN       NaN       NaN2013-01-03 -0.106266 -0.162607  0.175039 -1.2353302013-01-04 -3.334574 -3.250514 -2.525636 -1.9583022013-01-05 -4.735055 -3.320144 -5.716078 -3.7014932013-01-06       NaN       NaN       NaN       NaN'''print(df.apply(np.cumsum)) # Applying functions to the data'''                   A         B         C         D2013-01-01 -0.677487 -0.807600 -0.487288  0.0253622013-01-02 -1.325264  1.093034 -0.115254  0.8110842013-01-03 -0.431530  1.930427  1.059785  0.5757542013-01-04 -0.766105  1.679913  1.534150  1.6174522013-01-05 -0.501159  3.359769  0.818072  2.9159592013-01-06 -0.449844  4.712432  1.496572  4.986688'''print(df.apply(lambda x: x.max() - x.min()))'''A    1.571221B    2.708235C    1.891117D    2.306059dtype: float64'''

Histogramming

See more at Histogramming and Discretization

import pandas as pdimport numpy as npimport matplotlib.pyplot as plts = pd.Series(np.random.randint(0, 7, size=10))print(s)'''0    21    42    03    54    65    36    57    28    39    6dtype: int32'''print(s.value_counts()) # 统计每个数出现的次数'''6    25    23    22    24    10    1dtype: int64'''

String Methods

s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])print(s)print(s.str.lower()) # 所有字母小写print(s.str.upper()) # 所有字母大写

Merge

Concat

See the Merging section

import pandas as pdimport numpy as npimport matplotlib.pyplot as pltdf = pd.DataFrame(np.random.randn(10, 4))pieces = [df[:3], df[3:7], df[7:]] # # break it into piecesprint(pieces[0])'''          0         1         2         30 -0.524490  0.940037  0.866867 -0.0309731  0.679116  1.187920  1.519773 -2.5969302 -0.526557  0.436916 -1.804245  0.058277'''print(pd.concat(pieces))'''          0         1         2         30 -0.524490  0.940037  0.866867 -0.0309731  0.679116  1.187920  1.519773 -2.5969302 -0.526557  0.436916 -1.804245  0.0582773 -1.204341  0.771885  0.474900 -0.3088404 -0.018233 -0.405723 -0.344591 -0.4547785 -1.255896  0.352891  0.231837 -0.8023456  0.777226  0.252132 -0.252539 -0.7795987  1.726603  0.210850  0.118263 -0.0828488 -0.507362 -0.265372 -0.468006  0.9972329  0.825417 -1.098757 -0.920184  0.227833'''

Join

SQL style merges. See the Database style joining

import pandas as pdimport numpy as npleft = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})print(left)'''   key  lval0  foo     11  foo     2'''print(right)'''   key  rval0  foo     41  foo     5'''print(pd.merge(left, right, on='key'))'''   key  lval  rval0  foo     1     41  foo     1     52  foo     2     43  foo     2     5'''

Another example that can be given is:

import pandas as pdimport numpy as npleft = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})print(left)'''   key  lval0  foo     11  bar     2'''print(right)'''   key  rval0  foo     41  bar     5'''print(pd.merge(left, right, on='key'))'''   key  lval  rval0  foo     1     41  bar     2     5'''

Append

Append rows to a dataframe. See the Appending

import pandas as pdimport numpy as npdf = pd.DataFrame(np.random.randn(4, 4), columns=['A','B','C','D'])print(df)'''          A         B         C         D0 -0.131941  0.687743  0.096007  0.4216321  1.299803  0.878197  0.734132 -0.6858852  1.578180  0.371533  0.914458  0.6036013  0.844306  1.265807  0.039494  1.894346'''s = df.iloc[3] # 取第3行print(df.append(s, ignore_index=True))'''          A         B         C         D0 -0.131941  0.687743  0.096007  0.4216321  1.299803  0.878197  0.734132 -0.6858852  1.578180  0.371533  0.914458  0.6036013  0.844306  1.265807  0.039494  1.8943464  0.844306  1.265807  0.039494  1.894346'''

Grouping

Splitting the data into groups based on some criteria
Applying a function to each group independently
Combining the results into a data structure

See the Grouping section

import pandas as pdimport numpy as npdf = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',                           'foo', 'bar', 'foo', 'foo'],                    'B' : ['one', 'one', 'two', 'three',                           'two', 'two', 'one', 'three'],                    'C' : np.random.randn(8),                    'D' : np.random.randn(8)})print(df)'''     A      B         C         D0  foo    one -0.805533  0.8781751  bar    one  0.587196  1.0195602  foo    two  0.428103 -0.2880533  bar  three  0.085747  0.4790964  foo    two -0.460235 -0.3234065  bar    two -1.180654 -1.9250486  foo    one  1.866390 -0.7504127  foo  three  0.146175 -0.692545'''print(df.groupby('A').sum()) # 按A列中的对应元素求和'''           C         DA                     bar -0.50771 -0.426392   # -0.50771=0.587196+0.085747-1.180654foo  1.17490 -1.176241'''print(df.groupby(['A','B']).sum())'''                  C         DA   B                        bar one    0.587196  1.019560    three  0.085747  0.479096    two   -1.180654 -1.925048foo one    1.060857  0.127763    three  0.146175 -0.692545    two   -0.032132 -0.611460'''

Reshaping

See the sections on Hierarchical Indexing and Reshaping.

Stack

import pandas as pdimport numpy as nptuples = list(zip(*[['bar', 'bar', 'baz', 'baz',                      'foo', 'foo', 'qux', 'qux'],                    ['one', 'two', 'one', 'two',                     'one', 'two', 'one', 'two']]))print(tuples)'''[('bar', 'one'), ('bar', 'two'), ('baz', 'one'), ('baz', 'two'), ('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')]'''index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])print(df)'''                     A         Bfirst second                    bar   one    -3.084166 -1.439038      two     0.692997 -0.267035baz   one    -0.183196  0.616568      two     0.055133 -0.214597foo   one     0.503489  0.046369      two     0.978864 -0.185691qux   one     0.945434 -0.199447      two     0.839038  0.193106'''df2 = df[:4]print(df2)'''                     A         Bfirst second                    bar   one    -3.084166 -1.439038      two     0.692997 -0.267035baz   one    -0.183196  0.616568      two     0.055133 -0.214597'''stacked = df2.stack()print(stacked)'''first  second   bar    one     A   -3.084166               B   -1.439038       two     A    0.692997               B   -0.267035baz    one     A   -0.183196               B    0.616568       two     A    0.055133               B   -0.214597dtype: float64'''print(stacked.unstack())'''                     A         Bfirst second                    bar   one    -3.084166 -1.439038      two     0.692997 -0.267035baz   one    -0.183196  0.616568      two     0.055133 -0.214597'''print(stacked.unstack(1))'''second        one       twofirst                      bar   A -3.084166  0.692997      B -1.439038 -0.267035baz   A -0.183196  0.055133      B  0.616568 -0.214597'''print(stacked.unstack(0))'''first          bar       bazsecond                      one    A -3.084166 -0.183196       B -1.439038  0.616568two    A  0.692997  0.055133       B -0.267035 -0.214597'''

Pivot Tables

See the section on Pivot Tables.

import pandas as pdimport numpy as npdf = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,                    'B' : ['A', 'B', 'C'] * 4,                    'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,                    'D' : np.random.randn(12),                   'E' : np.random.randn(12)})print(df)'''        A  B    C         D         E0     one  A  foo -0.082127 -0.5268101     one  B  foo -0.657548 -1.1206102     two  C  foo  0.332124 -0.1805613   three  A  bar  1.181363  0.0788914     one  B  bar -0.787990  0.4279325     one  C  bar  0.897438  0.8343036     two  A  foo -0.156936  1.2452567   three  B  foo -0.453510  0.2099168     one  C  foo  0.936943  0.2195369     one  A  bar -0.972587 -1.41791210    two  B  bar  1.456860 -0.19873511  three  C  bar -0.750585  0.108095'''print(pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']))'''C             bar       fooA     B                    one   A -0.972587 -0.082127      B -0.787990 -0.657548      C  0.897438  0.936943three A  1.181363       NaN      B       NaN -0.453510      C -0.750585       NaNtwo   A       NaN -0.156936      B  1.456860       NaN      C       NaN  0.332124'''

Time Series

See the Time Series section

import pandas as pdimport numpy as nprng = pd.date_range('1/1/2012', periods=100, freq='S')ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)print(ts.resample('5Min').sum())# Time zone representationrng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')ts = pd.Series(np.random.randn(len(rng)), rng)print(ts)ts_utc = ts.tz_localize('UTC')print(ts_utc)# Convert to another time zoneprint(ts_utc.tz_convert('US/Eastern'))# Converting between time span representationsrng = pd.date_range('1/1/2012', periods=5, freq='M')ts = pd.Series(np.random.randn(len(rng)), index=rng)print(ts)ps = ts.to_period()print(ps)print(ps.to_timestamp())prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')ts = pd.Series(np.random.randn(len(prng)), prng)ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9ts.head()

Categoricals

For full docs, see the categorical introductionand the API documentation.

import pandas as pdimport numpy as npdf = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})print(df)'''   id raw_grade0   1         a1   2         b2   3         b3   4         a4   5         a5   6         e'''df["grade"] = df["raw_grade"].astype("category")print(df["grade"]) # Convert the raw grades to a categorical data type.'''0    a1    b2    b3    a4    a5    eName: grade, dtype: categoryCategories (3, object): [a, b, e]'''print(df)'''   id raw_grade grade0   1         a     a1   2         b     b2   3         b     b3   4         a     a4   5         a     a5   6         e     e'''# Rename the categories to more meaningful names (assigning to Series.cat.categories is inplace!)df["grade"].cat.categories = ["very good", "good", "very bad"]print(df["grade"])'''0    very good1         good2         good3    very good4    very good5     very badName: grade, dtype: categoryCategories (3, object): [very good, good, very bad]'''df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])print(df["grade"])'''0    very good1         good2         good3    very good4    very good5     very badName: grade, dtype: categoryCategories (5, object): [very bad, bad, medium, good, very good]'''print(df.sort_values(by="grade"))'''   id raw_grade      grade5   6         e   very bad1   2         b       good2   3         b       good0   1         a  very good3   4         a  very good4   5         a  very good'''print(df.groupby("grade").size())'''gradevery bad     1bad          0medium       0good         2very good    3dtype: int64'''

Plotting

Plotting docs.

import pandas as pdimport numpy as npimport matplotlib.pyplot as pltts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))ts = ts.cumsum()plt.figure('1');plt.subplot(211)ts.plot()plt.subplot(212)plt.plot(ts)# plt.legend(loc='best');plt.show()df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,                      columns=['A', 'B', 'C', 'D'])df = df.cumsum()plt.figure(); df.plot(); plt.legend(loc='best');plt.show()

Getting Data In/Out

CSV

Writing to a csv file

import pandas as pdimport numpy as npimport matplotlib.pyplot as pltts = pd.Series(np.random.randn(4), index=pd.date_range('1/1/2000', periods=4))df = pd.DataFrame(np.random.randn(4, 4), index=ts.index,                      columns=['A', 'B', 'C', 'D'])# Writing to a csv filedf.to_csv('foo.csv')# Reading from a csv filedatas=pd.read_csv('foo.csv')print(df)'''                   A         B         C         D2000-01-01 -0.886796  0.150827  1.891757  0.7039122000-01-02 -0.174584 -2.120584  0.251963 -1.7865272000-01-03 -0.190375  0.603245  0.965307  0.2599122000-01-04  0.615358  0.432191  0.781446  0.883223'''print(datas)'''   Unnamed: 0         A         B         C         D0  2000-01-01 -0.886796  0.150827  1.891757  0.7039121  2000-01-02 -0.174584 -2.120584  0.251963 -1.7865272  2000-01-03 -0.190375  0.603245  0.965307  0.2599123  2000-01-04  0.615358  0.432191  0.781446  0.883223'''datas2=datas.iloc[:,1:]print(datas2)'''          A         B         C         D0 -0.886796  0.150827  1.891757  0.7039121 -0.174584 -2.120584  0.251963 -1.7865272 -0.190375  0.603245  0.965307  0.2599123  0.615358  0.432191  0.781446  0.883223'''

HDF5

Reading and writing to HDFStores

import pandas as pdimport numpy as npimport matplotlib.pyplot as pltts = pd.Series(np.random.randn(4), index=pd.date_range('1/1/2000', periods=4))df = pd.DataFrame(np.random.randn(4, 4), index=ts.index,                      columns=['A', 'B', 'C', 'D'])# Writing to a csv filedf.to_hdf('foo.h5','df')# Reading from a csv filedatas=pd.read_hdf('foo.h5','df')print(datas)

Excel

Reading and writing to MS Excel

Writing to an excel file

import pandas as pdimport numpy as npimport matplotlib.pyplot as pltts = pd.Series(np.random.randn(4), index=pd.date_range('1/1/2000', periods=4))df = pd.DataFrame(np.random.randn(4, 4), index=ts.index,                      columns=['A', 'B', 'C', 'D'])# Writing to a csv filedf.to_excel('foo.xlsx', sheet_name='Sheet1')# Reading from a csv filedatas=pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])print(datas)

Gotchas

If you are trying an operation and you see an exception like:

>>> if pd.Series([False, True, False]):    print("I was true")Traceback    ...ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all().

See Comparisons for an explanation and what to do.

See Gotchas as well.

import pandas as pdimport numpy as npimport matplotlib.pyplot as pltds=pd.Series([False, True, False])print(ds)[print("true") for i in ds if not i]

阅读全文

0 0