pandas常用操作

来源：互联网发布：真维斯淘宝店电话编辑：程序博客网时间：2024/06/05 04:37

In [6]: dates = pd.date_range(’20130101’,periods=6)In [8]: df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list(’ABCD’))In [9]: dfOut[9]:                  A         B         C         D2013-01-01 0.469112 -0.282863 -1.509059 -1.1356322013-01-02 1.212112 -0.173215 0.119209 -1.0442362013-01-03 -0.861849 -2.104569 -0.494929 1.0718042013-01-04 0.721555 -0.706771 -1.039575 0.2718602013-01-05 -0.424972 0.567020 0.276232 -1.0874012013-01-06 -0.673690 0.113648 -1.478427 0.524988df.valuesdf.sort_index(axis=1, ascending=False)df.sort(columns=’B’)df[’A’]df[0:3]df[’20130102’:’20130104’]df.loc[:,[’A’,’B’]]df.loc[’20130102’:’20130104’,[’A’,’B’]]df.iloc[3]df.iloc[3:5,0:2]df.iloc[[1,2,4],[0,2]]df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + [’E’])df.mean()df.mean(1)df.apply(lambda x: x.max() - x.min())

In [77]: left = pd.DataFrame({’key’: [’foo’, ’foo’], ’lval’: [1, 2]})In [78]: right = pd.DataFrame({’key’: [’foo’, ’foo’], ’rval’: [4, 5]})In [79]: leftOut[79]:  key lval0 foo 11 foo 2In [80]: rightOut[80]:  key rval0 foo 41 foo 5In [81]: pd.merge(left, right, on=’key’)Out[81]:key lval rval0 foo 1 41 foo 1 52 foo 2 43 foo 2 5

df.append(s, ignore_index=True)

In [87]: dfOut[87]:    A   B         C         D0 foo one -1.202872 -0.0552241 bar one -1.814470 2.3959852 foo two 1.018601 1.5528253 bar three -0.595447 0.1665994 foo two 1.395433 0.0476095 bar two -0.392670 -0.1364736 foo one 0.007207 -0.5617577 foo three 1.928123 -1.623033In [88]: df.groupby(’A’).sum()Out[88]:            C       DAbar -2.802588 2.42611foo 3.146492 -0.63958In [89]: df.groupby([’A’,’B’]).sum()Out[89]:                  C        DA Bbar one   -1.814470 2.395985    three -0.595447 0.166599    two   -0.392670 -0.136473foo one   -1.195665 -0.616981    three 1.928123 -1.623033    two   2.414034 1.600434

In [101]: dfOut[101]:    A B   C        D         E0 one A foo 1.418757 -0.1796661 one B foo -1.879024 1.2918362 two C foo 0.536826 -0.0096143 three A bar 1.006160 0.3921494 one B bar -0.029716 0.2645995 one C bar -1.146178 -0.0574096 two A foo 0.100900 -1.4256387 three B foo -1.035018 1.0240988 one C foo 0.314665 -0.1060629 one A bar -0.773723 1.82437510 two B bar -1.170653 0.59597411 three C bar 0.648740 1.167115In [102]: pd.pivot_table(df, values=’D’, index=[’A’, ’B’], columns=[’C’])Out[102]:C             bar      fooA     Bone   A -0.773723 1.418757      B -0.029716 -1.879024      C -1.146178 0.314665three A 1.006160 NaN      B NaN -1.035018      C 0.648740 NaNtwo   A NaN 0.100900      B -1.170653 NaN      C NaN 0.536826

# List to hold file namesFileNames = []# Your path will be different, please modify the path below.#os.chdir()函数功能：改变当前路径到指定路径os.chdir(r"C:\Users\david\notebooks\pandas")# Find any file that ends with ".xlsx"for files in os.listdir("."):    if files.endswith(".xlsx"):        FileNames.append(files)FileNamesdef GetFile(fnombre):    # Path to excel file    # Your path will be different, please modify the path below.    location = r'C:\Users\david\notebooks\pandas\\' + fnombre    # Parse the excel file    # 0 = first sheet    df = pd.read_excel(location, 0)    # Tag record to file name    df['File'] = fnombre    # Make the "File" column the index of the df    return df.set_index(['File'])# Create a list of dataframesdf_list = [GetFile(fname) for fname in FileNames]# Combine all of the dataframes into onebig_df = pd.concat(df_list)

df One_X One_Y Two_X Two_Y row0 1.1   1.2   1.11   1.22  01 1.1   1.2   1.11   1.22  12 1.1   1.2   1.11   1.22  2In [60]: df = df.set_index(’row’);dfOut[60]:   One_X One_Y Two_X Two_Yrow0   1.1   1.2   1.11  1.221   1.1   1.2   1.11  1.222   1.1   1.2   1.11  1.22In [61]: df.columns = pd.MultiIndex.from_tuples([tuple(c.split(’_’)) for c in df.columns]);dfOut[61]:      One     Two     X   Y    X   Yrow0   1.1 1.2 1.11 1.221   1.1 1.2 1.11 1.222   1.1 1.2 1.11 1.22In [62]: df = df.stack(0).reset_index(1);dfOut[62]:   level_1  X   Yrow0   One   1.10 1.200   Two   1.11 1.221   One   1.10 1.201   Two   1.11 1.222   One   1.10 1.202   Two   1.11 1.22In [63]: df.columns = [’Sample’,’All_X’,’All_Y’];dfOut[63]:   Sample All_X All_Yrow0   One   1.10   1.200   Two   1.11   1.221   One   1.10   1.201   Two   1.11   1.222   One   1.10   1.202   Two   1.11   1.22In [64]: cols = pd.MultiIndex.from_tuples([ (x,y) for x in [’A’,’B’,’C’] for y in [’O’,’I’]])In [65]: df = pd.DataFrame(np.random.randn(2,6),index=[’n’,’m’],columns=cols); dfOut[65]:     A      B      C     O  I   O  I   O  In 1.920906 -0.388231 -2.314394 0.665508 0.402562 0.399555m -1.765956 0.850423 0.388054 0.992312 0.744086 -0.739776In [72]: index = list(itertools.product([’Ada’,’Quinn’,’Violet’],[’Comp’,’Math’,’Sci’]))In [73]: headr = list(itertools.product([’Exams’,’Labs’],[’I’,’II’]))In [74]: indx = pd.MultiIndex.from_tuples(index,names=[’Student’,’Course’])In [75]: cols = pd.MultiIndex.from_tuples(headr) #Notice these are un-namedIn [76]: data = [[70+x+y+(x*y)%3 for x in range(4)] for y in range(9)]In [77]: df = pd.DataFrame(data,indx,cols); dfOut[77]:                Exams    Labs                I   II   I   IIStudent CourseAda     Comp    70  71   72  73        Math    71  73   75  74        Sci     72  75   75  75Quinn   Comp    73  74   75  76        Math    74  76   78  77        Sci     75  78   78  78Violet  Comp    76  77   78  79        Math    77  79   81  80        Sci     78  81   81  81In [78]: All = slice(None)In [79]: df.loc[’Violet’]Out[79]:      Exams   Labs      I   II   I   IICourseComp  76  77   78  79Math  77  79   81  80Sci   78  81   81  81In [80]: df.loc[(All,’Math’),All]Out[80]:              Exams   Labs              I  II   I  IIStudent CourseAda     Math  71 73   75 74Quinn   Math  74 76   78 77Violet  Math  77 79   81 80In [81]: df.loc[(slice(’Ada’,’Quinn’),’Math’),All]Out[81]:              Exams   Labs              I  II   I  IIStudent CourseAda     Math  71 73   75 74Quinn   Math  74 76   78 77In [82]: df.loc[(All,’Math’),(’Exams’)]Out[82]:              I  IIStudent CourseAda     Math  71 73Quinn   Math  74 76Violet  Math  77 79In [83]: df.loc[(All,’Math’),(All,’II’)]Out[83]:              Exams Labs              II    IIStudent CourseAda     Math  73    74Quinn   Math  76    77Violet  Math  79    80

df = pd.DataFrame({’animal’: ’cat dog cat fish dog cat cat’.split(),....: ’size’: list(’SSMMMLL’),....: ’weight’: [8, 10, 11, 1, 20, 12, 12],....: ’adult’ : [False] * 5 + [True] * 2});df  adult animal size weight0 False cat      S    81 False dog      S   102 False cat      M   113 False fish     M    14 False dog      M   205 True  cat      L   126 True  cat      L   12#List the size of the animals with the highest weight.In [90]: df.groupby(’animal’).apply(lambda subf: subf[’size’][subf[’weight’].idxmax()])Out[90]:animalcat Ldog Mfish Mdtype: objectIn [91]: gb = df.groupby([’animal’])In [92]: gb.get_group(’cat’)Out[92]:  adult animal size weight0 False cat     S     82 False cat     M     115 True  cat     L     126 True  cat     L     12Apply to different items in a groupIn [93]: def GrowUp(x):....: avg_weight = sum(x[x.size == ’S’].weight * 1.5)....: avg_weight += sum(x[x.size == ’M’].weight * 1.25)....: avg_weight += sum(x[x.size == ’L’].weight)....: avg_weight = avg_weight / len(x)....: return pd.Series([’L’,avg_weight,True], index=[’size’, ’weight’, ’adult’])....:In [94]: expected_df = gb.apply(GrowUp)In [95]: expected_dfOut[95]:      size weight  adultanimalcat     L  12.4375 Truedog     L  20.0000 Truefish    L  1.2500  TrueCreate a value counts column and reassign back to the DataFrameIn [115]: df = pd.DataFrame({’Color’: ’Red Red Red Blue’.split(),.....: ’Value’: [100, 150, 50, 50]}); df.....:Out[115]:  Color Value0 Red   1001 Red   1502 Red   503 Blue  50In [116]: df[’Counts’] = df.groupby([’Color’]).transform(len)In [117]: dfOut[117]:  Color Value Counts0 Red   100   31 Red   150   32 Red   50    33 Blue  50    1Rolling Apply to multiple columns where function returns a Scalar (Volume Weighted Average Price)In [138]: rng = pd.date_range(start = ’2014-01-01’,periods = 100)In [139]: df = pd.DataFrame({’Open’ : np.random.randn(len(rng)),.....: ’Close’ : np.random.randn(len(rng)),’Volume’ : np.random.randint(100,2000,len(rng))}, index=rng)df           Close    Open     Volume2014-01-01 1.550590 0.458513 13712014-01-02 -0.818812 -0.508850 14332014-01-03 1.160619 0.257610 6452014-01-04 0.081521 -1.773393 8782014-01-05 1.083284 -0.560676 11432014-01-06 -0.518721 0.284174 10882014-01-07 0.140661 1.146889 1722... ... ... ...2014-04-04 0.458193 -0.669474 17682014-04-05 0.108502 -1.616315 8362014-04-06 1.418082 -1.294906 6942014-04-07 0.486530 1.171647 7962014-04-08 0.181885 0.501639 2652014-04-09 -0.707238 -0.361868 12932014-04-10 1.211432 1.564429 1088In [140]: def vwap(bars): return ((bars.Close*bars.Volume).sum()/bars.Volume.sum()).round(2)In [141]: window = 5In [142]: s = pd.concat([ (pd.Series(vwap(df.iloc[i:i+window]), index=[df.index[i+window]])) for i in range(95)])Out[142]:2014-01-06 0.552014-01-07 0.062014-01-08 0.322014-01-09 0.032014-01-10 0.08...2014-04-05 0.482014-04-06 0.542014-04-07 0.462014-04-08 0.452014-04-09 0.532014-04-10 0.15Length: 95

Replacing some values with mean of the rest of a groupIn [100]: df = pd.DataFrame({’A’ : [1, 1, 2, 2], ’B’ : [1, -1, 1, 2]})In [101]: gb = df.groupby(’A’)In [102]: def replace(g):.....: mask = g < 0.....: g.loc[mask] = g[~mask].mean().....: return gIn [103]: gb.transform(replace)Out[103]:B0 11 12 13 2Sort groups by aggregated dataIn [104]: df = pd.DataFrame({’code’: [’foo’, ’bar’, ’baz’] * 2,.....: ’data’: [0.16, -0.21, 0.33, 0.45, -0.59, 0.62],.....: ’flag’: [False, True] * 3}).....:In [105]: code_groups = df.groupby(’code’)In [106]: agg_n_sort_order = code_groups[[’data’]].transform(sum).sort(’data’)In [107]: sorted_df = df.ix[agg_n_sort_order.index]In [108]: sorted_dfOut[108]:  code data flag1 bar -0.21 True4 bar -0.59 False0 foo 0.16 False3 foo 0.45 True2 baz 0.33 False5 baz 0.62 TrueCreate multiple aggregated columnsIn [109]: rng = pd.date_range(start="2014-10-07",periods=10,freq=’2min’)In [110]: ts = pd.Series(data = list(range(10)), index = rng)In [111]: def MyCust(x):.....: if len(x) > 2:.....: return x[1] * 1.234.....: return pd.NaTIn [112]: mhc = {’Mean’ : np.mean, ’Max’ : np.max, ’Custom’ : MyCust}In [113]: ts.resample("5min",how = mhc)Out[113]:                   Max Custom Mean2014-10-07 00:00:00 2  1.234  1.02014-10-07 00:05:00 4  NaN    3.52014-10-07 00:10:00 7  7.404  6.02014-10-07 00:15:00 9  NaN    8.5In [114]: tsOut[114]:2014-10-07 00:00:00 02014-10-07 00:02:00 12014-10-07 00:04:00 22014-10-07 00:06:00 32014-10-07 00:08:00 42014-10-07 00:10:00 52014-10-07 00:12:00 62014-10-07 00:14:00 72014-10-07 00:16:00 82014-10-07 00:18:00 9Freq: 2T, dtype: int64Create a value counts column and reassign back to the DataFrameIn [115]: df = pd.DataFrame({’Color’: ’Red Red Red Blue’.split(),.....: ’Value’: [100, 150, 50, 50]}); df.....:Out[115]: Color Value0 Red  1001 Red  1502 Red  503 Blue 50In [116]: df[’Counts’] = df.groupby([’Color’]).transform(len)In [117]: dfOut[117]: Color Value Counts0 Red  100   31 Red  150   32 Red  50    33 Blue 50    1

In [30]: i = pd.date_range(’20000101’,periods=10000)In [31]: df = pd.DataFrame(dict(year = i.year, month = i.month, day = i.day))In [32]: df.head()Out[32]:  day month year0 1   1     20001 2   1     20002 3   1     20003 4   1     20004 5   1     2000In [34]: ds = df.apply(lambda x: "%04d%02d%02d" % (x[’year’],x[’month’],x[’day’]),axis=1)In [35]: ds.head()Out[35]:0 200001011 200001022 200001033 200001044 20000105dtype: object

In [180]: def expand_grid(data_dict):.....: rows = itertools.product(*data_dict.values()).....: return pd.DataFrame.from_records(rows, columns=data_dict.keys())In [181]: df = expand_grid(.....: {’height’: [60, 70],.....: ’weight’: [100, 140, 180],.....: ’sex’: [’Male’, ’Female’]})Out[182]:  sex      weight height0 Male     100    601 Male     100    702 Male     140    603 Male     140    704 Male     180    605 Male     180    706 Female   100    607 Female   100    708 Female   140    609 Female   140    7010 Female  180    6011 Female  180    70

In [43]: DataFrame(data)Out[43]:A B C0 1 2 Hello1 2 3 WorldIn [44]: DataFrame(data, index=[’first’, ’second’])Out[44]:       A B Cfirst  1 2 Hellosecond 2 3 WorldIn [45]: DataFrame(data, columns=[’C’, ’A’, ’B’])Out[45]:  C     A B0 Hello 1 21 World 2 3

0 0