pandas indexing and slicing data

来源：互联网发布：数据库大并发处理编辑：程序博客网时间：2024/06/05 20:53

`In [9]: dfOut[9]:           A        B         C         D2000-01-01 0.469112 -0.282863 -1.509059 -1.1356322000-01-02 1.212112 -0.173215 0.119209 -1.0442362000-01-03 -0.861849 -2.104569 -0.494929 1.0718042000-01-04 0.721555 -0.706771 -1.039575 0.2718602000-01-05 -0.424972 0.567020 0.276232 -1.0874012000-01-06 -0.673690 0.113648 -1.478427 0.5249882000-01-07 0.404705 0.577046 -1.715002 -1.0392682000-01-08 -0.370647 -1.157892 -1.344312 0.844885In [10]: df[[’B’, ’A’]] = df[[’A’, ’B’]]In [11]: dfOut[11]:           A         B        C         D2000-01-01 -0.282863 0.469112 -1.509059 -1.1356322000-01-02 -0.173215 1.212112 0.119209 -1.0442362000-01-03 -2.104569 -0.861849 -0.494929 1.0718042000-01-04 -0.706771 0.721555 -1.039575 0.2718602000-01-05 0.567020 -0.424972 0.276232 -1.0874012000-01-06 0.113648 -0.673690 -1.478427 0.5249882000-01-07 0.577046 0.404705 -1.715002 -1.0392682000-01-08 -1.157892 -0.370647 -1.344312 0.844885

In [21]: dfa[’A’] = list(range(len(dfa.index))) # use this form to create a new column{In [22]: dfaOut[22]:           A B        C         D2000-01-01 0 0.469112 -1.509059 -1.1356322000-01-02 1 1.212112 0.119209 -1.0442362000-01-03 2 -0.861849 -0.494929 1.0718042000-01-04 3 0.721555 -1.039575 0.2718602000-01-05 4 -0.424972 0.276232 -1.0874012000-01-06 5 -0.673690 -1.478427 0.5249882000-01-07 6 0.404705 -1.715002 -1.0392682000-01-08 7 -0.370647 -1.344312 0.844885

In [94]: df[df[’A’] > 0]Out[94]:           A        B        C         D E 02000-01-04 7.000000 0.721555 -1.039575 0.271860 NaN NaN2000-01-05 0.567020 -0.424972 0.276232 -1.087401 NaN NaN2000-01-06 0.113648 -0.673690 -1.478427 0.524988 7 NaN2000-01-07 0.577046 0.404705 -1.715002 -1.039268 NaN NaN

In [95]: df2 = DataFrame({’a’ : [’one’, ’one’, ’two’, ’three’, ’two’, ’one’, ’six’],....: ’b’ : [’x’, ’y’, ’y’, ’x’, ’y’, ’x’, ’x’],....: ’c’ : randn(7)})....:# only want ’two’ or ’three’In [96]: criterion = df2[’a’].map(lambda x: x.startswith(’t’))In [97]: df2[criterion]Out[97]:  a   b c2 two y 0.9957613 three x 2.3967804 two y 0.014871In [99]: df2[criterion & (df2[’b’] == ’x’)]Out[99]:  a     b c3 three x 2.39678

In [104]: s[s.isin([2, 4, 6])]Out[104]:2 20 4dtype: int64In [107]: s_mi = Series(np.arange(6),.....: index=pd.MultiIndex.from_product([[0, 1], [’a’, ’b’, ’c’]])).....:In [108]: s_miOut[108]:0 a 0  b 1  c 21 a 3  b 4  c 5dtype: int32In [109]: s_mi.iloc[s_mi.index.isin([(1, ’a’), (2, ’b’), (0, ’c’)])]Out[109]:0 c 21 a 3dtype: int32In [110]: s_mi.iloc[s_mi.index.isin([’a’, ’c’, ’e’], level=1)]Out[110]:0 a 0  c 21 a 3  c 5dtype: int32

In [146]: df = DataFrame(randint(n / 2, size=(n, 2)), columns=list(’bc’))In [147]: df.index.name = ’a’In [148]: dfOut[148]:  b ca0 2 31 4 12 4 03 4 14 1 45 1 46 0 17 0 08 4 09 4 2In [149]: df.query(’a < b and b < c’)Out[149]:  b ca0 2 3

In [157]: import pandas.util.testing as tmIn [158]: n = 10In [159]: colors = tm.choice([’red’, ’green’], size=n)In [160]: foods = tm.choice([’eggs’, ’ham’], size=n)In [163]: index = MultiIndex.from_arrays([colors, foods], names=[’color’, ’food’])In [164]: df = DataFrame(randn(n, 2), index=index)In [165]: dfOut[165]:            0        1color foodred   ham   0.157622 -0.293555green eggs  0.111560 0.597679red   ham   -1.270093 0.120949green ham   -0.193898 1.804172red   ham   -0.234694 0.939908green eggs  -0.171520 -0.153055red   eggs  -0.363095 -0.067318green eggs  1.444721 0.325771      ham   -0.855732 -0.697595      eggs  -0.276134 -1.258759In [166]: df.query(’color == "red"’)Out[166]:           0        1color foodred ham    0.157622 -0.293555    ham    -1.270093 0.120949    ham    -0.234694 0.939908    eggs   -0.363095 -0.067318

In [208]: df2 = DataFrame({’a’ : [’one’, ’one’, ’two’, ’three’, ’two’, ’one’, ’six’],.....: ’b’ : [’x’, ’y’, ’y’, ’x’, ’y’, ’x’, ’x’],.....: ’c’ : np.random.randn(7)}).....:In [209]: df2.duplicated([’a’,’b’])Out[209]:0 False1 False2 False3 False4 True5 True6 Falsedtype: boolIn [210]: df2.drop_duplicates([’a’,’b’])Out[210]:  a     b c0 one   x 0.9327131 one   y -0.3935102 two   y -0.5484543 three x 1.1307366 six   x -1.233298In [211]: df2.drop_duplicates([’a’,’b’], take_last=True)Out[211]:  a     b c1 one   y -0.3935103 three x 1.1307364 two   y -0.4472175 one   x 1.0439216 six   x -1.233298

In [223]: index = Index(list(range(5)), name=’rows’)In [224]: columns = Index([’A’, ’B’, ’C’], name=’cols’)In [225]: df = DataFrame(np.random.randn(5, 3), index=index, columns=columns)In [226]: dfOut[226]:cols A        B        Crows0    0.603791 0.388713 0.5443311    -0.152978 1.929541 0.2021382    0.024972 0.117533 -0.1847403    1.054144 -0.736061 -0.7853524    -1.362549 -0.063514 0.487562In [227]: df[’A’]Out[227]:rows0 0.6037911 -0.1529782 0.0249723 1.0541444 -1.362549Name: A, dtype: float64In [250]: indexed2 = data.set_index([’a’, ’b’])In [251]: indexed2Out[251]:        c da   bbar one z 1    two y 2foo one x 3    two w 4In [255]: data.set_index(’c’, drop=False)Out[255]:  a   b   c dcz bar one z 1y bar two y 2x foo one x 3w foo two w 4In [256]: data.set_index([’a’, ’b’], inplace=True)In [257]: dataOut[257]:        c da   bbar one z 1    two y 2foo one x 3    two w 4In [259]: data.reset_index()Out[259]:  a   b   c d0 bar one z 11 bar two y 22 foo one x 33 foo two w 4You can use the level keyword to remove only a portion of the index:In [260]: frameOut[260]:          c dc a   bz bar one z 1y bar two y 2x foo one x 3w foo two w 4In [261]: frame.reset_index(level=1)Out[261]:      a   c dc bz one bar z 1y two bar y 2x one foo x 3w two foo w 4

0 0