pandas常用操作
来源:互联网 发布:真维斯淘宝店电话 编辑:程序博客网 时间:2024/06/05 04:37
In [6]: dates = pd.date_range(’20130101’,periods=6)In [8]: df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list(’ABCD’))In [9]: dfOut[9]: A B C D2013-01-01 0.469112 -0.282863 -1.509059 -1.1356322013-01-02 1.212112 -0.173215 0.119209 -1.0442362013-01-03 -0.861849 -2.104569 -0.494929 1.0718042013-01-04 0.721555 -0.706771 -1.039575 0.2718602013-01-05 -0.424972 0.567020 0.276232 -1.0874012013-01-06 -0.673690 0.113648 -1.478427 0.524988df.valuesdf.sort_index(axis=1, ascending=False)df.sort(columns=’B’)df[’A’]df[0:3]df[’20130102’:’20130104’]df.loc[:,[’A’,’B’]]df.loc[’20130102’:’20130104’,[’A’,’B’]]df.iloc[3]df.iloc[3:5,0:2]df.iloc[[1,2,4],[0,2]]df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + [’E’])df.mean()df.mean(1)df.apply(lambda x: x.max() - x.min())
In [77]: left = pd.DataFrame({’key’: [’foo’, ’foo’], ’lval’: [1, 2]})In [78]: right = pd.DataFrame({’key’: [’foo’, ’foo’], ’rval’: [4, 5]})In [79]: leftOut[79]: key lval0 foo 11 foo 2In [80]: rightOut[80]: key rval0 foo 41 foo 5In [81]: pd.merge(left, right, on=’key’)Out[81]:key lval rval0 foo 1 41 foo 1 52 foo 2 43 foo 2 5
df.append(s, ignore_index=True)
In [87]: dfOut[87]: A B C D0 foo one -1.202872 -0.0552241 bar one -1.814470 2.3959852 foo two 1.018601 1.5528253 bar three -0.595447 0.1665994 foo two 1.395433 0.0476095 bar two -0.392670 -0.1364736 foo one 0.007207 -0.5617577 foo three 1.928123 -1.623033In [88]: df.groupby(’A’).sum()Out[88]: C DAbar -2.802588 2.42611foo 3.146492 -0.63958In [89]: df.groupby([’A’,’B’]).sum()Out[89]: C DA Bbar one -1.814470 2.395985 three -0.595447 0.166599 two -0.392670 -0.136473foo one -1.195665 -0.616981 three 1.928123 -1.623033 two 2.414034 1.600434
In [101]: dfOut[101]: A B C D E0 one A foo 1.418757 -0.1796661 one B foo -1.879024 1.2918362 two C foo 0.536826 -0.0096143 three A bar 1.006160 0.3921494 one B bar -0.029716 0.2645995 one C bar -1.146178 -0.0574096 two A foo 0.100900 -1.4256387 three B foo -1.035018 1.0240988 one C foo 0.314665 -0.1060629 one A bar -0.773723 1.82437510 two B bar -1.170653 0.59597411 three C bar 0.648740 1.167115In [102]: pd.pivot_table(df, values=’D’, index=[’A’, ’B’], columns=[’C’])Out[102]:C bar fooA Bone A -0.773723 1.418757 B -0.029716 -1.879024 C -1.146178 0.314665three A 1.006160 NaN B NaN -1.035018 C 0.648740 NaNtwo A NaN 0.100900 B -1.170653 NaN C NaN 0.536826
# List to hold file namesFileNames = []# Your path will be different, please modify the path below.#os.chdir()函数功能:改变当前路径到指定路径os.chdir(r"C:\Users\david\notebooks\pandas")# Find any file that ends with ".xlsx"for files in os.listdir("."): if files.endswith(".xlsx"): FileNames.append(files)FileNamesdef GetFile(fnombre): # Path to excel file # Your path will be different, please modify the path below. location = r'C:\Users\david\notebooks\pandas\\' + fnombre # Parse the excel file # 0 = first sheet df = pd.read_excel(location, 0) # Tag record to file name df['File'] = fnombre # Make the "File" column the index of the df return df.set_index(['File'])# Create a list of dataframesdf_list = [GetFile(fname) for fname in FileNames]# Combine all of the dataframes into onebig_df = pd.concat(df_list)
df One_X One_Y Two_X Two_Y row0 1.1 1.2 1.11 1.22 01 1.1 1.2 1.11 1.22 12 1.1 1.2 1.11 1.22 2In [60]: df = df.set_index(’row’);dfOut[60]: One_X One_Y Two_X Two_Yrow0 1.1 1.2 1.11 1.221 1.1 1.2 1.11 1.222 1.1 1.2 1.11 1.22In [61]: df.columns = pd.MultiIndex.from_tuples([tuple(c.split(’_’)) for c in df.columns]);dfOut[61]: One Two X Y X Yrow0 1.1 1.2 1.11 1.221 1.1 1.2 1.11 1.222 1.1 1.2 1.11 1.22In [62]: df = df.stack(0).reset_index(1);dfOut[62]: level_1 X Yrow0 One 1.10 1.200 Two 1.11 1.221 One 1.10 1.201 Two 1.11 1.222 One 1.10 1.202 Two 1.11 1.22In [63]: df.columns = [’Sample’,’All_X’,’All_Y’];dfOut[63]: Sample All_X All_Yrow0 One 1.10 1.200 Two 1.11 1.221 One 1.10 1.201 Two 1.11 1.222 One 1.10 1.202 Two 1.11 1.22In [64]: cols = pd.MultiIndex.from_tuples([ (x,y) for x in [’A’,’B’,’C’] for y in [’O’,’I’]])In [65]: df = pd.DataFrame(np.random.randn(2,6),index=[’n’,’m’],columns=cols); dfOut[65]: A B C O I O I O In 1.920906 -0.388231 -2.314394 0.665508 0.402562 0.399555m -1.765956 0.850423 0.388054 0.992312 0.744086 -0.739776In [72]: index = list(itertools.product([’Ada’,’Quinn’,’Violet’],[’Comp’,’Math’,’Sci’]))In [73]: headr = list(itertools.product([’Exams’,’Labs’],[’I’,’II’]))In [74]: indx = pd.MultiIndex.from_tuples(index,names=[’Student’,’Course’])In [75]: cols = pd.MultiIndex.from_tuples(headr) #Notice these are un-namedIn [76]: data = [[70+x+y+(x*y)%3 for x in range(4)] for y in range(9)]In [77]: df = pd.DataFrame(data,indx,cols); dfOut[77]: Exams Labs I II I IIStudent CourseAda Comp 70 71 72 73 Math 71 73 75 74 Sci 72 75 75 75Quinn Comp 73 74 75 76 Math 74 76 78 77 Sci 75 78 78 78Violet Comp 76 77 78 79 Math 77 79 81 80 Sci 78 81 81 81In [78]: All = slice(None)In [79]: df.loc[’Violet’]Out[79]: Exams Labs I II I IICourseComp 76 77 78 79Math 77 79 81 80Sci 78 81 81 81In [80]: df.loc[(All,’Math’),All]Out[80]: Exams Labs I II I IIStudent CourseAda Math 71 73 75 74Quinn Math 74 76 78 77Violet Math 77 79 81 80In [81]: df.loc[(slice(’Ada’,’Quinn’),’Math’),All]Out[81]: Exams Labs I II I IIStudent CourseAda Math 71 73 75 74Quinn Math 74 76 78 77In [82]: df.loc[(All,’Math’),(’Exams’)]Out[82]: I IIStudent CourseAda Math 71 73Quinn Math 74 76Violet Math 77 79In [83]: df.loc[(All,’Math’),(All,’II’)]Out[83]: Exams Labs II IIStudent CourseAda Math 73 74Quinn Math 76 77Violet Math 79 80
df = pd.DataFrame({’animal’: ’cat dog cat fish dog cat cat’.split(),....: ’size’: list(’SSMMMLL’),....: ’weight’: [8, 10, 11, 1, 20, 12, 12],....: ’adult’ : [False] * 5 + [True] * 2});df adult animal size weight0 False cat S 81 False dog S 102 False cat M 113 False fish M 14 False dog M 205 True cat L 126 True cat L 12#List the size of the animals with the highest weight.In [90]: df.groupby(’animal’).apply(lambda subf: subf[’size’][subf[’weight’].idxmax()])Out[90]:animalcat Ldog Mfish Mdtype: objectIn [91]: gb = df.groupby([’animal’])In [92]: gb.get_group(’cat’)Out[92]: adult animal size weight0 False cat S 82 False cat M 115 True cat L 126 True cat L 12Apply to different items in a groupIn [93]: def GrowUp(x):....: avg_weight = sum(x[x.size == ’S’].weight * 1.5)....: avg_weight += sum(x[x.size == ’M’].weight * 1.25)....: avg_weight += sum(x[x.size == ’L’].weight)....: avg_weight = avg_weight / len(x)....: return pd.Series([’L’,avg_weight,True], index=[’size’, ’weight’, ’adult’])....:In [94]: expected_df = gb.apply(GrowUp)In [95]: expected_dfOut[95]: size weight adultanimalcat L 12.4375 Truedog L 20.0000 Truefish L 1.2500 TrueCreate a value counts column and reassign back to the DataFrameIn [115]: df = pd.DataFrame({’Color’: ’Red Red Red Blue’.split(),.....: ’Value’: [100, 150, 50, 50]}); df.....:Out[115]: Color Value0 Red 1001 Red 1502 Red 503 Blue 50In [116]: df[’Counts’] = df.groupby([’Color’]).transform(len)In [117]: dfOut[117]: Color Value Counts0 Red 100 31 Red 150 32 Red 50 33 Blue 50 1Rolling Apply to multiple columns where function returns a Scalar (Volume Weighted Average Price)In [138]: rng = pd.date_range(start = ’2014-01-01’,periods = 100)In [139]: df = pd.DataFrame({’Open’ : np.random.randn(len(rng)),.....: ’Close’ : np.random.randn(len(rng)),’Volume’ : np.random.randint(100,2000,len(rng))}, index=rng)df Close Open Volume2014-01-01 1.550590 0.458513 13712014-01-02 -0.818812 -0.508850 14332014-01-03 1.160619 0.257610 6452014-01-04 0.081521 -1.773393 8782014-01-05 1.083284 -0.560676 11432014-01-06 -0.518721 0.284174 10882014-01-07 0.140661 1.146889 1722... ... ... ...2014-04-04 0.458193 -0.669474 17682014-04-05 0.108502 -1.616315 8362014-04-06 1.418082 -1.294906 6942014-04-07 0.486530 1.171647 7962014-04-08 0.181885 0.501639 2652014-04-09 -0.707238 -0.361868 12932014-04-10 1.211432 1.564429 1088In [140]: def vwap(bars): return ((bars.Close*bars.Volume).sum()/bars.Volume.sum()).round(2)In [141]: window = 5In [142]: s = pd.concat([ (pd.Series(vwap(df.iloc[i:i+window]), index=[df.index[i+window]])) for i in range(95)])Out[142]:2014-01-06 0.552014-01-07 0.062014-01-08 0.322014-01-09 0.032014-01-10 0.08...2014-04-05 0.482014-04-06 0.542014-04-07 0.462014-04-08 0.452014-04-09 0.532014-04-10 0.15Length: 95
Replacing some values with mean of the rest of a groupIn [100]: df = pd.DataFrame({’A’ : [1, 1, 2, 2], ’B’ : [1, -1, 1, 2]})In [101]: gb = df.groupby(’A’)In [102]: def replace(g):.....: mask = g < 0.....: g.loc[mask] = g[~mask].mean().....: return gIn [103]: gb.transform(replace)Out[103]:B0 11 12 13 2Sort groups by aggregated dataIn [104]: df = pd.DataFrame({’code’: [’foo’, ’bar’, ’baz’] * 2,.....: ’data’: [0.16, -0.21, 0.33, 0.45, -0.59, 0.62],.....: ’flag’: [False, True] * 3}).....:In [105]: code_groups = df.groupby(’code’)In [106]: agg_n_sort_order = code_groups[[’data’]].transform(sum).sort(’data’)In [107]: sorted_df = df.ix[agg_n_sort_order.index]In [108]: sorted_dfOut[108]: code data flag1 bar -0.21 True4 bar -0.59 False0 foo 0.16 False3 foo 0.45 True2 baz 0.33 False5 baz 0.62 TrueCreate multiple aggregated columnsIn [109]: rng = pd.date_range(start="2014-10-07",periods=10,freq=’2min’)In [110]: ts = pd.Series(data = list(range(10)), index = rng)In [111]: def MyCust(x):.....: if len(x) > 2:.....: return x[1] * 1.234.....: return pd.NaTIn [112]: mhc = {’Mean’ : np.mean, ’Max’ : np.max, ’Custom’ : MyCust}In [113]: ts.resample("5min",how = mhc)Out[113]: Max Custom Mean2014-10-07 00:00:00 2 1.234 1.02014-10-07 00:05:00 4 NaN 3.52014-10-07 00:10:00 7 7.404 6.02014-10-07 00:15:00 9 NaN 8.5In [114]: tsOut[114]:2014-10-07 00:00:00 02014-10-07 00:02:00 12014-10-07 00:04:00 22014-10-07 00:06:00 32014-10-07 00:08:00 42014-10-07 00:10:00 52014-10-07 00:12:00 62014-10-07 00:14:00 72014-10-07 00:16:00 82014-10-07 00:18:00 9Freq: 2T, dtype: int64Create a value counts column and reassign back to the DataFrameIn [115]: df = pd.DataFrame({’Color’: ’Red Red Red Blue’.split(),.....: ’Value’: [100, 150, 50, 50]}); df.....:Out[115]: Color Value0 Red 1001 Red 1502 Red 503 Blue 50In [116]: df[’Counts’] = df.groupby([’Color’]).transform(len)In [117]: dfOut[117]: Color Value Counts0 Red 100 31 Red 150 32 Red 50 33 Blue 50 1
In [30]: i = pd.date_range(’20000101’,periods=10000)In [31]: df = pd.DataFrame(dict(year = i.year, month = i.month, day = i.day))In [32]: df.head()Out[32]: day month year0 1 1 20001 2 1 20002 3 1 20003 4 1 20004 5 1 2000In [34]: ds = df.apply(lambda x: "%04d%02d%02d" % (x[’year’],x[’month’],x[’day’]),axis=1)In [35]: ds.head()Out[35]:0 200001011 200001022 200001033 200001044 20000105dtype: object
In [180]: def expand_grid(data_dict):.....: rows = itertools.product(*data_dict.values()).....: return pd.DataFrame.from_records(rows, columns=data_dict.keys())In [181]: df = expand_grid(.....: {’height’: [60, 70],.....: ’weight’: [100, 140, 180],.....: ’sex’: [’Male’, ’Female’]})Out[182]: sex weight height0 Male 100 601 Male 100 702 Male 140 603 Male 140 704 Male 180 605 Male 180 706 Female 100 607 Female 100 708 Female 140 609 Female 140 7010 Female 180 6011 Female 180 70
In [43]: DataFrame(data)Out[43]:A B C0 1 2 Hello1 2 3 WorldIn [44]: DataFrame(data, index=[’first’, ’second’])Out[44]: A B Cfirst 1 2 Hellosecond 2 3 WorldIn [45]: DataFrame(data, columns=[’C’, ’A’, ’B’])Out[45]: C A B0 Hello 1 21 World 2 3
0 0
- pandas常用操作
- Pandas常用操作
- Pandas常用操作
- pandas 常用操作
- pandas常用操作
- pandas <十分钟掌握pandas常用操作>
- 几个pandas数据处理中的常用操作
- pandas操作
- pandas做数据分析(三):常用预处理操作
- python:pandas模块中的DataFrame结构及常用操作
- Python Pandas常用数据结构Series和DataFrame的相关操作
- pandas数据处理常用函数demo之创建/行列操作/查看/文件操作
- pandas常用的方法
- pandas 常用函数
- python~pandas常用代码
- Pandas常用笔记
- pandas常用功能整理
- pandas常用函数
- POJ 3304 Segment
- OpenCV入门笔记(一) Linux下的安装
- 一致性哈希算法
- java定时器
- 笔记:sh中从提交一个表单开始发生了什么
- pandas常用操作
- 【计蒜客系列】挑战难题18:跳跃游戏
- js动画2 多物体运动
- HDOJ 2035 人见人爱A^B(快速幂取模)
- poj3735 Training little cats
- 编写一个函数将数组中的数字前后颠倒,并输出到屏幕上来。
- 二叉树的链式存储的递归和非递归遍历
- Android AIDL跨进程通讯
- Hibernate学习笔记: