python-pandas使用样例及速查表

来源:互联网 发布:梵高固彩淘宝 编辑:程序博客网 时间:2024/05/16 08:54

pandas常用使用样例及速查表


1.重建index

frame = DataFrame({'a': range(7), 'b': range(7, 0, -1),                   'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'], 'd': [0, 1, 2, 0, 1, 2, 3]})print frame#  重建index   drop是否删除变为index的字段print frame.set_index(['c', 'd'], drop=False)print frame.reset_index()  # 恢复


2.调换索引级别

frame = DataFrame(np.arange(12).reshape((4, 3)),                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],                  columns=[['Ohio', 'Ohio', 'Colorado'],['Green', 'Red', 'Green']])print frame#  调换索引级别print frame.swaplevel(0, 1, axis=1)


3.数据合并

df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],'data1': range(7)})df2 = DataFrame({'key': ['a', 'b', 'd'],'data2': range(3)})df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],'data1': range(7)})df4 = DataFrame({'rkey': ['a', 'b', 'd'],'data2': range(3)})left = DataFrame({'key1': ['foo', 'foo', 'bar'],'key2': ['one', 'two', 'one'],'lval': [1, 2, 3]})right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],'key2': ['one', 'one', 'one', 'two'],'rval': [4, 5, 6, 7]})left1 = DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],'value': range(6)})right1 = DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])lefth = DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],'key2': [2000, 2001, 2002, 2001, 2002],'data': np.arange(5.)})righth = DataFrame(np.arange(12).reshape((6, 2)),index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],[2001, 2000, 2000, 2000, 2001, 2002]],columns=['event1', 'event2'])

# 统一使用key键,默认内联print pd.merge(df1, df2, on='key')


#  左键lkey,右键rkey,方式外联print pd.merge(df3, df4, left_on='lkey', right_on='rkey', how='outer')


#  以['key1','key2']为键合并print pd.merge(left, right, on=['key1', 'key2'], how='outer')


#  以key1为键合并,并特殊标记其他重名键print pd.merge(left, right, on='key1', suffixes=('_left', '_right'), how='outer')


#  左键key,右键使用索引print pd.merge(left1, right1, left_on='key', right_index=True)#  左键key,右键使用索引(多级索引)print pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)



4.数据拼接

s1 = Series([0, 1], index=['a', 'b'])s2 = Series([2, 3, 4], index=['c', 'd', 'e'])s3 = Series([5, 6], index=['f', 'g'])s4 = pd.concat([s1 * 5, s3])df1 = DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],columns=['one', 'two'])df2 = DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],columns=['three', 'four'])

#  数据拼接print pd.concat([s1, s2, s3])print pd.concat([s1, s2, s3], axis=1)


#  指定拼接方式print pd.concat([s1, s4], axis=1, join='inner')#  指定拼接索引print pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])


#  对拼接数据来源标记索引print pd.concat([s1, s1, s4], keys=['one', 'two', 'three'])print pd.concat([df1, df2], axis=1, keys=['level1',' level2'])


df1 = DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])df2 = DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])#  拼接数据后,取消索引print pd.concat([df1, df2], ignore_index=True)


5.数据转换

data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'pastrami','corned beef', 'bacon', 'pastrami', 'honey ham','nova lox', 'apple'],                  'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6, 0]})meat_to_animal = {'bacon': 'pig', 'pulled pork': 'pig', 'pastrami': 'cow', 'corned beef': 'cow', 'honey ham': 'pig', 'nova lox': 'salmon'}#  map转换print data['food'].map(meat_to_animal)#  transform 使用自定义的函数进行转换print data['food'].transform(lambda x: meat_to_animal[x] if x in meat_to_animal else 'fruit')


6.数据切割

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]bins = [18, 25, 35, 60, 100]#  数据切割print pd.cut(ages, bins).value_counts()#  左闭右开print pd.cut(ages, bins, right=False).value_counts()group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']#  切割重命名print pd.cut(ages, bins, labels=group_names).value_counts()



7.字符串处理

data = Series({'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com','Rob': 'rob@gmail.com', 'Wes': np.nan})pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'#  字符串匹配print data.str.contains('google')#  字符串切割print data.str.findall(pattern, flags=re.IGNORECASE)


8.数据聚合

df = DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'], 'key2': ['one', 'two', 'one', 'two', 'one'],                'data1': np.random.randn(5), 'data2': np.random.randn(5)})#  列多维度聚合print df['data1'].groupby([df['key1'], df['key2']]).mean()#  列多维度聚合并在该维度展开print df['data1'].groupby([df['key1'], df['key2']]).mean().unstack()#  限定聚合的列的值print df.groupby('key1')['data1'].mean()

people = DataFrame(np.arange(25).reshape(5, 5), columns=['a', 'b', 'c', 'd', 'e'],                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}key_list = ['one', 'one', 'one', 'two', 'two']#  列按dict规则聚合print people.groupby(mapping, axis=1).sum()#  函数规则聚合:按索引字符串长度聚合print people.groupby(len).sum()#  复合聚合print people.groupby([len, key_list]).sum()


columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'], [1, 3, 5, 1, 3]], names=['cty', 'tenor'])hier_df = DataFrame(np.arange(20).reshape(4, 5), columns=columns)#  根据索引级别聚合print hier_df.groupby(level='tenor', axis=1).sum()

tips_dict = {'total_bill': [16.99, 10.34, 21.01, 23.68, 24.59, 25.29], 'tip': [1.01, 1.66, 3.50, 3.31, 3.61, 4.71],             'sex': ['F', 'M', 'M', 'M', 'F', 'M'], 'smoker': ['N', 'Y', 'N', 'N', 'Y', 'Y'],             'sizes': [1, 2, 3, 4, 5, 6]}tips = DataFrame(tips_dict)tips['tip_pct'] = tips['tip'] / tips['total_bill']print tips#  groupby取消索引print tips.groupby(['sex', 'smoker'], as_index=False).mean()#  按group做applyprint tips.groupby('smoker').apply(lambda x: x.sort_values(by='tip_pct'))#  数据透视表,默认使用mean聚合print tips.pivot_table(index=['sex'], columns=['smoker'])#  数据透视表,margins表示添加汇总项(all)print tips.pivot_table(index=['sex'], columns=['smoker'], margins=True)#  aggfunc 指定聚合运算规则print tips.pivot_table(index='sex', columns='smoker', aggfunc=sum)#  快速建立透视表,默认sum聚合print pd.crosstab([tips.sex, tips.sizes], tips.smoker, margins=True)grouped = tips.groupby(['sex', 'smoker'])print grouped['tip_pct'].mean()#  数据分析聚合并命名, tuple内首参数为列名,尾参数为运算规则print grouped['tip_pct'].agg([('foo', 'mean'), ('bar', np.std)])#  单列多类型分析聚合print grouped.agg({'tip_pct': ['min', 'max', 'mean', ('std_name', np.std)], 'tip': 'sum'})


frame = DataFrame({'data1': np.arange(1, 1001), 'data2': np.arange(1001, 2001)})def get_stats(group):    return {'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()}#  将data1列按值平分为4组,并在其基础上分析data2列print frame.data2.groupby(pd.cut(frame.data1, 4)).apply(get_stats).unstack()#  将data1列按数量平分为10组,并在其基础上分析data2列print frame.data2.groupby(pd.qcut(frame.data1, 10, labels=False)).apply(get_stats).unstack()

states = ['Ohio', 'New York', 'Vermont', 'Florida', 'Oregon', 'Nevada', 'California', 'Idaho']group_key = ['East'] * 4 + ['West'] * 4data = Series(np.arange(8), index=states)data[['Vermont', 'Nevada', 'Idaho']] = np.nanfill_values = {'East': 50, 'West': 100}#  apply方法填充缺失值print data.groupby(group_key).apply(lambda x: x.fillna(x.mean()))print data.groupby(group_key).apply(lambda x: x.fillna(fill_values[x.name]))