Python for Data Analysis 第9章

来源:互联网 发布:docker java jar 编辑:程序博客网 时间:2024/06/05 18:30

从merge(第7章)开始,就可以摆脱用MySQL进行大表join的冗长时间了;有了.loc(DataFrame数据选取)和 .str(矢量化字符串处理),可以不用mysql的update了;有了groupby(第9章),基本可以摆脱MySQL缓慢的select了。由此,MySQL只用来读原始数据和写最终结果,数据清洗中间过程全在Python下了。

groupby选择列的几个位置,当然效率是不同的:

p_prs_23.groupby('NAT_AN')['is_failed'].sum()

p_prs_23[['NAT_AN','is_failed']].groupby('NAT_AN').sum()

p_prs_23.groupby('NAT_AN')[['NAT_AN','is_failed']].sum()

p_prs_23.groupby('NAT_AN').sum().reset_index()[['NAT_AN','is_failed']]

第9章代码:

# -*- coding: utf-8 -*-import pandas as pdimport numpy as np# p265df=pd.DataFrame({'key1':['a','a','b','b','a'],                 'key2':['one','two','one','two','one'],                 'data1':np.random.randn(5),                 'data2':np.random.randn(5)})grouped = df['data1'].groupby(df['key1'])grouped.mean()means = df['data1'].groupby([df['key1'],df['key2']]).mean()means.unstack()states = np.array(['ohio','california','california','ohio','ohio'])years  = np.array([2005,2005,2006,2005,2006])df['data1'].groupby([states,years]).mean()df.groupby('key1').mean()df.groupby(['key1','key2']).mean()df.groupby(['key1','key2']).size()for name,group in df.groupby('key1'):    print name    print groupfor (k1,k2),group in df.groupby(['key1','key2']):    print k1,k2    print grouppieces = dict(list(df.groupby('key1')))pieces['b']df.dtypesgrouped = df.groupby(df.dtypes,axis=1)dict(list(grouped))df.groupby('key1')['data1'].mean()df['data1'].groupby(df['key1']).mean()df.groupby('key1')[['data1']].mean()df[['data1']].groupby(df['key1']).mean()df.groupby(['key1','key2'])[['data1']].mean()s_group = df.groupby(['key1','key2'])['data2']s_group.mean()people=pd.DataFrame(np.random.randn(5,5),                    columns=['a','b','c','d','e'],                    index=['joe','steve','wes','jim','travis'])people.ix[2:3,['b','c']]=np.nanmapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}by_column = people.groupby(mapping,axis=1)by_column.sum()map_series=pd.Series(mapping)people.groupby(map_series,axis=1).count()people.groupby(len).sum()key_list=['one','one','one','two','two']people.groupby([len,key_list]).min()columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['cty','tensor'])hief_df = pd.DataFrame(np.random.randn(4,5), columns=columns)hief_df.groupby(level='cty',axis=1).count()grouped=df.groupby('key1')grouped['data1'].quantile(0.9)grouped['data1'].quantile(0)grouped['data1'].quantile(1)def peak_to_peak(arr):    return arr.max()-arr.min()grouped.agg(peak_to_peak)grouped.describe()tips = pd.read_csv('g:/tips.csv')tips['tip_pct']=tips['tip']/tips['total_bill']tips[:6]grouped = tips.groupby(['sex','smoker'])grouped_pct = grouped['tip_pct']grouped_pct.agg('mean')grouped_pct.agg(['mean','std',peak_to_peak])grouped_pct.agg([('foo','mean'),('bar',np.std)])functions=['count','mean','max']result = grouped['tip_pct','total_bill'].agg(functions)result = grouped[['tip_pct','total_bill']].agg(functions) # 同上一条result['tip_pct']ftuples = [('durchschnitt','mean'),('abweichung',np.var)]grouped['tip_pct','total_bill'].agg(ftuples)grouped.agg({'tip':np.max,'size':'sum'})grouped.agg({'tip_pct':['min','max','mean','std'],'size':'sum'})tips.groupby(['sex','smoker'], as_index=False).mean()tips.groupby(['sex','smoker']).mean().reset_index()k1_means = df.groupby('key1').mean().add_prefix('mean_')pd.merge(df,k1_means,left_on='key1',right_index=True)key = ['one','two','one','two','one']people.groupby(key).mean()people.groupby(key).transform(np.mean)people.groupby(key).transform().mean() # ×def demean(arr):    return arr-arr.mean()demeaned = people.groupby(key).transform(demean)demeaned.groupby(key).mean()def top(df,n=5,column='tip_pct'):    return df.sort_index(by=column)[-n:] # 默认升续top(tips, n=6)tips.groupby('smoker').apply(top)tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill') # ×result=tips.groupby('smoker')['tip_pct'].describe()result.unstack('smoker')f=lambda x:x.describe()grouped.apply(f)tips.groupby('smoker',group_keys=False).apply(top)frame=pd.DataFrame({'data1':np.random.randn(1000),'data2':np.random.randn(1000)})factor=pd.cut(frame.data1,4)factor[:10]def get_stats(group):    return {'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()}grouped=frame.data2.groupby(factor)grouped.apply(get_stats).unstack()grouping = pd.qcut(frame.data1,10,labels=False)grouped = frame.data2.groupby(grouping)grouped.apply(get_stats).unstack()s=pd.Series(np.random.randn(6))s[::2]=np.nans.fillna(s.mean())states=['ohio','new york','vermont','florida','oregon','nevada','california','idaho']group_key=['east']*4+['west']*4data=pd.Series(np.random.randn(8),index=states)data[['vermont','nevada','idaho']]=np.nandata.groupby(group_key).mean()fill_mean = lambda g:g.fillna(g.mean())data.groupby(group_key).apply(fill_mean)def xx(g):    return g.fillna(g.mean())data.groupby(group_key).apply(xx)fill_values={'east':0.5,'west':-1}fill_func=lambda g:g.fillna(fill_values[g.name])data.groupby(group_key).apply(fill_func)def yy(g):    return g.fillna(fill_values[g.name])data.groupby(group_key).apply(yy)suits=['h','s','c','d']card_val=(range(1,11)+[10]*3)*4base_names=['a']+range(2,11)+['j','q','k']cards=[]for suit in suits:    cards.extend(str(num)+suit for num in base_names)cards=[]for suit in suits:    for num in base_names:        cards.extend(str(num)+suit) # ×deck=pd.Series(card_val,index=cards)def draw(deck,n=5):    return deck.take(np.random.permutation(len(deck))[:n])draw(deck)get_suit=lambda card: card[-1]deck.groupby(get_suit).apply(draw,n=2)deck.groupby(get_suit, group_keys=False).apply(draw,n=2)df=pd.DataFrame({'category':['a']*4+['b']*4,'data':np.random.randn(8),'weights':np.random.randn(8)})grouped=df.groupby('category')get_wavg=lambda g: np.average(g['data'],weights=g['weights'])grouped.apply(get_wavg)close_px = pd.read_csv('g:/stock_px.csv',parse_dates=True,index_col=0)close_px[-4:]rets = close_px.pct_change().dropna()spx_corr = lambda x: x.corrwith(x['SPX'])by_year = rets.groupby(lambda x: x.year) # 为什么这样就可以?by_year.apply(spx_corr)by_year = rets.groupby(rets.index.year) # works 2.by_year.apply(spx_corr)by_year.apply(corrwith(rets.SPX)) # ×by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))by_year.apply(rets['AAPL'].corr(rets['MSFT'])) # ×import statsmodels.api as smdef regress(data,yvar,xvars):    y=data[yvar]    x=data[xvars]    x['intercept']=1.    result=sm.OLS(y,x).fit()    return result.paramsby_year.apply(regress,'AAPL',['SPX'])tips.pivot_table(index=['sex','smoker'])tips.groupby(['sex','smoker']).mean()tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns='smoker')tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns='smoker',margins=True)tips.pivot_table('tip_pct',index=['sex','smoker'],columns='day',aggfunc='count',margins=True)tips.pivot_table('size',index=['time','sex','smoker'],columns='day',aggfunc=sum,fill_value=0)data=pd.DataFrame({'sample':[1,2,3,4,5,6,7,8,9,10],'gender':['f','m','f','m','m','m','f','f','m','f'],'handedness':['r','l','r','r','l','r','r','l','r','r']})pd.crosstab(data.gender,data.handedness,margins=True)pd.crosstab([tips.time,tips.day],tips.smoker,margins=True)fec=pd.read_csv('g:/p_all.csv')fec.ix[123456]unique_cands=fec.cand_nm.unique()unique_cands[2]parties={'Bachmann, Michelle':'r','Romney, Mitt':'r','Obama, Barack':'d'}fec.cand_nm[123456:123461]fec.cand_nm[123456:123461].map(parties)fec['party']=fec.cand_nm.map(parties)fec.party.value_counts()fec.party.counts() # ×(fec.contb_receipt_amt>0).value_counts()fec=fec[fec.contb_receipt_amt>0]fec_mrbo=fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]fec_mrbo=fec[fec.cand_nm.isin(('Obama, Barack','Romney, Mitt'))]fec_mrbo=fec[fec.cand_nm in ['Obama, Barack','Romney, Mitt']] # ×fec_mrbo=fec[fec.cand_nm.str.isin('Obama, Barack','Romney, Mitt')] # ×fec_mrbo=fec[fec.cand_nm.str in('Obama, Barack','Romney, Mitt')] # ×fec.contbr_occupation.value_counts()[:10]occ_mapping={'INFORMATION REQUESTED PER BEST EFFORTS':'not provided','INFORMATION REQUESTED':'not provided','INFORMATION REQUESTED (BEST EFFORTS)':'not provided','C.E.O.':'CEO'}f=lambda x: occ_mapping.get(x,x)fec.contbr_occupation=fec.contbr_occupation.map(f)emp_mapping={'INFORMATION REQUESTED PER BEST EFFORTS':'not provided','INFORMATION REQUESTED':'not provided','SELF':'SELF-EMPLOYED','SELF EMPLOYED':'SELF-EMPLOYED',}f=lambda x: emp_mapping.get(x,x)fec.contbr_employer=fec.contbr_employer.map(f)by_occupation=fec.pivot_table('contb_receipt_amt',index='contbr_occupation',columns='party',aggfunc='sum')over_2mm=by_occupation[by_occupation.sum(1)>2000000]over_2mm.plot(kind='barh')def get_top_amounts(group,key,n=5):    totals= group.groupby(key)['contb_receipt_amt'].sum()    return totals.order(ascending=False)[:n]grouped=fec_mrbo.groupby('cand_nm')grouped.apply(get_top_amounts,'contbr_occupation',n=7)grouped.apply(get_top_amounts,'contbr_employer',n=7)bins=np.array([0,1,10,100,1000,10000,100000,1000000,10000000])labels=pd.cut(fec_mrbo.contb_receipt_amt,bins)grouped=fec_mrbo.groupby(['cand_nm',labels])grouped.size().unstack(0)bucket_sums=grouped.contb_receipt_amt.sum().unstack(0)normed_sum=bucket_sums.div(bucket_sums.sum(axis=1),axis=0)normed_sum[:-2].plot(kind='barh',stacked=True)grouped=fec_mrbo.groupby(['cand_nm','contbr_st'])totals=grouped.contb_receipt_amt.sum().unstack(0).fillna(0)totals[:10]percent=totals.div(totals.sum(axis=1),axis=0)percent=totals.div(totals.sum(1),axis=0)percent[:10]from mpl_toolkits.basemap import Basemap,cmfrom matplotlib import rcParamsfrom matplotlib.collections import LineCollectionimport matplotlib.pyplot as pltfrom shapelib import ShapeFileimport dbflib# p255data=pd.read_csv('g:/haiti.csv')data.head()data.describe()data=data[(data.LATITUDE>18)&(data.LATITUDE<20)&(data.LONGITUDE>-75)&(data.LONGITUDE<-70)&data.CATEGORY.notnull()]def to_cat_list(catstr):    stripped=(x.strip() for x in catstr.split(','))    return [x for x in stripped if x]def get_all_categories(cat_series):    cat_sets=(set(to_cat_list(x)) for x in cat_series)    return sorted(set.union(*cat_sets))def get_english(cat):    code,names=cat.split('.')    if '|' in names:        names=names.split('|')[1]    return code, namesall_cats=get_all_categories(data.CATEGORY)english_mapping=dict(get_english(x) for x in all_cats)        english_mapping['2a'] def get_code(seq):    return [x.split('.')[0] for x in seq if x ] all_codes=get_code(all_cats)all_cats.str.split('.')[0] # × 'list' object has no attribute 'str'code_index=pd.Index(np.unique(all_codes))dummy_frame=pd.DataFrame(np.zeros((len(data),len(code_index))),index=data.index,columns=code_index)dummy_frame.ix[:,:6]for row, cat in zip(data.index, data.CATEGORY):    codes=get_code(to_cat_list(cat))    dummy_frame.ix[row,codes]=1data=data.join(dummy_frame.add_prefix('category_'))    data.ix[:,10:15]def basic_haiti_map(ax=None,lllat=17.25,urlat=20.25,lllon=-75,urlon=-71):    m=Basemap(ax=ax, projection='stere',lon_0=(urlon+lllon)/2,lat_0=(urlat+lllat)/2,              llcrnrlat=lllat,urcrnrlat=urlat,llcrnrlon=lllon,urcrnrlon=urlon,resolution='f')    m.drawcoastlines()    m.drawstates()    m.drawcountries()    return mfig,axes=plt.subplots(nrows=2,ncols=2,figsize=(12,10))fig.subplots_adjust(hspace=0.05,wspace=0.05)to_plot=['2a','1','3c','7a']lllat=17.25;urlat=20.25;lllon=-75;urlon=-71for code, ax in zip(to_plot,axes.flat):    m=basic_haiti_map(ax,lllat=lllat,urlat=urlat,lllon=lllon,urlon=urlon)    cat_data=data[data['category_%s' % code]==1]    # x,y=m(cat_data.LONGITUDE,cat_data.LATITUDE)     add_v=pd.DataFrame()    for i,j in zip(cat_data.LONGITUDE,cat_data.LATITUDE) :            x,y=m(i,j)        add_v=add_v.append(pd.DataFrame([x,y]).T)    # m.plot(x,y,'k.',alpha=0.5)    m.plot(add_v[0],add_v[1],'k.',alpha=0.5)    ax.set_title('%s:%s' % (code, english_mapping))   add_v=pd.DataFrame()for i,j in zip(cat_data.LONGITUDE,cat_data.LATITUDE) :        x,y=m(i,j)    add_v=add_v.append(pd.DataFrame([x,y]).T)m.plot(add_v[0],add_v[1],'k.',alpha=0.5)    for code, ax in zip(to_plot,axes.flat):    m=basic_haiti_map(ax,lllat=lllat,urlat=urlat,lllon=lllon,urlon=urlon)    cat_data=data[data['category_%s' % code]==1]    x,y=m(cat_data.LONGITUDE,cat_data.LATITUDE)        m.plot(x,y,'k.',alpha=0.5)    ax.set_title('%s:%s' % (code, english_mapping))

原创粉丝点击