第九章 数据分组与聚合(下)

来源:互联网 发布:有关网络直播的论文 编辑:程序博客网 时间:2024/06/04 19:38



import pandas as pdfrom pandas import Seriesfec=pd.read_csv("e:/P00000001-ALL.csv")fec[:2]     cmte_id    cand_id             cand_nm        contbr_nm contbr_city  \0  C00410118  P20002978  Bachmann, Michelle  HARVEY, WILLIAM      MOBILE   1  C00410118  P20002978  Bachmann, Michelle  HARVEY, WILLIAM      MOBILE     contbr_st  contbr_zip contbr_employer contbr_occupation  contb_receipt_amt  \0        AL  3.6601e+08         RETIRED           RETIRED              250.0   1        AL  3.6601e+08         RETIRED           RETIRED               50.0     contb_receipt_dt receipt_desc memo_cd memo_text form_tp  file_num  0        20-JUN-11          NaN     NaN       NaN   SA17A    736166  1        23-JUN-11          N
fec.ix[123456]
unique_cands
cmte_id                             C00431445
cand_id                             P80003338
cand_nm                         Obama, Barack
contbr_nm                         ELLMAN, IRA
contbr_city                             TEMPE
contbr_st                                  AZ
contbr_zip                          852816719
contbr_employer      ARIZONA STATE UNIVERSITY
contbr_occupation                   PROFESSOR
contb_receipt_amt                          50
contb_receipt_dt                    01-DEC-11
receipt_desc                              NaN
memo_cd                                   NaN
memo_text                                 NaN
form_tp                                 SA17A
file_num                               772372
Name: 123456, dtype: object
unique_cands=fec.cand_nm.unique()
patries={'Bachmann, Michelle':'Republican','Cain, Herman':'Republican',         'Gingrich, Newt':'Republican','Huntsman, Jon':'Republican',         'Johnson, Gary Earl':'Republican','McCotter, Thaddeus G':'Republican',         'Obama, Barack':'Democrat','Paul, Ron':'Republican',         'Pawlenty, Timothy':'Republican','Perry, Rick':'Republican',         "Roemer, Charles E. 'Buddy' III":'Republican',         'Romney, Mitt':'Republican',         'Santorum, Rick':'Republican'         }
unique_cands[2]
'Obama, Barack'fec.cand_nm[123456:123461]
123456    Obama, Barack
123457    Obama, Barack
123458    Obama, Barack
123459    Obama, Barack
123460    Obama, Barack
Name: cand_nm, dtype: object
fec.cand_nm[123456:123461].map(patries)#通过这个映射以及Series对象的map方法,根据候选人姓名得到党派信息123456    Democrat
123457    Democrat
123458    Democrat
123459    Democrat
123460    Democrat
Name: cand_nm, dtype: object
#添加一个新列fec['party']=fec.cand_nm.map(patries)fec['party'].value_counts()
Democrat      593746
Republican    407985
Name: party, dtype: int64
#这里需要注意:该数字既包括赞助也包括退款(负的出资额),限定该数据集只能有正的出资额

fec=fec[fec.contb_receipt_amt>0]

#由于Barack Obama和Mitt Romney是最主要的两名候选人,建立一个子集只包含针对他们两人的竞选活动的赞助信息

fec_mrbo=fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]





根据职业和雇主统计信息

#首先根据职业计算总出资额fec.contbr_occupation.value_counts()[:10]Out[30]: RETIRED                                   233990INFORMATION REQUESTED                      35107ATTORNEY                                   34286HOMEMAKER                                  29931PHYSICIAN                                  23432INFORMATION REQUESTED PER BEST EFFORTS     21138ENGINEER                                   14334TEACHER                                    13990CONSULTANT                                 13273PROFESSOR                                  12555Name: contbr_occupation, dtype: int64#巧妙运用dict.get,允许没有映射关系的职业也能通过,处理职业信息occ_mapping={             'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',             'INFORMATION REQUESTED':'NOT PROVIDED',             'INFORMATION REQUESTED(BEST EFFORTS)':'NOT PROVIDED',             'C.E.O.':'CEO'             }#如果没有提供相关映射,则返回xf=lambda x:occ_mapping.get(x,x)fec.contbr_employer=fec.contbr_employer.map(f)#通过pivot_table根据党派和职业对数据进行聚合,过滤掉总出资额不足200万美元的数据by_occupation=fec.pivot_table('contb_receipt_amt',index='contbr_occupation',                              columns='party',aggfunc='sum')over_2mm=by_occupation[by_occupation.sum(1)>2000000]over_2mmOut[42]: party                                      Democrat    Republicancontbr_occupation                                                ATTORNEY                                11141982.97  7.477194e+06C.E.O.                                      1690.00  2.592983e+06CEO                                      2073284.79  1.618057e+06CONSULTANT                               2459912.71  2.544725e+06ENGINEER                                  951525.55  1.818374e+06EXECUTIVE                                1355161.05  4.138850e+06HOMEMAKER                                4248875.80  1.363428e+07INFORMATION REQUESTED                    4866973.96  3.896616e+06INFORMATION REQUESTED PER BEST EFFORTS          NaN  1.634053e+07INVESTOR                                  884133.00  2.431769e+06LAWYER                                   3160478.87  3.912243e+05MANAGER                                   762883.22  1.444532e+06OWNER                                    1001567.36  2.408287e+06PHYSICIAN                                3735124.94  3.594320e+06PRESIDENT                                1878509.95  4.720924e+06PROFESSOR                                2165071.08  2.967027e+05REAL ESTATE                               528902.09  1.625902e+06RETIRED                                 25305116.38  2.356124e+07SELF-EMPLOYED                             672393.40  1.640253e+06import matplotlib%matplotlib inlineover_2mm.plot(kind='barh')

def get_top_amounts(group,key,n=5):    totals=group.groupby(key)['contb_receipt_amt'].sum()#    根据key对totals进行降序排列    return totals.sort_values(ascending=False)[n:]#根据职业和雇主进行聚合grouped=fec_mrbo.groupby('cand_nm')grouped.apply(get_top_amounts,'contbr_occupation',n=7)Out[56]: cand_nm        contbr_occupation                     Obama, Barack  PROFESSOR                                 2165071.08               CEO                                       2073284.79               PRESIDENT                                 1878509.95               NOT EMPLOYED                              1709188.20               EXECUTIVE                                 1355161.05               TEACHER                                   1250969.15               WRITER                                    1084188.88               OWNER                                     1001567.36               ENGINEER                                   951525.55               INVESTOR                                   884133.00               ARTIST                                     763125.00               MANAGER                                    762883.22               SELF-EMPLOYED                              672393.40               STUDENT                                    628099.75               REAL ESTATE                                528902.09               CHAIRMAN                                   496547.00               ARCHITECT                                  483859.89               DIRECTOR                                   471741.73               BUSINESS OWNER                             449979.30               EDUCATOR                                   436600.89               PSYCHOLOGIST                               427299.92               SOFTWARE ENGINEER                          396985.65               PARTNER                                    395759.50               SALES                                      392886.91               EXECUTIVE DIRECTOR                         348180.94               MANAGING DIRECTOR                          329688.25               SOCIAL WORKER                              326844.43               VICE PRESIDENT                             325647.15               ADMINISTRATOR                              323079.26               SCIENTIST                                  319227.88   Romney, Mitt   NON-PROFIT VETERANS ORG. CHAIR/ANNUITA         10.00               PARAPLANNER                                    10.00               APPRAISAL                                      10.00               SIGN CONTRACTOR                                10.00               POLITICAL OPERATIVE                            10.00               PORT MGT                                       10.00               PRESIDENT EMERITUS                             10.00               CONTRACTS SPECIALIST                            9.00               TEACHER & FREE-LANCE JOURNALIST                 9.00               FOUNDATION CONSULTANT                           6.00               MAIL HANDLER                                    6.00               TREASURER & DIRECTOR OF FINANCE                 6.00               SECRETARY/BOOKKEPPER                            6.00               ELAYNE WELLS HARMER                             6.00               CHICKEN GRADER                                  5.00               DIRECTOR REISCHAUER CENTER FOR EAST A           5.00               SCOTT GREENBAUM                                 5.00               EDUCATION ADMIN                                 5.00               ENGINEER/RISK EXPERT                            5.00               PLANNING AND OPERATIONS ANALYST                 5.00               VILLA NOVA                                      5.00               FINANCIAL INSTITUTION - CEO                     5.00               HORTICULTURIST                                  5.00               MD - UROLOGIST                                  5.00               DISTRICT REPRESENTATIVE                         5.00               INDEPENDENT PROFESSIONAL                        3.00               REMODELER & SEMI RETIRED                        3.00               AFFORDABLE REAL ESTATE DEVELOPER                3.00               IFC CONTRACTING SOLUTIONS                       3.00               3RD GENERATION FAMILY BUSINESS OWNER            3.00Name: contb_receipt_amt, dtype: float64
grouped.apply(get_top_amounts,'contbr_occupation',n=10)Out[57]: cand_nm        contbr_occupation                     Obama, Barack  NOT EMPLOYED                              1709188.20               EXECUTIVE                                 1355161.05               TEACHER                                   1250969.15               WRITER                                    1084188.88               OWNER                                     1001567.36               ENGINEER                                   951525.55               INVESTOR                                   884133.00               ARTIST                                     763125.00               MANAGER                                    762883.22               SELF-EMPLOYED                              672393.40               STUDENT                                    628099.75               REAL ESTATE                                528902.09               CHAIRMAN                                   496547.00               ARCHITECT                                  483859.89               DIRECTOR                                   471741.73               BUSINESS OWNER                             449979.30               EDUCATOR                                   436600.89               PSYCHOLOGIST                               427299.92               SOFTWARE ENGINEER                          396985.65               PARTNER                                    395759.50               SALES                                      392886.91               EXECUTIVE DIRECTOR                         348180.94               MANAGING DIRECTOR                          329688.25               SOCIAL WORKER                              326844.43               VICE PRESIDENT                             325647.15               ADMINISTRATOR                              323079.26               SCIENTIST                                  319227.88               VOLUNTEER                                  305233.64               FINANCE                                    296031.40               MARKETING                                  263610.68   Romney, Mitt   NON-PROFIT VETERANS ORG. CHAIR/ANNUITA         10.00               PARAPLANNER                                    10.00               APPRAISAL                                      10.00               SIGN CONTRACTOR                                10.00               POLITICAL OPERATIVE                            10.00               PORT MGT                                       10.00               PRESIDENT EMERITUS                             10.00               CONTRACTS SPECIALIST                            9.00               TEACHER & FREE-LANCE JOURNALIST                 9.00               FOUNDATION CONSULTANT                           6.00               MAIL HANDLER                                    6.00               TREASURER & DIRECTOR OF FINANCE                 6.00               SECRETARY/BOOKKEPPER                            6.00               ELAYNE WELLS HARMER                             6.00               CHICKEN GRADER                                  5.00               DIRECTOR REISCHAUER CENTER FOR EAST A           5.00               SCOTT GREENBAUM                                 5.00               EDUCATION ADMIN                                 5.00               ENGINEER/RISK EXPERT                            5.00               PLANNING AND OPERATIONS ANALYST                 5.00               VILLA NOVA                                      5.00               FINANCIAL INSTITUTION - CEO                     5.00               HORTICULTURIST                                  5.00               MD - UROLOGIST                                  5.00               DISTRICT REPRESENTATIVE                         5.00               INDEPENDENT PROFESSIONAL                        3.00               REMODELER & SEMI RETIRED                        3.00               AFFORDABLE REAL ESTATE DEVELOPER                3.00               IFC CONTRACTING SOLUTIONS                       3.00               3RD GENERATION FAMILY BUSINESS OWNER            3.00Name: contb_receipt_amt, dtype: float64
对出资额分组

import numpy as npbins=np.array([0,1,10,100,1000,10000,100000,1000000,10000000])labels=pd.cut(fec_mrbo.contb_receipt_amt,bins)labels411           (10, 100]412         (100, 1000]413         (100, 1000]414           (10, 100]415           (10, 100]416           (10, 100]417         (100, 1000]418           (10, 100]419         (100, 1000]420           (10, 100]421           (10, 100]422         (100, 1000]423         (100, 1000]424         (100, 1000]425         (100, 1000]426         (100, 1000]427       (1000, 10000]428         (100, 1000]429         (100, 1000]430           (10, 100]431       (1000, 10000]432         (100, 1000]433         (100, 1000]434         (100, 1000]435         (100, 1000]436         (100, 1000]437           (10, 100]438         (100, 1000]439         (100, 1000]440           (10, 100]     701356        (10, 100]701357          (1, 10]701358        (10, 100]701359        (10, 100]701360        (10, 100]701361        (10, 100]701362      (100, 1000]701363        (10, 100]701364        (10, 100]701365        (10, 100]701366        (10, 100]701367        (10, 100]701368      (100, 1000]701369        (10, 100]701370        (10, 100]701371        (10, 100]701372        (10, 100]701373        (10, 100]701374        (10, 100]701375        (10, 100]701376    (1000, 10000]701377        (10, 100]701378        (10, 100]701379      (100, 1000]701380    (1000, 10000]701381        (10, 100]701382      (100, 1000]701383          (1, 10]701384        (10, 100]701385      (100, 1000]Name: contb_receipt_amt, dtype: categoryCategories (8, object): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]]#根据候选人姓名以及面元标签对数据进行分组grouped=fec_mrbo.groupby(['cand_nm',labels])grouped.size().unstack(0)Out[67]: cand_nm              Obama, Barack  Romney, Mittcontb_receipt_amt                               (0, 1]                       493.0          77.0(1, 10]                    40070.0        3681.0(10, 100]                 372280.0       31853.0(100, 1000]               153991.0       43357.0(1000, 10000]              22284.0       26186.0(10000, 100000]                2.0           1.0(100000, 1000000]              3.0           NaN(1000000, 10000000]            4.0           NaN#对出资额求合并在面元内规格化,以便图形化显示两位候选人各种赞助额度bucket_sumsOut[69]: cand_nm              Obama, Barack  Romney, Mittcontb_receipt_amt                               (0, 1]                      318.24         77.00(1, 10]                  337267.62      29819.66(10, 100]              20288981.41    1987783.76(100, 1000]            54798531.46   22363381.69(1000, 10000]          51753705.67   63942145.42(10000, 100000]           59100.00      12700.00(100000, 1000000]       1490683.08           NaN(1000000, 10000000]     7148839.76           NaN
normed_sums=bucket_sums.div(bucket_sums.sum(axis=1),axis=0)normed_sumsOut[71]: cand_nm              Obama, Barack  Romney, Mittcontb_receipt_amt                               (0, 1]                    0.805182      0.194818(1, 10]                   0.918767      0.081233(10, 100]                 0.910769      0.089231(100, 1000]               0.710176      0.289824(1000, 10000]             0.447326      0.552674(10000, 100000]           0.823120      0.176880(100000, 1000000]         1.000000           NaN(1000000, 10000000]       1.000000           NaN
normed_sums[:-2].plot(kind='barh',stacked=True)


#根据州统计赞助信息
















阅读全文
0 0
原创粉丝点击