
来源:互联网 发布:arcgis api for js 编辑:程序博客网 时间:2024/06/04 01:24

1 统计婴儿姓名


1.1 下载数据


import pandas as pd#查看原始数据print open(r'E:\python\pythonDataAnalysis\pydata-book-master\ch02\names\yob1880.txt').readline()names1880 = pd.read_csv(r'E:\python\pythonDataAnalysis\pydata-book-master\ch02\names\yob1880.txt',names=['name','sex','births'])print type(names1880)names1880[:5]
  Mary,F,7065    <class 'pandas.core.frame.DataFrame'>
name sex births 0 Mary F 7065 1 Anna F 2604 2 Emma F 2003 3 Elizabeth F 1939 4 Minnie F 1746

1.2 查看1880年男女婴儿的出生数

print names1880.shapeprint len(names1880)print names1880.size#很显然,size是三列的乘积names1880.groupby(['sex']).sum()
(2000, 3)    2000    6000
births sex F 90993 M 110493

1.3 实现多个txt文本文件的融合(多个DataFrame的联结)

#由于统计的是1880-2011年的婴儿名字years = range(1880,2011)columns = ['name','sex','births']pieces = []for year in years:    path = r'E:\python\pythonDataAnalysis\pydata-book-master\ch02\names\yob%d.txt'%year    frame = pd.read_csv(path, names = columns)    frame['year'] = year    pieces.append(frame)names = pd.concat(pieces,ignore_index=True)print len(names)print names.shape
    1690784    (1690784, 4)

1.4 统计并可视化每年不同性别婴儿的出生数量


total_births_by_sex = pd.pivot_table(names,values = 'births', index ='year',columns='sex',aggfunc = sum)total_births_by_sex.tail()#默认显示最后5行
sex F M year 2006 1896468 2050234 2007 1916888 2069242 2008 1883645 2032310 2009 1827643 1973359 2010 1759010 1898382
total_births_by_sex.plot(title='total births by sex and year')import matplotlib.pyplot as


1.5 找出最受欢迎的名字


def add_prop(group):    briths = group.births.astype(float)    group['prop'] = briths/briths.sum()    return groupnames = names.groupby(['year','sex']).apply(add_prop)
name sex births year prop 0 Mary F 7065 1880 0.077643 1 Anna F 2604 1880 0.028618 2 Emma F 2003 1880 0.022013 3 Elizabeth F 1939 1880 0.021309 4 Minnie F 1746 1880 0.019188
import numpy as np#检验分组总计值是否接近于1np.allclose(names.groupby(['year','sex']).prop.sum(),1)
def get_top1000(group):    return group.sort_index(by = 'births',ascending = False)top1000 = names.groupby(['year','sex']).apply(get_top1000)
    C:\Program Files\anaconda\lib\site-packages\ipykernel\ FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)      from ipykernel import kernelapp as app查看top1000的相关信息
    <class 'pandas.core.frame.DataFrame'>    MultiIndex: 1690784 entries, (1880, F, 0) to (2010, M, 1690783)    Data columns (total 5 columns):    name      1690784 non-null object    sex       1690784 non-null object    births    1690784 non-null int64    year      1690784 non-null int64    prop      1690784 non-null float64    dtypes: float64(1), int64(2), object(2)    memory usage: 77.4+ MB


name sex births year prop year sex 1880 F 0 Mary F 7065 1880 0.077643 1 Anna F 2604 1880 0.028618 2 Emma F 2003 1880 0.022013 3 Elizabeth F 1939 1880 0.021309 4 Minnie F 1746 1880 0.019188


1.6 分析并可视化某个名字的随时间变化趋势

#将top1000中男女分开boys = top1000[ == 'M']girls = top1000[ == 'F']boys[:5]
name sex births year prop year sex 1880 M 942 John M 9655 1880 0.087381 943 William M 9533 1880 0.086277 944 James M 5927 1880 0.053641 945 Charles M 5348 1880 0.048401 946 George M 5126 1880 0.046392
name sex births year prop year sex 1880 F 0 Mary F 7065 1880 0.077643 1 Anna F 2604 1880 0.028618 2 Emma F 2003 1880 0.022013 3 Elizabeth F 1939 1880 0.021309 4 Minnie F 1746 1880 0.019188
#生成year、name和births的透视表total_births = top1000.pivot_table(values = 'births',index = 'year',columns = 'name',aggfunc = sum)
name Aaban Aabid Aabriella Aadam Aadan Aadarsh Aaden Aadesh Aadhav Aadhavan … Zyrus Zysean Zyshaun Zyshawn Zyshon Zyshonne Zytavious Zyvion Zyyanna Zzyzx year 1880 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN … NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1881 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN … NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1882 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN … NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1883 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN … NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1884 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN … NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 88496 columns
    <class 'pandas.core.frame.DataFrame'>    Int64Index: 131 entries, 1880 to 2010    Columns: 88496 entries, Aaban to Zzyzx    dtypes: float64(88496)    memory usage: 88.4 MB
subset = total_births['John']subset[:5]
    year    1880    9701.0    1881    8795.0    1882    9597.0    1883    8934.0    1884    9427.0    Name: John, dtype: float64
subset = total_births[['John','Harry','Mary','Marilyn']]subset[:5]
name John Harry Mary Marilyn year 1880 9701.0 2158.0 7092.0 NaN 1881 8795.0 2002.0 6948.0 NaN 1882 9597.0 2246.0 8179.0 NaN 1883 8934.0 2116.0 8044.0 NaN 1884 9427.0 2338.0 9253.0 NaN
#可视化subset,注意该图是在运行三次出的,前两次有点小问题subset.plot(subplots = True, figsize = (12,10),grid = False,title='Number of births per year')



1.7 命名多样性增加


tabal = top1000.pivot_table(values = 'prop',index = 'year',columns = 'sex',aggfunc = sum)tabal[:5]
sex F M year 1880 1.0 1.0 1881 1.0 1.0 1882 1.0 1.0 1883 1.0 1.0 1884 1.0 1.0


#tabal.plot(title ='sum of table1000.prop by year and sex',yticks = np.linspace(0,1.2,20),xticks=range(1880,2020,10))tabal.plot(title ='sum of table1000.prop by year and sex')





def get_quantile_count(group,q=0.5):    group = group.sort_index(by = 'prop',ascending = False)    return (group.prop.cumsum().searchsorted(q)+1)[0]diversity = top1000.groupby(['year','sex']).apply(get_quantile_count)print type(diversity)diversity[:5]
    C:\Program Files\anaconda\lib\site-packages\ipykernel\ FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)      from ipykernel import kernelapp as app    <class 'pandas.core.series.Series'>
year  sex1880  F      38      M      141881  F      38      M      141882  F      38dtype: int64
#具有多个索引的Serise可以展开diversity = diversity.unstack('sex')diversity[:5]
sex F M year 1880 38 14 1881 38 14 1882 38 15 1883 39 15 1884 39 16
diversity.plot(title='number of popular names in top 50%')



1.8 男孩名与女孩名字的混用情况



all_names =找出top1000中所有名字的集合(没有重复的名字)all_names[:5]
    array(['Mary', 'Anna', 'Emma', 'Elizabeth', 'Minnie'], dtype=object)
mask = np.array(['lesl' in x.lower() for x in all_names])mask
  array([False, False, False, ..., False, False, False], dtype=bool)
lesley_like = all_names[mask]print lesley_like.shapelesl_like = np.array([ x for x in all_names if x.lower().startswith('lesl')])print lesl_like.shape
    (24L,)    (21L,)
lesl_top1000 = top1000[]lesl_top1000.groupby(['name']).births.sum()
    name    Lesle            187    Leslea           349    Leslee          4863    Leslei            52    Lesleigh         436    Lesley         37945    Lesleyann         86    Lesleyanne        80    Lesli           5473    Leslian           27    Lesliann           6    Leslianne         10    Leslie        371686    Leslieann        465    Leslieanne        93    Lesliee            8    Leslly             5    Lesly          12407    Leslyann          16    Leslye          2295    Leslyn           166    Name: births, dtype: int64
name sex births year prop year sex 1880 F 654 Leslie F 8 1880 0.000088 M 1108 Leslie M 79 1880 0.000715 1881 F 2523 Leslie F 11 1881 0.000120 M 3072 Leslie M 92 1881 0.000913 1882 F 4593 Leslie F 9 1882 0.000083


lesl_pivot = lesl_top1000.pivot_table(values = 'births',index = 'year',columns = 'sex',aggfunc = sum)lesl_pivot[:5]
sex F M year 1880 8 79 1881 11 92 1882 9 128 1883 7 125 1884 15 125
lesl_pivot.plot(style = {'M':'k-','F':'k--'})



lesl_pivot_div = lesl_pivot.div(lesl_pivot.sum(1),axis = 0)lesl_pivot_div.tail()
sex F M year 2006 0.979139 0.020861 2007 0.978508 0.021492 2008 0.977437 0.022563 2009 0.971627 0.028373 2010 0.978482 0.021518
lesl_pivot_div.plot(style = {'M':'k-','F':'k--'})



2 总结

