利用python进行数据分析-pandas.concat/subplots/gropuby/pivot_table,多文件整合、聚合、分组，子图

来源：互联网发布：36o软件管家下载编辑：程序博客网时间：2024/05/29 16:02

import pandas as pdfrom matplotlib import pyplot as plt

import numpy as np
#文件中年份为1880年-2011年

years=range(1880,2011)

pieces=[]

#列名

columns=['name','sex','births']

for year in years:

    #通用的路径名

    path='D:\\python program\\names\\babynames\\yob%d.txt' % year

    #读取文件并生成框表

    frame=pd.read_csv(path,names=columns)

    #添加‘year’列

    frame['year']=year

    pieces.append(frame)

#将所有数据整合到单个dataframe中，必须指定ignore_index=True，避免返回原始行号

names=pd.concat(pieces,ignore_index=True)

#在year和sex级别上对其进行聚合

total_births=pd.pivot_table(names,index='year',columns=['sex'],                            values=['births'],aggfunc=sum)
#print(total_births.tail())   输出

#绘图

plt.plot(total_births)

plt.title('total births by sex and yeae')

plt.show()

此时生成的names数据为：

---------------------------------------------------------------------
#利用groupby对year和sex分组
names.groupby(['year','sex']).births.sum()
----------------------------------------------------------------------------------------------
#插入一个prop列，指定名字的婴儿数相对于出生总数的比例

def add_prop(group):    #整数除法会向下圆整
    births=group.births.astype(float)  #类型转换为浮点型    group['prop']=births/births.sum()    return groupnames=names.groupby(['year','sex']).apply(add_prop) #将新列加到各个分组上
图为：
------------------------------------------------------------------------------
****进行验证，通过np.allclose检查这个分组的总计值是否为1，是输出True
print(np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1))
------------------------------------------------------------------------------------
#分组收集数据的一个子集：每对sex、year组合的前1000个名字
def get_top1000(group):    return group.sort_values(by='births',ascending=False)[0:1000] #多列排序grouped=names.groupby(['year','sex'])top1000=grouped.apply(get_top1000)
图：

-----------------------------------------------------------------------------------------------

分别索引男、女

boys=top1000[top1000.sex=='M']girls=top1000[top1000.sex=='F']

---------------------------------------------------------------------------------------

在这1000生成按照year和name统计的总的出生数据透视表

total_births=pd.pivot_table(top1000,values=['births'],index=['year'],columns='name',aggfunc=sum)

--------------------------------------------------------------------------------------

在一张图布上根据4个名字查看随年份命名的变化，建立4个折线图

subset = total_births.births[['John','Harry','Mary','Marilyn']]subset.plot(subplots=True,grid=False,figsize=(12,10),title="numbers of births per year")plt.show()

图：

--------------------------------------------------------------------------------------------

计算最流行的1000个名字所占的比例，按照year和sex聚合并绘图

table=pd.pivot_table(top1000,values='prop',index=['year'],columns='sex',aggfunc=sum)table.plot(title="sum of table1000.pro by year and sex",yticks=np.linspace(0,1.2,13),xticks=range(1880,2020,10))plt.show()
图：分性别统计的前1000个名字在总出生人数中的比例

-----------------------------------------------------------------------------------------------

计算前2010年男孩的名字，对prop降序后。多少个名字的人数加起来才够50%

#2010年男孩的名字df=boys[boys.year==2010]prop_cumsum=df.sort_values(by='prop',ascending=False).prop.cumsum() #计算prop的累积和cumsumprint(prop_cumsum.searchsorted(0.5)+1)     #通过searchsorted找出0.5被插在哪个位置。数组索引从0开始，所以+1

---------------------------------------------------------------------------------------------------

按照上例对所有的year/sex组合执行上诉计算。这两个字段进行groupby处理，然后用一个函数计算各分组的这个值

def get_quantile_count(group,q=0.5):    group=group.sort_values(by='prop',ascending=False)    return group.prop.cumsum().searchsorted(q)+1diversity=top1000.groupby(['year','sex']).apply(get_quantile_count)diversity=diversity.unstack('sex')plt.plot(diversity)plt.title("number of popular names in top 50%")plt.show()
图：每年的prop在前1000个名字中，累计达到50%的位置按照sex分类的趋势图

-------------------------------------------------------------------------------------

分析名字中最后一个字母上的分布变化

#从name列取出最后一个字母get_last_letter=lambda x:x[-1]last_letters=names.name.map(get_last_letter)last_letters.name='last_letter'#将全部出生数据在year、sex以及末字母进行聚合table=pd.pivot_table(names,values='births',index=last_letters,columns=['sex','year'],aggfunc=sum)#选出具有一定代表性的三年subtable=table.reindex(columns=[1910,1960,2010],level='year')
图：

-----------------------------------------------------------------------------------------

各年度各性别的名字末字母所占总人数的条形图

#各性别各末字母占总出生人数的比例letter_prop=subtable/subtable.sum().astype(float)#生成条形图fig,axes=plt.subplots(2,1,figsize=(10,10))letter_prop['M'].plot(kind='bar',rot=0,ax=axes[0],title="Male")letter_prop['F'].plot(kind='bar',rot=0,ax=axes[1],title="Female",legend=False)plt.show()
图：

------------------------------------------------------------------------------------------------------

上例中男孩名字末字母所占比例的变化，本例选取几个特定字母进行分析

#对完整table按year和sex规范化处理，并在男孩名字中选取几个字母，查看比例letter_prop=table/table.sum().astype(float)dny_ts=letter_prop.ix[['d','n','y'],'M'].Tdny_ts.plot()plt.show()
图：各年出生的男孩中名字以d/n/y结尾的人数比例

-------------------------------------------------------------------------------

早年流行语男孩名字近年来流行语女孩，回到top1000数据集，找出以‘lesl’开头的一组名字

#变成女孩的男孩名字all_names=top1000.name.unique()#查找以lesl开头的所有名字mask=np.array(['lesl' in x.lower() for x in all_names])lesl_like=all_names[mask]#计算以lesl开头的名字出生数filtered=top1000[top1000.name.isin(lesl_like)]filtered.groupby('name').births.sum()
图：

---------------------------------------------------------------------------------------

分析各年以‘lesl’开头的名字男女比例

#按照sex和year聚合table=pd.pivot_table(filtered,values='births',index='year',columns='sex',aggfunc=sum)####################数字比例特别处理table=table.div(table.sum(1),axis=0)#画图table.plot(style={'M':'k-','F':'k--'})plt.show()

图：各年度使用‘lesl’型名字的男女比例

阅读全文

0 0