来自bit.ly的1.usa.gov数据

来源：互联网发布：中美网络安全问题博弈编辑：程序博客网时间：2024/04/27 23:15

import json
from pandas import DataFrame,Series
import pandas as pd;import numpy as np
import matplotlib
#from pylab import *
path='usagov_bitly_data2012-03-16-1331923249.txt'
records=[json.loads(line) for line in open(path)]#将json转换成Python字典对象
frame = DataFrame(records)
#print frame['tz'][:10].value_counts()

#统计'tz'中的数目
clean_tz=frame['tz'].fillna('Missing')#将NA值替换为Missing
clean_tz[clean_tz==' ']='Unknown'#将空白值替换为Unknown
tz_counts=clean_tz.value_counts()#统计数目
#print tz_counts[:10]
#tz_counts[:10].plot(kind='barh',rot=0)#绘制数目前十地点的柱形图

result= Series([x.split()[0] for x in frame.a.dropna()])#从去除掉na值的a中按空格分离后取第一个字符串，得到浏览器型号列
#print result.value_counts()[:10]#计数按降序排列

#按windows和非windows用户进行划分
cframe= frame[frame.a.notnull()]#移除缺失值
operating_system=Series(np.where(cframe['a'].str.contains('Windows'),'Windows','not Windows'))#判断cframe['a']中是否含有‘Windows’,是则返回'Windows'否则返回'非Windows'，然后将返回值生成Series对象，index=1.2.3.4....,values为返回值

#根据时区和操作系统来对用户进行分组
by_tz_os = cframe.groupby(['tz',operating_system])#按时区和操作系统进行分组
agg_counts= by_tz_os.size().unstack().fillna(0)#按照从左到右的顺序，依次对by_tz_os进行计数、重塑、将na记为0操作
#print agg_counts[:10]

indexer=np.sum(agg_counts,axis=1).argsort()#对每一列axis=1进行求和（对行则为axis=0），并按照从小到大的顺序进行排列
#print indexer[:]

count_subset=agg_counts.take(indexer)[-10:]#按照indexer的顺序取后十列
#print count_subset

#count_subset.plot(kind='barh',stacked=True)
normed_subset=count_subset.divide(count_subset.sum(axis=1),axis=0)
#对每一列进行归一化处理，便于比较每种操作系统的人数差异
normed_subset.plot(kind='barh',stacked=True)
#show()

#总结：Series对象用于一维数组的分析处理；DataFrame对象用于数据表格，多维数组的分析处理

0 0