计数:统计出现的次数

来源:互联网 发布:网络借贷办法 编辑:程序博客网 时间:2024/06/06 09:04

统计出现的次数的方法:

#coding=utf-8__author__ = 'mac'import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport pylab as plimport os#import jsonpath='/Users/mac/PycharmProjects/python2.7/BigData/pydata-book-2nd-edition/datasets/bitly_usagov/example.txt'records=[json.loads(line) for line in open(path)]print records[0]['tz']time_zone=[rec['tz'] for rec in records if 'tz' in rec]print time_zone[:10]
方法1:用纯python方法def get_counts(sequence):    counts={}    for x in sequence:        if x in counts:            counts[x]+=1        else:            counts[x]=1    return counts# #测试数据# a=['a','b',1,'a','c','a']# print get_counts(a)# print get_counts(time_zone[:10])
方法2:用collections的defaultdict将字典初始化为0#或用collections库from collections import defaultdictdef get_counts2(sequence):    counts=defaultdict(int) #所有值都会被初始化为0    for x in sequence:        counts[x]+=1    return counts# #测试数据# a=['a','b',1,'a','c','a']# print get_counts2(a)#计算time_zones的值counts=get_counts(time_zone)print counts['America/New_York']print len(time_zone)# a={'a':3,'b':5,'c':1,'d':3}#处理排在前10的时区def top_counts(count_dict,n=10):    value_key_pairs=[(count,tz) for tz,count in count_dict.items()]    print value_key_pairs    value_key_pairs.sort() #sort()升序,故下面从最后面的开始倒着取    return value_key_pairs[-n:]print top_counts(counts)
方法3:用collections库中的Counter计数器,Counter.most_common从大到小排序取前10#可以用collections的Counter类,它更简单from collections import Countercounts=Counter(time_zone)#取counts中的前10的时区用most_common(10)print counts.most_common(10)
方法4:引用数据类的库pandas的DataFrame,Series会带有index索引,和numpy中的array差不多#使用pandas对时区进行计数from pandas import DataFrame,Seriesframe=DataFrame(records)# print frame['tz'][:10]print frame['tz']tz_counts=frame['tz'].value_counts()# print tz_counts#用fillna函数可以替换缺失值(NAN),而未知值(空字符串)则可以通过布尔型数组索引加以替换:clean_tz=frame['tz'].fillna('Missing')# print clean_tzclean_tz[clean_tz=='']='Unknown'# print clean_tztz_counts=clean_tz.value_counts()print tz_counts[:10]print frame['a'][1]#plot(tz_counts[:10],kind='barh',rot=0)# pl.show()加上此才可以显示图形results=Series([x.split()[0] for x in frame.a.dropna()]) #dropna()是删除缺失项print results[:5]