分析python处理基本数据<四>

来源:互联网 发布:阿里旅行软件 编辑:程序博客网 时间:2024/06/10 16:58
# coding=utf-8import linecacheimport timenow = time.time()data_keys = ('bid', 'uid', 'username', 'v_class', 'content', 'img', 'created_at', 'source', 'rt_num', 'cm_num',             'rt_uid', 'rt_username', 'rt_v_class', 'rt_content', 'rt_img', 'src_rt_num', 'src_cm_num', 'gender',             'rt_bid', 'location', 'rt_mid', 'mid', 'lat', 'lon', 'lbs_type', 'lbs_title', 'poiid', 'links', 'hashtags',             'ats', 'rt_links', 'rt_hashtags', 'rt_ats', 'v_url', 'rt_v_url')keys = {data_keys[i]: i for i in xrange(0, len(data_keys))}F = linecache.getlines('twitter.txt')lines = [x[1:-2].split('","') for x in F]users = set(line[keys['username']] for line in lines)users_total = len(users)  # 用户总数assert type(users_total) == intusers = list(users)  # 用户名组成的listassert type(users) == listtweets_from_2012_11 = filter(lambda line: line[keys['created_at']].startswith('2012-11'), lines)tweets_2012_11_count = len(tweets_from_2012_11)  # 2012-11月的tweets的总数assert type(tweets_2012_11_count) == intdates = set(line[keys['created_at']].split(' ')[0] for line in lines)dates = sorted(list(dates))  # 有哪几天的数据assert type(dates) == listtime_list = [int(line[keys['created_at']][11:13]) for line in lines]time_count = [(h, time_list.count(h)) for h in xrange(0, 24)]time_count.sort(key=lambda k: k[1], reverse=True)time_most_tweet = time_count[0][0]  # 文本里面发布数据最多的小时assert type(time_most_tweet) == intdate_user_num = {k: dict() for k in dates}date_most_user = {k: '' for k in dates}for line in lines:    date_line = line[keys['created_at']].split(' ')[0]    user_line = line[keys['username']]    if date_user_num[date_line]. has_key(user_line):        date_user_num[date_line][user_line] += 1    else:        date_user_num[date_line][user_line] = 1for k, v in date_user_num.items():    us = v.items()    us.sort(key=lambda k: k[1], reverse=True)    date_user_num[k] = {us[0][0]: us[0][1]}  # 每个日期下,发Twitter最多的用户,一个条数    date_most_user[k] = us[0][0]  # 每个日期下,发Twitter最多的用户assert type(date_user_num) == dicttweets_from_2012_11_03 = filter(lambda line: line[keys['created_at']].startswith('2012-11-03'), lines)date_time_list = [int(line[keys['created_at']][11:13]) for line in tweets_from_2012_11_03]date_time_count = [(str(i), date_time_list.count(i)) for i in xrange(0, 24)]assert type(date_time_count) == list  # 按照时间顺序输出 2012-11-03 每个小时的发布tweets的频率tweets_source_dict = {}for line in lines:    if tweets_source_dict. has_key(line[keys['source']]):        tweets_source_dict[line[keys['source']]] += 1    else:        tweets_source_dict[line[keys['source']]] = 1tweets_source = tweets_source_dict.items()tweets_source.sort(key=lambda k: k[1], reverse=True)  # 统计该文本里来源的相关信息和次数assert type(tweets_source) == listtweet_transmit_url = filter(lambda line: line[keys['rt_v_url']].startswith('https://twitter.com/umiushi_no_uta'), lines)tweet_transmit_url_count = len(tweet_transmit_url)  # 计算转发URL中:以"https://twitter.com/umiushi_no_uta"开头的有几个assert type(tweet_transmit_url_count) == inttweet_user_count = 0for line in lines:    if line[keys['uid']] == '573638104':        tweet_user_count += 1assert type(tweet_user_count) == int  # UID为573638104的用户 发了多少个微博# 定义一个函数,该函数可放入任意多的用户uid参数(如果不存在则返回null),函数返回发微薄数最多的用户uid。def most_tweet_count(*temp):    if len(temp) == 0:        return 'null'    uid_count = {temp[i]: 0 for i in xrange(0, len(temp))}    for line in lines:        if line[keys['uid']] in temp:            uid_count[line[keys['uid']]] += 1    uid_count_list = uid_count.items()    uid_count_list.sort(key=lambda k: k[1], reverse=True)    return uid_count_list[0][0] if uid_count_list[0][1] > 0 else 'null'assert most_tweet_count() == 'null'assert most_tweet_count('ab', 'cds') == 'null'assert most_tweet_count('ab', 'cds', '123b') == 'null'assert most_tweet_count('12342', 'cd') == 'null'assert most_tweet_count('28803555', 28803555) == '28803555'assert most_tweet_count('28803555', 28803555, '96165754') == '28803555'# 12. 该文本里,谁发的微博内容长度最长 (要求:输出用户的uid,字符串格式。)max_len = 0max_uid = ''for line in lines:    if len(line[keys['content']]) > max_len:        max_uid = line[keys['uid']]        max_len = len(line[keys['content']])assert type(max_uid) == str# 13. 该文本里,谁转发的URL最多 (要求:输出用户的uid,字符串格式。)trans_uid_count = [(line[keys['uid']], int(line[keys['rt_num']])) for line in lines if line[keys['rt_num']] != '']trans_uid_count.sort(key=lambda k: k[1], reverse=True)most_trans_url = trans_uid_count[0][0]assert type(most_trans_url) == str# 14. 该文本里,11点钟,谁发的微博次数最多。 (要求:输出用户的uid,字符串格式。)time_eleven = {}lines_eleven = filter(lambda line: line[keys['created_at']].startswith('11', 11, 13), lines)for line in lines_eleven:        if time_eleven. has_key(line[keys['uid']]):            time_eleven[line[keys['uid']]] += 1        else:            time_eleven[line[keys['uid']]] = 1time_eleven_list = time_eleven.items()time_eleven_list.sort(key=lambda k: k[1], reverse=True)time_eleven_most = time_eleven_list[0][0]assert type(time_eleven_most) == str# 15. 该文本里,哪个用户的源微博URL次数最多。 (要求:输出用户的uid,字符串格式。)url_user = {line[keys['uid']]: 0 for line in lines}for line in lines:    if line[keys['v_url']] != '':        url_user[line[keys['uid']]] += 1url_user_list = url_user.items()url_user_list.sort(key=lambda k: k[1], reverse=True)url_trans_most = url_user_list[0][0]assert type(url_trans_most) == str
总时间:<pre name="code" class="python">d = time.time() - nowprint dprint '运算时间%s' % dprint '运算时间%d' % dprint type(d)# 这里面是%s, 代表字符串,%d代表整型会变成0

<pre name="code" class="python"><pre name="code" class="python">0.523000001907运算时间0.523000001907运算时间0<type 'float'>




0 0
原创粉丝点击