《Python数据挖掘入门与实战》第四章电影推荐案例

来源:互联网 发布:网络东北大秧歌曲大全 编辑:程序博客网 时间:2024/06/15 23:31
import pandas as pd#1000名用户和1700部电影all_ratings = pd.read_csv('u.data',delimiter='\t',header=None,names = ['UserID','MovieID','Rating','Datetime'])all_ratings['Datetime'] = pd.to_datetime(all_ratings['Datetime'],unit='s')#all_ratings.head()#对某部电影评分大于3,判断于喜欢该电影all_ratings['Favorable'] = all_ratings['Rating']>3#all_ratings[10:15]#训练集ratings = all_ratings[all_ratings['UserID'].isin(range(200))]#用户喜欢某部电影的数据集favorable_ratings = ratings[ratings['Favorable']]#每个用户喜欢那些电影 favorable_reviews_by_users = dict((k , frozenset(v.values)) for k ,v in favorable_ratings.groupby('UserID')['MovieID'])#每部电影的影迷数量num_favorable_by_movie = ratings[['MovieID','Favorable']].groupby('MovieID').sum()#最受欢迎的五部电影num_favorable_by_movie.sort_values(by = 'Favorable',ascending=False)[:5]#初始化频繁项集frequent_itemsets ={}#最小支持度min_support = 50frequent_itemsets[1] = dict((frozenset((movie_id ,)) , row['Favorable']) for movie_id , row in num_favorable_by_movie.iterrows() if row['Favorable']> min_support) from collections import defaultdictfrom os import sysdef find_frequent_itemsets(favorable_reviews_by_users ,k_1_itemsets,minsupport):    counts = defaultdict(int)    for user , reviews in favorable_reviews_by_users.items():        for itemset in k_1_itemsets:            if itemset.issubset(reviews):                for other_reviewed_movie in reviews - itemset :                    current_superset = itemset | frozenset((other_reviewed_movie,))                    counts[current_superset] += 1    return dict([(itemset , frequency) for itemset , frequency in counts.items() if frequency >= min_support])for k in range(2,20):    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users ,frequent_itemsets[k-1],min_support)    frequent_itemsets[k] = cur_frequent_itemsets    if len(cur_frequent_itemsets) == 0:        print('Did not find any frequent itemsets of length {0}'.format(k))        sys.stdout.flush()#把缓存区中的内容输出到终端        break    else:        print('I find {0} frequent itemsets of length {1}'.format(len(cur_frequent_itemsets),k))        sys.stdout.flush() del frequent_itemsets[1]    candidate_rules = []for itemset_length , itemset_counts in frequent_itemsets.items():    for itemset in itemset_counts.keys():        for conclusion in itemset:            premise = itemset - set((conclusion ,))            candidate_rules.append((premise,conclusion))print(candidate_rules[:5])correct_counts = defaultdict(int)incorrect_counts = defaultdict(int)for user , reviews in favorable_reviews_by_users.items():    for candidate_rule in candidate_rules:        premise , conclusion = candidate_rule        if premise.issubset(reviews):            if conclusion in reviews:                correct_counts[candidate_rule] += 1            else:                incorrect_counts[candidate_rule] += 1rule_confidence = {candidate_rule : correct_counts[candidate_rule] / float(correct_counts[candidate_rule]+incorrect_counts[candidate_rule]) for candidate_rule in candidate_rules}from operator import itemgettersorted_confidence = sorted(rule_confidence.items() ,key = itemgetter(1) ,reverse =True)for index in range(5):    print('Rule #{0}'.format(index +1 ))    (premise , conclusion) = sorted_confidence[index][0]    print('RuleL if a person recommends {0} they will also recommend {1}'.format(premise , conclusion))    print(' - Confidence: {0:.3f}'.format(rule_confidence[(premise , conclusion)]))    print('')                    movie_name_data = pd.read_csv('u.item',delimiter='|',header=None , encoding='mac_roman')movie_name_data.columns = ['MovieID', 'Title', 'Release Date','Video Release', 'IMDB', '<UNK>', 'Action', 'Adventure','Animation', "Children's", 'Comedy', 'Crime', 'Documentary','Drama', 'Fantasy',                           'Film-Noir','Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller','War', 'Western']def get_movie_name(movie_id):    title_object = movie_name_data[movie_name_data['MovieID'] == movie_id]['Title']    title= title_object.values[0]    return titlefor index in range(5):    print('Rule #{0}'.format(index +1 ))    (premise , conclusion) = sorted_confidence[index][0]    premise_names = ', '.join(get_movie_name(idx) for idx in premise)    conclusion_name = get_movie_name(conclusion)    print('Rule: if a person recommends {0} they will also recommend {1}'.format(premise_names , conclusion_name))    print(' - Confidence: {0:.3f}'.format(rule_confidence[(premise , conclusion)]))    print('')  test_dataset = all_ratings[~all_ratings['UserID'].isin(range(200))]test_favorable = test_dataset[test_dataset['Favorable']]test_favorable_by_users = dict((k,frozenset(v.values)) for k ,v in test_favorable.groupby('UserID')['MovieID'] )correct_counts = defaultdict(int)incorrect_counts = defaultdict(int)for user ,reviews in test_favorable_by_users.items():    for candidate_rule in candidate_rules:        premise , conclusion = candidate_rule        if premise.issubset(reviews):            if conclusion in reviews:                correct_counts[candidate_rule] +=1            else:                incorrect_counts[candidate_rule] +=1test_confidence = {candidate_rule :correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule] ) for candidate_rule in candidate_rules}for index in range(5):    print('Rule #{0}'.format(index +1 ))    (premise , conclusion) = sorted_confidence[index][0]    premise_names = ', '.join(get_movie_name(idx) for idx in premise)    conclusion_name = get_movie_name(conclusion)    print('Rule: if a person recommends {0} they will also recommend {1}'.format(premise_names , conclusion_name))    print(' - Train Confidence: {0:.3f}'.format(rule_confidence[(premise , conclusion)]))    print(' - Test Confidence: {0:.3f}'.format(test_confidence[(premise , conclusion)]))    print('')                                

阅读全文
0 0
原创粉丝点击