《Python数据挖掘入门与实战》第四章电影推荐案例
来源:互联网 发布:网络东北大秧歌曲大全 编辑:程序博客网 时间:2024/06/15 23:31
import pandas as pd#1000名用户和1700部电影all_ratings = pd.read_csv('u.data',delimiter='\t',header=None,names = ['UserID','MovieID','Rating','Datetime'])all_ratings['Datetime'] = pd.to_datetime(all_ratings['Datetime'],unit='s')#all_ratings.head()#对某部电影评分大于3,判断于喜欢该电影all_ratings['Favorable'] = all_ratings['Rating']>3#all_ratings[10:15]#训练集ratings = all_ratings[all_ratings['UserID'].isin(range(200))]#用户喜欢某部电影的数据集favorable_ratings = ratings[ratings['Favorable']]#每个用户喜欢那些电影 favorable_reviews_by_users = dict((k , frozenset(v.values)) for k ,v in favorable_ratings.groupby('UserID')['MovieID'])#每部电影的影迷数量num_favorable_by_movie = ratings[['MovieID','Favorable']].groupby('MovieID').sum()#最受欢迎的五部电影num_favorable_by_movie.sort_values(by = 'Favorable',ascending=False)[:5]#初始化频繁项集frequent_itemsets ={}#最小支持度min_support = 50frequent_itemsets[1] = dict((frozenset((movie_id ,)) , row['Favorable']) for movie_id , row in num_favorable_by_movie.iterrows() if row['Favorable']> min_support) from collections import defaultdictfrom os import sysdef find_frequent_itemsets(favorable_reviews_by_users ,k_1_itemsets,minsupport): counts = defaultdict(int) for user , reviews in favorable_reviews_by_users.items(): for itemset in k_1_itemsets: if itemset.issubset(reviews): for other_reviewed_movie in reviews - itemset : current_superset = itemset | frozenset((other_reviewed_movie,)) counts[current_superset] += 1 return dict([(itemset , frequency) for itemset , frequency in counts.items() if frequency >= min_support])for k in range(2,20): cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users ,frequent_itemsets[k-1],min_support) frequent_itemsets[k] = cur_frequent_itemsets if len(cur_frequent_itemsets) == 0: print('Did not find any frequent itemsets of length {0}'.format(k)) sys.stdout.flush()#把缓存区中的内容输出到终端 break else: print('I find {0} frequent itemsets of length {1}'.format(len(cur_frequent_itemsets),k)) sys.stdout.flush() del frequent_itemsets[1] candidate_rules = []for itemset_length , itemset_counts in frequent_itemsets.items(): for itemset in itemset_counts.keys(): for conclusion in itemset: premise = itemset - set((conclusion ,)) candidate_rules.append((premise,conclusion))print(candidate_rules[:5])correct_counts = defaultdict(int)incorrect_counts = defaultdict(int)for user , reviews in favorable_reviews_by_users.items(): for candidate_rule in candidate_rules: premise , conclusion = candidate_rule if premise.issubset(reviews): if conclusion in reviews: correct_counts[candidate_rule] += 1 else: incorrect_counts[candidate_rule] += 1rule_confidence = {candidate_rule : correct_counts[candidate_rule] / float(correct_counts[candidate_rule]+incorrect_counts[candidate_rule]) for candidate_rule in candidate_rules}from operator import itemgettersorted_confidence = sorted(rule_confidence.items() ,key = itemgetter(1) ,reverse =True)for index in range(5): print('Rule #{0}'.format(index +1 )) (premise , conclusion) = sorted_confidence[index][0] print('RuleL if a person recommends {0} they will also recommend {1}'.format(premise , conclusion)) print(' - Confidence: {0:.3f}'.format(rule_confidence[(premise , conclusion)])) print('') movie_name_data = pd.read_csv('u.item',delimiter='|',header=None , encoding='mac_roman')movie_name_data.columns = ['MovieID', 'Title', 'Release Date','Video Release', 'IMDB', '<UNK>', 'Action', 'Adventure','Animation', "Children's", 'Comedy', 'Crime', 'Documentary','Drama', 'Fantasy', 'Film-Noir','Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller','War', 'Western']def get_movie_name(movie_id): title_object = movie_name_data[movie_name_data['MovieID'] == movie_id]['Title'] title= title_object.values[0] return titlefor index in range(5): print('Rule #{0}'.format(index +1 )) (premise , conclusion) = sorted_confidence[index][0] premise_names = ', '.join(get_movie_name(idx) for idx in premise) conclusion_name = get_movie_name(conclusion) print('Rule: if a person recommends {0} they will also recommend {1}'.format(premise_names , conclusion_name)) print(' - Confidence: {0:.3f}'.format(rule_confidence[(premise , conclusion)])) print('') test_dataset = all_ratings[~all_ratings['UserID'].isin(range(200))]test_favorable = test_dataset[test_dataset['Favorable']]test_favorable_by_users = dict((k,frozenset(v.values)) for k ,v in test_favorable.groupby('UserID')['MovieID'] )correct_counts = defaultdict(int)incorrect_counts = defaultdict(int)for user ,reviews in test_favorable_by_users.items(): for candidate_rule in candidate_rules: premise , conclusion = candidate_rule if premise.issubset(reviews): if conclusion in reviews: correct_counts[candidate_rule] +=1 else: incorrect_counts[candidate_rule] +=1test_confidence = {candidate_rule :correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule] ) for candidate_rule in candidate_rules}for index in range(5): print('Rule #{0}'.format(index +1 )) (premise , conclusion) = sorted_confidence[index][0] premise_names = ', '.join(get_movie_name(idx) for idx in premise) conclusion_name = get_movie_name(conclusion) print('Rule: if a person recommends {0} they will also recommend {1}'.format(premise_names , conclusion_name)) print(' - Train Confidence: {0:.3f}'.format(rule_confidence[(premise , conclusion)])) print(' - Test Confidence: {0:.3f}'.format(test_confidence[(premise , conclusion)])) print('')
阅读全文
0 0
- 《Python数据挖掘入门与实战》第四章电影推荐案例
- 《python数据挖掘入门与实践》“电影推荐” 笔记3
- python数据挖掘入门与实战——学习笔记(第1、2章)
- python数据挖掘入门与实战——学习笔记(第3、4章)
- python数据挖掘入门与实战——学习笔记(第5、6章)
- pthon数据挖掘与分析实战【笔记】-第四章 数据预处理4.1数据清洗
- python数据分析与挖掘实战 第六章 拓展思考
- python数据分析与挖掘实战 第七章 拓展思考
- python数据分析与挖掘实战 第九章 拓展练习
- 笔记《Python数据分析与实战挖掘》
- python数据分析与挖掘实战-4
- python 数据分析与挖掘实战
- Python数据分析与实战挖掘
- nba2014年数据获取——最近在读《Python数据挖掘入门与实战》
- 数据挖掘竞赛题目 -- 电影推荐
- Python数据分析与挖掘实战—挖掘建模
- 『Python数据分析与挖掘实战』第五章:挖掘建模
- 数据挖掘-推荐算法入门
- 傅里叶级数与复的傅里叶级数、傅里叶变换
- 实验四 网络模拟器RouterSim的使用
- Android中AsyncTask的简单用法
- 《智能路由器开发指南》核心笔记(全)
- 加油
- 《Python数据挖掘入门与实战》第四章电影推荐案例
- 免费馅饼 HDU
- html无序有序列表嵌套
- mysql 乱码之 Incorrect string value: 'XXXXXX' for column 'XXX' at row 1 类问题
- opencv(二):FileStorage类
- python爬数据小试牛刀--beautifulSoup使用
- 关于js对象键顺序的更改
- 通俗理解KMP字符串匹配算法
- 494. Target Sum