python_movie_apriori

来源：互联网发布：爱易编程工具箱编辑：程序博客网时间：2024/06/14 02:51
#! /usr/bin/env python#coding=utf-8import pandas as pdroot="F:/Data/exe/ml-100k/"all_ratings=pd.read_csv(root+"u.data",delimiter="\t",header=None)#原始数据是制表符且没有表头all_ratings.columns=(["UserId","MovieId","Rating","DateTime"])#Rating:用户给该电影打分，满分为5分all_ratings["DateTime"]=pd.to_datetime(all_ratings["DateTime"],unit="s")#解析时间戳数据all_ratings["Favorable"]=all_ratings["Rating"]>3#增加用户是否喜欢这部电影的特征ratings=all_ratings[all_ratings["UserId"].isin(range(200))]#前200个不同的用户的数据，而不是前200条数据,共有一万多条数据#目标：如果用户喜欢某些电影，那么他们也将喜欢这部电影favorable_ratings=ratings[ratings["Favorable"]]#前200个用户喜欢某部电影的数据行#按照用户分组，frozenset(v.values)存储每个用户喜爱的电影favorable_reviews_by_users=dict((k,frozenset(v.values))                                for k,v in favorable_ratings.groupby("UserId")["MovieId"])#按电影分组，记录每个电影的评分总和num_favorable_by_movie=ratings[["MovieId","Favorable"]].groupby("MovieId").sum()#查看最受欢迎的5部电影#print (num_favorable_by_movie.sort("Favorable",ascending=False)[:5])#%%%%%%%%%%%%%%%%%%%%%%%%Apriori%%%%%%%%%%%%%%%frequent_itemsets={}min_support=50#电影编号使用frozenset，原因：一当做集合，二当做字典的键（普通的集合不可以frequent_itemsets[1]=dict((frozenset((movie_id,)),row["Favorable"])                          for movie_id,row in num_favorable_by_movie.iterrows()                          if row["Favorable"]>min_support)print(frequent_itemsets[1])from collections import defaultdict#函数：定义新发现的频繁集合，创建超集，检测频繁程度def find_frequent_itemsets(favorable_reviews_by_users,k_1_itermsets,min_support):    counts=defaultdict(int)#字典初始化    for user,reviews in favorable_reviews_by_users.items():        for itemset in k_1_itermsets:#遍历前面找出来的项集            if itemset.issubset(reviews):                #判断是否是用户当前评分项集中的子集，如果是表示用户已经为子集中的电影评过分                for other_reviewed_movie in reviews-itemset:                    #遍历用户评过分却没有出现在项集中的电影,用它生成超集，更新该项集的计数                    current_superset=itemset|frozenset((other_reviewed_movie,))                    counts[current_superset]+=1    #返回其中的频繁项集    return dict([(itemset,frequent)for itemset,frequent in counts.items() if frequent>=min_support])import sysfor k in range(2,20):    cur_frequent_itemsets=\        find_frequent_itemsets(favorable_reviews_by_users,frequent_itemsets[k-1],min_support)    frequent_itemsets[k]=cur_frequent_itemsets    #未找到新的频繁集    if len(cur_frequent_itemsets)==0:        print("Did not find any frequent itemsets of lenth{}".format(k))        sys.stdout.flush()        break    #找到新的频繁集    else:        print("I find {} frequent itemsets of lenth {}".format(len(cur_frequent_itemsets),k))        sys.stdout.flush()del frequent_itemsets[1]#关联规则：如果用户喜欢前提中的所有电影，那么他也会喜欢结论中的电影candidate_rules=[]for itemset_length,itemset_counts in frequent_itemsets.items():    #frequent_itemsets:字典 itemset_length：键：项集长度 itemset_counts：值：同一项集长度下的多个项集    for itemset in itemset_counts.keys():        #itemset_counts：字典（键：电影id,值电影的喜爱度） itemset：一个项集的多个电影id        for conclusion in itemset:            premise=itemset-set((conclusion,))#premise前提可能有很多，conclusion是一个            candidate_rules.append((premise,conclusion))print(candidate_rules[:5])#计算规则的置信度correct_counts=defaultdict(int)#就是字典的值是int型的,键不管，无论初始化默认任何键值都为0incorrect_counts=defaultdict(int)for user,reviews in favorable_reviews_by_users.items():    #遍历所有用户及其喜爱的电影    for candidate_rule in candidate_rules:        premise,conclusion=candidate_rule        if premise.issubset(reviews):            if conclusion in reviews:                correct_counts[candidate_rule]+=1#对于一个用户，是正确规则则加1，否则为错误规则加1            else:                incorrect_counts[candidate_rule]+=1#计算每条适用规则的置信度,也是定义了一个字典rule_confidence={candidate_rule:correct_counts[candidate_rule]/                                float(correct_counts[candidate_rule]+incorrect_counts[candidate_rule])                                for candidate_rule in candidate_rules}root="F:/Data/exe/ml-100k/"movie_name_data=pd.read_csv(root+"u.item",delimiter="|",header=None,encoding="mac-roman")movie_name_data.columns=["Movie_Id","Title","Release Date",                         "Video Release","IMDB","<UNK>",                         "Action","Adventure","Animation",                         "Children","Comedy","Crim",                         "Documentary","Drama","Fantasy",                         "Film-Noir","Horror","Musical",                         "Mystery","Romance","Sci-Fi",                         "Thriller","War","Western"]#用电影编号获得电影def get_movie_name(movie_id):    title_object=movie_name_data[movie_name_data["Movie_Id"]==movie_id]["Title"]    title=title_object.values[0]#title_object得到的是series对象，我们只对第一个值（电影名称）感兴趣    return title#print(get_movie_name(1）#获取置信度排序前五的规则#获取置信度排序前五的规则from operator import itemgettersorted_confidence=sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)for index in range(5):    print("Rule #{}".format(index+1))    (premise,conclusion)=sorted_confidence[index][0]    premise_names=",".join(get_movie_name(idx)for idx in premise)    conclusion_name=get_movie_name(conclusion)    print("Rule:If a person recommends {0} they will also recommend {1}".format(premise_names,conclusion_name))    print("-Confidence: {}".format(rule_confidence[(premise,conclusion)]))    print("")#训练集用了前200为用户，测试集用余下的数据，为测试集中每一位用户获取最喜欢的电影test_dataset=all_ratings[~all_ratings["UserId"].isin(range(200))]#剩下用户的数据集test_favorable=test_dataset[test_dataset["Favorable"]]#剩下用户喜爱的评分的数据集test_favorable_by_users=dict((k,frozenset(v.values))                             for k,v in test_favorable.groupby("UserId")["MovieId"])                            #存储剩下每个用户的id以及他们喜欢的电影#计算测试集中上述给的规则的应验数量correct_counts=defaultdict(int)incorrect_counts=defaultdict(int)for user,reviews in test_favorable_by_users.items():    for candidate_rule in candidate_rules:        premise,conclusion=candidate_rule        if premise.issubset(reviews):            if conclusion in reviews:                correct_counts[candidate_rule]+=1            else:                incorrect_counts[candidate_rule]+=1#计算应验规则的置信度test_confidence={candidate_rule:                correct_counts[candidate_rule]                /float(correct_counts[candidate_rule]+incorrect_counts[candidate_rule])                 for candidate_rule in candidate_rules}for index in range(5):    print("Rule {}".format(index+1))    (premise,conclusion)=sorted_confidence[index][0]    premise_names=",".join(get_movie_name(idx)for idx in premise)    conclusion_name=get_movie_name(conclusion)    print("Rule:If a person recommends {0} they will also recommend {1}"          .format(premise_names,conclusion_name))    print("-Train Confidence:{}").format(rule_confidence[(premise,conclusion)])    print("-Test Confidence:{}").format(test_confidence[(premise,conclusion)])
F:\Amy\anaconnda\python.exe F:/Amy/3_python文件/exe/2_movies_apriori.py{frozenset([286]): 59.0, frozenset([7]): 67.0, frozenset([64]): 58.0, frozenset([79]): 58.0, frozenset([258]): 83.0, frozenset([50]): 100.0, frozenset([313]): 60.0, frozenset([174]): 74.0, frozenset([100]): 89.0, frozenset([181]): 79.0, frozenset([1]): 66.0, frozenset([127]): 70.0, frozenset([172]): 59.0, frozenset([98]): 70.0, frozenset([56]): 67.0, frozenset([9]): 53.0}I find 93 frequent itemsets of lenth 2I find 295 frequent itemsets of lenth 3I find 593 frequent itemsets of lenth 4I find 785 frequent itemsets of lenth 5I find 677 frequent itemsets of lenth 6I find 373 frequent itemsets of lenth 7I find 126 frequent itemsets of lenth 8I find 24 frequent itemsets of lenth 9I find 2 frequent itemsets of lenth 10Did not find any frequent itemsets of lenth11[(frozenset([50]), 64), (frozenset([64]), 50), (frozenset([127]), 181), (frozenset([181]), 127), (frozenset([127]), 1)]Rule #1Rule:If a person recommends Pulp Fiction (1994),Contact (1997),Empire Strikes Back, The (1980),Return of the Jedi (1983),Twelve Monkeys (1995) they will also recommend Star Wars (1977)-Confidence: 1.0Rule #2Rule:If a person recommends Silence of the Lambs, The (1991),Godfather, The (1972),Empire Strikes Back, The (1980),Raiders of the Lost Ark (1981),Twelve Monkeys (1995) they will also recommend Shawshank Redemption, The (1994)-Confidence: 1.0Rule #3Rule:If a person recommends Pulp Fiction (1994),Toy Story (1995),Shawshank Redemption, The (1994),Godfather, The (1972) they will also recommend Silence of the Lambs, The (1991)-Confidence: 1.0Rule #4Rule:If a person recommends Shawshank Redemption, The (1994),Fargo (1996),Return of the Jedi (1983),Raiders of the Lost Ark (1981),Fugitive, The (1993) they will also recommend Pulp Fiction (1994)-Confidence: 1.0Rule #5Rule:If a person recommends Pulp Fiction (1994),Fargo (1996),Return of the Jedi (1983),Raiders of the Lost Ark (1981),Godfather, The (1972) they will also recommend Star Wars (1977)-Confidence: 1.0Rule 1Rule:If a person recommends Pulp Fiction (1994),Contact (1997),Empire Strikes Back, The (1980),Return of the Jedi (1983),Twelve Monkeys (1995) they will also recommend Star Wars (1977)-Train Confidence:1.0-Test Confidence:0.965517241379Rule 2Rule:If a person recommends Silence of the Lambs, The (1991),Godfather, The (1972),Empire Strikes Back, The (1980),Raiders of the Lost Ark (1981),Twelve Monkeys (1995) they will also recommend Shawshank Redemption, The (1994)-Train Confidence:1.0-Test Confidence:0.853658536585Rule 3Rule:If a person recommends Pulp Fiction (1994),Toy Story (1995),Shawshank Redemption, The (1994),Godfather, The (1972) they will also recommend Silence of the Lambs, The (1991)-Train Confidence:1.0-Test Confidence:0.869565217391Rule 4Rule:If a person recommends Shawshank Redemption, The (1994),Fargo (1996),Return of the Jedi (1983),Raiders of the Lost Ark (1981),Fugitive, The (1993) they will also recommend Pulp Fiction (1994)-Train Confidence:1.0-Test Confidence:0.755555555556Rule 5Rule:If a person recommends Pulp Fiction (1994),Fargo (1996),Return of the Jedi (1983),Raiders of the Lost Ark (1981),Godfather, The (1972) they will also recommend Star Wars (1977)-Train Confidence:1.0-Test Confidence:0.975进程已结束,退出代码0
0 0