Python-基于物品的协同过滤算法(附两种算法的对比)

来源:互联网 发布:python归并排序算法 编辑:程序博客网 时间:2024/06/05 01:04

与上一篇UserCF数据来源一致。

先贴代码ItemCF:

#coding=utf-8import mathclass ItemCF:    def __init__(self,basefile,testfile):        self.datafile = basefile        self.testfile = testfile        self.readData()        self.readTestData()    def readData(self):        self.traindata = {}        for line in open(self.datafile):            userid,itemid,record,_ = line.split()            self.traindata.setdefault(userid,{})            self.traindata[userid][itemid] = int(record)     def readTestData(self):        self.testdata = {}        for line in open(self.testfile):            userid,itemid,record,_ = line.split()            self.testdata.setdefault(userid,{})            self.testdata[userid][itemid] = int(record)     def ItemSimilarity(self):        train = self.traindata        C = dict()        N = dict()        for u, items in train.items():            for i in items.keys():                N.setdefault(i,0)                N[i] += 1                for j in items.keys():                    if i == j:                        continue                    C.setdefault(i,{})                    C[i].setdefault(j,0)                    C[i][j] += 1        self.itermSimBest = dict()#物品与物品之间的相似度        for i,related_items in C.items():            self.itermSimBest.setdefault(i,{})            for j, cij in related_items.items():                self.itermSimBest[i].setdefault(j,0);                self.itermSimBest[i][j] = cij / math.sqrt(N[i] * N[j])    def Recommendation(self,user_id,K = 8,nitem = 40):        train = self.traindata        rank = dict()         ru = train.get(user_id)#用户历史记录        for i,pi in ru.items():            for j, wj in sorted(self.itermSimBest[i].items(),key=lambda x : x[1],reverse=True)[0:K]:                if j in ru:                    continue                rank.setdefault(j,0)                rank[j] += pi * wj        return  dict(sorted(rank.items(),key = lambda x :x[1],reverse = True)[0:nitem])    def recallAndPrecision(self,test = None,k = 8,nitem = 10):        train  = self.traindata        test = self.testdata        hit = 0        recall = 0        precision = 0        for user in train.keys():            tu = test.get(user,{})            rank = self.Recommendation(user,k,nitem)             for item,_ in rank.items():                if item in tu:                    hit += 1            recall += len(tu)            precision += nitem        return (hit / (recall * 1.0),hit / (precision * 1.0))def testUserCF():    cf = ItemCF("train.txt","test.txt")    cf.ItemSimilarity()    print("%5s%5s%20s%20s" % ('K','N',"recall",'precision'))    for k in [5,10,20,40,80,160]:        for nitem in [5,10,15,20]:            recall,precision = cf.recallAndPrecision( k=k,nitem=nitem )            print("%5d%5d%19.3f%%%19.3f%%" % (k,nitem,recall * 100,precision * 100))if __name__=='__main__':    testUserCF()

UserCF和ItemCF的综合比较:

  1. UserCF(适用新闻推荐等)
    给用户推荐那些和他相同兴趣爱好的用户喜欢的物品,反映用户所在的小型兴趣群体中物品的热门程度。

  2. ItemCF(适用图书、电商、电影网站等)
    给用户推荐那些和他之前喜欢的物品类似的物品,更加个性化,反映了用户自己的兴趣传承。

UserCF和ItemCF优缺点对比

原创粉丝点击