SimpleTag_TFIDF++

来源:互联网 发布:刘美麟因为爱情网络 编辑:程序博客网 时间:2024/06/07 03:35
'''Created on 2014-03-05@Author:Dior'''import randomimport mathimport operatorclass SimpleTagBased():        #The constructor function    def __init__(self,filename):        self.filename=filename        #self.N=N        self.loadData()        self.randomlySplitData(0.2)        self.initStat()        self.testRecommend()            #read in the data file    def loadData(self):        print "##################load data begin#######################"        filename=self.filename        self.records={}        fi=open(filename)        lineNum=0        for line in fi:            lineNum+=1            if lineNum==1:                continue            uid,iid,tag,timestamp=line.split('\t')            uid=int(uid)-1            iid=int(iid)-1            tag=int(tag)-1            self.records.setdefault(uid,{})            self.records[uid].setdefault(iid,[])            self.records[uid][iid].append(tag)        fi.close()        print "Load data success.The total records is %d." % (lineNum)        print "The total records number is %d." % (len(self.records))        print "##################load data end#######################\n"        #Randomly split the data into training set and testing set        def randomlySplitData(self,ratio,seed=100):        print "################beginning to split data#####################"        random.seed(seed)        self.train=dict()        self.test=dict()        for u in self.records.keys():            for i in self.records[u].keys():                if random.random()<ratio:                    self.test.setdefault(u,{})                    self.test[u].setdefault(i,[])                    for t in self.records[u][i]:                        self.test[u][i].append(t)                else:                    self.train.setdefault(u,{})                    self.train[u].setdefault(i,[])                    for t in self.records[u][i]:                        self.train[u][i].append(t)                print "Split data complete."        print "The length of train set is %d,the length of test set is %d." % (len(self.train),len(self.test))        print "##################split data end#######################\n"        #Initialize the user_tags,tag_items and user_items dictionary        def initStat(self):        print "##################initstat begin#######################"        records=self.train        self.user_tags=dict()        self.tag_items=dict()        self.user_items=dict()        #TODO:        self.tag_users=dict()        #TODO        self.item_users=dict()        for u,items in records.items():            for i,tags in items.items():                for tag in tags:                    #print tag                    self._addValueToMat(self.user_tags,u,tag,1)                    self._addValueToMat(self.tag_items,tag,i,1)                    self._addValueToMat(self.user_items,u,i,1)                    #TODO                    self._addValueToMat(self.tag_users,tag,u,1)                    #TODO                    self._addValueToMat(self.item_users,i,u,1)        print "Initialize state complete."        print "The length of the user_tags is %d,the length of the tag_items is %d,the length of the user_items is %d" % (len(self.user_tags),len(self.tag_items),len(self.user_items))        print "##################initstat end#######################\n"        #The private function which is used to add value to matrix        def _addValueToMat(self,mat,index,item,value=1):        #the private function which is used to add value to matrix        if index not in mat:            mat.setdefault(index,{})            mat[index].setdefault(item,value)        else:            if item not in mat[index]:                mat[index][item]=value            else:                mat[index][item]+=value        #The precision and recall    def precisionAndRecall(self,N):        #print "##################precisionAndRecall begin#######################"        #print "Beginning calculating......"        hit=0        h_recall=0        h_precision=0        for user,items in self.test.items():            if user not in self.train:                continue            rank=self.recommend(user,N)            for item,rui in rank:                if item in items:                    hit+=1            #print "The items in test set is:"            #print items            #print "The items in recommended set is:"            #print rank            h_recall+=len(items)            h_precision+=N        #print hit                #print "Calculating end....."        #print "##################precisionAndRecall end#######################"        return (hit/(h_precision*1.0)),(hit/(h_recall*1.0))        #The recommend function    def recommend(self,user,N):        recommend_items=dict()        #N=self.N        tagged_items=self.user_items[user]             for tag,wut in self.user_tags[user].items():            wut=wut*1.0/math.log(1+len(self.tag_users[tag]))            for item,wti in self.tag_items[tag].items():                wti=wti*1.0/math.log(1+len(self.item_users[item]))                if item in tagged_items:                    continue                if item not in recommend_items:                    recommend_items[item]=wut*wti                else:                    recommend_items[item]+=wut*wti        return sorted(recommend_items.items(),key=operator.itemgetter(1),reverse=True)[0:N]        #Test recommend function    def testRecommend(self):        print "##################testRecommend begin#######################"        #precision,recall=self.precisionAndRecall()        print "%3s%20s%20s" % ('K',"recall",'precision')        for n in [5,10,20,40,60,80,160]:            precision,recall=self.precisionAndRecall(n)            print "%3d%19.3f%%%19.3f%%" % (n,recall * 100,precision * 100)        #print "The precision is %f,the recall is %f" % (precision,recall)        print "##################testRecommend end#######################\n"                if __name__=='__main__':    stb=SimpleTagBased("E:\\RecommenderSystem\\datasets\\hetrec2011-delicious-2k\\user_taggedbookmarks-timestamps.dat")    #print stb.user_items    #print stb.item_users    #for item in stb.item_users:        #if 7 in stb.item_users[item]:            #print item,stb.item_users[item][7]

0 0