【知识发现】隐语义模型LFM算法python实现(二)
来源:互联网 发布:快速排序算法 java 编辑:程序博客网 时间:2024/06/06 16:28
http://blog.csdn.net/fjssharpsword/article/details/78015956
基于该篇文章中的代码优化,主要是在生成负样例上提高执行速度,代码参考如下:
# -*- coding: utf-8 -*-'''Created on 2017年10月16日@author: Administrator'''import numpy as npimport pandas as pdfrom math import expimport timeimport mathclass LFM: def __init__(self,lclass,iters,alpha,lamda,topk,ratio,traindata): self.lclass = lclass#隐类数量,对性能有影响 self.iters = iters#迭代次数,收敛的最佳迭代次数未知 self.alpha =alpha#梯度下降步长 self.lamda = lamda#正则化参数 self.topk =topk #推荐top k项 self.ratio =ratio #正负样例比率,对性能最大影响 self.traindata=traindata #初始化开始..... def getUserPositiveItem(self, userid):#生成正样例 traindata=self.traindata series = traindata[traindata['userid'] == userid]['itemid'] positiveItemList = list(series.values) return positiveItemList def getUserNegativeItem(self, userid):#生成负样例 traindata=self.traindata itemLen=self.itemLen ratio=self.ratio userItemlist = list(set(traindata[traindata['userid'] == userid]['itemid'])) #用户评分过的物品 negativeItemList = [] count = ratio*len(userItemlist)#生成负样例的数量 for key,value in itemLen.iteritems():#itemLen.index if count==0: break if key in userItemlist: continue negativeItemList.append(key) count=count-1 return negativeItemList def initUserItem(self, userid): #traindata=self.traindata positiveItem = self.getUserPositiveItem( userid) negativeItem = self.getUserNegativeItem( userid) itemDict = {} for item in positiveItem: itemDict[item] = 1 for item in negativeItem: itemDict[item] = 0 return itemDict def initModel(self): traindata=self.traindata lcalss=self.lclass #隐类数量 userID = list(set(traindata['userid'].values)) self.userID=userID itemID = list(set(traindata['itemid'].values)) self.itemID=itemID itemCount=[len(traindata[traindata['itemid'] == item]['userid']) for item in itemID ] self.itemLen = pd.Series(itemCount, index=itemID).sort_values(ascending=False)#统计每个物品对应的热门度(次数并降序 #初始化p、q矩阵 arrayp = np.random.rand(len(userID), lcalss) #构造p矩阵,[0,1]内随机值 arrayq = np.random.rand(lcalss, len(itemID)) #构造q矩阵,[0,1]内随机值 p = pd.DataFrame(arrayp, columns=range(0,lcalss), index=userID) q = pd.DataFrame(arrayq, columns=itemID, index=range(0,lcalss)) #生成负样例 userItem = [] for userid in userID: itemDict = self.initUserItem(userid) userItem.append({userid:itemDict}) return p, q, userItem #初始化结束..... def sigmod(self,x): # 单位阶跃函数,将兴趣度限定在[0,1]范围内 y = 1.0/(1+exp(-x)) return y def lfmPredict(self,p, q, userID, itemID): #利用参数p,q预测目标用户对目标物品的兴趣度 p = np.mat(p.ix[userID].values) q = np.mat(q[itemID].values).T r = (p * q).sum() r = self.sigmod(r) return r def latenFactorModel(self): #traindata=self.traindata lclass=self.lclass iters=self.iters #迭代次数 alpha = self.alpha #梯度下降步长 lamda = self.lamda #正则化参数 p, q, userItem = self.initModel() for step in range(0, iters): for user in userItem: for userID, samples in user.items(): for itemID, rui in samples.items(): eui = rui - self.lfmPredict(p, q, userID, itemID) for f in range(0, lclass): #print('step %d user %d class %d' % (step, userID, f)) p[f][userID] += alpha * (eui * q[itemID][f] - lamda * p[f][userID]) q[itemID][f] += alpha * (eui * p[f][userID] - lamda * q[itemID][f]) alpha *= 0.9#学习速率 return p, q def recommend(self,userid,p,q): itemID=self.itemID Topk=self.topk #traindata=self.traindata #userItemlist = list(set(traindata[traindata['userid'] == userid]['itemid'])) #otherItemList = [item for item in set(traindata['itemid'].values) if item not in userItemlist] predictList = [self.lfmPredict(p, q, userid, itemid) for itemid in itemID] series = pd.Series(predictList, index=itemID) series = series.sort_values(ascending=False)[:Topk] return series def recallAndPrecision(self,p,q):#召回率和准确率 traindata = self.traindata #itemID=self.itemID userID=self.userID hit = 0 recall = 0 precision = 0 for userid in userID: trueItem = traindata[traindata['userid'] == userid]['itemid'] preitem=self.recommend(userid, p, q) preItem=list(preitem.index) for item in preItem: if item in trueItem: hit += 1 recall += len(trueItem) precision += len(preItem) return (hit / (recall * 1.0),hit / (precision * 1.0)) def coverage(self,p,q):#覆盖率 traindata = self.traindata recommend_items = set() all_items = set() userID=self.userID for userid in userID: trueItem = traindata[traindata['userid'] == userid]['itemid'] for item in trueItem: all_items.add(item) preitem = self.recommend(userid, p, q) preItem=list(preitem.index) for item in preItem: recommend_items.add(item) return len(recommend_items) / (len(all_items) * 1.0) def popularity(self,p,q):#流行度 #traindata = self.traindata itemLen=self.itemLen #itemID=self.itemID userID=self.userID ret = 0 n = 0 for userid in userID: preitem = self.recommend(userid, p, q) preItem=list(preitem.index) for item in preItem: ret += math.log(1+itemLen[item]) n += 1 return ret / (n * 1.0) if __name__ == "__main__": start = time.clock() #导入数据 #df_sample = pd.read_csv("D:\\dev\\workspace\\PyRecSys\\demo\\ratings.csv",names=['userid','itemid','ratings'],header=0) df_sample = pd.read_csv("D:\\tmp\\ratings.csv",names=['userid','itemid','ratings'],header=0) traindata=df_sample[['userid','itemid']] for ratio in [1,2,3,5,10,20]: for lclass in [5,10,20,30,50]: lfm=LFM(lclass,2,0.02,0.01,10,ratio,traindata) #隐类参数 p,q=lfm.latenFactorModel() #推荐 #preitem = lfm.recommend(1, p, q) #print (preitem) #模型评估 print ("%3s%20s%20s%20s%20s%20s" % ('ratio','lcalss',"recall",'precision','coverage','popularity')) recall,precision = lfm.recallAndPrecision(p,q) coverage =lfm.coverage(p,q) popularity =lfm.popularity(p,q) print ("%3d%20d%19.3f%%%19.3f%%%19.3f%%%20.3f" % (ratio,lclass,recall * 100,precision * 100,coverage * 100,popularity)) end = time.clock() print('finish all in %s' % str(end - start))
关注三点:
1)性能受正负样例比率、隐类数量影响最大,要训练出一个最佳参数。
2)对于梯度下降的收敛条件,即迭代次数,限定步长为0.02,迭代次数n要训练出一个最佳值。
3)对于增量数据的训练:保存p、q矩阵,对于增量样本集,可以在p、q基础上训练,有待实践验证,避免每次全量训练耗费性能。
阅读全文
0 0
- 【知识发现】隐语义模型LFM算法python实现(二)
- 【知识发现】隐语义模型LFM算法python实现(三)
- 【知识发现】【知识发现】隐语义模型LFM算法python实现(一)
- 隐语义模型LFM
- LFM隐语义模型
- 隐语义模型(LFM)
- 推荐系统之隐语义模型(LFM)
- 大数据推荐算法之隐语义模型(lfm)进行Top-N推荐
- 使用LFM(Latent factor model)隐语义模型进行Top-N推荐
- 使用LFM(Latent factor model)隐语义模型进行Top-N推荐
- 使用LFM(Latent factor model)隐语义模型进行Top-N推荐
- 使用LFM(Latent factor model)隐语义模型进行Top-N推荐
- 使用LFM(Latent factor model)隐语义模型进行Top-N推荐
- 使用LFM(Latent factor model)隐语义模型进行Top-N推荐
- LFM(Latent factor model)隐语义模型的思想和伪代码
- 使用LFM(Latent factor model)隐语义模型进行Top-N推荐
- 使用LFM(Latent factor model)隐语义模型进行Top-N推荐
- 使用LFM(Latent factor model)隐语义模型进行Top-N推荐
- Unity3d鼠标拖拽控制物体720°旋转
- C#.NET MVC 导出记事本(txt)
- 使用Scrapy框架中如何避免被Ban
- 命令行,在ros中使用qt
- 我眼中的SAML (Security Assertion Markup Language)
- 【知识发现】隐语义模型LFM算法python实现(二)
- C语言学习笔记(五)----高精度的加减法
- aix从光盘装系统直接进入diagnostic原因
- ARM处理器和架构
- 思维定势导致犯的低级错误(碎碎念)一
- 合并有序数组
- 有符号整型和无符号整型的计算
- Android开发资料收集
- angularJs服务 Service_Provider