svdRec 源码

来源:互联网 发布:yum install .rpm 编辑:程序博客网 时间:2024/06/07 03:00
#!/usr/python/bin#-*- coding:utf-8 -*-from numpy import *from numpy import linalg as ladef loadExData():    return[[0, 0, 0, 2, 2],           [0, 0, 0, 3, 3],           [0, 0, 0, 1, 1],           [1, 1, 1, 0, 0],           [2, 2, 2, 0, 0],           [5, 5, 5, 0, 0],           [1, 1, 1, 0, 0]]    def loadExData2():    return[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],           [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],           [0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],           [3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],           [5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],           [0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],           [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],           [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],           [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],           [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],           [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]    def ecludSim(inA,inB):#欧式距离计算相似度    return 1.0/(1.0 + la.norm(inA - inB))def pearsSim(inA,inB):#皮尔逊相似度    if len(inA) < 3 : return 1.0    return 0.5+0.5*corrcoef(inA, inB, rowvar = 0)[0][1]#corrcoef是相关系数矩阵,[0][1]代表的值就是1,2向量的相关系数def cosSim(inA,inB):#余弦相似度    num = float(inA.T*inB)    denom = la.norm(inA)*la.norm(inB)    return 0.5+0.5*(num/denom)def standEst(dataMat, user, simMeas, item):#预测一个用户对一个物品的打分    n = shape(dataMat)[1]#思路:计算该用户打过分的其他物品与该物品之间的相似度,用相似度乘相应打分的和再除以相似度的和,也就是相似度越大的物品打分对预测打分的影响越大    simTotal = 0.0; ratSimTotal = 0.0    for j in range(n):        userRating = dataMat[user,j]        if userRating == 0: continue        overLap = nonzero(logical_and(dataMat[:,item].A>0, dataMat[:,j].A>0))[0]        if len(overLap) == 0:         similarity = 0        else:         similarity = simMeas(dataMat[overLap,item], dataMat[overLap,j])        print 'the %d and %d similarity is: %f' % (item, j, similarity)        simTotal += similarity        ratSimTotal += similarity * userRating    if simTotal == 0: return 0    else: return ratSimTotal/simTotal    def svdEst(dataMat, user, simMeas, item):#使用svd预测用户对某个物品的打分    n = shape(dataMat)[1]    simTotal = 0.0    ratSimTotal = 0.0    U,Sigma,VT = la.svd(dataMat)    Sig4 = mat(eye(4)*Sigma[:4])#取奇异值矩阵对角线的前4个元素,组成新的对角阵    xformedItems = dataMat.T*U[:,:4]*Sig4.I #再拿原矩阵与只取前4列的U矩阵相乘得到简化后的可以计算物品相似度的矩阵,Sig4.I是求逆矩阵    for j in range(n):    userRating = dataMat[user,j]    if userRating == 0 or j == item:#如果用户没有对那个物品打过分,或物品就是当前评分的物品,则跳过    continue    similarity = simMeas(xformedItems[item,:].T,xformedItems[j,:].T)#计算两物品向量之间的相似度    print 'the %d and %d similarity is: %f' % (item, j, similarity)    simTotal = simTotal+similarity    ratSimTotal = similarity*userRating    if simTotal == 0:    return 0    else:    return ratSimTotal/simTotal #计算用户可能对该物品的打分def recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst):    unratedItems = nonzero(dataMat[user,:].A==0)[1]#找到用户未打分的项目    if len(unratedItems) == 0: return 'you rated everything'    itemScores = []    for item in unratedItems:        estimatedScore = estMethod(dataMat, user, simMeas, item)        itemScores.append((item, estimatedScore))    return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[:N]def printMat(inMat, thresh = 0.8):#将矩阵以图像的形式打印出来for i in range(32):for k in range(32):if float(inMat[i,k])>thresh:#因为有浮点数,所以值大于阈值的就打印为1,否则打印为0print 1,#加个“,”就是不要换行了else:print 0,print ''def imgCompress(numSV = 3, thresh = 0.8):#根据指定的奇异值数目来压缩图像(因为压缩后就可以使用U,Sigma,VT相乘的形式来表示图像,就不用1024*1024那种原始的形式了,所以节省了存储空间)myl = []for line in open('0_5.txt').readlines():newRow = []for i in range(32):newRow.append(int(line[i]))myl.append(newRow)myMat = mat(myl)#将文件中的0,1读入矩阵myMat中print "****original matrix****"printMat(myMat,thresh)#打印原矩阵U, Sigma, VT = la.svd(myMat)#SigRecon = mat(zeros((numSV,numSV)))#for k in range(numSV):#SigRecon[k,k] = Sigma[k]SigRecon = mat(eye(numSV)*Sigma[:numSV])#与上面三行的效果相同reconMat = U[:,:numSV]*SigRecon*VT[:numSV,:]print "****reconstructed matrix using %d singular values****" % numSVprintMat(reconMat,thresh)


皮尔逊相似度计算公式