#!/usr/bin/pythonfrom math import sqrtdef genUserBasedMap(file = 'u.data'): map = {} f = open(file) for line in f: (user, item, rate) = line.split('\t')[0:3] map.setdefault(int(user), {}) map[int(user)][int(item)] = int(rate) f.close() return mapdef genItemBasedMap(file = 'u.data'): map = {} f = open(file) for line in f: (user, item, rate) = line.split('\t')[0:3] map.setdefault(int(item), {}) map[int(item)][int(user)] = int(rate) f.close() return mapdef userBased(map, person, n=5, similarity=pearson): items = {} itemsSim = {} for p in map: if(p == person): continue score = distance(map, p, person, similarity) if(score <= 0): continue for i in map[p]: if(i != 0 and i != None and i not in map[person]): items.setdefault(i, 0) itemsSim.setdefault(i, 0) items[i] += score * map[p][i] itemsSim[i] += score #normalize the items rankings = [(total/itemsSim[item], item) for item, total in items.items()] rankings.sort() rankings.reverse() return rankings[0:n]def itemBased(map, item, n = 5, similarity=pearson): score = [] for i in map: if i == item: continue score.append((distance(map, item, i, similarity), i)) score.sort() score.reverse() return score[0:n] def distance(map, p1, p2, similarity=cosine): si = {} for item in map[p1]: if item in map[p2]: si[item] = 1 if len(si) == 0: return 0 # calc the distance v1 = [map[p1][i] for i in si] v2 = [map[p2][i] for i in si] distance = similarity(v1, v2) return distance#different similarity functionsdef euclidean(v1, v2): length = min(len(v1), len(v2)) if length == 0: return 0 d = 0 for i in range(length): d += pow((v1[i] - v2[i]), 2) #return sqrt(d) return 1 / float(1+d)def cosine(v1, v2): length = min(len(v1), len(v2)) if length == 0: return 0 dp = 0 #dot product m1 = 0 #modulus of v1 m2 = 0 #modulus of v2 for i in range(length): dp += v1[i] * v2[i] m1 += v1[i] * v1[i] m2 += v2[i] * v2[i] if m1 == 0 or m2 == 0: return 0 distance = dp / (sqrt(m1) * sqrt(m2)) return distancedef pearson(v1, v2): length = min(len(v1), len(v2)) if length == 0: return 0 #e of v1 v2 e1 = 0 e2 = 0 for i in range(length): e1 += v1[i] e2 += v2[i] e1 /= float(length) e2 /= float(length) cov = 0 #cov of v1 v2 d1 = 0 #variance of v2 d2 = 0 #variance of v2 for i in range(length): diff1 = v1[i] - e1 diff2 = v2[i] - e2 cov += diff1 * diff2 d1 += diff1 * diff1 d2 += diff2 * diff2 cov /= float(length) d1 /= float(length) d2 /= float(length) if d1 == 0 or d2 == 0: return 0 return cov / sqrt(d1 * d2)