读《Programming Collective Intelligence》chapter 2

来源:互联网 发布:淘宝宝贝竖图怎么弄 编辑:程序博客网 时间:2024/05/29 19:18


一次读英文原著,刚开始有些担心自己的能力,但是坚持的读了30分钟后,觉得自己没有问题。

这章以电影推荐作例子,带领新手不光认识了Python编程,还很清楚的演示了2种推荐打分算法,欧几里得距离和皮尔逊距离算法,这两种常用的算法,都很易懂。
这本书使用Python2.7,而我使用的是Python3.0,而且我用的PyCharm作为编程平台。


在这章的实例验证过程中,我进入了一个大坑,坑了2天。pydelicious.py的使用,我在参考了网络上高人的意见后,成功加入了该库,并可以使其在3.X上跑动。但是,由于该库连接的网址的问题,这个部分的代码无法进行测试。

下面是一个这章的思维导图(引用)

        下面是我练习的代码,与原书有一点不一样。


         
# a dictionary of movie critics and their ratings of a small# set of moviesfrom math import sqrtcritics = {'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,                         'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,                         'The Night Listener': 3.0},           'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,                            'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,                            'You, Me and Dupree': 3.5},           'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,                                'Superman Returns': 3.5, 'The Night Listener': 4.0},           'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,                            'The Night Listener': 4.5, 'Superman Returns': 4.0,                            'You, Me and Dupree': 2.5},           'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,                            'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,                            'You, Me and Dupree': 2.0},           'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,                             'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},           'Toby': {'Snakes on a Plane': 4.5, 'You, Me and Dupree': 1.0, 'Superman Returns': 4.0}}# Returns a distance-based similarity score for person1 and person2def sim_distance(prefs, person1, person2):    # Get the list of shared_items    si = {}    for item in prefs[person1]:        if item in prefs[person2]:          si[item] = 1    # if they have no ratings in common, return 0    if len(si) == 0:      return 0    # Add up the squares of all the differences    sum_of_squares = sum([pow(prefs[person1][item] - prefs[person2][item], 2)                          for item in prefs[person1] if item in prefs[person2]])    return 1 / (1 + sum_of_squares)# Returns the Pearson correlation coefficient for p1 and p2def sim_pearson(prefs, p1, p2):    # Get the list of mutually rated items    si = {}    for item in prefs[p1]:        if item in prefs[p2]:          si[item] = 1    # if they are no ratings in common, return 0    if len(si) == 0:      return 0    # Sum calculations    n = len(si)    # Sums of all the preferences    sum1 = sum([prefs[p1][it] for it in si])    sum2 = sum([prefs[p2][it] for it in si])    # Sums of the squares    sum1Sq = sum([pow(prefs[p1][it], 2) for it in si])    sum2Sq = sum([pow(prefs[p2][it], 2) for it in si])    # Sum of the products    pSum = sum([prefs[p1][it] * prefs[p2][it] for it in si])    # Calculate r (Pearson score)    num = pSum - (sum1 * sum2 / n)    den = sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))    if den == 0:      return 0    r = num / den    return r# Returns the Tanimoto correlation coefficient for p1 and p2def sim_tanimoto(prefs, p1, p2):    # Get the list of mutually rated items    si = {}    for item in prefs[p1]:        if item in prefs[p2]:            si[item] = 1    # if they are no ratings in common, return 0    if len(si) == 0:        return 0    # Sum calculations    n = len(si)    if n == 0:        return 0    # Sums of the squares    sum1Sq = sum([pow(prefs[p1][it], 2) for it in si])    sum2Sq = sum([pow(prefs[p2][it], 2) for it in si])    # Sum of the products    pSum = sum([prefs[p1][it] * prefs[p2][it] for it in si])    # Calculate r (Tanimoto score)    den = sqrt(sum1Sq) + sqrt(sum2Sq) - pSum    if den == 0:        return 0    r = pSum / den    return r# Returns the best matches for person from the prefs dictionary.# Number of results and similarity function are optional params.def topMatches(prefs, person, n=5, similarity=sim_pearson):    scores = [(similarity(prefs, person, other), other)              for other in prefs if other != person]    scores.sort()    scores.reverse()    return scores[0:n]# Gets recommendations for a person by using a weighted average# of every other user's rankingsdef getRecommendations(prefs, person, similarity=sim_pearson):    totals = {}    simSums = {}    for other in prefs:        # don't compare me to myself        if other == person:          continue        sim = similarity(prefs, person, other)        # ignore scores of zero or lower        if sim <= 0:          continue        for item in prefs[other]:            # only score movies I haven't seen yet            if item not in prefs[person] or prefs[person][item] == 0:                # Similarity * Score                totals.setdefault(item, 0)                totals[item] += prefs[other][item] * sim                # Sum of similarities                simSums.setdefault(item, 0)                simSums[item] += sim    # Create the normalized list    rankings = [(total / simSums[item], item) for item, total in totals.items()]    # Return the sorted list    rankings.sort()    rankings.reverse()    return rankingsdef transformPrefs(prefs):    result = {}    for person in prefs:        for item in prefs[person]:            result.setdefault(item, {})            # Flip item and person            result[item][person] = prefs[person][item]    return resultdef calculateSimilarItems(prefs, n=10):    # Create a dictionary of items showing which other items they    # are most similar to.    result = {}    # Invert the preference matrix to be item-centric    itemPrefs = transformPrefs(prefs)    c = 0    for item in itemPrefs:        # Status updates for large datasets        c += 1        if c % 100 == 0:          print("%d / %d" % (c, len(itemPrefs)))        # Find the most similar items to this one        scores = topMatches(itemPrefs, item, n=n, similarity=sim_distance)        result[item] = scores    return resultdef getRecommendedItems(prefs, itemMatch, user):    userRatings = prefs[user]    scores = {}    totalSim = {}    # Loop over items rated by this user    for (item, rating) in userRatings.items():        # Loop over items similar to this one        for (similarity, item2) in itemMatch[item]:            # Ignore if this user has already rated this item            if item2 in userRatings:                continue            # Weighted sum of rating times similarity            scores.setdefault(item2, 0)            scores[item2] += similarity * rating            # Sum of all the similarities            totalSim.setdefault(item2, 0)            totalSim[item2] += similarity        # Divide each total score by total weighting to get an average        rankings = [(score/totalSim[item], item)                    for item, score in scores.items()]    # Return the rankings for highest to lowest    rankings.sort()    rankings.reverse()    return rankings# Load  MovieLens DataSetdef loadMovieLens(path='E:/PythonProgram/data/movielens'):    # Get movie titles    movies = {}    for line in open(path + '/u.item', mode='r', encoding="ISO-8859-1"):        (id, title) = line.split('|')[0:2]        movies[id] = title    # Load data    prefs = {}    for line in open(path + '/u.data', mode='r', encoding="ISO-8859-1"):        (user, movieid, rating, ts) = line.split('\t')        prefs.setdefault(user, {})        prefs[user][movies[movieid]] = float(rating)    return prefs