User Base协同过滤的推荐系统,python实现

来源:互联网 发布:数据治理 权威定义 编辑:程序博客网 时间:2024/04/30 00:05

同学让帮忙写的,临时写出来的,可能有bug,贴着存档吧,不解释了

# -*- coding: utf-8 -*-# test on python 2.7.11import csvimport randomimport math# user base CFdef read_csv(file_name='train.csv'):    '''        read csv file to a matrix    '''    with open(file_name) as csvfile:        matrix_reader = csv.reader(csvfile)        matrix = [[int(x) for x in row] for row in matrix_reader]    return matrix#&&&&&&&def avg(vector):    '''    calculate average skipping 0    '''    count = sum([int(a > 0) for a in vector])    if count:        return sum(vector) / count    else:        return 0def common_items(x, y):    '''    x, y are list, return two list    if x[i] & y[i] > 0: commx.append(x[i]) so as commy    '''    commx = [x[i] for i in range(len(x)) if x[i] > 0 and y[i] > 0]    commy = [y[i] for i in range(len(x)) if x[i] > 0 and y[i] > 0]    return commx, commydef pearson(x, y):    '''    calculte similarity between x and y with pearson method.    '''    avgx = avg(x)    avgy = avg(y)    commx, commy = common_items(x, y)    if len(commx) == 0:        return 0    dx = [x-avgx for x in commx]    dy = [y-avgy for y in commy]    up = sum([dx[i]*dy[i] for i in range(len(commx))])    down = math.sqrt(sum([x**2 for x in dx])*sum([y**2 for y in dy]))    if down:        return up / down    else:        return 0#&&&&&&&&&&&def pearson_for_matrix(matrix):    '''    calculate similarity between any two user, return a paerson matrix    '''    col_length = row_length = len(matrix)    pearson_matrix = [([0] * col_length) for i in range(row_length)]    for i in range(row_length):        for j in range(i+1, col_length):            pearson_matrix[i][j] = pearson(matrix[i], matrix[j])            pearson_matrix[j][i] = pearson_matrix[i][j]    return pearson_matrix#&&&&&&&def k_sim_user(sim_matrix, u, k):    '''    get the k most similar users with user(u)    return their id and similarity    '''    m = len(sim_matrix)    tmp_list = [(sim_matrix[u][i], i) for i in range(m)]    tmp_list.sort(reverse=True)    tmp_list = [int(x[1]) for x in tmp_list]    users_id_list = tmp_list[0:k]    users_value = [sim_matrix[u][x] for x in users_id_list]    return users_id_list, users_valuedef knn_fill_rate_matrix(matrix, sim_matrix, k=3):    '''    use knn to fill rating matrix    '''    row_length = len(matrix)    col_length = len(matrix[0])    for i in range(row_length):        simi_users, simi_users_value = k_sim_user(sim_matrix, i, k)        z = sum(simi_users_value)        user_i_avg = avg(matrix[i])        for j in range(col_length):            if matrix[i][j]==0:                weight = [                    sim_matrix[i][u]*(matrix[u][j]-avg(matrix[u])) for u in simi_users]                if sum(simi_users_value) and z!=0:                    matrix[i][j] = user_i_avg + sum(weight) / z                else:                    matrix[i][j] = user_i_avg                if matrix[i][j]==0:                    print('again')    return matrixdef saveMatrix(matrix, file_name='matrix.csv'):    '''    write matrix to csv file    if python3 with open(file_name, 'w', newline='') as csvfile:    '''    with open(file_name, 'wb') as csvfile:        matrix_writer = csv.writer(csvfile)        for line in matrix:            # print(line)            matrix_writer.writerow(line)# test code belowrate_matrix = read_csv()sim_matrix = pearson_for_matrix(rate_matrix)filled = knn_fill_rate_matrix(rate_matrix, sim_matrix, k=3)saveMatrix(filled, 'result.csv')


0 0
原创粉丝点击