视频收藏夹推荐
来源:互联网 发布:富士康java开发笔试题 编辑:程序博客网 时间:2024/06/01 08:26
写非TensorFlow python代码不免有退步的感觉,这次就水一波吧。
推荐系统中的图模型例子可见于如下链接
http://www.cnblogs.com/zhangchaoyang/articles/5470763.html
其实际就是马尔科夫链平稳分布的一类模型
写本文的一个目的是看看与推荐系统有关的图模型的构造及使用形式,
以探求spark graphx可能的使用场景与方法。
(实际场景基本上不会使用python)
下面对哔站视频收藏夹数据使用上述图模型
首先是获取数据的爬虫代码
#coding: utf-8from scrapy.spiders import Spiderfrom scrapy.http import Requestimport osimport jsonfrom scrapy.exceptions import CloseSpiderclass SpiderBiliBili(Spider): name = "SpiderBiliBili" user_favorites_url_format = "https://api.bilibili.com/x/v2/fav/video?vmid={}&ps=30&fid={}&tid=0&keyword=&pn={}&order=fav_time&jsonp=jsonp" # require funs num funs_num = 10000 # limit the followers page size of 5 followers_page_upper = 5 followers_url_format = "https://api.bilibili.com/x/relation/followers?vmid={}&pn={}&ps=20&jsonp=jsonp" get_fid_url_format = "https://api.bilibili.com/x/v2/fav/folder?vmid={}&jsonp=jsonp" vmid = 0 fid = 0 main_focus = [] funs_dict = {} def check_close(self): if len(self.funs_dict) > self.funs_num: raise CloseSpider() else: if len(self.funs_dict) > 0 and len(self.funs_dict) % 100 == 0: self.logger.info("num: {}".format(len(self.funs_dict)) + "_" * 10) def closed(self, reason): with open("users.json", "w") as f: require_dict = {} for k, v in self.funs_dict.items(): require_dict[k] = list(eval(dict_string) for dict_string in v) json.dump(require_dict, f) def start_requests(self): return [Request(url = self.user_favorites_url_format.format(self.vmid, self.fid, 1), meta={"page": 2})] def parse(self, response): jsonData = json.loads(response.text) jsonList = jsonData["data"]["archives"] if jsonList: for archive in jsonList: self.main_focus.append(archive["owner"]) yield Request(url = self.user_favorites_url_format.format(self.vmid, self.fid, response.meta["page"]), meta={"page": response.meta["page"] + 1} , callback=self.parse) else: for page in range(1, self.followers_page_upper + 1): for url in [self.followers_url_format.format(owner["mid"], page) for owner in self.main_focus]: yield Request(url = url, callback=self.parseFollower) def parseFollower(self, response): self.check_close() if response.text: jsonData = json.loads(response.text) jsonList = jsonData["data"]["list"] if jsonList: for url in [self.get_fid_url_format.format(user["mid"]) for user in jsonList]: yield Request(url = url, callback=self.parseFid) def parseFid(self, response): self.check_close() if response.text: jsonData = json.loads(response.text) if jsonData["data"]: defaultDict = jsonData["data"][0] mid = defaultDict["mid"] fid = defaultDict["fid"] yield Request(url = self.user_favorites_url_format.format(mid, fid, 1), meta = {"page": 2, "mid": mid, "fid": fid}, callback=self.parseFuns) def parseFuns(self, response): self.check_close() jsonData = json.loads(response.text) jsonList = jsonData["data"]["archives"] if jsonList: SetRequire = set([]) for archive in jsonList: SetRequire.add(str({"aid": archive["aid"], "tname": archive["tname"], "title": archive["title"]})) if self.funs_dict.get(response.meta["mid"]): self.funs_dict[response.meta["mid"]] = self.funs_dict[response.meta["mid"]].union(SetRequire) else: self.funs_dict[response.meta["mid"]] = SetRequire yield Request(url = self.user_favorites_url_format.format(response.meta["mid"], response.meta["fid"], response.meta["page"]), callback=self.parseFuns,\ meta = {"page": response.meta["page"] + 1, "mid": response.meta["mid"], "fid": response.meta["fid"]})
迭代图模型的一个优化速度方向是减少迭代的图节点,故上述爬虫设置为爬取待推荐
目标视频收藏夹中关注相应主播的用户的收藏夹数据。
爬取的数据量为7000余,从中取出500用户(small_users.json)。(由于爬取的总体与被推荐目标关注相同up 故抽样的影响不大)
如果直接对7000余样本使用迭代图模型会发现图的概率都被转移到item上而无法得到user的相关性描述。
从这里还能窥见出图迭代模型的一个可能特征:
当user与item处在数量上的不平衡时(对应于user 7000余 item 70余)。很可能造成上述概率“散失”现象,
user对item形成相对密集的反射凝聚。
故可以考虑对500 user 进一步筛选以使得数据平衡。
考虑使用AP算法通过聚类从500 user筛选出与待推荐目标处于同一类的用户。
这里进行抽样与使用AP算法是有关的,直接对7000余用户使用AP算法(可以考虑调整超参数)会发现收敛比较差——
不收敛或样本点单点收敛或绝大多数收敛到一个群。但还想利用AP较客观的性质。
将上述AP算法应用于图模型预处理的过程如下
#coding: utf-8import numpy as npfrom sklearn.cluster import AffinityPropagationfrom sklearn.metrics.pairwise import cosine_similarity, polynomial_kernel, rbf_kernelimport pickleimport jsonimport timeimport osusers_json_name = "small_users.json"target_name_set = Noneusers_data = Nonewith open("target_name_set.pkl", "rb") as f: target_name_set = pickle.load(f)with open(users_json_name, "r", encoding="UTF-8") as f: users_data = json.load(f)me_data = Nonewith open("me.json", "r", encoding="UTF-8") as f: me_data = json.load(f)key, value = list(me_data.items())[0]users_data[key] = valueme_mid = 19678608logistic = Truelogistic_alpha = 0.8use_threshold = Falseupper_threshold = 0.95lower_threshold = 0.01def process_by_threshold(array): listRequire = [] for v in array: if v < lower_threshold: listRequire.append(0) elif v > upper_threshold: listRequire.append(1) else: listRequire.append(v) return np.asarray(listRequire, np.float32)target_name_list = list(target_name_set)def return_count_array(values): require_list = [0] * len(target_name_list) for value in values: target_index = target_name_list.index(value["tname"]) require_list[target_index] += 1 return np.asarray(require_list, np.float32)users_dict_arrays = dict()for k, v in users_data.items(): count_array = return_count_array(v) if use_threshold: users_dict_arrays[k] = process_by_threshold(count_array / np.sum(count_array)) elif logistic: users_dict_arrays[k] = 1 / (1 + logistic_alpha * np.exp(-1 * count_array / np.sum(count_array))) else: users_dict_arrays[k] = count_array / np.sum(count_array)store_array = np.empty((len(users_dict_arrays),1 + len(list(users_dict_arrays.values())[0])), dtype=object)for index, (k, v) in enumerate(users_dict_arrays.items()): store_array[index] = np.asarray([k] + list(v), object)users_tname_array = store_array[:, 1:]users_tname_similarity = cosine_similarity(users_tname_array)#users_tname_similarity = rbf_kernel(users_tname_array)clf = AffinityPropagation(damping = 0.5 ,verbose=True #, max_iter=1000 )clf.fit(users_tname_similarity)labels = clf.labels_print(np.unique(labels, return_counts=True))me_label = Nonelabel_with_mid = dict()for (index, label) in enumerate(labels): if str(store_array[index, 0]) == str(me_mid): me_label = label if label_with_mid.get(label): label_with_mid[label].append(store_array[index, 0]) else: label_with_mid[label] = [store_array[index, 0]]use_threshold = Truecorrelation_dict = dict([(uid ,users_data[uid]) for uid in label_with_mid[me_label]])serialize_json = Falseif serialize_json: json_dict = dict([(uid ,[v["aid"] for v in users_data[uid]]) for uid in label_with_mid[me_label]]) with open("linear.json", "w") as f: json.dump(json_dict, f) from itertools import combinations json_path = "linear.json" with open(json_path, "r") as f: json_dict = json.load(f) list_require = [] for k, v in json_dict.items(): temp_list = [[tuple2[0], tuple2[1]] for tuple2 in combinations(v, 2)] list_require.extend(temp_list) with open("combine_linear.json", "w") as f: json.dump(list_require, f)correlation_users_dict_arrays = dict()for k, v in correlation_dict.items(): count_array = return_count_array(v) if use_threshold: correlation_users_dict_arrays[k] = process_by_threshold(count_array / np.sum(count_array)) elif logistic: correlation_users_dict_arrays[k] = 1 / (1 + logistic_alpha * np.exp(-1 * count_array / np.sum(count_array))) else: correlation_users_dict_arrays[k] = count_array / np.sum(count_array)correlation_store_array = np.empty((len(correlation_users_dict_arrays),1 + len(list(correlation_users_dict_arrays.values())[0])), dtype=object)for index, (k, v) in enumerate(correlation_users_dict_arrays.items()): correlation_store_array[index] = np.asarray([k] + list(v), object)me_index = list(np.asarray(correlation_store_array[:, 0], np.int64)).index(me_mid)correlation_users_tname_array = correlation_store_array[:, 1:]correlation_users_tname_similarity = cosine_similarity(correlation_users_tname_array)most_close_sort = np.argsort(correlation_users_tname_similarity[me_index])print(list(np.asarray(correlation_store_array[:, 0], np.int64)[most_close_sort][::-1]))def generate_graph_dict(store_array): user_graph_dict = dict() for row in store_array: temp_dict = dict() for index, v in enumerate(row[1:]): if v > lower_threshold: temp_dict[str(index)] = v user_graph_dict[str(row[0])] = temp_dict item_graph_dict = dict() for mid, v in user_graph_dict.items(): for item_id, vv in v.items(): if item_graph_dict.get(item_id): if vv > lower_threshold: item_graph_dict[item_id][mid] = 1 else: if vv > lower_threshold: item_graph_dict[item_id] = {mid: 1} return dict(list(user_graph_dict.items()) + list(item_graph_dict.items()))G = generate_graph_dict(correlation_store_array)def PersonalRank(G, alpha, root, max_step): rank = dict() rank = {x:0 for x in G.keys()} rank[root] = 1 begin=time.time() for k in range(max_step): tmp = {x:0 for x in G.keys()} for i, ri in G.items(): for j, wij in ri.items(): tmp[j] += alpha * rank[i] / (1.0 * len(ri)) tmp[root] += (1 - alpha) rank = tmp end=time.time() print('use time', end - begin) li = sorted(rank.items(), key=lambda variable: variable[1], reverse=True) for ele in li: print("%s:%.3f, \t"%(ele[0], ele[1])) print return rankalpha = 0.8PersonalRank(G, alpha, str(me_mid), 100)
process_by_threshold
logistic = False
则mid有关的输出为
13156831:0.045,3335043:0.044,7577870:0.043,14037227:0.037,99306016:0.033,8513717:0.032,34692687:0.029,111674358:0.012,
logistic = True
即第一次使用logistic处理得到
12777203:0.009,6518065:0.009,76843110:0.008,29936562:0.007,44476348:0.007,14037227:0.007,8406409:0.007,9060244:0.007,18653144:0.007,32372159:0.007,8016053:0.007,49959769:0.007,34118388:0.007,12777203:0.009,6518065:0.009,76843110:0.008,29936562:0.007,44476348:0.007,14037227:0.007,8406409:0.007,9060244:0.007,18653144:0.007,32372159:0.007,8016053:0.007,49959769:0.007,34118388:0.007,539619:0.006,25636327:0.006,21494757:0.006,99306016:0.006,30230802:0.006,689792:0.006,10695207:0.006,680012:0.006,4144022:0.006,16998987:0.006,7321936:0.006,26426749:0.006,20660706:0.005,345361:0.005,7773751:0.005,9969183:0.005,16864217:0.005,13692142:0.005,34801726:0.005,12743531:0.005,9025026:0.005,101336036:0.005,38857506:0.005,21738804:0.005,20822284:0.005,111144756:0.005,34692687:0.005,14981904:0.005,19959681:0.005,23798570:0.005,1814736:0.005,22318597:0.005,5655170:0.005,12134561:0.004,22949680:0.004,23187308:0.004,137769:0.004,68450848:0.004,14633020:0.004,29505433:0.004,85724722:0.004,109344934:0.004,60095442:0.003,12014777:0.003,11522571:0.003,22346648:0.003,23433997:0.003,30733158:0.003,4282763:0.003,25422512:0.003,11026519:0.003,19045421:0.003,28235287:0.002,59436778:0.002,
serialize_json
package Graphimport net.sf.json.JSONArrayimport org.apache.log4j.{Level, Logger}import org.apache.spark.graphx._import org.apache.spark.sql.SparkSessionimport scala.io.Sourceimport org.apache.spark.rdd.RDDimport org.apache.spark.sql.types.{StructField, StructType, LongType, DoubleType}import org.apache.spark.sql.Rowimport scala.collection.JavaConversions._/** * Created by dell on 2017/7/24. */object pageRank_example { //Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession.builder().master("local[*]").appName("pagerank").getOrCreate() val sc = spark.sparkContext val json_path = "combine_linear.json" def main(args: Array[String]): Unit ={ val json_array = JSONArray.fromObject(Source.fromFile(json_path).getLines().mkString) val nest_array = (0 until json_array.size()).map(json_array.getJSONArray).map{ case inner_array: JSONArray => Array(inner_array.getLong(0), inner_array.getLong(1)) }.toArray val edges = nest_array.map{ case inner_array: Array[Long] => Edge(inner_array(0), inner_array(1), 1) } val users = nest_array.flatten.toSet val users_rdd: RDD[(VertexId, Long)] = sc.parallelize( users.toArray.map{ case aid: Long => (aid, aid) } ) val edges_rdd: RDD[Edge[Int]] = sc.parallelize(edges) val graph = Graph(users_rdd, edges_rdd) val ranks = graph.pageRank(0.0001).vertices //val ranks = graph.staticPageRank(1000).vertices val rows = ranks.toLocalIterator.map{ case (id, rank) => Row(id.toLong, rank) }.toList val rank_struct = StructType( StructField("aid" ,LongType, false):: StructField("rank", DoubleType, false) ::Nil ) val rank_df = spark.createDataFrame(rows, rank_struct) rank_df.createGlobalTempView("rank_df") val rank_desc_df = spark.sql("SELECT * FROM global_temp.rank_df ORDER BY rank DESC") rank_desc_df.show() }}
得到仅有阈值处理的情况下得到aid pagerank如下
+--------+------------------+| aid| rank|+--------+------------------+| 1543810|36.857786196129645|| 1137202|25.437858061268024|| 2550973|19.923127673583593|| 4014829|16.237562202477054|| 4873874|13.981142227076205|| 866155|13.821319835630888||10307977|13.750193546631364|| 3865391|10.894418368425686|| 6164935|10.009169606141201|| 836461| 9.649273705397425|| 5903999| 8.98509256107174|| 6665034| 8.777060649987597|| 2301501| 7.679566291514307||11963092| 7.624034734532023|| 3289154| 7.471009479125614|| 2642350| 6.726639466091617|| 1171410| 6.516281094515306||10099135| 6.159356876775848|| 1163949|5.9982829135971905|| 2340796| 5.707696215674478|+--------+------------------+
可以作为av号视频推荐的一种结果。
将第一列待入
https://www.bilibili.com/video/av{}/
可以得到视频链接
这里要指出的是这仅仅是提供一个较粗糙的思路,因为logistic情况下会因为初始数据量过大
要进行抽样。
- 视频收藏夹推荐
- 我的收藏夹推荐观看
- 最值得推荐的Blog : 幸福收藏夹
- 收藏夹
- 收藏夹
- 收藏夹
- 收藏夹
- 收藏夹
- 收藏夹
- 收藏夹
- 收藏夹
- 收藏夹
- 收藏夹
- 收藏夹
- 收藏夹
- 收藏夹
- 收藏夹
- 收藏夹
- Android 属性动画 源码解析 深入了解其内部实现
- 深入分析C++中deque的使用
- c++实现顺序表、单链表和双向链表
- Linux c文件操作总结
- Tomcat Server.xml详解
- 视频收藏夹推荐
- Intellij Idea离线安装Java Decompiler插件
- TCP/IP(一)
- switch小例子
- 怎么把MySQL数据库卸载干净
- [LeetCode] 73. Set Matrix Zeroes
- Webpack 2 前端开发教程
- Java系列(一)之基础语法
- 接入高德地图第三方SDK——如何获取API Key