UBM - Implementation by Spark/Scala
来源:互联网 发布:孚盟软件logo 编辑:程序博客网 时间:2024/04/29 23:06
For modelling details, refer to Derivation of UBM .
Local version
The idea of updating numerator & denominator separately is borrowed from varepsilon @ Yandex, which implemented by Python.
class LocalModeler { var browsingModes = 1 var max_urls = 4000 var max_queries = 500 var max_url_per_query = val alpha = Array.tabulate(max_urls, max_queries){ (u, q) => 0.5} // q -> u -> alpha val gamma = Array.tabulate(max_url_per_query, max_url_per_query, browsingModes){ (r,d,m) => if (d<=r) 0.5 else 0.0 } // r -> d -> m -> gamma val mu = Array.tabulate(max_queries, browsingModes){(q, m) => 1.0 / browsingModes} // q -> m -> def click2distance(cs: Seq[Boolean]): Seq[(Boolean, Int, Int)] = { var pre_click = -1 cs.zipWithIndex.map{ case (c, r) => val d = r - pre_click - 1 if (c) pre_click = r (c, r, d) } def train(sessions: Seq[(Int, Seq[Int], Seq[Boolean])], maxIter: Int) val data = sessions.flatMap{ case (q, url, click) => val distance = click2distance(click) url zip distance map { case (u, (c, r, d)) => (q, u, r, d, c) } }.groupBy{identity}.mapValues{_.length}.to for (i <- 0 until maxIter) { val updates = data.map { case ((q, u, r, d, c), cnt) => val alpha_uq = alpha(u)(q) val gamma_rd = gamma(r)(d) val mu_q = mu(q) val mu_gamma = mu_q zip gamma_rd map { case (x, y) => x * y} val dot_prod_mu_gamma = mu_gamma.sum val Q_m_a1_e1_c1 = mu_gamma.map { _ / dot_prod_mu_gamma } val Q_m_e1_c1 = Q_m_a1_e1_c1 val Q_m_c1 = Q_m_a1_e1_c1 val Q_a1_c1 = 1.0 val Q_a1_c0 = alpha_uq * (1 - dot_prod_mu_gamma) / (1 - alpha_uq * dot_prod_mu_gamma) val Q_m_e1_c0 = mu_gamma.map { _ * (1 - alpha_uq) / (1 - alpha_uq * dot_prod_mu_gamma) } val Q_m_c0 = gamma_rd.map { gamma_rdm => 1 - alpha_uq * gamma_rdm }.zip(mu_q).map { case (x, y) => x * y / (1 - alpha_uq * dot_prod_mu_gamma) val alpha_fraction = if (c) { (Q_a1_c1 * cnt, cnt) } else { (Q_a1_c0 * cnt, cnt) val gamma_fraction = if (c) { Q_m_e1_c1.map{_ * cnt}.zip(Q_m_c1.map {_ * cnt}) } else { Q_m_e1_c0.map {_ * cnt}.zip(Q_m_c0.map {_ * cnt}) val mu_fraction = if (c) { Q_m_c1.map { q_m_c => ( q_m_c * cnt, cnt)} } else { Q_m_c0.map { q_m_c => ( q_m_c * cnt, cnt)} ((q, u, r, d), (alpha_fraction, gamma_fraction, mu_fraction)) // update alpha updates.map { case ((q, u, r, d), (af, gf, mf)) => ((u, q), af) }.groupBy { _._1 }.mapValues { _.map {_._2}.reduce[(Double, Int)] { case (x, y) => (x._1 + y._1, x._2 + y._2) } }.foreach{ case ((u, q), (num, den)) => alpha(u)(q) = num / den // update gamma updates.map { case ((q, u, r, d), (af, gf, mf)) => ((r, d), gf) }.groupBy{_._1}.mapValues { _.map { _._2 }.reduce[Array[(Double, Double)]] { case (xs, ys) => xs zip ys map { case (x, y) => (x._1 + y._1, x._2 + y._2) } } }.foreach{ case ((r, d), fs) => fs.zipWithIndex.foreach{ case ((num, den), m) => gamma(r)(d)(m) = num / den } // update mu updates.map { case ((q, u, r, d), (af, gf, mf)) => (q, mf) }.groupBy{_._1}.mapValues { _.map{_._2}.reduce[Array[(Double, Int)]]{ case (xs, ys) => xs zip ys map { case (x, y) => (x._1 + y._1, x._2 + y._2) } } }.foreach { case (q, fs) => fs.zipWithIndex.foreach{ case ((num, den), m) => mu(q)(m) = num / den } } (alpha, gamma, mu) }}
Distributed version
Updating of α is inconvenient for Graph.aggregateMessage
, that is why not implemented using GraphX.
object SparkUBModeler extends Logging { case class Conf( max_queries: Long, max_url_per_query: Int, browsingModes: Int, maxIter: Int, minDelta: Double, numPartitions: Int ) def train(train_data: RDD[((Long, Long, Int, Int), Seq[(Boolean, Int)])], conf: Conf) = {// logInfo("train data: " + train_data.count()) println("train data: " + train_data.count()) val Conf(max_queries, max_url_per_query, browsingModes, maxIter, minDelta, numPartitions) = conf val sc = train_data.context val gamma = train_data.map{ case ((q,u,r,d), _) => (r,d) }.distinct().collect().map { case (r, d) => ((r, d), Array.fill(browsingModes)(0.5)) }.toMap var gamma_br = sc.broadcast(gamma) var alpha = train_data.map { case ((q, u, r, d), _) => (q, u) }.distinct(numPartitions).map { (_, 0.5) }.cache() var mu = train_data.map { _._1._1 }.distinct().map { q => (q, Array.tabulate(browsingModes) { _ => 1.0 / browsingModes}) }.cache() var delta = Double.PositiveInfinity var joined_data = train_data.map { case ((q, u, r, d), cnts) => ((q, u), (r, d, cnts)) }.join(alpha).map { case ((q, u), ((r, d, cnts), alpha_qu)) => (q, (u, r, d, cnts, alpha_qu)) }.join(mu) for (i <- 0 until maxIter if delta > minDelta) { val updates = joined_data.flatMap { case (q, ((u, r, d, cnts, alpha_qu), mu_q)) => val gamma_rd = gamma_br.value(r,d) val mu_gamma = mu_q zip gamma_rd map { case (x, y) => x * y} val dot_prod_mu_gamma = mu_gamma.sum val Q_m_a1_e1_c1 = mu_gamma.map { _ / dot_prod_mu_gamma } val Q_m_e1_c1 = Q_m_a1_e1_c1 val Q_m_c1 = Q_m_a1_e1_c1 val Q_a1_c1 = 1.0 val Q_a1_c0 = alpha_qu * (1 - dot_prod_mu_gamma) / (1 - alpha_qu * dot_prod_mu_gamma) val Q_m_e1_c0 = mu_gamma.map { _ * (1 - alpha_qu) / (1 - alpha_qu * dot_prod_mu_gamma) } val Q_m_c0 = gamma_rd.map { gamma_rdm => 1 - alpha_qu * gamma_rdm }.zip(mu_q).map { case (x, y) => x * y / (1 - alpha_qu * dot_prod_mu_gamma) } val fractions = cnts.map { case (c, cnt) => val alpha_fraction = if (c) { (Q_a1_c1 * cnt, cnt) } else { (Q_a1_c0 * cnt, cnt) } val gamma_fraction = if (c) { Q_m_e1_c1.map {_ * cnt}.zip(Q_m_c1.map {_ * cnt}) } else { Q_m_e1_c0.map {_ * cnt}.zip(Q_m_c0.map {_ * cnt}) } val mu_fraction = if (c) { Q_m_c1.map { q_m_c => (q_m_c * cnt, cnt)} } else { Q_m_c0.map { q_m_c => (q_m_c * cnt, cnt)} } (alpha_fraction, gamma_fraction, mu_fraction) } fractions.map{ case fs => ((q,u,r,d), fs)} }.cache() // update alpha val new_alpha = updates.map { case ((q, u, r, d), fractions) => ((q, u), fractions._1) }.reduceByKey { case (lhs, rhs) => (lhs._1 + rhs._1, lhs._2 + rhs._2) }.mapValues { case (num, den) => num / den }.cache() val delta_alpha = alpha.join(new_alpha).values.map{ case (x, y) => math.abs(x - y) }.max() // update mu val new_mu = updates.map { case ((q, u, r, d), fractions) => (q, fractions._3) }.reduceByKey { case (x, y) => x zip y map { case (lhs, rhs) => (lhs._1 + rhs._1, lhs._2 + rhs._2) } }.mapValues { _.map { case (num, den) => num / den } }.cache() val delta_mu = mu.join(new_mu).values.map{ case (lhs, rhs) => lhs.zip(rhs).map{ case (x, y) => math.abs(x - y) }.max }.max() delta = math.max(delta_alpha, delta_mu) // update gamma updates.map { case ((q, u, r, d), fractions) => ((r, d), fractions._2) }.reduceByKey { case (x, y) => x zip y map { case (lhs, rhs) => (lhs._1 + rhs._1, lhs._2 + rhs._2) } }.mapValues { _.map { case (num, den) => num / den } }.collect().foreach { case ((r, d), gamma_rd) => gamma_rd.zipWithIndex.foreach { case (gamma_rdm, m) => delta = math.max(delta, math.abs(gamma(r,d)(m) - gamma_rdm)) gamma(r,d)(m) = gamma_rdm } } gamma_br = sc.broadcast(gamma) updates.unpersist() alpha.unpersist() mu.unpersist() joined_data.unpersist() alpha = new_alpha mu = new_mu joined_data = train_data.map { case ((q, u, r, d), cnts) => ((q, u), (r, d, cnts)) }.join(alpha).map { case ((q, u), ((r, d, cnts), alpha_qu)) => (q, (u, r, d, cnts, alpha_qu)) }.join(mu).cache() val perplexity = joined_data.flatMap{ case (q, ((u, r, d, cnts, alpha_qu), mu_q)) => val gamma_rd = gamma_br.value(r, d) cnts.map{ case (c, cnt) => val p_c1 = alpha_qu * gamma_rd.zip(mu_q).map{ case (x, y) => x * y}.sum (if (c) - cnt * log2(p_c1) else - cnt * log2(1-p_c1), cnt) } }.reduce{ (x, y) => (x._1 + y._1, x._2 + y._2) }// logInfo(f"iteration $i: delta = $delta%.6f") println(f"iteration $i: delta = $delta%.6f, " + f"perplexity = ${math.pow(2, perplexity._1 / perplexity._2)}%.6f") } val model = new UBMModel(max_queries, max_url_per_query, browsingModes) model.gamma = Some(gamma) model.alpha = Some(alpha) model.mu = Some(mu) model } def log2(x: Double): Double = math.log(x) / math.log(2) class UBMModel( val max_queries: Long, val max_url_per_query: Int, val browsingModes: Int = 1) { var gamma: Option[Map[(Int, Int), Array[Double]]] = None var alpha: Option[RDD[((Long, Long), Double)]] = None var mu: Option[RDD[(Long, Array[Double])]] = None }}
0 0
- UBM - Implementation by Spark/Scala
- PageRank Spark implementation
- spark,scala
- spark implementation hadoop setup,cleanup
- scala 开发spark程序
- scala for spark
- scala spark开发模式
- spark scala wordcout
- Scala and Spark Begin
- Spark+Scala课程包
- 初学spark--scala--45
- Spark与Scala学习
- 实战Scala & Spark (1)
- 实战Scala & Spark (2)
- 实战Scala & Spark (3)
- Scala && Spark 安装
- Spark Scala 范例
- Scala Spark 求众数
- 如何在Windows下利用Oracle VM VirtualBox虚拟机下安装linux
- 在jsp中如何实现网页的自动更新
- 2015-08-24 part2 bootloader前期准备
- Opencv 16位深度图片显示并保存
- 6 ZigZag Conversion
- UBM - Implementation by Spark/Scala
- 【NOIP2008TG/codevs1169】 传纸条 解题报告
- kafka集群搭建和使用Java写kafka生产者消费者
- 反渗透设备:反渗透水处理设备特点介绍
- HTTP状态码(HTTP Status Code)
- C#中泛型List的定义与用法以及常用函数
- 操作系统几个概念
- struts2学习笔记——03
- 最老程序员创业开发实训1---Android应用架构之MVC