Spark组件之GraphX学习10--PageRank学习和使用（From examples）

来源：互联网发布：淘宝科技有限公司编辑：程序博客网时间：2024/05/18 04:13

更多代码请见：https://github.com/xubo245/SparkLearning

1解释

原理在参考【3】中讲的很详细，包括MapReduce情况下的。

源码：

  /**   * Run a dynamic version of PageRank returning a graph with vertex attributes containing the   * PageRank and edge attributes containing the normalized edge weight.   *   * @see [[org.apache.spark.graphx.lib.PageRank$#runUntilConvergence]]   */  def pageRank(tol: Double, resetProb: Double = 0.15): Graph[Double, Double] = {    PageRank.runUntilConvergence(graph, tol, resetProb)  }

调用了：

/**   * Run a dynamic version of PageRank returning a graph with vertex attributes containing the   * PageRank and edge attributes containing the normalized edge weight.   *   * @tparam VD the original vertex attribute (not used)   * @tparam ED the original edge attribute (not used)   *   * @param graph the graph on which to compute PageRank   * @param tol the tolerance allowed at convergence (smaller => more accurate).   * @param resetProb the random reset probability (alpha)   *   * @return the graph containing with each vertex containing the PageRank and each edge   *         containing the normalized weight.   */  def runUntilConvergence[VD: ClassTag, ED: ClassTag](    graph: Graph[VD, ED], tol: Double, resetProb: Double = 0.15): Graph[Double, Double] =  {      runUntilConvergenceWithOptions(graph, tol, resetProb)  }  /**   * Run a dynamic version of PageRank returning a graph with vertex attributes containing the   * PageRank and edge attributes containing the normalized edge weight.   *   * @tparam VD the original vertex attribute (not used)   * @tparam ED the original edge attribute (not used)   *   * @param graph the graph on which to compute PageRank   * @param tol the tolerance allowed at convergence (smaller => more accurate).   * @param resetProb the random reset probability (alpha)   * @param srcId the source vertex for a Personalized Page Rank (optional)   *   * @return the graph containing with each vertex containing the PageRank and each edge   *         containing the normalized weight.   */  def runUntilConvergenceWithOptions[VD: ClassTag, ED: ClassTag](      graph: Graph[VD, ED], tol: Double, resetProb: Double = 0.15,      srcId: Option[VertexId] = None): Graph[Double, Double] =  {    // Initialize the pagerankGraph with each edge attribute    // having weight 1/outDegree and each vertex with attribute 1.0.    val pagerankGraph: Graph[(Double, Double), Double] = graph      // Associate the degree with each vertex      .outerJoinVertices(graph.outDegrees) {        (vid, vdata, deg) => deg.getOrElse(0)      }      // Set the weight on the edges based on the degree      .mapTriplets( e => 1.0 / e.srcAttr )      // Set the vertex attributes to (initalPR, delta = 0)      .mapVertices( (id, attr) => (0.0, 0.0) )      .cache()    val personalized = srcId.isDefined    val src: VertexId = srcId.getOrElse(-1L)    // Define the three functions needed to implement PageRank in the GraphX    // version of Pregel    def vertexProgram(id: VertexId, attr: (Double, Double), msgSum: Double): (Double, Double) = {      val (oldPR, lastDelta) = attr      val newPR = oldPR + (1.0 - resetProb) * msgSum      (newPR, newPR - oldPR)    }    def personalizedVertexProgram(id: VertexId, attr: (Double, Double),      msgSum: Double): (Double, Double) = {      val (oldPR, lastDelta) = attr      var teleport = oldPR      val delta = if (src==id) 1.0 else 0.0      teleport = oldPR*delta      val newPR = teleport + (1.0 - resetProb) * msgSum      (newPR, newPR - oldPR)    }    def sendMessage(edge: EdgeTriplet[(Double, Double), Double]) = {      if (edge.srcAttr._2 > tol) {        Iterator((edge.dstId, edge.srcAttr._2 * edge.attr))      } else {        Iterator.empty      }    }    def messageCombiner(a: Double, b: Double): Double = a + b    // The initial message received by all vertices in PageRank    val initialMessage = resetProb / (1.0 - resetProb)    // Execute a dynamic version of Pregel.    val vp = if (personalized) {      (id: VertexId, attr: (Double, Double), msgSum: Double) =>        personalizedVertexProgram(id, attr, msgSum)    } else {      (id: VertexId, attr: (Double, Double), msgSum: Double) =>        vertexProgram(id, attr, msgSum)    }    Pregel(pagerankGraph, initialMessage, activeDirection = EdgeDirection.Out)(      vp, sendMessage, messageCombiner)      .mapVertices((vid, attr) => attr._1)  } // end of deltaPageRank

2.代码：

/** * @author xubo * ref http://spark.apache.org/docs/1.5.2/graphx-programming-guide.html * time 20160503 */package org.apache.spark.graphx.learningimport org.apache.spark.SparkConfimport org.apache.spark.SparkContextimport org.apache.spark.graphx.Graphimport org.apache.spark.graphx.Graph.graphToGraphOpsimport org.apache.spark.graphx.VertexIdimport org.apache.spark.graphx.util.GraphGeneratorsimport org.apache.spark.graphx.GraphLoaderobject PageRank {  def main(args: Array[String]): Unit = {    val conf = new SparkConf().setAppName("PageRank").setMaster("local[4]")    val sc = new SparkContext(conf)    // Load the edges as a graph    val graph = GraphLoader.edgeListFile(sc, "file/data/graphx/input/followers.txt")    // Run PageRank    val ranks = graph.pageRank(0.0001).vertices    // Join the ranks with the usernames    val users = sc.textFile("file/data/graphx/input/users.txt").map { line =>      val fields = line.split(",")      (fields(0).toLong, fields(1))    }    val ranksByUsername = users.join(ranks).map {      case (id, (username, rank)) => (username, rank)    }    // Print the result    println(ranksByUsername.collect().mkString("\n"))  }}

直接调用

<pre code_snippet_id="1671863" snippet_file_name="blog_20160504_3_6713227" name="code" class="plain">   val ranks = graph.pageRank(0.0001).vertices

就行了，0.0001为前后两次收敛的误差阈值，小于这个阈值时则结束计算，越小精度越到

数据：

followers.txt：

2 14 11 26 37 37 66 73 7

users.txt<span style="font-family: Arial, Helvetica, sans-serif; font-size: 12px; background-color: rgb(255, 255, 255);">:</span>

1,BarackObama,Barack Obama2,ladygaga,Goddess of Love3,jeresig,John Resig4,justinbieber,Justin Bieber6,matei_zaharia,Matei Zaharia7,odersky,Martin Odersky8,anonsys

图：

<img src="http://img.blog.csdn.net/20160504185030083?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQv/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/Center" alt="" />

3.结果：

(justinbieber,0.15)(matei_zaharia,0.7013599933629602)(ladygaga,1.390049198216498)(BarackObama,1.4588814096664682)(jeresig,0.9993442038507723)(odersky,1.2973176314422592)

第8个顶点由于没有与任何顶点连接，故join时就没了

参考

【1】 http://spark.apache.org/docs/1.5.2/graphx-programming-guide.html

【2】https://github.com/xubo245/SparkLearning

【3】http://blog.jobbole.com/71431/

0 0