Spark GraphX相关使用方法
来源:互联网 发布:淘宝网店助理 编辑:程序博客网 时间:2024/06/05 18:26
Spark GraphX是一个分布式图处理框架,Spark GraphX基于Spark平台提供对图计算和图挖掘简洁易用的而丰富多彩的接口,极大的方便了大家对分布式图处理的需求。Spark GraphX由于底层是基于Spark来处理的,所以天然就是一个分布式的图处理系统。图的分布式或者并行处理其实是把这张图拆分成很多的子图,然后我们分别对这些子图进行计算,计算的时候可以分别迭代进行分阶段的计算,即对图进行并行计算。
Spark GraphX基本操作:
- import org.apache.spark.SparkContext
- import org.apache.spark._
- import org.apache.spark.graphx._
- import org.apache.spark.graphx.Graph
- import org.apache.spark.graphx.Edge
- import org.apache.spark.graphx.VertexRDD
- import org.apache.spark.graphx.util.GraphGenerators
- import org.apache.spark.graphx.GraphLoader
- import org.apache.spark.storage.StorageLevel
- import org.apache.spark.rdd.RDD
- object SparkGraphx1 {
- def main(args: Array[String]) {
- val sc = new SparkContext("spark://centos.host1:7077", "Spark Graphx")
- //创建点RDD
- val users: RDD[(VertexId, (String, String))] = sc.parallelize(Array(
- (3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
- (5L, ("franklin", "prof")), (2L, ("istoica", "prof"))))
- //创建边RDD
- val relationships: RDD[Edge[String]] = sc.parallelize(Array(
- Edge(3L, 7L, "collab"), Edge(5L, 3L, "advisor"),
- Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi")))
- //定义一个默认用户,避免有不存在用户的关系
- val defaultUser = ("John Doe", "Missing")
- //构造Graph
- val graph = Graph(users, relationships, defaultUser)
- //点RDD、边RDD过滤
- val fcount1 = graph.vertices.filter { case (id, (name, pos)) => pos == "postdoc" }.count
- println("postdocs users count: " + fcount1)
- val fcount2 = graph.edges.filter(edge => edge.srcId > edge.dstId).count
- println("srcId > dstId edges count: " + fcount2)
- val fcount3 = graph.edges.filter { case Edge(src, dst, prop) => src > dst }.count
- println("srcId > dstId edges count: " + fcount3)
- //Triplets(三元组),包含源点、源点属性、目标点、目标点属性、边属性
- val triplets: RDD[String] = graph.triplets.map(triplet => triplet.srcId + "-" +
- triplet.srcAttr._1 + "-" + triplet.attr + "-" + triplet.dstId + "-" + triplet.dstAttr._1)
- triplets.collect().foreach(println(_))
- //度、入度、出度
- val degrees: VertexRDD[Int] = graph.degrees;
- degrees.collect().foreach(println)
- val inDegrees: VertexRDD[Int] = graph.inDegrees
- inDegrees.collect().foreach(println)
- val outDegrees: VertexRDD[Int] = graph.outDegrees
- outDegrees.collect().foreach(println)
- //构建子图
- val subGraph = graph.subgraph(vpred = (id, attr) => attr._2 != "Missing")
- subGraph.vertices.collect().foreach(println(_))
- subGraph.triplets.map(triplet => triplet.srcAttr._1 + " is the " + triplet.attr + " of " + triplet.dstAttr._1)
- .collect().foreach(println(_))
- //Map操作,根据原图的一些特性得到新图,原图结构是不变的,下面两个逻辑是等价的,但是第一个不会被graphx系统优化
- val newVertices = graph.vertices.map { case (id, attr) => (id, (attr._1 + "-1", attr._2 + "-2")) }
- val newGraph1 = Graph(newVertices, graph.edges)
- val newGraph2 = graph.mapVertices((id, attr) => (id, (attr._1 + "-1", attr._2 + "-2")))
- //构造一个新图,顶点属性是出度
- val inputGraph: Graph[Int, String] =
- graph.outerJoinVertices(graph.outDegrees)((vid, _, degOpt) => degOpt.getOrElse(0))
- //根据顶点属性为出度的图构造一个新图,依据PageRank算法初始化边与点
- val outputGraph: Graph[Double, Double] =
- inputGraph.mapTriplets(triplet => 1.0 / triplet.srcAttr).mapVertices((id, _) => 1.0)
- //图的反向操作,新的图形的所有边的方向相反,不修改顶点或边性属性、不改变的边的数目,它可以有效地实现不必要的数据移动或复制
- var rGraph = graph.reverse
- //Mask操作也是根据输入图构造一个新图,达到一个限制制约的效果
- val ccGraph = graph.connectedComponents()
- val validGraph = graph.subgraph(vpred = (id, attr) => attr._2 != "Missing")
- val validCCGraph = ccGraph.mask(validGraph)
- //Join操作,原图外连出度点构造一个新图 ,出度为顶点属性
- val degreeGraph2 = graph.outerJoinVertices(outDegrees) { (id, attr, outDegreeOpt) =>
- outDegreeOpt match {
- case Some(outDeg) => outDeg
- case None => 0 //没有出度标识为零
- }
- }
- //缓存。默认情况下,缓存在内存的图会在内存紧张的时候被强制清理,采用的是LRU算法
- graph.cache()
- graph.persist(StorageLevel.MEMORY_ONLY)
- graph.unpersistVertices(true)
- //GraphLoader构建Graph
- var path = "/user/hadoop/data/temp/graph/graph.txt"
- var minEdgePartitions = 1
- var canonicalOrientation = false // if sourceId < destId this value is true
- val graph1 = GraphLoader.edgeListFile(sc, path, canonicalOrientation, minEdgePartitions,
- StorageLevel.MEMORY_ONLY, StorageLevel.MEMORY_ONLY)
- val verticesCount = graph1.vertices.count
- println(s"verticesCount: $verticesCount")
- graph1.vertices.collect().foreach(println)
- val edgesCount = graph1.edges.count
- println(s"edgesCount: $edgesCount")
- graph1.edges.collect().foreach(println)
- //PageRank
- val pageRankGraph = graph1.pageRank(0.001)
- pageRankGraph.vertices.sortBy(_._2, false).saveAsTextFile("/user/hadoop/data/temp/graph/graph.pr")
- pageRankGraph.vertices.top(5)(Ordering.by(_._2)).foreach(println)
- //Connected Components
- val connectedComponentsGraph = graph1.connectedComponents()
- connectedComponentsGraph.vertices.sortBy(_._2, false).saveAsTextFile("/user/hadoop/data/temp/graph/graph.cc")
- connectedComponentsGraph.vertices.top(5)(Ordering.by(_._2)).foreach(println)
- //TriangleCount主要用途之一是用于社区发现 保持sourceId小于destId
- val graph2 = GraphLoader.edgeListFile(sc, path, true)
- val triangleCountGraph = graph2.triangleCount()
- triangleCountGraph.vertices.sortBy(_._2, false).saveAsTextFile("/user/hadoop/data/temp/graph/graph.tc")
- triangleCountGraph.vertices.top(5)(Ordering.by(_._2)).foreach(println)
- sc.stop()
- }
- }
—————————————————————————————————————————————————————————————————————————————
Spark GraphX的一些其他有用操作:
- import org.apache.spark._
- import org.apache.spark.SparkContext
- import org.apache.spark.graphx._
- import org.apache.spark.graphx.Graph
- import org.apache.spark.graphx.util.GraphGenerators
- import org.apache.spark.rdd.RDD
- object SparkGraphx {
- def main(args: Array[String]) {
- val sc = new SparkContext("spark://centos.host1:7077", "Spark Graphx")
- //通过GraphGenerators构建一个随机图
- val numVertices = 100
- val numEParts = 2
- val mu = 4.0
- val sigma = 1.3
- val graph: Graph[Double, Int] = GraphGenerators.logNormalGraph(
- sc, numVertices, numEParts, mu, sigma).mapVertices((id, _) => id.toDouble)
- graph.triplets.collect.foreach(triplet => println(triplet.srcId + "-" + triplet.srcAttr + "-" +
- triplet.attr + "-" + triplet.dstId + "-" + triplet.dstAttr))
- //mapReduceTriplets函数使用样例
- //计算年龄大于自己的关注者的总人数和总年龄
- val olderFollowers: VertexRDD[(Int, Double)] = graph.mapReduceTriplets[(Int, Double)](
- //Map函数
- triplet => {
- if (triplet.srcAttr > triplet.dstAttr) {
- Iterator((triplet.dstId, (1, triplet.srcAttr)))
- } else {
- Iterator.empty
- }
- },
- //Reduce函数
- (a, b) => (a._1 + b._1, a._2 + b._2)
- )
- //计算年龄大于自己的关注者的平均年龄
- val avgAgeOfOlderFollowers: VertexRDD[Double] =
- olderFollowers.mapValues((id, value) => value match {case (count, totalAge) => totalAge / count })
- avgAgeOfOlderFollowers.collect.foreach(println(_))
- //定义一个Reduce函数来计算图中最大度的点
- def max(a: (VertexId, Int), b: (VertexId, Int)): (VertexId, Int) = {
- if (a._2 > b._2) a else b
- }
- val maxInDegree: (VertexId, Int) = graph.inDegrees.reduce(max)
- println(s"maxInDegree: $maxInDegree")
- val maxOutDegree: (VertexId, Int) = graph.outDegrees.reduce(max)
- println(s"maxOutDegree: $maxOutDegree")
- val maxDegrees: (VertexId, Int) = graph.degrees.reduce(max)
- println(s"maxDegrees: $maxDegrees")
- //计算邻居相关函数,这些操作是相当昂贵的,需要大量的重复信息作为他们的通信,因此相同的计算还是推荐用mapReduceTriplets
- val neighboorIds:VertexRDD[Array[VertexId]] = graph.collectNeighborIds(EdgeDirection.Out)
- val neighboors:VertexRDD[Array[(VertexId, Double)]] = graph.collectNeighbors(EdgeDirection.Out);
- //Pregel API。计算单源最短路径
- val graph1 = GraphGenerators.logNormalGraph(sc, numVertices, numEParts, mu, sigma).mapEdges(e => e.attr.toDouble)
- //定义一个源值 点
- val sourceId: VertexId = 42
- //初始化图的所有点,除了与指定的源值点相同值的点为0.0以外,其他点为无穷大
- val initialGraph = graph1.mapVertices((id, _) => if (id == sourceId) 0.0 else Double.PositiveInfinity)
- //Pregel有两个参数列表,第一个参数列表包括的是:初始化消息、迭代最大数、边的方向(Out)。第二个参数列表包括的是:用户定义的接受消息、计算消息、联合合并消息的函数。
- val sssp = initialGraph.pregel(Double.PositiveInfinity)(
- //点程序
- (id, dist, newDist) => math.min(dist, newDist),
- //发送消息
- triplet => {
- if (triplet.srcAttr + triplet.attr < triplet.dstAttr) {
- Iterator((triplet.dstId, triplet.srcAttr + triplet.attr))
- } else {
- Iterator.empty
- }
- },
- //合并消息
- (a, b) => math.min(a, b)
- )
- println(sssp.vertices.collect.mkString("\n"))
- //aggregateUsingIndex操作
- val setA: VertexRDD[Int] = VertexRDD(sc.parallelize(0L until 100L).map(id => (id, 1)))
- val rddB: RDD[(VertexId, Double)] = sc.parallelize(0L until 100L).flatMap(id => List((id, 1.0), (id, 2.0)))
- val setB: VertexRDD[Double] = setA.aggregateUsingIndex(rddB, _ + _)
- val setC: VertexRDD[Double] = setA.innerJoin(setB)((id, a, b) => a + b)
- sc.stop()
- }
- }
转自:http://blog.csdn.net/fighting_one_piece/article/details/39668267
http://blog.csdn.net/fighting_one_piece/article/details/39673193,所有权力归原作者所有。 0 0
- Spark GraphX相关使用方法
- spark-graphx以及图的相关介绍
- Spark GraphX
- spark graphx
- Spark GraphX
- Spark Graphx
- Spark GraphX
- Spark中的GraphX
- Spark GraphX(一)
- spark graphx文章整理
- Spark GraphX原理介绍
- spark-graphx之pagerank
- spark-graphx pagerank
- Spark GraphX学习笔记
- spark graphx创建
- Spark GraphX新手入门
- Spark GraphX实现PageRank
- 谈谈Spark GraphX吧!
- NOIP2000 方格取数
- VS2013无法进入断点进行调试
- LeetCode之Sqrt(x)
- 用两个队列实现一个栈
- 学习笔记之数据结构与算法(三)
- Spark GraphX相关使用方法
- 值类型和引用类型
- POJ 题目3744(矩阵优化概率DP)
- LeetCode之Unique Binary Search Trees & Unique Binary Search Trees II
- Eclipse中在web项目中添加hibernate
- 求字符串中指定开始和结束字符串的子串数量
- 设计模式问答系列(2)----工厂模式
- Android 的消息机制(Handler消息传递机制)
- opencv学习日常之Mat的代数运算