Spark GraphX新手入门

本文档内容主要是从OReilly.Advanced.Analytics.with.Spark.Early.Release.Edition.2014.11这本书的early release版中的Chapter 7: Analyzing Co-occurrence Networks with GraphX中节选的代码, 此文档是我阅读完此章节后，实际在我们的测试Spark集群上验证过的, 可以直接执行。所提供的jar和下面的代码与pdf文档中有些不同，代码已经去掉了cloudera所提供的代码依赖，用附件中我提供的jar实现。此文档基本涵盖了GraphX入门所需要的各类内容。希望初学者逐行测试推敲。

下面代码中“数据准备”这一块已经在bjlg-40p103-hadoop27.bfdabc.com为MASTER节点的Spark测试集群中hdfs中的/usr/hadoop/medline_data/medline中创建好了，如需测试，可跳过“数据准备”，直接测试其它代码。

//数据准备

mkdir medline_data

cd medline_data

wget ftp://ftp.nlm.nih.gov/nlmdata/sample/medline/*.gz

gunzip *.gz

hadoop fs -mkdir medline

hadoop fs -put *.xml medline

//环境配置

MASTER=spark://bjlg-40p103-hadoop27.xxx.com:7077

spark-shell --jars /opt/hadoop/huxl/medline_data/spark-graphx-test.jar

//程序

import org.apache.spark.SparkContext

import org.apache.spark.SparkContext._

import org.apache.spark.SparkConf

import org.apache.spark.rdd.RDD

import org.apache.spark.rdd.RDD._

import org.apache.spark.graphx._

import org.apache.hadoop.io.{Text, LongWritable}

import org.apache.hadoop.conf.Configuration

import scala.xml._

import com.google.common.hash.Hashing

import com.baifendian.spark.graphx.common._

//数据预处理，从xml中提取出作者关系

def loadMedline(sc: SparkContext, path: String) = {

@transient val conf = new Configuration()

conf.set(XmlInputFormat.START_TAG_KEY, "<MedlineCitation ")

conf.set(XmlInputFormat.END_TAG_KEY, "</MedlineCitation>")

val in = sc.newAPIHadoopFile(path, classOf[XmlInputFormat],

classOf[LongWritable], classOf[org.apache.hadoop.io.Text], conf)

in.map(line => line._2.toString)

}

val medline_raw = loadMedline(sc, "hdfs://xxx/user/hadoop/medline")

val raw_xml = medline_raw.take(1)(0)

val elem = XML.loadString(raw_xml)

elem.label

elem.attributes

elem \ "MeshHeadingList"

(elem \\ "Year").map(_.text)

val mxml: RDD[Elem] = medline_raw.map(XML.loadString)

def majorTopics(elem: Elem): Seq[String] = {

val dn = elem \\ "DescriptorName"

val mt = dn.filter(n => (n \ "@MajorTopicYN").text == "Y")

mt.map(n => n.text)

}

//分析共现关系

majorTopics(elem)

val medline: RDD[Seq[String]] = mxml.map(majorTopics).cache()

medline.take(1)(0)

medline.count()

val topics: RDD[String] = medline.flatMap(mesh => mesh)

val topicCounts = topics.countByValue()

topicCounts.size

val tcSeq = topicCounts.toSeq

tcSeq.sortBy(_._2).reverse.take(10).foreach(println)

//长尾主题

val valueDist = topicCounts.groupBy(_._2).mapValues(_.size)

valueDist.toSeq.sorted.take(10).foreach(println)

//列表排列组合

val list = List(1, 2, 3)

val combs = list.combinations(2)

combs.foreach(println)

val permu = list.permutations

permu.foreach(println)

//共现关系

val topicPairs = medline.flatMap(t => t.sorted.combinations(2))

val cooccurs = topicPairs.map(p => (p, 1)).reduceByKey(_+_)

cooccurs.cache()

cooccurs.count()

val ord = Ordering.by[(Seq[String], Int), Int](_._2)

cooccurs.top(10)(ord).foreach(println)

//通过定点集合边集构建图

def hashId(str: String) = {

Hashing.md5().hashString(str).asLong()

}

val vertices = topics.map(topic => (hashId(topic), topic))

val uniqueHashes = vertices.map(_._1).countByValue()

val uniqueTopics = vertices.map(_._2).countByValue()

uniqueHashes.size == uniqueTopics.size

val edges = cooccurs.map(p => {

val (topics, cnt) = p

val ids = topics.map(hashId).sorted

Edge(ids(0), ids(1), cnt)

})

val topicGraph = Graph(vertices, edges)

topicGraph.cache()

vertices.count()

topicGraph.vertices.count()

//连通子图

val connectedComponentGraph: Graph[VertexId, Int] =

topicGraph.connectedComponents()

def sortedConnectedComponents(connectedComponents: Graph[VertexId, _]) : Seq[(VertexId, Long)] = {

val componentCounts = connectedComponents.vertices.map(_._2).countByValue

componentCounts.toSeq.sortBy(_._2).reverse

}

val componentCounts = sortedConnectedComponents(

connectedComponentGraph)

componentCounts.size

componentCounts.take(10)foreach(println)

val nameCID = topicGraph.vertices.

innerJoin(connectedComponentGraph.vertices) {

(topicId, name, componentId) => (name, componentId)

}

val c1 = nameCID.filter(x => x._2._2 == componentCounts(1)._1)

c1.collect().foreach(x => println(x._2._1))

//连通图中度的分布

val degrees: VertexRDD[Int] = topicGraph.degrees.cache()

degrees.map(_._2).stats()

val sing = medline.filter(x => x.size == 1)

sing.count()

val singTopic = sing.flatMap(topic => topic).distinct()

singTopic.count()

val topic2 = topicPairs.flatMap(p => p)

singTopic.subtract(topic2).count()

def topNamesAndDegrees(degrees: VertexRDD[Int],

topicGraph: Graph[String, Int]): Array[(String, Int)] = {

val namesAndDegrees = degrees.innerJoin(topicGraph.vertices) {

(topicId, degree, name) => (name, degree)

}

val ord = Ordering.by[(String, Int), Int](_._2)

namesAndDegrees.map(_._2).top(10)(ord)

}

topNamesAndDegrees(degrees, topicGraph).foreach(println)

//利用卡方统计过滤掉噪音边

val T = medline.count()

val topicCountsRdd = topics.map(x => (hashId(x), 1)).reduceByKey(_+_)

val topicCountGraph = Graph(topicCountsRdd, topicGraph.edges)

def chiSq(YY: Int, YB: Int, YA: Int, T: Long): Double = {

val NB = T - YB

val NA = T - YA

val YN = YA - YY

val NY = YB - YY

val NN = T - NY - YN - YY

val inner = (YY * NN - YN * NY) - T / 2.0

T * math.pow(inner, 2) / (YA * NA * YB * NB)

}

val chiSquaredGraph = topicCountGraph.mapTriplets(triplet => {

chiSq(triplet.attr, triplet.srcAttr, triplet.dstAttr, T)

})

chiSquaredGraph.edges.map(x => x.attr).stats()

val interesting = chiSquaredGraph.subgraph(triplet => triplet.attr > 19.5)

interesting.edges.count

//过滤掉1/3噪音边后连通图的变化

val interestingComponentCounts = sortedConnectedComponents(

interesting.connectedComponents())

interestingComponentCounts.size

interestingComponentCounts.take(10).foreach(println)

//过滤后度分布的变化

val interestingDegrees = interesting.degrees.cache()

interestingDegrees.map(_._2).stats()

//小世界网络

val triCountGraph = interesting.triangleCount()

triCountGraph.vertices.map(x => x._2).stats()

val maxTrisGraph = interesting.degrees.mapValues(d => d * (d - 1) / 2.0)

val clusterCoefGraph = triCountGraph.vertices.innerJoin(maxTrisGraph) {

(vertexId, triCount, maxTris) => {

if (maxTris == 0) 0 else triCount / maxTris

}}

clusterCoefGraph.map(_._2).sum() / interesting.vertices.count()