spark构建图graphx

来源:互联网 发布:delphi登陆淘宝联盟 编辑:程序博客网 时间:2024/05/18 13:23
import org.apache.spark.graphx.{Edge, Graph}import utility.Helpersimport scala.collection.mutable/**  * Created by fhqplzj on 2017/7/20.  */object SemanticNormalization {  def word_count(s: String): mutable.Map[Char, Int] = {    /**      * word count的作用      */    val hashMap = new mutable.HashMap[Char, Int]().withDefaultValue(0)    s.foreach {      c =>        hashMap(c) += 1    }    hashMap  }  def jaccard_similarity(s1: String, s2: String): Double = {    /**      * 广义雅可比相似度计算      */    val m1 = word_count(s1)    val m2 = word_count(s2)    val ks1 = m1.keySet    val ks2 = m2.keySet    val numerator = (ks1 & ks2).toSeq.map {      x =>        Math.min(m1(x), m2(x))    }.sum    val denominator = (ks1 | ks2).toSeq.map {      x =>        Math.max(m1(x), m2(x))    }.sum    1.0 * numerator / denominator  }  def main(args: Array[String]): Unit = {    val sc = Helpers.getSc    val vertices = sc.textFile("file:///tmp/graph.txt").map {      line =>        val parts = line.split("\t")        (parts(0), parts(1))    }.zipWithIndex().map(_.swap)    val similarities = vertices.cartesian(vertices).flatMap {      case ((id1, (content1, _)), (id2, (content2, _))) =>        val sim = jaccard_similarity(content1, content2)        if (id1 < id2) {          Seq(Edge(id1, id2, sim), Edge(id2, id1, sim))        } else if (id1 == id2) {          Seq(Edge(id1, id2, sim))        } else {          None        }    }    val graph = Graph.fromEdges(similarities, 0.0)    /*构建好了图,接下来做自己想做的事,比如提取图的连通分量。*/    println(graph.vertices.count())  }}

原创粉丝点击