NLP处理-Spark中的HashTF与CountVectorizer模型

来源:互联网 发布:软件开发的几个阶段 编辑:程序博客网 时间:2024/06/10 16:04

http://spark.apache.org/docs/latest/ml-features.html#tf-idf


import org.apache.spark.ml.feature._import org.apache.spark.ml.linalg.SparseVectorimport org.apache.spark.sql.SparkSessionimport scala.collection.mutableimport scala.io.Source/**  * Created by xubc on 2017/6/3.  */object TestX {  def main(args: Array[String]): Unit = {    val spark = SparkSession.builder        .master("local[5]")        .appName(this.getClass.getName().stripSuffix("$"))        .getOrCreate()    val sentenceData = spark.createDataFrame(Seq(      (0.0, "Hi I heard about are Spark"),      (1.0, "I wish Java could use case spark classes"),      (2.0, "Logistic regression regression models are neat I")    )).toDF("label", "sentence")    val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")    val wordsData = tokenizer.transform(sentenceData)    // HashingTF bow模型//    val hashingTF = new HashingTF()//      .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(100)//    val featurizedData = hashingTF.transform(wordsData)    // CountVectorizer bow模型    val cvModel: CountVectorizerModel = new CountVectorizer()      .setInputCol("words").setOutputCol("rawFeatures")      .fit(wordsData)    val featurizedData = cvModel.transform(wordsData)    val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")    val idfModel = idf.fit(featurizedData)    val rescaledData = idfModel.transform(featurizedData)    rescaledData.printSchema()    val vocabulary = cvModel.vocabulary    println(vocabulary.mkString(","))    rescaledData.show(false)    rescaledData.foreach(e => {      val label = e.getAs[Double]("label")      val str = e.getAs[String]("sentence")      val words = e.getAs[mutable.WrappedArray[String]]("words").mkString(",")      val tf = e.getAs[SparseVector]("rawFeatures")      val originWords = tf.indices.map(i => vocabulary(i)).mkString(",")      val idf = e.getAs[SparseVector]("features")      println(        s"""$label   $str           | $words           | $tf    $originWords           | $idf""".stripMargin)    })  }}
通过CountVectorizer模型的vocabulary可以回溯tf-idf权重高的词,但是HashTF采用的hash算法能够更高效率计算出tf-idf无法回溯到具体词

1.0   I wish Java could use case spark classes i,wish,java,could,use,case,spark,classes (16,[0,2,4,5,7,8,13,14],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])    i,spark,could,java,wish,case,classes,use (16,[0,2,4,5,7,8,13,14],[0.0,0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453])2.0   Logistic regression regression models are neat I logistic,regression,regression,models,are,neat,i (16,[0,1,3,6,9,15],[1.0,2.0,1.0,1.0,1.0,1.0])    i,regression,are,neat,models,logistic (16,[0,1,3,6,9,15],[0.0,1.3862943611198906,0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453])0.0   Hi I heard about are Spark hi,i,heard,about,are,spark (16,[0,2,3,10,11,12],[1.0,1.0,1.0,1.0,1.0,1.0])    i,spark,are,about,hi,heard (16,[0,2,3,10,11,12],[0.0,0.28768207245178085,0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453])



原创粉丝点击