基于spark的朴素贝叶斯分类器
来源:互联网 发布:英文有声书软件 编辑:程序博客网 时间:2024/05/01 10:59
根据TDA里面的情感分析实现的基于spark的朴素贝叶斯分类器
import org.apache.spark.SparkContextimport org.apache.spark.SparkContext._import org.apache.spark.rdd._import scala.util.parsing.json._import java.util.StringTokenizerimport scala.collection.mutable.HashMapimport java.io.PrintWriterimport java.io.Fileobject SimpleApp{ def main(args: Array[String]){ var totalWordNumber = 0 var happyProb = 0.0 var sadProb = 0.0 //var classiferPara: (Int, Double, Double) = (0,0.0,0.0) //val logFile = "/home/hrl/spark-0.9.1/README.md" val sc = new SparkContext("local", "Simple App", "/home/hrl/spark-0.9.1", List("target/scala-2.10/simple-project_2.10-1.0.jar")) //val logData = sc.textFile(logFile, 2).cache() val owsFile = "/home/hrl/spark-0.9.1/apps/sentimenAnalysis/data/ows.json" //val owsFile = "/home/hrl/spark-0.9.1/apps/sentimenAnalysis/data/ows_sample.json" val outputFile = "/home/hrl/spark-0.9.1/apps/sentimenAnalysis/data/output.txt" //val logData = sc.textFile(owsFile) val owsData = sc.textFile(owsFile) //only twittContent val twittTextPrimitive = owsData.map(parseJson _) val twittText = twittTextPrimitive.map(filterStopWord) val twittTextWithSent = twittText.map(parseSentiment _) //twittTextWithSent.collect() //(twitt, (1,0)) (happy, sad) val twittWord = twittTextWithSent.map(mapSentWord _) //twittWord.collect //twittWordGroup is instance of HashMap[Word, (Int, Int)] not RDD val twittWordGroup = groupWordNumber(twittWord) //twittWordGroup.collect //totalWordNumber = countTotalWord(twittWordGroup) //(totalWordNumber, happyProb, sadProb) val classiferPara:(Int, Double, Double) = classiferParameters(twittWordGroup) val twitt = "thing NYC could do to #Occupy is what they are doing right now. Suppression always always has the opposite effect" val classProbs = classify(twitt, twittWordGroup, classiferPara) writeToFile(outputFile, twittWordGroup) println("job done") //val numAs = logData.filter(line => line.contains("a")).count() //val numBs = logData.filter(line => line.contains("b")).count() //println("Lines with a: %s, Lines with b: %s".format(numAs, numBs)) } def filterStopWord(line: String): String = { val stopwords: Array[String] = Array("a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can", "did", "do", "does", "doing", "don", "down", "during", "each", "few", "for", "from", "further", "get", "had", "has", "have", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", "im", "i'm", "in", "into", "is", "it", "its", "itself", "just", "me", "more", "most", "my", "myself", "no", "nor", "not", "now", "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "rt", "s", "same", "she", "should", "so", "some", "such", "t", "than", "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too", "under", "until", "up", "us", "very", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "you", "your", "yours", "yourself", "yourselves") var res = "" line.split("\\s+").foreach{ x => if (!stopwords.contains(x.toLowerCase)) res += x + " " } res } def writeToFile(path: String, vector: HashMap[String, (Int, Int)]){ val writer = new PrintWriter(new File(path)) vector.foreach{ case(key,value) => writer.println(key + "," + value) } writer.close } def parseJson(line: String): String = { val result = JSON.parseFull(line) result match{ case Some(e) => { val r = e.asInstanceOf[Map[String, String]] r("text") } case None => "failed" } } //(twittText,1,0,-1) (happy, nonDecided, sad) def parseSentiment(twittText: String): (String, Int) = { val HAPPY_SMILEYS: Array[String] = Array[String](":)", ";)", ":D", ":-)", ":o)", ":-D") val SAD_SMILEYS:Array[String] = Array[String](":(", ":-(", ":'(", ":'-(", "D:") val tokens = new StringTokenizer(twittText) var found = false var res= 0 while(tokens.hasMoreTokens() && !found){ val token: String = tokens.nextToken() if(HAPPY_SMILEYS.contains(token)){ found = true res = 1 } else if(SAD_SMILEYS.contains(token)){ found = true res = -1 } } (twittText, res) } //parse (twittText, senti) => (word, (n1, n2)) /* def mapSentWord(sentTwitt: (String, Int)): (String,(Int, Int))={ sentTwitt match { case (twitt: String, 1) =>{ (twitt, (1, 0)) } case (twitt: String, -1) =>{ (twitt, (0, 1)) } case (twitt: String, 0) =>{ (twitt, (0, 0)) } } } */ /* def mapSentWord(sentTwitt: (String, Int)): Array[(String,(Int, Int))]={ val tokens = new StringTokenizer(sentTwitt._1) var result:Array[(String,(Int,Int))] = Array() sentTwitt match { case (twitt: String, 1) =>{ while(tokens.hasMoreTokens()){ result = result ++ Array((tokens.nextToken(),(1,0))) } }dev-unsubscribe@spark.apache.org case (twitt: String, -1) =>{ while(tokens.hasMoreTokens()){ result = result ++ Array((tokens.nextToken(),(0,1))) } } case (twitt: String, 0) =>{ while(tokens.hasMoreTokens()){ result = result ++ Array((tokens.nextToken(),(0,0))) } } } result } */ //first place is happy class, the second place is sad class def mapSentWord(sentTwitt: (String, Int)): HashMap[String,(Int, Int)]={ val tokens = new StringTokenizer(sentTwitt._1) val result:HashMap[String,(Int,Int)] = HashMap() sentTwitt match { case (twitt: String, 1) =>{ while(tokens.hasMoreTokens()){ val token = tokens.nextToken(); if(result.contains(token)){ result(token) = (result(token)._1+1, result(token)._2) }else{ //add new key,value to the result hashmap result += (token -> (1,0)) } //result = result ++ HashMap((tokens.nextToken(),(1,0))) } } case (twitt: String, -1) =>{ while(tokens.hasMoreTokens()){ val token = tokens.nextToken(); if(result.contains(token)){ result(token) = (result(token)._1, result(token)._2 + 1) }else{ //add new key,value to the result hashmap result += (token -> (0,1)) } //result = result ++ HashMap((tokens.nextToken(),(1,0))) } } case (twitt: String, 0) =>{ while(tokens.hasMoreTokens()){ val token = tokens.nextToken(); if(result.contains(token)){ result(token) = (result(token)._1, result(token)._2) }else{ //add new key,value to the result hashmap result += (token -> (0,0)) } //result = result ++ HashMap((tokens.nextToken(),(1,0))) } } } result } /* do not work def filterZeroNumberWord(twittWord: RDD[HashMap[String,(Int, Int)]]): RDD[HashMap[String,(Int, Int)]] = { twittWord.filter(x => x.filter((key,value) => value._1 == 0 && value._2 == 0)) } */ def addMap(aVal: HashMap[String,(Int, Int)], b: HashMap[String,(Int, Int)]): HashMap[String,(Int, Int)] = { var a = aVal b.foreach{ case (key: String,value) => if(a.contains(key)){ a(key) =((a(key)._1 + b(key)._1), (a(key)._2 + b(key)._2)) } else{ if(!(value == (0,0))) a += (key -> value) } } a = a retain {(key,value) => value._1 != 0 || value._2 != 0} a } //parse (twittText, (0,1),(1,0)) to (twittText, (1,1)) def groupWordNumber(sentWordRDD: RDD[HashMap[String, (Int,Int)]]): HashMap[String, (Int,Int)] = { sentWordRDD.reduce((x,y) => addMap(x,y)) } def countTotalWord(twittWordGroup: HashMap[String,(Int,Int)]): Int = { var totalWord = 0 twittWordGroup.foreach{ case(key, value)=> totalWord += value._1 + value._2 } totalWord } def classiferParameters(twittWordGroup: HashMap[String, (Int, Int)]): (Int, Double, Double) = { var happyNum = 0 var sadNum = 0 var totalWordNumber = 0 twittWordGroup.foreach{ case(key, value)=> { happyNum += value._1 sadNum += value._2 totalWordNumber += value._1 + value._2 } } (totalWordNumber, happyNum.toDouble/totalWordNumber, sadNum.toDouble/totalWordNumber) } /*8 def classify(twitt:String): (Double,Double) = { } */ def classify(twitt: String, twittWordGroup: HashMap[String, (Int, Int)], paras: (Int, Double, Double)): (Double, Double) = { val tokens = new StringTokenizer(twitt) var firstProb = 1.0 var secProb = 1.0 var foundClass1 = false var foundClass2 = false while(tokens.hasMoreTokens()){ val token = tokens.nextToken(); if(twittWordGroup.contains(token)){ val pWord1 = (twittWordGroup(token)._1 + twittWordGroup(token)._2).toDouble / paras._1 val pClass1 = paras._2 if(twittWordGroup(token)._1 > 0){ foundClass1 = true val pWordGivenClass1 = twittWordGroup(token)._1.toDouble /(twittWordGroup(token)._1+ twittWordGroup(token)._2) //firstProb *= pWordGivenClass1 * pClass1 / pWord1 firstProb *= pWordGivenClass1 * pClass1 } val pWord2 = (twittWordGroup(token)._1 + twittWordGroup(token)._2).toDouble / paras._1 val pClass2 = paras._3 if(twittWordGroup(token)._2 > 0){ foundClass2 = true val pWordGivenClass2 = twittWordGroup(token)._2.toDouble /(twittWordGroup(token)._1+ twittWordGroup(token)._2) //secProb *= pWordGivenClass2 * pClass2 / pWord2 secProb *= pWordGivenClass2 * pClass2 } }else{ //do noting } } if(!foundClass1) firstProb = 0.0 if(!foundClass2) secProb = 0.0 (firstProb, secProb) }}
0 0
- 基于spark的朴素贝叶斯分类器
- 基于朴素贝叶斯的文本分类器
- 基于朴素贝叶斯分类器的文本分类(下)
- 基于朴素贝叶斯分类器的文本分类算法
- 基于朴素贝叶斯分类器的文本分类算法
- 基于朴素贝叶斯分类器的文本分类算法
- 基于朴素贝叶斯分类器的文本分类算法
- 基于朴素贝叶斯分类器的文本分类算法
- 基于朴素贝叶斯分类器的文本分类
- 文本分类基于朴素贝叶斯分类器
- 基于的朴素贝叶斯的文本分类(附完整代码(spark/java)
- Spark中组件Mllib的学习31之朴素贝叶斯分类器(多项式朴素贝叶斯)
- Spark中组件Mllib的学习32之朴素贝叶斯分类器(伯努利朴素贝叶斯)*
- 基于朴素贝叶斯的文本分类算法
- 基于概率论的分类方法:朴素贝叶斯
- 基于朴素贝叶斯的文本分类算法
- 基于朴素贝叶斯的文本分类算法
- 基于概率论的分类方法:朴素贝叶斯
- js中apply与call的区别
- hadoop YARN主要思想和架构
- 显示初始界面——C# WPF小白学习心得1
- xmmintrin.h与SSE指令集
- DAG图的拓扑排序 python
- 基于spark的朴素贝叶斯分类器
- lampp环境下Mysql和网站备份脚本
- 很怀念那年我们写代码的日子。
- andriod游戏音效
- iOS通过app读取通讯录信息(整理)
- 滴滴打车更名背后的商业效应:内涵更丰富,品牌更亲民
- [projecteuler]Names scores
- 安全测试--SQL注入攻击
- 禁用Enter键表单自动提交