基于spark的朴素贝叶斯分类器

来源：互联网发布：英文有声书软件编辑：程序博客网时间：2024/05/01 10:59

根据TDA里面的情感分析实现的基于spark的朴素贝叶斯分类器

import org.apache.spark.SparkContextimport org.apache.spark.SparkContext._import org.apache.spark.rdd._import scala.util.parsing.json._import java.util.StringTokenizerimport scala.collection.mutable.HashMapimport java.io.PrintWriterimport java.io.Fileobject SimpleApp{    def main(args: Array[String]){        var totalWordNumber = 0        var happyProb = 0.0        var sadProb = 0.0                 //var classiferPara: (Int, Double, Double) = (0,0.0,0.0)        //val logFile = "/home/hrl/spark-0.9.1/README.md"        val sc = new SparkContext("local", "Simple App", "/home/hrl/spark-0.9.1", List("target/scala-2.10/simple-project_2.10-1.0.jar"))        //val logData = sc.textFile(logFile, 2).cache()        val owsFile = "/home/hrl/spark-0.9.1/apps/sentimenAnalysis/data/ows.json"        //val owsFile = "/home/hrl/spark-0.9.1/apps/sentimenAnalysis/data/ows_sample.json"        val outputFile = "/home/hrl/spark-0.9.1/apps/sentimenAnalysis/data/output.txt"        //val logData = sc.textFile(owsFile)        val owsData = sc.textFile(owsFile)                //only twittContent        val twittTextPrimitive = owsData.map(parseJson _)        val twittText = twittTextPrimitive.map(filterStopWord)        val twittTextWithSent = twittText.map(parseSentiment _)        //twittTextWithSent.collect()        //(twitt, (1,0)) (happy, sad)        val twittWord = twittTextWithSent.map(mapSentWord _)        //twittWord.collect        //twittWordGroup is instance of HashMap[Word, (Int, Int)] not RDD        val twittWordGroup = groupWordNumber(twittWord)        //twittWordGroup.collect                //totalWordNumber = countTotalWord(twittWordGroup)        //(totalWordNumber, happyProb, sadProb)        val classiferPara:(Int, Double, Double) = classiferParameters(twittWordGroup)                val twitt = "thing NYC could do to #Occupy is what they are doing right now. Suppression always  always has the opposite effect"        val classProbs = classify(twitt, twittWordGroup, classiferPara)        writeToFile(outputFile, twittWordGroup)        println("job done")        //val numAs = logData.filter(line => line.contains("a")).count()        //val numBs = logData.filter(line => line.contains("b")).count()        //println("Lines with a: %s, Lines with b: %s".format(numAs, numBs))    }    def filterStopWord(line: String): String = {      val stopwords: Array[String] = Array("a", "about", "above", "after", "again", "against", "all",         "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below",         "between", "both", "but", "by", "can", "did", "do", "does", "doing", "don", "down",         "during", "each", "few", "for", "from", "further", "get", "had", "has", "have", "having",         "he", "her", "here", "hers", "herself", "him", "himself", "his", "how",         "i", "if", "im", "i'm", "in", "into", "is", "it", "its", "itself", "just",         "me", "more", "most", "my", "myself", "no", "nor", "not", "now", "of", "off",         "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over",         "own", "rt", "s", "same", "she", "should", "so", "some", "such", "t", "than", "that",         "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this",         "those", "through", "to", "too", "under", "until", "up", "us", "very", "was", "we", "were",         "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with",         "you", "your", "yours", "yourself", "yourselves")      var res = ""      line.split("\\s+").foreach{          x => if (!stopwords.contains(x.toLowerCase))  res += x + " "      }      res    }     def writeToFile(path: String, vector: HashMap[String, (Int, Int)]){        val writer = new PrintWriter(new File(path))        vector.foreach{          case(key,value) => writer.println(key + "," + value)        }        writer.close    }    def parseJson(line: String): String = {    val result = JSON.parseFull(line)    result match{    case Some(e) => {    val r = e.asInstanceOf[Map[String, String]]    r("text")    }    case None => "failed"    }    }        //(twittText,1,0,-1)  (happy, nonDecided, sad)    def parseSentiment(twittText: String): (String, Int) = {      val HAPPY_SMILEYS: Array[String] = Array[String](":)", ";)", ":D", ":-)", ":o)", ":-D")      val SAD_SMILEYS:Array[String] = Array[String](":(", ":-(", ":'(", ":'-(", "D:")          val tokens = new StringTokenizer(twittText)      var found = false      var res= 0      while(tokens.hasMoreTokens() && !found){        val token: String = tokens.nextToken()        if(HAPPY_SMILEYS.contains(token)){          found = true          res = 1        }        else if(SAD_SMILEYS.contains(token)){          found = true          res = -1        }      }      (twittText, res)    }    //parse (twittText, senti) => (word, (n1, n2))    /*    def mapSentWord(sentTwitt: (String, Int)): (String,(Int, Int))={    sentTwitt match {    case (twitt: String, 1) =>{                (twitt, (1, 0))    }    case (twitt: String, -1) =>{                (twitt, (0, 1))    }    case (twitt: String, 0) =>{                (twitt, (0, 0))    }    }    }    */    /*    def mapSentWord(sentTwitt: (String, Int)): Array[(String,(Int, Int))]={        val tokens = new StringTokenizer(sentTwitt._1)        var result:Array[(String,(Int,Int))] = Array()    sentTwitt match {    case (twitt: String, 1) =>{                 while(tokens.hasMoreTokens()){                   result = result ++ Array((tokens.nextToken(),(1,0)))                 }    }dev-unsubscribe@spark.apache.org    case (twitt: String, -1) =>{                 while(tokens.hasMoreTokens()){                   result = result ++ Array((tokens.nextToken(),(0,1)))                 }    }     case (twitt: String, 0) =>{     while(tokens.hasMoreTokens()){                   result = result ++ Array((tokens.nextToken(),(0,0)))                 }    }    }        result    }    */     //first place is happy class, the second place is sad class   def mapSentWord(sentTwitt: (String, Int)): HashMap[String,(Int, Int)]={        val tokens = new StringTokenizer(sentTwitt._1)        val result:HashMap[String,(Int,Int)] = HashMap()    sentTwitt match {    case (twitt: String, 1) =>{                 while(tokens.hasMoreTokens()){                   val token = tokens.nextToken();                   if(result.contains(token)){                     result(token) = (result(token)._1+1, result(token)._2)                    }else{                     //add new key,value to the result hashmap                     result += (token -> (1,0))                   }                   //result = result ++ HashMap((tokens.nextToken(),(1,0)))                 }    }    case (twitt: String, -1) =>{                 while(tokens.hasMoreTokens()){                   val token = tokens.nextToken();                   if(result.contains(token)){                     result(token) = (result(token)._1, result(token)._2 + 1)                    }else{                     //add new key,value to the result hashmap                     result += (token -> (0,1))                   }                   //result = result ++ HashMap((tokens.nextToken(),(1,0)))                 }    }     case (twitt: String, 0) =>{     while(tokens.hasMoreTokens()){                   val token = tokens.nextToken();                   if(result.contains(token)){                     result(token) = (result(token)._1, result(token)._2)                    }else{                     //add new key,value to the result hashmap                     result += (token -> (0,0))                   }                   //result = result ++ HashMap((tokens.nextToken(),(1,0)))                 }    }    }        result    }        /* do not work    def filterZeroNumberWord(twittWord: RDD[HashMap[String,(Int, Int)]]): RDD[HashMap[String,(Int, Int)]] = {      twittWord.filter(x =>         x.filter((key,value) => value._1 == 0 && value._2 == 0))    }    */    def addMap(aVal: HashMap[String,(Int, Int)], b: HashMap[String,(Int, Int)]): HashMap[String,(Int, Int)] = {      var a = aVal      b.foreach{        case (key: String,value) =>          if(a.contains(key)){             a(key) =((a(key)._1 + b(key)._1), (a(key)._2 + b(key)._2))          }          else{            if(!(value == (0,0)))                a += (key -> value)          }      }      a = a retain {(key,value) => value._1 != 0 || value._2 != 0}      a    }    //parse (twittText, (0,1),(1,0)) to (twittText, (1,1))    def groupWordNumber(sentWordRDD: RDD[HashMap[String, (Int,Int)]]): HashMap[String, (Int,Int)] = {    sentWordRDD.reduce((x,y) => addMap(x,y))    }    def countTotalWord(twittWordGroup: HashMap[String,(Int,Int)]): Int = {        var totalWord = 0    twittWordGroup.foreach{    case(key, value)=> totalWord += value._1 + value._2    }      totalWord    }    def classiferParameters(twittWordGroup: HashMap[String, (Int, Int)]): (Int, Double, Double) = {       var happyNum = 0      var sadNum = 0      var totalWordNumber = 0      twittWordGroup.foreach{        case(key, value)=> {          happyNum += value._1          sadNum += value._2          totalWordNumber += value._1 + value._2        }      }      (totalWordNumber, happyNum.toDouble/totalWordNumber, sadNum.toDouble/totalWordNumber)    }    /*8    def classify(twitt:String): (Double,Double) = {    }     */    def classify(twitt: String, twittWordGroup: HashMap[String, (Int, Int)], paras: (Int, Double, Double)): (Double, Double) = {        val tokens = new StringTokenizer(twitt)        var firstProb = 1.0        var secProb = 1.0        var foundClass1 = false        var foundClass2 = false        while(tokens.hasMoreTokens()){                    val token = tokens.nextToken();          if(twittWordGroup.contains(token)){            val pWord1 = (twittWordGroup(token)._1 + twittWordGroup(token)._2).toDouble / paras._1            val pClass1 = paras._2            if(twittWordGroup(token)._1 > 0){                foundClass1 = true                val pWordGivenClass1 = twittWordGroup(token)._1.toDouble /(twittWordGroup(token)._1+ twittWordGroup(token)._2)                 //firstProb *=  pWordGivenClass1 * pClass1 / pWord1                firstProb *=  pWordGivenClass1 * pClass1            }            val pWord2 = (twittWordGroup(token)._1 + twittWordGroup(token)._2).toDouble / paras._1            val pClass2 = paras._3            if(twittWordGroup(token)._2 > 0){                foundClass2 = true                val pWordGivenClass2 = twittWordGroup(token)._2.toDouble /(twittWordGroup(token)._1+ twittWordGroup(token)._2)                 //secProb *= pWordGivenClass2 * pClass2 / pWord2                secProb *= pWordGivenClass2 * pClass2            }          }else{            //do noting          }        }        if(!foundClass1) firstProb = 0.0        if(!foundClass2) secProb = 0.0        (firstProb, secProb)    }}

0 0