垃圾邮件分类(Scala 版本)

来源:互联网 发布:网络任务源码 编辑:程序博客网 时间:2024/06/07 01:01
import org.apache.log4j.{Level, Logger}import org.apache.spark.mllib.classification.LogisticRegressionWithSGDimport org.apache.spark.mllib.feature.HashingTFimport org.apache.spark.mllib.regression.LabeledPointimport org.apache.spark.{SparkConf, SparkContext}/**  * Created by DengNi on 2016/9/21.  * 邮件分类 scala  */object spam_normal {  def main(args: Array[String]) {    Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)    val conf = new SparkConf().setAppName("scala").setMaster("local[*]")    val sc = new SparkContext(conf)    val spam = sc.textFile("spam.txt")    val norm = sc.textFile("noraml.txt")    //创建一个HashingTF 实例来把邮件文本映射为包含 10000 个特征的向量    val tf = new HashingTF(numFeatures = 10000)    //各个邮件都被切分为单词,每个单词被映射为一个特征    val spamFeatures  = spam.map(email =>tf.transform(email.split(" ")))    val normFeatures  = norm.map(email =>tf.transform(email.split(" ")))    //创建lablepoint 数据集分别存放垃圾邮件和正常邮件    val positiveExample =spamFeatures.map(features => LabeledPoint(1,features))    val negativeExample =normFeatures.map(features => LabeledPoint(0,features))    val trainingDatat = positiveExample.union(negativeExample)    //因为逻辑回归是迭代算法,所以使用缓存技术    trainingDatat.cache()    //使用SGD 算法运行逻辑回归    val model  = new LogisticRegressionWithSGD().run(trainingDatat)    //使用两组数据测试    val psTest  = tf.transform("fuck you love sex cheap by sending money fund".split(" "))    val negTest = tf.transform("Hi hwo do you good to see you want to spark".split(" "))    println(model.predict(psTest)) //should be 1    println(model.predict(negTest)) //should be 0  }}


"C:\Program Files\Java\jdk1.7.0_80\bin\java" -Didea.launcher.port=7533 "-Didea.launcher.bin.path=C:\Program Files (x86)\JetBrains\IntelliJ IDEA Community Edition 2016.1.3\bin" -Dfile.encoding=UTF-8 -classpath "C:\Program Files\Java\jdk1.7.0_80\jre\lib\charsets.jar;C:\Program Files\Java\jdk1.7.0_80\jre\lib\deploy.jar;C:\Program Files\Java\jdk1.7.0_80\jre\lib\ext\access-bridge-64.jar;C:\Program Files\Java\jdk1.7.0_80\jre\lib\ext\dnsns.jar;C:\Program Files\Java\jdk1.7.0_80\jre\lib\ext\jaccess.jar;C:\Program Files\Java\jdk1.7.0_80\jre\lib\ext\localedata.jar;C:\Program Files\Java\jdk1.7.0_80\jre\lib\ext\sunec.jar;C:\Program Files\Java\jdk1.7.0_80\jre\lib\ext\sunjce_provider.jar;C:\Program Files\Java\jdk1.7.0_80\jre\lib\ext\sunmscapi.jar;C:\Program Files\Java\jdk1.7.0_80\jre\lib\ext\zipfs.jar;C:\Program Files\Java\jdk1.7.0_80\jre\lib\javaws.jar;C:\Program Files\Java\jdk1.7.0_80\jre\lib\jce.jar;C:\Program Files\Java\jdk1.7.0_80\jre\lib\jfr.jar;C:\Program Files\Java\jdk1.7.0_80\jre\lib\jfxrt.jar;C:\Program Files\Java\jdk1.7.0_80\jre\lib\jsse.jar;C:\Program Files\Java\jdk1.7.0_80\jre\lib\management-agent.jar;C:\Program Files\Java\jdk1.7.0_80\jre\lib\plugin.jar;C:\Program Files\Java\jdk1.7.0_80\jre\lib\resources.jar;C:\Program Files\Java\jdk1.7.0_80\jre\lib\rt.jar;D:\bigdataworkspaces\recommder\out\production\recommder;F:\scala\lib\scala-actors-migration.jar;F:\scala\lib\scala-actors.jar;F:\scala\lib\scala-library.jar;F:\scala\lib\scala-reflect.jar;F:\scala\lib\scala-swing.jar;D:\bigdataworkspaces\recommder\lib\spark-assembly-1.6.0-hadoop2.6.0.jar;C:\Program Files (x86)\JetBrains\IntelliJ IDEA Community Edition 2016.1.3\lib\idea_rt.jar" com.intellij.rt.execution.application.AppMain spam_normal
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
16/09/21 22:16:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
16/09/21 22:16:15 INFO Slf4jLogger: Slf4jLogger started
16/09/21 22:16:15 INFO Remoting: Starting remoting
16/09/21 22:16:16 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriverActorSystem@192.168.184.1:3070]
16/09/21 22:16:18 WARN : Your hostname, root resolves to a loopback/non-reachable address: fe80:0:0:0:0:5efe:c0a8:8c01%17, but we couldn't find any external IP address!
16/09/21 22:16:19 INFO FileInputFormat: Total input paths to process : 1
16/09/21 22:16:19 INFO FileInputFormat: Total input paths to process : 1
16/09/21 22:16:20 INFO deprecation: mapred.tip.id is deprecated. Instead, use mapreduce.task.id
16/09/21 22:16:20 INFO deprecation: mapred.task.id is deprecated. Instead, use mapreduce.task.attempt.id
16/09/21 22:16:20 INFO deprecation: mapred.task.is.map is deprecated. Instead, use mapreduce.task.ismap
16/09/21 22:16:20 INFO deprecation: mapred.task.partition is deprecated. Instead, use mapreduce.task.partition
16/09/21 22:16:20 INFO deprecation: mapred.job.id is deprecated. Instead, use mapreduce.job.id
16/09/21 22:16:20 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
16/09/21 22:16:20 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
1.0
0.0

16/09/21 22:16:24 INFO RemoteActorRefProvider$RemotingTerminator: Shutting down remote daemon.

Process finished with exit code 0

0 0