Spark分类模型--来源Spark机器学习
来源:互联网 发布:安卓手机仿windows桌面 编辑:程序博客网 时间:2024/05/01 11:57
import org.apache.spark.mllib.classification.{ClassificationModel, LogisticRegressionWithSGD, NaiveBayes, SVMWithSGD}import org.apache.spark.mllib.evaluation.BinaryClassificationMetricsimport org.apache.spark.mllib.feature.StandardScalerimport org.apache.spark.mllib.linalg.Vectorsimport org.apache.spark.mllib.linalg.distributed.RowMatriximport org.apache.spark.mllib.optimization.{SquaredL2Updater, Updater}import org.apache.spark.mllib.regression.LabeledPointimport org.apache.spark.mllib.tree.DecisionTreeimport org.apache.spark.mllib.tree.configuration.Algoimport org.apache.spark.mllib.tree.impurity.{Entropy, Gini, Impurity}import org.apache.spark.rdd.RDDimport org.apache.spark.{SparkConf, SparkContext}/** * Created by zgr on 2017/3/14. */object ClassificationDemo { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("classification").setMaster("spark://10.149.252.106:7077"); val sc = new SparkContext(sparkConf); //将训练数据读入RDD并且进行检 val rawData = sc.textFile("hdfs://10.149.252.106:9000/input/train_classification.tsv"); var records = rawData.map(_.split("\t")); //数据处理 val data = records.map{r => val trimmed = r.map(_.replaceAll("\"", "")); val label = trimmed(r.size - 1).toInt; val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble) LabeledPoint(label, Vectors.dense(features))//返回 } val numdata = data.count(); print(numdata+"================================================="); //数值数据中包含负的特征值,朴素贝叶斯模型要求特征值非负,,,将负特征值设为0 val nbData = records.map{r => val trimmed = r.map(_.replaceAll("\"", "")) val label = trimmed(r.size - 1).toInt val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble).map(d => if (d < 0) 0.0 else d) LabeledPoint(label, Vectors.dense(features)) } //为逻辑回归和SVM设置迭代次数,为决策树设置最大树深度 val numIterations = 10; val maxTreeDepth = 5; //首先训练逻辑回归模型 val logisticModel = LogisticRegressionWithSGD.train(data,numIterations); println("======logistic==="+logisticModel) //再训练SVM模型 val svmModel = SVMWithSGD.train(data,numIterations); println(svmModel) //训练朴素贝叶斯 val nbModel = NaiveBayes.train(nbData); println(nbModel) //训练决策树 val dtModel = DecisionTree.train(data,Algo.Classification,Entropy,maxTreeDepth); println(dtModel) //预测准确率和错误率 val lrTotalCorrect = data.map{point => if(logisticModel.predict(point.features) == point.label) 1 else 0 }.sum(); val lrAccuracy = lrTotalCorrect /numdata; println("**********************lrAccuracy="+lrAccuracy+"========================================"); // compute accuracy for the other models val svmTotalCorrect = data.map { point => if (svmModel.predict(point.features) == point.label) 1 else 0 }.sum val nbTotalCorrect = nbData.map { point => if (nbModel.predict(point.features) == point.label) 1 else 0 }.sum val dtTotalCorrect = data.map { point => val score = dtModel.predict(point.features) val predicted = if (score > 0.5) 1 else 0//决策树的预测阈 if (predicted == point.label) 1 else 0 }.sum val svmAccuracy = svmTotalCorrect / numdata // svmAccuracy: Double = 0.5146720757268425 val nbAccuracy = nbTotalCorrect / numdata // nbAccuracy: Double = 0.5803921568627451 val dtAccuracy = dtTotalCorrect / numdata // dtAccuracy: Double = 0.6482758620689655 println("**********************svmAccuracy="+svmAccuracy+"========================================"); println("**********************nbAccuracy="+nbAccuracy+"========================================"); println("**********************dtAccuracy="+dtAccuracy+"========================================"); //改进性能,调参 //将特征向量用RowMatrix类表示成MLlib中的分布矩阵 val vectors = data.map(lp => lp.features); val matrix = new RowMatrix(vectors); val matrixSummary = matrix.computeColumnSummaryStatistics();//可以输出很多关于矩阵的信息 比如每列最小值,最大值、、 //可以对每个特征进行标准化,使得每个特征是0均值和单位标准差 //具体做法是对每个特征值减去列的均值,然后除以列的标准差以进行缩放 //实际上,我们可以对数据集中每个特征向量,与均值向量按项依次做减法,然后依次按项除以特征的标准差向量。标准差向量可以由方差向量的每项求平方根得到。 //使用StandardScaler 传入两个参数,一个表示是否从数据中减去均值,另一个表示是否应用标准差缩放 val scaler = new StandardScaler(withMean = true,withStd = true).fit(vectors); val scaledData = data.map(lp => LabeledPoint(lp.label,scaler.transform(lp.features))); //现在我们使用标准化的数据重新训练模型 决策树和朴素贝叶斯不受特征标准话的影响), val lrModelScaled = LogisticRegressionWithSGD.train(scaledData, numIterations) val lrTotalCorrectScaled = scaledData.map { point => if (lrModelScaled.predict(point.features) == point.label) 1 else 0 }.sum val lrAccuracyScaled = lrTotalCorrectScaled / numdata println("**********************标准话前:="+lrAccuracy+"========================================"); println("=====================标准化后logistic:"+lrAccuracyScaled); val lrPredictionsVsTrue = scaledData.map { point => (lrModelScaled.predict(point.features), point.label) } val lrMetricsScaled = new BinaryClassificationMetrics(lrPredictionsVsTrue) val lrPr = lrMetricsScaled.areaUnderPR val lrRoc = lrMetricsScaled.areaUnderROC println(f"${lrModelScaled.getClass.getSimpleName}\nAccuracy: ${lrAccuracyScaled * 100}%2.4f%%\nArea under PR: ${lrPr * 100.0}%2.4f%%\nArea under ROC: ${lrRoc * 100.0}%2.4f%%") //类别特征对性能影响// val categories = records.map(r => r(3)).distinct.collect.zipWithIndex.toMap;// val numCategories = categories.size;//得到类别种类数量 14个// //创建一个长为14的向量来表示类别特征,然后根据每个样本所属类别索引,对相应的维度赋值为1,其他为0// // numCategories: Int = 14// val dataCategories = records.map { r =>// val trimmed = r.map(_.replaceAll("\"", ""))// val label = trimmed(r.size - 1).toInt// val categoryIdx = categories(r(3))// val categoryFeatures = Array.ofDim[Double](numCategories)// categoryFeatures(categoryIdx) = 1.0// val otherFeatures = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)// val features = categoryFeatures ++ otherFeatures// LabeledPoint(label, Vectors.dense(features))// }// println("===========================dataCategories.first:"+dataCategories.first); //参数调优,迭代次数,步长,正则化 //决策树,最大深度影响 val dtResultsEntropy = Seq(1, 2, 3, 4, 5, 10, 20).map{param => val model = trainDTWithParams(data,param,Entropy); val scoreAndLabels = data.map{point => val score = model.predict(point.features) (if (score > 0.5) 1.0 else 0.0, point.label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) (s"$param tree depth", metrics.areaUnderROC) } dtResultsEntropy.foreach{case(param,auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%") } //采用Gini不纯度进行类似 val dtResultsGini = Seq(1, 2, 3, 4, 5, 10, 20).map { param => val model = trainDTWithParams(data, param, Gini) val scoreAndLabels = data.map { point => val score = model.predict(point.features) (if (score > 0.5) 1.0 else 0.0, point.label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) (s"$param tree depth", metrics.areaUnderROC) } dtResultsGini.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%") } //贝叶斯,通过改变不同的lambda val nbResults = Seq(0.001, 0.01, 0.1, 1.0, 10.0).map{param => val model = trainNBWithParams(nbData,param); val scoreAndLabels = nbData.map{point => (model.predict(point.features),point.label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) (s"$param lambda", metrics.areaUnderROC) } nbResults.foreach{case(param,auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%")} println("============================贝叶斯调参=======================================") //交叉验证 //create a 60% / 40% train/test data split val trainTestSplit = data.randomSplit(Array(0.6,0.4),123) val train = trainTestSplit(0) val test = trainTestSplit(1) //在不同的正则化参数下评估模型的性能 //测试集 val regResultsTest = Seq(0.0, 0.001, 0.0025, 0.005, 0.01).map { param => val model = trainWithParams(train, param, numIterations, new SquaredL2Updater, 1.0) createMetrics(s"$param L2 regularization parameter", test, model) } regResultsTest.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.6f%%") } println("============================测试集=========================") /* * 0.0 L2 regularization parameter, AUC = 50.168509% 0.001 L2 regularization parameter, AUC = 50.168509% 0.0025 L2 regularization parameter, AUC = 50.168509% 0.005 L2 regularization parameter, AUC = 50.168509% 0.01 L2 regularization parameter, AUC = 50.168509% ============================测试集========================= * */ //训练集// val regResultsTrain = Seq(0.0, 0.001, 0.0025, 0.005, 0.01).map { param =>// val model = trainWithParams(train, param, numIterations, new SquaredL2Updater, 1.0)// createMetrics(s"$param L2 regularization parameter", train, model)// }// regResultsTrain.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.6f%%") }// println("============================训练集=========================") /* 0.0 L2 regularization parameter, AUC = 50.124190% 0.001 L2 regularization parameter, AUC = 50.124190% 0.0025 L2 regularization parameter, AUC = 50.124190% 0.005 L2 regularization parameter, AUC = 50.124190% 0.01 L2 regularization parameter, AUC = 50.124190% ============================训练集=========================*/ } //,定义辅助函数,根据给定输入训练模型: def trainWithParams(input: RDD[LabeledPoint], regParam: Double, numIterations: Int, updater: Updater, stepSize: Double) = { val lr = new LogisticRegressionWithSGD lr.optimizer.setNumIterations(numIterations).setUpdater(updater).setRegParam(regParam).setStepSize(stepSize) lr.run(input) } //定义第二个辅助函数并根据输入数据和分类模型,计算相关的AUC def createMetrics(label:String,data:RDD[LabeledPoint],model:ClassificationModel)={ val scoreAndLabels = data.map{point => (model.predict(point.features),point.label) } var metries = new BinaryClassificationMetrics(scoreAndLabels); (label,metries.areaUnderROC()) } def trainDTWithParams(input: RDD[LabeledPoint], maxDepth: Int, impurity: Impurity)={ DecisionTree.train(input,Algo.Classification,impurity,maxDepth); } def trainNBWithParams(input: RDD[LabeledPoint], lambda: Double) ={ val nb = new NaiveBayes; nb.setLambda(lambda) nb.run(input); }}
0 0
- Spark分类模型--来源Spark机器学习
- Spark聚类模型K-Means----来源Spark机器学习
- spark机器学习笔记:(四)用Spark Python构建分类模型(上)
- spark机器学习笔记:(五)用Spark Python构建分类模型(下)
- spark机器学习笔记:(五)用Spark Python构建分类模型(下)
- Spark机器学习之分类与回归
- spark机器学习库之决策树分类
- Spark构建分类模型
- spark机器学习构建回归模型
- Spark-机器学习模型持久化
- Spark的学习资料来源
- MLlib分类算法实战演练--Spark学习(机器学习)
- MLlib分类算法实战演练--Spark学习(机器学习)
- MLlib分类算法实战演练--Spark学习(机器学习)
- 分类解读Spark下的39个机器学习库
- Spark 机器学习实践 :Iris数据集的分类
- spark学习笔记-spark上做kaggle的机器学习分类任务
- Spark 机器学习《一》
- 数据库应用优化(1)
- 《一本书读懂TCP/IP》读后感——第四章TCP/IP网络机构
- python 生成器和lambda的故事
- linux下的压缩与解压命令以及磁盘加载
- 二分法查询(某个网站的面试题)
- Spark分类模型--来源Spark机器学习
- 数据库应用优化(2)
- CppPrimer--顶层const与底层const
- linux网络编程之用select函数实现io复用(基于TCP)引发的思考
- PyCharm 使用 tricks
- python AES对称加密示例
- 学习算法(3)——查找2个数组中的相同元素
- Codeforces Round #402 (Div. 2) A题
- Spark聚类模型K-Means----来源Spark机器学习