【scala】【spark】基于随机梯度下降的简单线性回归编程实现：Linear Regression - SGD

来源：互联网发布：梦幻西游175魔化生数据编辑：程序博客网时间：2024/05/08 02:05

基于 Spark + Scala 编程实现基于随机梯度下降的简单线性回归算法；

数据集来自于 Spark 源码包；

调用 org.jblas.DoubleMatrix 包；

实现过程避免直接调用 Spark.MLlib 或 Spark.ml ，旨在深入理解“线性回归 - 随机梯度下降法”的基本原理。

package org.lily.optimization.testimport org.apache.spark.{SparkContext,SparkConf}import org.apache.spark.mllib.regression.LabeledPointimport org.apache.spark.mllib.linalg.Vectorsimport org.apache.spark.mllib.util.MLUtilsimport org.jblas.DoubleMatriximport scala.collection.mutable.ArrayBufferobject LinearRegressionSGDDemo {  def main(args: Array[String]): Unit = {    val conf = new SparkConf().setMaster("local").setAppName("simpletest")    val sc = new SparkContext(conf)           // Load training data in LIBSVM format.    val sourcedata = MLUtils.loadLibSVMFile(sc, "D:/data/mllib/sample_linear_regression_data.txt")        //初始化参数    val addIntercept:Boolean = true//是否设置截距    val alpha = 0.01 //学习率    val numIterations = 2000//迭代次数    var loss = 10.0        //对象转化为元组（类标签，特征）    val data = if(addIntercept) { // data : RDD[(Double, Array[Double])]       sourcedata.map(x => (x.label, 1.0 +: x.features.toArray))    } else {      sourcedata.map(x => (x.label, x.features.toArray))    }    //初始化权重向量    val numFeatures = sourcedata.first().features.toArray.length // numFeatures = 10    val initialWeights = new Array[Double](numFeatures)    val initialWeightsWithIntercept = if(addIntercept) { //initialWeightsWithIntercept: Array[Double]      0.0 +: initialWeights    } else {      initialWeights    }    val numExamples = data.count().toInt//样本点个数 numExamples = 501    var weights = new DoubleMatrix(initialWeightsWithIntercept.length,1,initialWeightsWithIntercept:_*)    println("initial weights: " + weights )        val label = data.map(x => x._1).collect()// label : Array[Double]    val features = data.map(x => x._2).collect()// features: Array[Array[Double]] - 501*10    //features.length = numExamples = 501             var hypothesis = 0.0    var midError = 0.0        for( k <- 0 until numIterations         if (loss > 1.0)           ) {     val i = (new util.Random).nextInt(numExamples) //生成一个随机数，不包括 - numExamples     // blog.csdn.net/springlustre/article/details/48828507          val variable = new DoubleMatrix(features(i).length, 1, features(i):_*)     hypothesis = variable.dot(weights)     midError = label(i) - hypothesis     weights = weights.add(variable.mul(alpha * midError))       println("The current weights: " + weights)           var cacheLoss = 0.0       for(j <- 0 to (numExamples - 1)) {         var multiplier = new DoubleMatrix(features(j).length, 1, features(j):_*)          cacheLoss += (label(j) - weights.dot(multiplier))*(label(j) - weights.dot(multiplier))          }         loss = 0.5 * cacheLoss / numExamples        println("The current loss: " + loss)     } //    for(i <- 0 to 10) println(i)//- 0,1,2,3,4,5,6,7,8,9,10        sc.stop()     }}

参考文章1：http://blog.csdn.net/yangguo_2011/article/details/33859337

参考文章2：http://blog.csdn.net/springlustre/article/details/48828507

1 0