【Mastering Machine Learning with scikit-learn (python+spark版)】Chapter2 Linear Regression
来源:互联网 发布:java闰年判断case 编辑:程序博客网 时间:2024/05/18 16:16
:源码下载地址
https://www.packtpub.com/big-data-and-business-intelligence/mastering-machine-learning-scikit-learn
:启动ipython notebook
cd E:\DM\bookcode\mastering-machine-learning-scikit-learn
ipython notebook
:python版预测pizza直径与价格的关系
from sklearn.linear_model import LinearRegression# Training dataX = [[6], [8], [10], [14], [18]]y = [[7], [9], [13], [17.5], [18]]# Create and fit the modelmodel = LinearRegression()model.fit(X, y)print 'A 12" pizza should cost: $%.2f' % model.predict([12])[0]
:spark版预测pizza直径与价格的关系
package com.bbw5.ml.sparkimport org.apache.spark.SparkConfimport org.apache.spark.SparkContextimport org.apache.spark.ml.regression.LinearRegressionimport org.apache.spark.mllib.linalg.Vectorsimport org.apache.spark.mllib.regression.LabeledPointimport org.apache.spark.mllib.regression.{ LinearRegressionModel => LRModel }import org.apache.spark.mllib.regression.LinearRegressionWithSGDimport org.apache.spark.rdd.RDDimport org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctionsimport org.apache.spark.sql.DataFrameimport org.apache.spark.sql.SQLContextimport org.apache.spark.mllib.evaluation.RegressionMetricsimport org.apache.spark.ml.evaluation.RegressionEvaluatorimport org.apache.spark.ml.feature.PolynomialExpansion/** * Pizza price plotted against diameter * X=Diameter in inches * Y=Price in dollars */object SparkLinearRegresionSample1 { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("SparkLinearRegresionSample1") val sc = new SparkContext(sparkConf) val sqlContext = new org.apache.spark.sql.SQLContext(sc) import sqlContext.implicits._ val X = Array(6, 8, 10, 14, 18) val y = Array(7, 9, 13, 17.5, 18) val X_test = Array(8, 9, 11, 16, 12) val y_test = Array(11, 8.5, 15, 18, 11) val training = sc.makeRDD(X.zip(y).map(a => Vectors.dense(a._1) -> a._2).toSeq).toDF("features", "label") val testing = sc.makeRDD(X_test.zip(y_test).map(a => Vectors.dense(a._1) -> a._2).toSeq).toDF("features", "label") //simple linear regression train4ML(training, testing) val X2 = Array((6, 2), (8, 1), (10, 0), (14, 2), (18, 0)) val y2 = Array((7), (9), (13), (17.5), (18)) val X2_test = Array((8, 2), (9, 0), (11, 2), (16, 2), (12, 0)) val y2_test = Array((11), (8.5), (15), (18), (11)) val training2 = sc.makeRDD(X2.zip(y2).map(a => Vectors.dense(a._1._1, a._1._2) -> a._2).toSeq).toDF("features", "label") val testing2 = sc.makeRDD(X2_test.zip(y2_test).map(a => Vectors.dense(a._1._1, a._1._2) -> a._2).toSeq).toDF("features", "label") //multiply linear regression train4ML(training2, testing2) //polynomial linear regression val degree = 5 val training3 = sc.makeRDD(X2.zip(y2).map(a => Vectors.dense(a._1._1, a._1._2) -> a._2).toSeq).toDF("lastFeatures", "label") val testing3 = sc.makeRDD(X2_test.zip(y2_test).map(a => Vectors.dense(a._1._1, a._1._2) -> a._2).toSeq).toDF("lastFeatures", "label") val polynomialExpansion = new PolynomialExpansion().setInputCol("lastFeatures").setOutputCol("features").setDegree(degree) val polyTraining = polynomialExpansion.transform(training3) val polyTesting = polynomialExpansion.transform(testing3) train4ML(polyTraining, polyTesting) } /** * use ml api to train */ def train4ML(training: DataFrame, testing: DataFrame) { val lr = new LinearRegression().setMaxIter(10).setRegParam(0.3) val lrModel = lr.fit(training) // Print the coefficients and intercept for linear regression // Summarize the model over the training set and print out some metrics val trainingSummary = lrModel.summary println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}") println(s"numIterations: ${trainingSummary.totalIterations}") println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}") trainingSummary.residuals.show() println(s"train rmse: ${trainingSummary.rootMeanSquaredError}") println(s"train r2: ${trainingSummary.r2}") //println(s"train pValues: ${trainingSummary.pValues}") //println(s"train tValues: ${trainingSummary.tValues}") //predict val predictDF = lrModel.transform(testing) //.select("features", "prediction") println("A 8\" pizza should cost: " + predictDF.first().getAs("prediction")) println("test rmse:" + new RegressionEvaluator().setMetricName("rmse").evaluate(predictDF)) println("test mse:" + new RegressionEvaluator().setMetricName("mse").evaluate(predictDF)) println("test r2:" + new RegressionEvaluator().setMetricName("r2").evaluate(predictDF)) println("test mae:" + new RegressionEvaluator().setMetricName("mae").evaluate(predictDF)) } /** * 使用新版ML API进行训练和预测 */ def train4ML2(training: DataFrame) { val evaluations = for ( iter <- Array(1, 5, 10); lambda <- Array(0.0001, 0.01, 1.0) ) yield { val lr = new LinearRegression().setMaxIter(iter).setRegParam(lambda) val lrModel = lr.fit(training); val trainingSummary = lrModel.summary ((iter, lambda), trainingSummary.rootMeanSquaredError, trainingSummary.r2) } evaluations.sortBy(_._2).reverse.foreach(println) } def train4ML3(training: DataFrame, testing: DataFrame) { } def calculateMSE(testing: RDD[LabeledPoint], model: LRModel): Double = { // Evaluate model on training examples and compute training error val valuesAndPreds = testing.map { point => val prediction = model.predict(point.features) (point.label, prediction) } valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean() } /** * 使用MLLIB API进行训练和预测 * 通过12寸pizza的价格约等于$13.68来作为模型参数优劣判断 */ def train4MLLib(sc: SparkContext) { val diameters = Array(6, 8, 10, 14, 18.toDouble) val prices = Array(7, 9, 13, 17.5, 18) val training = sc.makeRDD(0 until diameters.size).map(i => LabeledPoint(prices(i), Vectors.dense(diameters(i)))) //采用默认参数训练stepSize=1 var model = LinearRegressionWithSGD.train(training, 100) println(s"weight:${model.weights},intercept:${model.intercept}") //效果非常差:A 12" pizza should cost: -8.001325844886018E135 println("A 12\" pizza should cost: " + model.predict(Vectors.dense(12))) //选取MSE最小方案 val evaluations = for ( numIterations <- Array(1, 5, 10, 100); stepSize <- Array(0.0001, 0.01, 1.0); miniBatchFraction <- Array(0.1, 0.5, 1.0) ) yield { model = LinearRegressionWithSGD.train(training, numIterations, stepSize, miniBatchFraction) ((numIterations, stepSize, miniBatchFraction), calculateMSE(training, model)) } //升序排序 evaluations.sortBy(_._2).foreach(println) //取最优方案预测 val params = evaluations.sortBy(_._2).apply(0)._1 model = LinearRegressionWithSGD.train(training, params._1, params._2, params._3) println(s"weight:${model.weights},intercept:${model.intercept}") //A 12" pizza should cost: 13.544867556960533 println("A 12\" pizza should cost: " + model.predict(Vectors.dense(12))) }}
0 0
- 【Mastering Machine Learning with scikit-learn (python+spark版)】Chapter2 Linear Regression
- chapter2 of OReilly.Hands-On.Machine.Learning.with.Scikit-Learn.and.TensorFlow
- Python Machine Learning---scikit-learn
- Machine Learning:Linear Regression With One Variable
- Machine Learning:Linear Regression With Multiple Variables
- Machine Learning - Linear Regression with One Variable
- Machine Learning - Linear Regression with Multiple Variables
- Learning Scikit-learn Machine Learning in Python
- 【machine learning】linear regression
- scikit-learn: machine learning in Python
- Machine Learning in Python (Scikit-learn)-(转)
- 【Stanford Machine Learning】Lecture 2--Linear Regression with Multiple Variables
- Machine Learning by Andrew Ng ---Linear Regression with one variable
- Machine Learning by Andrew Ng---Linear Regression with multiple variables
- Machine Learning week 1 quiz: Linear Regression with One Variable
- Machine Learning week 1 quiz: Linear Regression with One Variable
- Machine Learning week 2 quiz: Linear Regression with Multiple Variables
- Machine Learning -- Linear Regression with Multiple Variables(Andrew Ng)
- 修改oracle database中user的密码遇到ORA-28003和ORA-20001
- Spring整合RabbitMQ进行消息队列开发
- AIDL随写
- Zookeeper异常:FAILED TO WRITE PID与Permission denied
- 使用Vitamio打造自己的Android万能播放器(1)——准备
- 【Mastering Machine Learning with scikit-learn (python+spark版)】Chapter2 Linear Regression
- 如何使用echart中获取canvas绘制到自己的canvas上去
- MFC library Hierarchy Chart
- C#5.0 采用CancellationTokenSource方式取消Task
- Android事件分发机制以及滑动冲突处理
- 惭愧吧!!!
- MySQL存储引擎--MyISAM与InnoDB区别
- 从现有仓库克隆并开发项目
- 【JAVA】19、多维数组