Spark聚类模型K-Means----来源Spark机器学习
来源:互联网 发布:安卓手机仿windows桌面 编辑:程序博客网 时间:2024/05/01 16:22
import breeze.linalg.DenseVectorimport breeze.numerics.powimport org.apache.spark.mllib.clustering.KMeansimport org.apache.spark.mllib.linalg.Vectorsimport org.apache.spark.mllib.linalg.distributed.RowMatriximport org.apache.spark.mllib.recommendation.{ALS, Rating}import org.apache.spark.{SparkConf, SparkContext}/** * Created by zgr on 2017/3/14. */object Clusters { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("Clusters").setMaster("spark://10.149.252.106:7077"); val sc = new SparkContext(sparkConf); val movies = sc.textFile("hdfs://10.149.252.106:9000/input/ml-100k/u.item") println("====================================================================") println(movies.first) // 1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0 val genres = sc.textFile("hdfs://10.149.252.106:9000/input/ml-100k/u.genre") genres.take(5).foreach(println) val genreMap = genres.filter(!_.isEmpty).map(_.split("\\|")).map(array => (array(1),array(0))).collectAsMap(); println(genreMap) //为电影数据和题材映射关系创建新的RDD,其中包含电影ID、标题和题材 val titlesAndGenres = movies.map(_.split("\\|")).map{array => val genres = array.toSeq.slice(5,array.size) val genresAssigned = genres.zipWithIndex.filter{case(g,id) => g == '1'//挑出是1的类型 }.map{case(g,id)=> genreMap(id.toString); } (array(0).toInt,(array(1),genresAssigned)) } //println(titlesAndGenres.first()) //Run ALS model to generate movie and user factors val rawData = sc.textFile("hdfs://10.149.252.106:9000/input/ml-100k/u.data") val rawRatings = rawData.map(_.split("\t").take(3)) val ratings = rawRatings.map{ case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) } ratings.cache(); val alsModel = ALS.train(ratings,50,10,0.01); //最小二乘法(Alternating Least Squares,ALS)模型返回了两个键值RDD(user-Features和productFeatures)。这两个RDD的键为用户ID或者电影ID,值为相关因素。 //提取相关的因素并转化到MLlib的Vector中作为聚类模型的训练输入 val movieFactors = alsModel.productFeatures.map{case(id,factor) => (id,Vectors.dense(factor))}; val movieVectors = movieFactors.map(_._2)//即得到上一个式子中的Vectors.dense(factor) val userFactors = alsModel.userFeatures.map { case (id, factor) => (id, Vectors.dense(factor)) } val userVectors = userFactors.map(_._2) //归一化数据 val movieMatrix = new RowMatrix(movieVectors); val userMatrix = new RowMatrix(userVectors); val movieMatrixSummary = movieMatrix.computeColumnSummaryStatistics(); val userMatrixSummary = userMatrix.computeColumnSummaryStatistics(); //计算标准差,方差 println("Movie factors mean: " + movieMatrixSummary.mean) println("Movie factors variance: " + movieMatrixSummary.variance) println("User factors mean: " + userMatrixSummary.mean) println("User factors variance: " + userMatrixSummary.variance) //训练聚类模型 k-聚类 val numClusters = 5//K值 val numIterations = 10//迭代次数 val numRuns = 3//训练次数 //对电影的系数向量运行K-均值算法 val movieClusterModel = KMeans.train(movieVectors,numClusters,numIterations,numRuns); //用户相关因素的特征向量上训练K-均值模型 val userClusterModel = KMeans.train(userVectors,numClusters,numIterations,numRuns); //使用训练的K-均值模型进行预测 val movie1 = movieVectors.first val movieCluster = movieClusterModel.predict(movie1); println(movieCluster); val predictions = movieClusterModel.predict(movieVectors) println(predictions.take(10).mkString(",")) //-均值最小化的目标函数是样本到其类中心的欧拉距离之和 //对每个电影计算其特征向量与所属类簇中心向量的距离 val titlesWithFactors = titlesAndGenres.join(movieFactors) val moviesAssigned = titlesWithFactors.map { case (id, ((title, genres), vector)) => val pred = movieClusterModel.predict(vector) val clusterCentre = movieClusterModel.clusterCenters(pred) val dist = computeDistance(DenseVector(clusterCentre.toArray), DenseVector(vector.toArray)) (id, title, genres.mkString(" "), pred, dist) }//电影的信息为:电影ID、标题、题材、类别索引,以及电影的特征向量和类中心的距离 val clusterAssignments = moviesAssigned.groupBy { case (id, title, genres, cluster, dist) => cluster }.collectAsMap//根据特征向量进行分组 //枚举每个类簇并输出距离类中心最近的前20部电影 for((k,v) <- clusterAssignments.toSeq.sortBy(_._1)) { println(s"Cluster $k:") val m = v.toSeq.sortBy(_._5)//根据距离排序 println(m.take(20).map{case(_,title,genres,_,d) => (title,genres,d)}.mkString("\n")) println("======================================================") } //MLlib提供的函数computeCost可以方便地计算出给定输入数据RDD [Vector]的WCSS val movieCost = movieClusterModel.computeCost(movieVectors); val userCost = userClusterModel.computeCost(userVectors); println("WCSS for movies: " + movieCost) println("WCSS for users: " + userCost) //聚类模型参数调优,对于k-means,即改变k值 //交叉验证,电影聚类 val trainTestSplitMovies = movieVectors.randomSplit(Array(0.6,0.4),123) val trainMovies = trainTestSplitMovies(0)//训练集 val testMovies = trainTestSplitMovies(1)//测试集 val costsMovies = Seq(2, 3, 4, 5, 10, 20).map{k=> (k,KMeans.train(trainMovies,k,numIterations,numRuns).computeCost(testMovies)); } println("Movie clustering cross-validation:") costsMovies.foreach { case (k, cost) => println(f"WCSS for K=$k id $cost%2.2f") } //用户聚类 val trainTestSplitUsers = userVectors.randomSplit(Array(0.6, 0.4), 123) val trainUsers = trainTestSplitUsers(0) val testUsers = trainTestSplitUsers(1) val costsUsers = Seq(2, 3, 4, 5, 10, 20).map { k => (k, KMeans.train(trainUsers, numIterations, k, numRuns).computeCost(testUsers)) } println("User clustering cross-validation:") costsUsers.foreach { case (k, cost) => println(f"WCSS for K=$k id $cost%2.2f") } } def computeDistance(v1: DenseVector[Double],v2: DenseVector[Double]):Double=pow(v1 - v2, 2).sum}
0 0
- Spark聚类模型K-Means----来源Spark机器学习
- Spark分类模型--来源Spark机器学习
- 机器学习算法之K-means-spark
- 【Spark 机器学习】K-means聚类算法(理论篇)
- Spark 机器学习-实例演示- K-Means《二》
- Spark 机器学习-实例演示- K-Means《二》
- 机器学习预研spark.ml-K-Means
- Spark K-Means
- Spark-K-Means算法
- 初试Spark之K-Means聚类算法实现
- Spark MLlib之K-Means聚类算法
- 初试Spark之K-Means聚类算法实现
- spark平台 mllib K-Means聚类算法 实现
- 机器学习:k-Means聚类算法
- 机器学习----聚类之k-means
- 机器学习-K-means聚类算法
- <转>Spark机器学习6·聚类模型
- spark机器学习笔记:(七)用Spark Python构建聚类模型
- linux网络编程之用select函数实现io复用(基于TCP)引发的思考
- PyCharm 使用 tricks
- python AES对称加密示例
- 学习算法(3)——查找2个数组中的相同元素
- Codeforces Round #402 (Div. 2) A题
- Spark聚类模型K-Means----来源Spark机器学习
- python爬虫设置请求消息头(headers)
- CppPrimer--数组名与指针& 函数名与函数指针
- 95. Unique Binary Search Trees II
- Android-TextView及其子类(TextView、EditText、Button)
- 随记
- Android事件传递机制
- 解决Python的print中文在windows命令行乱码问题
- js--函数知识点