Spark聚类模型K-Means----来源Spark机器学习

来源:互联网 发布:安卓手机仿windows桌面 编辑:程序博客网 时间:2024/05/01 16:22
import breeze.linalg.DenseVectorimport breeze.numerics.powimport org.apache.spark.mllib.clustering.KMeansimport org.apache.spark.mllib.linalg.Vectorsimport org.apache.spark.mllib.linalg.distributed.RowMatriximport org.apache.spark.mllib.recommendation.{ALS, Rating}import org.apache.spark.{SparkConf, SparkContext}/**  * Created by zgr on 2017/3/14.  */object Clusters {  def main(args: Array[String]) {    val sparkConf = new SparkConf().setAppName("Clusters").setMaster("spark://10.149.252.106:7077");    val sc = new SparkContext(sparkConf);    val movies = sc.textFile("hdfs://10.149.252.106:9000/input/ml-100k/u.item")    println("====================================================================")    println(movies.first)    // 1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0    val genres = sc.textFile("hdfs://10.149.252.106:9000/input/ml-100k/u.genre")    genres.take(5).foreach(println)    val genreMap = genres.filter(!_.isEmpty).map(_.split("\\|")).map(array => (array(1),array(0))).collectAsMap();    println(genreMap)    //为电影数据和题材映射关系创建新的RDD,其中包含电影ID、标题和题材    val titlesAndGenres = movies.map(_.split("\\|")).map{array =>      val genres = array.toSeq.slice(5,array.size)      val genresAssigned = genres.zipWithIndex.filter{case(g,id) =>        g == '1'//挑出是1的类型      }.map{case(g,id)=>        genreMap(id.toString);      }      (array(0).toInt,(array(1),genresAssigned))    }    //println(titlesAndGenres.first())    //Run ALS model to generate movie and user factors    val rawData = sc.textFile("hdfs://10.149.252.106:9000/input/ml-100k/u.data")    val rawRatings = rawData.map(_.split("\t").take(3))    val ratings = rawRatings.map{ case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }    ratings.cache();    val alsModel = ALS.train(ratings,50,10,0.01);    //最小二乘法(Alternating Least Squares,ALS)模型返回了两个键值RDD(user-Features和productFeatures)。这两个RDD的键为用户ID或者电影ID,值为相关因素。    //提取相关的因素并转化到MLlib的Vector中作为聚类模型的训练输入    val movieFactors = alsModel.productFeatures.map{case(id,factor) => (id,Vectors.dense(factor))};    val movieVectors = movieFactors.map(_._2)//即得到上一个式子中的Vectors.dense(factor)    val userFactors = alsModel.userFeatures.map { case (id, factor) => (id, Vectors.dense(factor)) }    val userVectors = userFactors.map(_._2)    //归一化数据    val movieMatrix = new RowMatrix(movieVectors);    val userMatrix = new RowMatrix(userVectors);    val movieMatrixSummary = movieMatrix.computeColumnSummaryStatistics();    val userMatrixSummary = userMatrix.computeColumnSummaryStatistics();    //计算标准差,方差    println("Movie factors mean: " + movieMatrixSummary.mean)    println("Movie factors variance: " + movieMatrixSummary.variance)    println("User factors mean: " + userMatrixSummary.mean)    println("User factors variance: " + userMatrixSummary.variance)    //训练聚类模型 k-聚类    val numClusters = 5//K值    val numIterations = 10//迭代次数    val numRuns = 3//训练次数    //对电影的系数向量运行K-均值算法    val movieClusterModel = KMeans.train(movieVectors,numClusters,numIterations,numRuns);    //用户相关因素的特征向量上训练K-均值模型    val userClusterModel = KMeans.train(userVectors,numClusters,numIterations,numRuns);    //使用训练的K-均值模型进行预测    val movie1 = movieVectors.first    val movieCluster = movieClusterModel.predict(movie1);    println(movieCluster);    val predictions = movieClusterModel.predict(movieVectors)    println(predictions.take(10).mkString(","))    //-均值最小化的目标函数是样本到其类中心的欧拉距离之和    //对每个电影计算其特征向量与所属类簇中心向量的距离    val titlesWithFactors = titlesAndGenres.join(movieFactors)    val moviesAssigned = titlesWithFactors.map { case (id, ((title, genres), vector)) =>      val pred = movieClusterModel.predict(vector)      val clusterCentre = movieClusterModel.clusterCenters(pred)      val dist = computeDistance(DenseVector(clusterCentre.toArray), DenseVector(vector.toArray))      (id, title, genres.mkString(" "), pred, dist)    }//电影的信息为:电影ID、标题、题材、类别索引,以及电影的特征向量和类中心的距离    val clusterAssignments = moviesAssigned.groupBy { case (id, title, genres, cluster, dist) => cluster }.collectAsMap//根据特征向量进行分组    //枚举每个类簇并输出距离类中心最近的前20部电影    for((k,v) <- clusterAssignments.toSeq.sortBy(_._1))      {        println(s"Cluster $k:")        val m = v.toSeq.sortBy(_._5)//根据距离排序        println(m.take(20).map{case(_,title,genres,_,d) => (title,genres,d)}.mkString("\n"))        println("======================================================")      }    //MLlib提供的函数computeCost可以方便地计算出给定输入数据RDD [Vector]的WCSS    val movieCost = movieClusterModel.computeCost(movieVectors);    val userCost = userClusterModel.computeCost(userVectors);    println("WCSS for movies: " + movieCost)    println("WCSS for users: " + userCost)    //聚类模型参数调优,对于k-means,即改变k值    //交叉验证,电影聚类    val trainTestSplitMovies = movieVectors.randomSplit(Array(0.6,0.4),123)    val trainMovies = trainTestSplitMovies(0)//训练集    val testMovies = trainTestSplitMovies(1)//测试集    val costsMovies = Seq(2, 3, 4, 5, 10, 20).map{k=>        (k,KMeans.train(trainMovies,k,numIterations,numRuns).computeCost(testMovies));      }    println("Movie clustering cross-validation:")    costsMovies.foreach { case (k, cost) => println(f"WCSS for K=$k id $cost%2.2f") }    //用户聚类    val trainTestSplitUsers = userVectors.randomSplit(Array(0.6, 0.4), 123)    val trainUsers = trainTestSplitUsers(0)    val testUsers = trainTestSplitUsers(1)    val costsUsers = Seq(2, 3, 4, 5, 10, 20).map { k => (k, KMeans.train(trainUsers, numIterations, k, numRuns).computeCost(testUsers)) }    println("User clustering cross-validation:")    costsUsers.foreach { case (k, cost) => println(f"WCSS for K=$k id $cost%2.2f") }  }  def computeDistance(v1: DenseVector[Double],v2: DenseVector[Double]):Double=pow(v1 - v2, 2).sum}

0 0