零售户聚类改善

来源:互联网 发布:买个淘宝皇冠店多少钱 编辑:程序博客网 时间:2024/05/15 23:42

增加了特征标准化程序

package class6import org.apache.spark.mllib.clustering.{KMeansModel, KMeans}import org.apache.spark.mllib.feature.StandardScalerimport org.apache.spark.mllib.linalg.Vectorsimport org.apache.spark.mllib.stat.Statisticsimport org.apache.spark.sql.Rowimport org.apache.spark.sql.hive.HiveContextimport org.apache.spark.{SparkContext, SparkConf}/** * Created by root on 16-1-22. * 零售户按年库存量、销售量进行聚类 * 两表join出现了数据重叠,考虑每次读一张表,利用RDD的join方法 * 得到特征矩阵。。。 *--------------------------------------------------------------------Within Set Sum of Squared Errors=-----------------------------------------2.6105260195375473E10 * */object tobacco_kmeans {  def main(args: Array[String]) {    val sparkConf = new SparkConf().setAppName("tobacco_kmeans")//.setMaster("local[4]")    val sc = new SparkContext(sparkConf)    val HiveContext = new HiveContext(sc)    import HiveContext._    /*    销售数据     */    val saledata = sql("select com_name ,sum(qty_ord) sale_qty from hhsale_data where puh_time is " +      "not null group by com_name")    /*    库存数据     */    val storedata = sql("select com_name ,sum(qty_ord) store_qty from hhstore_data where item_code is not " +      "null and qty_ord >0 group by com_name")    val data=saledata.join(storedata,"com_name")    val parsedData = data.map{          case Row(_, sale_qty, store_qty) =>            val features = Array[Double](sale_qty.toString.toDouble,              store_qty.toString.toDouble)            Vectors.dense(features)        }.cache()//.saveAsTextFile("/class6/data")   /*   特征标准化    */        val scaler = new StandardScaler(        withMean = true,withStd = true        ).fit(parsedData)       val scaledVectors =parsedData.map(v => scaler.transform(v))    /*      不同的迭代次数     */  //  val it:Array[Int] = Array(1,2,3,4,5,6,7,8,9,10)    val it:Array[Int]=Array(10,15,20,25,30,35,40)    it.foreach(it => {      val model:KMeansModel = KMeans.train(scaledVectors, 3,it,2,"random")      val ssd = model.computeCost(scaledVectors)      println("sum of squared distances of points to their nearest center when itr=" + it + " -> "+ ssd)    })        val numClusters = 3        val numIterations = 20        val model = KMeans.train(scaledVectors,numClusters,numIterations,runs = 2)        //打印数据模型的中心点        println("---------------------------------------------------------------" +          "Cluster centers:" +          "---------------------------------------------------------------------")        for(c <-model.clusterCenters){          println(" "+c.toString)        }        //使用误差平方之和来评估数据模型,--------------------------------------模型在训练集上计算损失        val cost=model.computeCost(scaledVectors)        println("--------------------------------------------------------------------" +          "Within Set Sum of Squared Errors=-----------------------------------------"+cost)        ////用模型对读入的数据进行分类,并输出        //由于 partition 没设置,输出为 200 个小文件,可以使用 bin/hdfs dfs -getmerge 合并        //下载到本地        val result = data.map{          case Row(com_name, sale_qty, store_qty) =>            val features = Array[Double](sale_qty.toString.toDouble,              store_qty.toString.toDouble)            val linevectore = Vectors.dense(features)            //标准化            val scaledline = scaler.transform(linevectore)            val prediction = model.predict(scaledline)            com_name+" "+sale_qty+" "+store_qty+" "+prediction+"\n"        }.saveAsTextFile(args(0))//    val numClusters = 3//    val numIterations = 20//    val model = KMeans.train(parsedData,numClusters,numIterations)//    //打印数据模型的中心点//    println("---------------------------------------------------------------" +//      "Cluster centers:" +//      "---------------------------------------------------------------------")//    for(c <-model.clusterCenters){//      println(" "+c.toString)//    }////    //使用误差平方之和来评估数据模型,--------------------------------------模型在训练集上计算损失////    val cost=model.computeCost(parsedData)//    println("--------------------------------------------------------------------" +//      "Within Set Sum of Squared Errors=-----------------------------------------"+cost)//    ////用模型对读入的数据进行分类,并输出//    //由于 partition 没设置,输出为 200 个小文件,可以使用 bin/hdfs dfs -getmerge 合并//    //下载到本地//    val result = data.map{//      case Row(com_name, sale_qty, store_qty) =>//        val features = Array[Double](sale_qty.toString.toDouble,//          store_qty.toString.toDouble)//        val linevectore = Vectors.dense(features)//        val prediction = model.predict(linevectore)//        com_name+" "+sale_qty+" "+store_qty+" "+prediction+"\n"//    }.saveAsTextFile(args(0))//    val result2  = sqldata.map{//      case Row(com_name,store_qty,sale_qty)=>//        val features =Array[Double](store_qty.toString.toDouble,//          sale_qty.toString.toDouble)//        val linevectore = Vectors.dense(features)//        val prediction = model.predict(linevectore)//        com_name+" "+store_qty+" "+sale_qty+" "+prediction//    }.saveAsTextFile(args(0))    System.out.println("-----------------------------")    sc.stop()  }}

改变迭代次数,随机选初始点,run2次选初始点评估结果:

 1 sum of squared distances of points to their nearest center when itr=10 -> 68.45241255775106            2 sum of squared distances of points to their nearest center when itr=15 -> 68.45241255775106  3 sum of squared distances of points to their nearest center when itr=20 -> 68.45241255775107  4 sum of squared distances of points to their nearest center when itr=25 -> 68.38946484451297  5 sum of squared distances of points to their nearest center when itr=30 -> 69.15875531327036  6 sum of squared distances of points to their nearest center when itr=35 -> 68.5020394304827  7 sum of squared distances of points to their nearest center when itr=40 -> 68.64494935350622  8 ---------------------------------------------------------------Cluster centers:----------------------    -----------------------------------------------  9  [-0.3977231394410828,-0.08638511951423264] 10  [-0.3525021012551558,1.5702237448594607] 11  [1.4603723091512353,-0.8149743960785426] 12 --------------------------------------------------------------------Within Set Sum of Squared Errors=    -----------------------------------------68.64494935350622 13 -----------------------------~        sum of squared distances of points to their nearest center when itr=10 -> 68.45241255775107sum of squared distances of points to their nearest center when itr=15 -> 68.52141419538006sum of squared distances of points to their nearest center when itr=20 -> 68.38946484451294sum of squared distances of points to their nearest center when itr=25 -> 69.15875531327033sum of squared distances of points to their nearest center when itr=30 -> 68.64494935350619sum of squared distances of points to their nearest center when itr=35 -> 68.64494935350619sum of squared distances of points to their nearest center when itr=40 -> 68.45241255775107                                

随机选初始点,改变run的次数

sum of squared distances of points to their nearest center when run=1 -> 68.64494935350622sum of squared distances of points to their nearest center when run=2 -> 78.26550176498397sum of squared distances of points to their nearest center when run=3 -> 68.2653607410948sum of squared distances of points to their nearest center when run=4 -> 68.52141419538006sum of squared distances of points to their nearest center when run=5 -> 68.2653607410948sum of squared distances of points to their nearest center when run=6 -> 68.2653607410948sum of squared distances of points to their nearest center when run=7 -> 68.38946484451297sum of squared distances of points to their nearest center when run=8 -> 68.28913722951712sum of squared distances of points to their nearest center when run=9 -> 68.2653607410948sum of squared distances of points to their nearest center when run=10 -> 68.26536074109478-----------------------------

kmeans++选初始点,改变run的次数

  1 sum of squared distances of points to their nearest center when run=1 -> 68.28913722951711             2 sum of squared distances of points to their nearest center when run=2 -> 72.48322271456834  3 sum of squared distances of points to their nearest center when run=3 -> 68.45241255775106  4 sum of squared distances of points to their nearest center when run=4 -> 68.6449493535062  5 sum of squared distances of points to their nearest center when run=5 -> 68.26536074109477  6 sum of squared distances of points to their nearest center when run=6 -> 68.45241255775105  7 sum of squared distances of points to their nearest center when run=7 -> 68.45241255775106  8 sum of squared distances of points to their nearest center when run=8 -> 68.26536074109477  9 sum of squared distances of points to their nearest center when run=9 -> 68.26536074109477 10 sum of squared distances of points to their nearest center when run=10 -> 68.28913722951711 11 -----------------------------~                                             

递归20次,run2次的评估结果:

sum of squared distances of points to their nearest center when itr=10 -> 68.45241255775107sum of squared distances of points to their nearest center when itr=15 -> 72.28822380601336sum of squared distances of points to their nearest center when itr=20 -> 72.48322271456834sum of squared distances of points to their nearest center when itr=25 -> 68.64494935350622sum of squared distances of points to their nearest center when itr=30 -> 68.38946484451297sum of squared distances of points to their nearest center when itr=35 -> 68.28913722951712sum of squared distances of points to their nearest center when itr=40 -> 70.36885180809672-----------------------------

递归20次,run2次的kmeans++
聚类结果

[094]兴关店 18706 57706.000000 0[012]贵钢店 63320 40860.000000 0[056]观水店 28934 81498.000000 1[043]云阳店 49752 51101.000000 0[027]湘雅店 116073 32931.000000 2[077]凤凰翠堤 50637 42147.000000 0[054]O六一店 55564 62501.000000 1[073]和平店 37079 60840.000000 1[075]四方河店 53378 54411.000000 0[065]比兰德店 76568 48998.000000 2[051]新威店 44724 51807.000000 0[085]摩卡店 21160 56871.000000 0[024]金狮店A 43576 49296.000000 0[017]欣歆店 32897 59049.000000 0[046]贝地店 44829 48188.000000 0[063]金果园店 55705 31281.000000 0[019]黔灵店 38509 81074.000000 1[060]筑兴店 73269 30190.000000 2[036]华阳店 47707 51533.000000 0[037]小石城 52363 50968.000000 0[034]交校店 21708 37380.000000 0[079]万江店 44458 48095.000000 0[096]鸿通城 38592 50089.000000 0[072]吉奥店 45314 51248.000000 0[067]二中店 5713 2870.000000 0[093]新光店 14395 43151.000000 0[040]玉田店 73169 31059.000000 2[076]三桥北店 66542 49079.000000 0[070]军区店 31491 45743.000000 0[084]中天店 48581 50627.000000 0[042]马王店 49864 51891.000000 0[001]白云一店 94509 60483.000000 2[055]城基店 46682 55537.000000 0[068]枣山店 50042 58374.000000 0[087]警校店 18675 37316.000000 0[078]世纪新城 33088 38337.000000 0[069]松竹苑店 44036 46503.000000 0[050]世纪园店 39734 38059.000000 0[008]省委店 41826 91960.000000 1[026]501店 42622 59411.000000 0[010]教育学院店 82740 33065.000000 2[035]曦阳店 55683 41977.000000 0[038]振华店 41467 71864.000000 1[071]枫丹店 138029 27649.000000 2[015]清水江店 73585 25374.000000 2[089]蟠桃宫店 23821 54760.000000 0[013]瑞和店 33334 43215.000000 0[022]083店A 43406 56510.000000 0[049]贵龙店 84094 34512.000000 2[066]六广门店 33312 63732.000000 1[029]大理石店 96701 58521.000000 2[095]叠翠谷店 33754 55371.000000 0[014]虹桥店 82285 36892.000000 2[006]月亮岩 65030 52661.000000 0[098]浦江店 15519 75004.000000 1[011]凤凰店 41914 50276.000000 0[086]东新店 33324 56329.000000 0[081]福楼旺邸店 41874 50275.000000 0[062]头桥店 53363 53378.000000 0[041]万东店 100295 32096.000000 2[007]家乐店 60275 51752.000000 0[028]威清店 21674 37084.000000 0[074]十二中店 50059 38261.000000 0[053]嘉怡店 45155 33315.000000 0[032]宅吉店 60291 53308.000000 0[030]东门店 73298 40233.000000 2[061]太慈店 90692 30359.000000 2[059]中北店 49589 67715.000000 1[021]183店 62289 26584.000000 2[033]新发店 182588 50403.000000 2[020]贵医店 50190 64561.000000 1[047]宝山南店 48689 46983.000000 0[090]保利温泉店 29566 53804.000000 0[002]白云二店 72947 61744.000000 1[092]龙宇店 29702 52608.000000 0[025]宏福店 16933 48769.000000 0
0 0
原创粉丝点击