spark数据统计

来源:互联网 发布:电脑软件开发公司 编辑:程序博客网 时间:2024/05/20 16:11

        在《spark高级数据分析》上看到一种不错的对数据进行初步统计分析的方法。在实践后,果然效果不错。在此记录,并分享给大家,勉励自己不断学习新知识。


      

 //判断首行def isHeader(line:String) = line.contain("id_1")        def toDouble(s:String)={    try{        s.toDouble    }    catch{        case e: Exception => Double.NaN    }}case class MatchData(id1:Int, id2:Int, scores:Array[Double],matched:Boolean)def parseData(line:String){    var arr = line.split(",")    val id1 = arr(0).toInt    val id2 = arr(1).toInt    val matched = arr(11).toBoolean    //0列为第一列,统计2到11列的数据    val scoses = arr.slice(2,11).map(toDouble)    MatchData(id1,id2,scores,matched)}def DataAnalysis(){    import Java.lang.Double.isNan    val conf = new SparkConf().setAppName("dataAnalysis").setMaster("local[*]")    val sc = new SparkContext(conf)    //读取linkage目录下的所有文件    val filePath = "linkage/*"    val textContext = sc.textFile(filePath)    //去掉首行    val noHeader = textContext.filter{ x => !isHeader(x)}    val mds = noHeader.map{ x => parseData(x)}}val statsm = statsWithMissing(mds.filter(_.matched).map(_.scores))val statsn  = statsWithMissing(mds.filter(!_.matched).map(_.scores))statsm.zip(statsn).map{ case(m,n) =>    (m.m_lMissing + n.m_lMissing, m.stats.mean - n.stats.mean)}.foreach(println)import org.apache.spark.util.StatCounterclass VariableStats extends Serializable{    val stats:StatCounter = new StatCounter()    var m_Missing = 0    def add(x:Double):VariableStats={        if(Double.NaN.equals(x)) m_lMissing += 1        else stats.merge(x)        this    }    def merge(other:VariableStats):VariableStats={        stats.merge(other.stats)        m_lMissing += other.m_lMissing        this    }    override def toString = {        "stats:" + stats.toString() +"NaN" + m_lMissing    }}object VariableStats extends Serializable{    def apply(x:Double) = new VariableStats()}import org.apache.spark.rdd.RDDdef statsWithMissing(rdd: RDD[Array[Double]]):Array[VariableStats] = {    val nastats = rdd.mapPartitions((iter:Iterator[Array[Double]]) => {        val nas:Array[VariableStats] = iter.next().map{ d => VariableStats(d) }        iter.foreach{ arr => nas.zip(arr).foreach{case(n,d) => n.add(d)}}        Iterator(nas)    })    nastats.reduce((n1,n2) => {        n1.zip(n2).map{case(a,b) => a.merge(b)}    })}def naz(d:Double) = if(isNan(d)) 0 else dcase class Scored(md:MatchData,score:Double)def getScores(mds:RDD[MatchData]) = {    val ct = mds.map(md =>{        val score = Array(2,5,6,7,8).map(i => naz(md.score(i))).sum    })    ct.filter{ s => s.score >= 4}.map{ s => s.md.matched}.countByValue().foreach(println)}def main(args:Array[String]){    DataAnalysis()    println("OK")}

原创粉丝点击