Spark统计唯一值、缺失值和单值的算子(scala实现)

来源:互联网 发布:java并发编程实战下载 编辑:程序博客网 时间:2024/05/18 00:56

采用传统的Spark SQL编写sql语句循环多次按列查询来实现效率太低,如是采用基本的WordCount统计单词的思想,“columnName+value”即“列名:值”作为唯一单词,用特殊分隔符隔开,遍历一次即可计算所需的所有值。代码如下:

def getStatistics(data: DataFrame):  (java.util.HashMap[String, Long], java.util.HashMap[String, Long], java.util.HashMap[String, Long]) = {    val colUnique = new java.util.HashMap[String, Long] //唯一值    val colMissing = new java.util.HashMap[String, Long] //缺失值    val colSingle = new java.util.HashMap[String, Long] //单值    val allColArr = data.columns    val dtypes = data.dtypes    val colSize = allColArr.size    val separator = "_0_" //分隔符    val len = separator.length    val rddHandle = data.rdd.map( row => {      val str:StringBuilder = new StringBuilder      for (i <- 0 to colSize - 1) {        if (row.get(i) == null) {          str.append(dtypes(i)._1 + ":" + separator)        } else {          str.append(dtypes(i)._1 + ":" + row.get(i) + separator)        }      }      str.toString().substring(0, str.length - len)    }).flatMap(_.split(separator)).map((_,1L)).reduceByKey(_ + _)    rddHandle.persist()    for (col <- allColArr) {      log.info("____******col:" + col)      val colResult = rddHandle.filter(_._1.split(":",2)(0).equals(col))      val uniqueResult = colResult.filter(t => StringUtils.isNotEmpty(t._1.split(":", 2)(1))).count()      val missingResult = colResult.filter(t => StringUtils.isEmpty(t._1.split(":", 2)(1)))      val singleResult = colResult.filter(t => StringUtils.isNotEmpty(t._1.split(":", 2)(1))).filter(_._2 == 1).count()      colUnique.put(col, uniqueResult)      if (missingResult.isEmpty()) {        colMissing.put(col, 0)      } else {        colMissing.put(col, missingResult.first()._2)      }      colSingle.put(col, singleResult)    }    rddHandle.unpersist()    (colUnique, colMissing, colSingle)  }


原创粉丝点击