scala实例——六

来源:互联网 发布:mac电磁阀型号 编辑:程序博客网 时间:2024/06/05 04:01
import scala.util.control.Breaks._import org.apache.spark.sql._import org.apache.spark.SparkConfimport org.apache.spark.SparkContextimport org.apache.spark.sql.{DataFrame, SQLContext, SparkSession}import org.apache.spark.rdd.RDDimport org.apache.spark.sql.execution.vectorized.ColumnarBatch.Rowimport scala.util.Randomimport scala.util.control.Breaksobject base_day_analyse {  def main(args: Array[String]) {    val spark = SparkSession      .builder()      .appName("base_day_analyse")      .config("spark.some.config.option", "some-value")      .getOrCreate()    import spark.implicits._    val df1 = spark.read.json("/spark_data/visit_records.json")    df1.createOrReplaceTempView("visit")    // Global temporary view is tied to a system preserved database `global_temp`    var result_string = ""    var sql = "SELECT  in_time from visit  order by `in_time`  limit 1"    var min_time = (spark.sql(sql).collect()) (0).asInstanceOf[Row].getInt(0)    sql = "SELECT  in_time from visit  order by `in_time` desc  limit 1 "    var max_time = (spark.sql(sql).collect()) (0).asInstanceOf[Row].getInt(0)    var now_time = 0;    var outer = new Breaks    now_time = min_time;    var last_customer_num = 0;    var now_customer_num = 0;    var new_customer_num = 0;    var old_customer_num = 0;    var interval_customer_num = 0;    while (now_time <= max_time) {      outer.breakable {        var jump_num = 0;        var visit_num = 0;        var deep_in_num = 0;        var avg_stay_time = 0;        var time1 = now_time;        var time2 = now_time + 86400;        var sql2 = "SELECT  COUNT(DISTINCT mac) num from visit   where `in_time` >= " + time1 + " and `in_time` <= " + time2 + " and  stay_time > 0"; //SQL语句        interval_customer_num = (spark.sql(sql2).collect()) (0).asInstanceOf[Row].getInt(0)         sql2 = "SELECT  COUNT(DISTINCT mac) num from visit   where `in_time` >= " + min_time + " and `in_time` <= " + time2 + " and  stay_time > 0"; //SQL语句         now_customer_num = (spark.sql(sql2).collect()) (0).asInstanceOf[Row].getInt(0)        new_customer_num = now_customer_num - last_customer_num;        old_customer_num = interval_customer_num - new_customer_num;        sql2 = "SELECT  count(*) jump_num from visit   where `in_time` >= " + time1 + " and `in_time` <= " + time2 + " and  stay_time <= 180"; //SQL语句        jump_num = (spark.sql(sql2).collect()) (0).asInstanceOf[Row].getInt(0)        sql2 = "SELECT  count(*) deep_in_num from visit   where `in_time` >= " + time1 + " and `in_time` <= " + time2 + " and  stay_time >= 1200"; //SQL语句        deep_in_num = (spark.sql(sql2).collect()) (0).asInstanceOf[Row].getInt(0)        sql2 = "SELECT  count(*) visit_num , AVG(stay_time) avg_stay_time from visit   where `in_time` >= " + time1 + " and `in_time` <= " + time2 + ""; //SQL语句        var row = (spark.sql(sql2).collect()) (0).asInstanceOf[Row]        visit_num = row.getInt(0)        avg_stay_time = row.getInt(1)        var jump_rate = (jump_num.asInstanceOf[Float]/visit_num.asInstanceOf[Float])        var deep_in_rate = (deep_in_num.asInstanceOf[Float]/visit_num.asInstanceOf[Float])        var format_deep_in_rate = f"$deep_in_rate%1.2f"        var format_jump_rate = f"$jump_rate%1.2f"        //每一条 jump 结果 添加到 结果集        var day_string = """{"time":"""+time1 +","+""""jump_out_rate":"""+format_jump_rate+","+""""deep_in_rate":"""+format_deep_in_rate+","+""""avg_stay_time":"""+avg_stay_time+","+""""new_num":"""+new_customer_num+","+""""old_num":"""+old_customer_num+","+""""customer_num":"""+visit_num+"}\n"        result_string = result_string + day_string        now_time = now_time + 86400;        last_customer_num = now_customer_num      }    }    //将结果集 存入 文件    import java.io._    val writer = new PrintWriter(new File("\\sparkdata\\base_day_analyse.json" ))    writer.write(result_string)    writer.close()  }
0 0
原创粉丝点击