10.Spark SQL:内置函数以及每日uv

来源:互联网 发布:手机移动网络共享 编辑:程序博客网 时间:2024/05/16 19:36
Spark SQL:内置函数以及每日uv

java版本:
package cn.spark.study.sql;import java.util.Arrays;import java.util.List;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.Function;import org.apache.spark.api.java.function.VoidFunction;import org.apache.spark.sql.DataFrame;import org.apache.spark.sql.Row;import org.apache.spark.sql.RowFactory;import org.apache.spark.sql.SQLContext;import org.apache.spark.sql.types.DataTypes;import org.apache.spark.sql.types.StructField;import org.apache.spark.sql.types.StructType;/** * Spark SQL:内置函数以及每日uv */import scala.Tuple2;public class DailyUV {public static void main(String[] args) {// 创建SparkConf,本地运行SparkConf conf = new SparkConf().setMaster("local").setAppName("DailyUV");// 创建JavaSparkContextJavaSparkContext sc = new JavaSparkContext(conf);SQLContext sqlContext = new SQLContext(sc);    // 构造用户访问日志数据,并创建DataFrame    // 模拟用户访问日志,日志用逗号隔开,第一列是日期,第二列是用户idList<String> userAccessLog = Arrays.asList("2015-10-01,1122",        "2015-10-01,1122",        "2015-10-01,1123",        "2015-10-01,1124",        "2015-10-01,1124",        "2015-10-01,1128",        "2015-10-01,1129",        "2015-10-02,1122",        "2015-10-02,1121",        "2015-10-02,1123",        "2015-10-02,1125",        "2015-10-02,1123");// 并行化集合创建RDD,要通过并行化集合的方式创建RDD,那么就调用SparkContext以及其子类,的parallelize()方法JavaRDD<String> userAccessLogRDD = sc.parallelize(userAccessLog);// 将RDD数据转换为JavaRDD<Row>形式,为后续转换为DataFrame作准备JavaRDD<Tuple2<String, Long>> userAssessLogMapRDD = userAccessLogRDD.map(new Function<String, Tuple2<String, Long>>() {private static final long serialVersionUID = 1L;@Overridepublic Tuple2<String, Long> call(String line) throws Exception {return new Tuple2<String, Long>(line.split(",")[0], Long.valueOf(line.split(",")[1]));}});// 将RDD数据转换为JavaRDD<Row>形式,为后续转换为DataFrame作准备JavaRDD<Row> userAccessLogRowRDD = userAssessLogMapRDD.map(new Function<Tuple2<String,Long>, Row>() {private static final long serialVersionUID = 1L;@Overridepublic Row call(Tuple2<String, Long> tuple2) throws Exception {return RowFactory.create(tuple2._1, tuple2._2);}});// 创建元数据类型,为后续转换为DataFrame作准备List<StructField> structFields = Arrays.asList(DataTypes.createStructField("date", DataTypes.StringType, true),DataTypes.createStructField("userID", DataTypes.LongType, true));StructType structType = DataTypes.createStructType(structFields);// 转换为DataFrameDataFrame userAccessLogDF = sqlContext.createDataFrame(userAccessLogRowRDD, structType);// 将DataFrame注册为临时表,便于后面查询数据userAccessLogDF.registerTempTable("userAccessLogDF");// 查询每一天,不同用户登录次数DataFrame everyDayUVDF = sqlContext.sql("SELECT date, COUNT(DISTINCT(userID)) AS count FROM userAccessLogDF GROUP BY date");// 将DataFrame转换为JavaRDD,为后续遍历数据做准备JavaRDD<Tuple2<String, Long>> userAccessLogSumRDD = everyDayUVDF.javaRDD().map(new Function<Row, Tuple2<String, Long>>() {private static final long serialVersionUID = 1L;@Overridepublic Tuple2<String, Long> call(Row row) throws Exception {return new Tuple2<String, Long>(row.getString(0), row.getLong(1));}});// 遍历计算后的结果数据userAccessLogSumRDD.foreach(new VoidFunction<Tuple2<String,Long>>() {private static final long serialVersionUID = 1L;@Overridepublic void call(Tuple2<String, Long> tt) throws Exception {System.out.println(tt._1.toString() + " login: " + tt._2.toString() + " times");}});// 关闭SparkContextsc.close();}}

scala版本:
package cn.spark.study.sqlimport org.apache.spark.SparkConf;import org.apache.spark.SparkContext;import org.apache.spark.sql.SQLContext;import org.apache.spark.sql.Row;import org.apache.spark.sql.types.StructType;import org.apache.spark.sql.types.StructField;import org.apache.spark.sql.types.StringType;import org.apache.spark.sql.types.IntegerType;// 手动导入一个函数import org.apache.spark.sql.functions._/** * 83讲,SparkSQL内置函数以及每日UV案例实战 */object DailyUV {  def main(args:Array[String])  {    val conf = new SparkConf()      .setMaster("local")      .setAppName("DailyUV")          val sc = new SparkContext(conf)    val sqlContext = new SQLContext(sc);        // 这里着重说明一下!!!    // 要使用SparkSQL的内置函数,就必须在这里导入SQLContext下的隐式转换    import sqlContext.implicits._        // 构造用户访问日志数据,并创建DataFrame    // 模拟用户访问日志,日志用逗号隔开,第一列是日期,第二列是用户id    val userAccessLog = Array(        "2015-10-01,1122",        "2015-10-01,1122",        "2015-10-01,1123",        "2015-10-01,1124",        "2015-10-01,1124",        "2015-10-02,1122",        "2015-10-02,1121",        "2015-10-02,1123",        "2015-10-02,1125",        "2015-10-02,1123");    val userAccessLogRDD = sc.parallelize(userAccessLog, 5);        // 将模拟出来的用户访问日志RDD,转换为DataFrame    // 首先,将普通的RDD,转换为元素为Row的RDD    val userAccessLogRowRDD = userAccessLogRDD      .map{log => Row(log.split(",")(0), log.split(",")(1).toInt)}        // 构造DataFrame的元数据    val structType = StructType(Array(        StructField("date", StringType, true),        StructField("userid", IntegerType, true)))            // 使用SQLContext创建DataFrame    val userAccessLogRowDF = sqlContext.createDataFrame(userAccessLogRowRDD, structType)        // 这里讲解一下uv的基本含义和业务    // 每天都有很多用户来访问,但是每个用户可能每天都会该问很多次    // 所以,uv, 指的是,对用户进行去重以后的访问总数        // 这里,正式开始使用Spark 1.5.x版本提供的最新特性,内置函数,countDistinct    // 讲解一下聚合函数的用法    // 首先,对DataFrame调用groupBy()方法,对某一列进行分组    // 然后,调用agg()方法,第一个参数,必须,必须,传入之前在groupBy()方法中出现的字段    // 第二个参数,传入countDistinct、sum、first天天副,Spark提供的内置函数    // 内置函数中,传入的参数,也是用单引号作为前缀的,其他的字段    userAccessLogRowDF.groupBy("date")      .agg('date, countDistinct('userid))      .map{row => Row(row(1), row(2))}      .collect()      .foreach(println)  }}


原创粉丝点击