大数据IMF传奇行动绝密课程第75-79课:Spark SQL基于网站Log的综合案例实战
来源:互联网 发布:mac系统剪切快捷键 编辑:程序博客网 时间:2024/05/01 03:54
Spark SQL基于网站Log的综合案例实战
/** * 生成数据 SparkSQLDataManually.java */package com.tom.spark.SparkApps.sql;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.OutputStreamWriter;import java.io.PrintWriter;import java.text.SimpleDateFormat;import java.util.Calendar;import java.util.Date;import java.util.Random;/** * 论坛数据自动生成代码,数据格式如下: * data:日期,格式为yyyy-MM-dd * timestamp:时间戳 * userID:用户ID * pageID:页面ID * channelID:板块ID * action:点击和注册 */public class SparkSQLDataManually { static String yesterday = yesterday(); static String[] channelNames = new String[] { "Spark", "Scala", "Kafka", "Flink", "Hadoop", "Storm", "Hive", "Impala", "HBase", "ML" }; static String[] actionNames = new String[] { "register","view" }; public static void main(String[] args) { /** * 通过传递进来的参数生成制定大小规模的数据 */ long numberItems = 5000; String path = "."; if (args.length > 0) { numberItems = Integer.valueOf(args[0]); path = args[1]; System.out.println(path); } System.out.println("User log number is : " + numberItems); //具体的论坛频道 /** * 昨天的时间生成 */ userlogs(numberItems, path); } private static void userlogs(long numberItems, String path) { // TODO Auto-generated method stub Random random = new Random(); StringBuffer userLogBuffer = new StringBuffer(""); int[] unregisteredUsers = new int[]{1,2,3,4,5,6,7,8}; for(int i = 0; i < numberItems; i++) { long timestamp = new Date().getTime(); Long userID = 0L; long pageID = 0; //随机生成的用户ID if(unregisteredUsers[random.nextInt(8)] == 1) { userID = null; } else { userID = (long) random.nextInt((int) numberItems); } //随机生成的页面ID pageID = random.nextInt((int) numberItems); //随机生成Channel String channel = channelNames[random.nextInt(10)]; //随机生成acton行为 String action = actionNames[random.nextInt(2)]; userLogBuffer.append(yesterday) .append("\t") .append(timestamp) .append("\t") .append(userID) .append("\t") .append(pageID) .append("\t") .append(channel) .append("\t") .append(action) .append("\n"); }// System.out.print(userLogBuffer); PrintWriter pw = null; try { pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(path + "\\userlog.log"))); System.out.println(path + "userlog.log"); pw.write(userLogBuffer.toString()); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { pw.close(); } } private static String yesterday() { SimpleDateFormat date = new SimpleDateFormat("yyyy-MM-dd"); Calendar cal = Calendar.getInstance(); cal.setTime(new Date()); cal.add(Calendar.DATE, -1); Date yesterday = cal.getTime(); return date.format(yesterday); }}
/** * 计算PV、UV、热门板块、跳出率、新用户注册比率 */package com.tom.spark.SparkApps.sql;import java.text.SimpleDateFormat;import java.util.Calendar;import java.util.Date;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.sql.hive.HiveContext;/** * Table in hive database creation: * sqlContext.sql("create table userlogs(date string, timestamp bigint, userID bigint, pageID bigint, channel string, action string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n'") * */public class SparkSQLUserlogsOps { /** * @param args */ public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("SparkSQLUserlogsOps").setMaster("spark://Master:7077"); JavaSparkContext sc = new JavaSparkContext(conf); sc.setLogLevel("WARN"); HiveContext hiveContext = new HiveContext(sc); String yesterday = getYesterday(); pvStat(hiveContext, yesterday); //PV uvStat(hiveContext, yesterday); //UV hotChannel(hiveContext, yesterday); //热门板块 jumpOutStat(hiveContext, yesterday); //跳出率 newUserRegisterPercentStat(hiveContext, yesterday); //新用户注册的比例 } private static void newUserRegisterPercentStat(HiveContext hiveContext, String yesterday) { // TODO Auto-generated method stub hiveContext.sql("use hive"); String newUserSQL = "select count(*) " + "from userlogs " + "where action = 'View' and date='"+ yesterday+"' and userID is NULL "// + "limit 10" ; String RegisterUserSQL = "SELECT count(*) " + "from userlogs" + "where action = 'Register' and date='"+ yesterday+"' "// + "limit 10" ; Object newUser = hiveContext.sql(newUserSQL).collect()[0].get(0); Object RegisterUser = hiveContext.sql(RegisterUserSQL).collect()[0].get(0); double total = Double.valueOf(newUser.toString()); double register = Double.valueOf(RegisterUser.toString()); System.out.println("模拟新用户注册比例:" + register / total); } private static void jumpOutStat(HiveContext hiveContext, String yesterday) { // TODO Auto-generated method stub hiveContext.sql("use hive"); String totalPvSQL = "select count(*) " + "from " + "userlogs " + "where action = 'View' and date='"+ yesterday+"' "// + "limit 10" ; String pv2OneSQL = "SELECT count(*) " + "from " + "(SELECT count(*) totalNumber from userlogs " + "where action = 'View' and date='"+ yesterday+"' " + "group by userID " + "having totalNumber = 1) subquery "// + "limit 10" ; Object totalPv = hiveContext.sql(totalPvSQL).collect()[0].get(0); Object pv2One = hiveContext.sql(pv2OneSQL).collect()[0].get(0); double total = Double.valueOf(totalPv.toString()); double pv21 = Double.valueOf(pv2One.toString()); System.out.println("跳出率为" + pv21 / total); } private static void uvStat(HiveContext hiveContext, String yesterday) { // TODO Auto-generated method stub hiveContext.sql("use hive"); String sqlText = "select date, pageID, uv " + "from " + "(select date, pageID, count(distinct(userID)) uv from userlogs " + "where action = 'View' and date='"+ yesterday+"' " + "group by date, pageID) subquery " + "order by uv desc "// + "limit 10" ; hiveContext.sql(sqlText).show(); } private static void hotChannel(HiveContext hiveContext, String yesterday) { // TODO Auto-generated method stub hiveContext.sql("use hive"); String sqlText = "select date, pageID, pv " + "from " + "(select date, pageID, count(1) pv from userlogs " + "where action = 'View' and date='"+ yesterday+"' " + "group by date, pageID) subquery " + "order by pv desc "// + "limit 10" ; hiveContext.sql(sqlText).show(); } private static void pvStat(HiveContext hiveContext, String yesterday) { // TODO Auto-generated method stub hiveContext.sql("use hive"); String sqlText = "select date, channel, channelpv " + "from " + "(select date, channel, count(*) channelpv from userlogs " + "where action = 'View' and date='"+ yesterday+"' " + "group by date, channel) subquery " + "order by channelpv desc "// + "limit 10" ; hiveContext.sql(sqlText).show(); //把执行结果放到数据库或Hive中 //select date, pageID, pv from (select date, pageID, count(1) pv from userlogs where action = 'View' and //date='2017-03-10' group by date, pageID) subquery order by pv desc limit 10 } private static String getYesterday() { SimpleDateFormat date = new SimpleDateFormat("yyyy-MM-dd"); Calendar cal = Calendar.getInstance(); cal.setTime(new Date()); cal.add(Calendar.DATE, -2); Date yesterday = cal.getTime(); return date.format(yesterday); }}
0 0
- 大数据IMF传奇行动绝密课程第75-79课:Spark SQL基于网站Log的综合案例实战
- 大数据IMF传奇行动绝密课程第67课:spark SQL案例综合实战
- 大数据IMF传奇行动绝密课程第80课:Spark SQL网站搜索综合案例实战
- 大数据IMF传奇行动绝密课程第100-101课:使用Spark Streaming+Spark SQL+Kafka+FileSystem综合案例
- 大数据IMF传奇行动绝密课程第69课:Spark SQL通过Hive数据源实战
- 大数据IMF传奇行动绝密课程第73课:Spark SQL Thrift Server实战
- 大数据IMF传奇行动绝密课程第85课:基于HDFS的SparkStreaming案例实战和内幕源码解密
- 大数据IMF传奇行动绝密课程第104-114课:Spark Streaming电商广告点击综合案例
- 大数据IMF传奇行动绝密课程第61课:Spark SQL数据加载和保存内幕深度解密实战
- 大数据IMF传奇行动绝密课程第90课:SparkStreaming基于Kafka Receiver案例实战和内幕源码解密
- 大数据IMF传奇行动绝密课程第91课:SparkStreaming基于Kafka Direct案例实战和内幕源码解密
- 大数据IMF传奇行动绝密课程第57课:Spark SQL on Hive配置及实战
- 大数据IMF传奇行动绝密课程第70课:Spark SQL内置函数解密与实战
- 大数据IMF传奇行动绝密课程第71课:Spark SQL窗口函数解密与实战
- 大数据IMF传奇行动绝密课程第72课:Spark SQL UDF和UDAF解密与实战
- 大数据IMF传奇行动绝密课程第65课:Spark SQL下Parquet深入进阶
- 大数据IMF传奇行动绝密课程第68课:Spark SQL通过JDBC操作MySQL
- 大数据IMF传奇行动绝密课程第44课:真正的Spark功力:性能优化!
- linux开发环境运维建议
- 根据他人源码练习下拉刷新遇到的问题
- 【斯坦福大学-机器学习】2.单变量线性回归(二)
- SQLite 入门教程(一)基本控制台(终端)命令
- 在navicat中建外键时,保存后就全部消失的解决方法
- 大数据IMF传奇行动绝密课程第75-79课:Spark SQL基于网站Log的综合案例实战
- Android 开发:(八)初识GridView(九宫格布局)
- rom定制笔记(一)
- QT 交叉编译环境
- xcode8.2 免证书手机调试详解
- bzoj 4008: [HNOI2015]亚瑟王 (概率与期望DP)
- 一致性哈希算法(consistent hashing)
- 146 LRU Cache
- 编程语言学习记录1