[1.1.0]工具类之SparkUtils的编写
来源:互联网 发布:Java的rgb 编辑:程序博客网 时间:2024/06/05 03:13
场景
将各业务子模块公共的Spark代码抽取到一个工具类中,主要包括:
- 模拟生产环境hive仓库中相关业务表数据 - 本地开发测试用
spark应用程序所要处理的业务数据量很大,往往存储在Hive仓库中;本项目涉及到的hive表有 用户访问行为表:user_visit_action 与 用户信息表 user_info。在本地进行测试写好的spark应用程序代码,需手动写代码模拟上述hive表中业务数据。 - SQLContext等对象的生成
通过修改配置文件相关参数,实现同一个Application在本地测试模式与实际生产环境集群模式的自动切换。
代码
- SparkUtils.java类
package cool.pengych.sparker.util;import org.apache.spark.SparkConf;import org.apache.spark.SparkContext;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.sql.Row;import org.apache.spark.sql.SQLContext;import org.apache.spark.sql.hive.HiveContext;import com.alibaba.fastjson.JSONObject;import cool.pengych.sparker.conf.ConfigurationManager;import cool.pengych.sparker.constant.Constants;/** * spark应用程序各业务模块公共代码类 * @author pengyucheng */public class SparkUtils{ private static final boolean IS_LOCAL = ConfigurationManager.getBoolean(Constants.LOCAL_DEPLOY); /** * 集群模式:本地 or 集群 */ public static void setMaster(SparkConf conf) { if(IS_LOCAL) { conf.setMaster("local"); } } /** * 生产环境就用HiveContext * @param sc * @return */ public static SQLContext getSQLContext(SparkContext sc) { if(IS_LOCAL) { return new SQLContext(sc); } else { return new HiveContext(sc); } } /** * 模拟hive仓库中数据,本地开发环境测试用 * @param sc * @param sqlContext */ public static void mockData(JavaSparkContext sc,SQLContext sqlContext) { if(IS_LOCAL) { LocalDataGenerator.mockData(sc, sqlContext); } } /** * 获取指定日期范围内的用户行为数据 * @param sqlContext * @param taskParam * @return JavaRDD<Row> */ public static JavaRDD<Row> getActionRDDByRange(SQLContext sqlContext,JSONObject taskParam) { String startDate = taskParam.getString(Constants.PARAM_START_DATE); String endDate = taskParam.getString(Constants.PARAM_END_DATE); String sql = "select * from user_visit_action " + "where date >= "+startDate +" and date <= "+ endDate; return sqlContext.sql(sql).toJavaRDD(); }}
- LocalDataGenerator.java类
package cool.pengych.sparker.util;import java.util.ArrayList;import java.util.Arrays;import java.util.Date;import java.util.List;import java.util.UUID;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.sql.DataFrame;import org.apache.spark.sql.Row;import org.apache.spark.sql.RowFactory;import org.apache.spark.sql.SQLContext;import org.apache.spark.sql.types.DataTypes;import org.apache.spark.sql.types.StructType;import scala.util.Random;/** * 用户行为类型 * 搜索、点击、下单及支付 * @author pengyucheng */ enum ActionType { SEARCH,CLICK,ORDER,PAY;} /** * spark应用程序 * 本地测试数据用数据生成器 * @author pengyucheng */public class LocalDataGenerator{ /** * 模拟生产环境hive表数据 * 本地测试用 * @param jsc * @param sqlContext */ public static void mockData(JavaSparkContext jsc,SQLContext sqlContext) { mockUserInfo(jsc,sqlContext); mockUseSessionInfo(jsc,sqlContext); } /** * 模拟产生hive中 user_info表的数据 * @param sc * @param sqlContext */ public static void mockUserInfo(JavaSparkContext jsc ,SQLContext sqlContext) { /* * 1、生成 List<Row> 与 定义RDD中row的每列数据类型 :完成非结构化数据到结构化数据的转换 */ List<Row> rows = new ArrayList<Row>(); Row row = null; long userId = 0; String username = null; String name = null; int age = 0; String professional = null; String city = null; String[] sexStrs = new String[]{"man","woman"}; String sex = null; Random random = new Random(666); for (int i = 0; i < 100; i++) { userId = random.nextInt(100); username = "username"+userId; name = "name"+userId; age = random.nextInt(100); professional = "professional"+userId; city = "city"+userId; sex = sexStrs[random.nextInt(2)]; row = RowFactory.create(userId,username,name,age,professional,city,sex); rows.add(row); } JavaRDD<Row> rowRDD = jsc.parallelize(rows); StructType st = DataTypes.createStructType(Arrays.asList( DataTypes.createStructField("user_id", DataTypes.LongType, false), DataTypes.createStructField("username", DataTypes.StringType, true), DataTypes.createStructField("name", DataTypes.StringType, true), DataTypes.createStructField("age", DataTypes.IntegerType, true), DataTypes.createStructField("professional", DataTypes.StringType, true), DataTypes.createStructField("city", DataTypes.StringType, true) )); /* * 2、rowRDD 转化成 DataFrame */ DataFrame df = sqlContext.createDataFrame(rowRDD, st); df.printSchema(); /* * 3、将内存中的数据,注册临时表 */ df.registerTempTable("user_info"); } /** * 模拟hive中 user_session_info表数据 * @param sc * @param sqlContext */ public static void mockUseSessionInfo(JavaSparkContext jsc ,SQLContext sqlContext) { List<Row> rows = new ArrayList<Row>(); Random random = new Random(); String date = DateUtils.formatDate(new Date()); String[] searchKeyWords = new String[]{"钢琴","吉他","hadoop","spark"}; for (int i = 0; i < 100; i++) { long userId = random.nextInt(100); for (int j = 0; j < 10; j++) { String sessionId = UUID.randomUUID().toString(); for (int k = 0; k < 10; k++) { String searchKeyWord = null; Long clickCategoryId = null; Long clickProductId = null; String orderCategoryIds = null; String orderProductIds = null; String payCategoryIds = null; String payProductIds = null; long pageid = random.nextInt(10); String actionTime = date + " " + random.nextInt(24)+":"+random.nextInt(60)+":"+random.nextInt(60); ActionType actionType = ActionType.values()[random.nextInt(ActionType.values().length)]; switch (actionType) { case SEARCH: searchKeyWord = searchKeyWords[random.nextInt(4)]; break; case CLICK: clickCategoryId = Long.valueOf(String.valueOf(random.nextInt(100))); clickProductId = Long.valueOf(String.valueOf(random.nextInt(100))); break; case ORDER: orderCategoryIds = getRandomStringArrs(); orderProductIds = getRandomStringArrs(); break; case PAY: payCategoryIds = getRandomStringArrs(); payProductIds = getRandomStringArrs(); break; default: break; } Row row = RowFactory.create(date, userId, sessionId, pageid, actionTime, searchKeyWord, clickCategoryId, clickProductId, orderCategoryIds, orderProductIds, payCategoryIds, payProductIds); rows.add(row); } } } StructType type = DataTypes.createStructType(Arrays.asList( DataTypes.createStructField("date", DataTypes.StringType, false), DataTypes.createStructField("user_id", DataTypes.LongType, true), DataTypes.createStructField("session_id", DataTypes.StringType, true), DataTypes.createStructField("page_id", DataTypes.LongType, true), DataTypes.createStructField("action_time", DataTypes.StringType, true), DataTypes.createStructField("search_keyword", DataTypes.StringType, true), DataTypes.createStructField("click_category_id", DataTypes.LongType, true), DataTypes.createStructField("click_product_id", DataTypes.LongType, true), DataTypes.createStructField("order_category_ids", DataTypes.StringType, true), DataTypes.createStructField("order_product_ids", DataTypes.StringType, true), DataTypes.createStructField("pay_category_ids", DataTypes.StringType, true), DataTypes.createStructField("pay_product_ids", DataTypes.StringType, true))); DataFrame df = sqlContext.createDataFrame(jsc.parallelize(rows), type); df.registerTempTable("user_visit_action"); /**************测试用**************/ List<Row> rows2 = df.toJavaRDD().take(1); for (Row row : rows2) { System.out.println(row); } } /** * 获取随机个数的id * 比如,获取 ids 个 click_category_id * @return 字符串ids */ private static String getRandomStringArrs() { Random random = new Random(); StringBuffer sb = new StringBuffer(); int ids = random.nextInt(7); for (int i = 0; i < ids ; i++) { sb.append( String.valueOf(random.nextInt(100))); if(i<ids-1) { sb.append(","); } } return sb.toString(); }}
LocalDataGenerator类执行结果
16/06/28 00:05:04 INFO BlockManagerMaster: Registered BlockManagerroot |-- user_id: long (nullable = false) |-- username: string (nullable = true) |-- name: string (nullable = true) |-- age: integer (nullable = true) |-- professional: string (nullable = true) |-- city: string (nullable = true)16/06/28 00:05:08 INFO SparkContext: Starting job: take at LocalDataGenerator.java:193
16/06/28 00:05:09 INFO DAGScheduler: Job 0 finished: take at LocalDataGenerator.java:193, took 1.128053 s[2016-06-28,60,e689c129-2bac-4fc0-82f4-2c17bee21066,0,2016-06-28 1:44:29,null,null,null,,65,66,40,12,43,74,null,null]16/06/28 00:05:09 INFO SparkContext: Invoking stop() from shutdown hook
总结
00:07 Good , night Spark
0 0
- [1.1.0]工具类之SparkUtils的编写
- HibernateUtil工具类的编写
- 工具类的编写方式
- Android 编写工具类之Log
- 24SharedPreferences的工具类的编写
- 自己编写的JDBC的工具类
- 02_c3p0之c3p0-config.xml配置案例,操作c3p0的jdbcUtil工具类的编写
- 04网络请求工具类的编写
- JDBC:编写通用的 JDBCUtils工具类
- 编写连接数据库的工具类
- 编写一个数组的工具类
- JAVA编写数组的工具类
- elasticsearch-php工具类的编写
- HttpClientUtil 等各类工具类的编写
- :编写单元测试的工具
- 编写汇编语言的工具
- 黑马程序员之编写对class文件进行加密的工具类
- java学习干货之jdbc连接与编写数据库的工具类
- 关于Java中try-catch-finally-return的执行顺序
- stray '/241' in program
- 程序员==学习笔记==java基础==day06==构造方法
- Mac学习使用之路(一)
- 大败局 | 从中国企业的“失败基因”谈起
- [1.1.0]工具类之SparkUtils的编写
- Linux中使用PostFix代替Sendmail 发送邮件
- MySQL-Front 注册码注意的问题!!!!
- C和C++变量定义位置在前和在后的区别
- 实现移动端单手指拖动+双手指拉大放小
- 格雷码生成
- 程序员福利:大牛收藏5年的开发资料
- ASCII、Unicode和UTF-8编码的区别
- 得到二叉树指定层的所有结点