[1.1.0]工具类之SparkUtils的编写

来源:互联网 发布:Java的rgb 编辑:程序博客网 时间:2024/06/05 03:13

场景

将各业务子模块公共的Spark代码抽取到一个工具类中,主要包括:

  • 模拟生产环境hive仓库中相关业务表数据 - 本地开发测试用
    spark应用程序所要处理的业务数据量很大,往往存储在Hive仓库中;本项目涉及到的hive表有 用户访问行为表:user_visit_action 与 用户信息表 user_info。在本地进行测试写好的spark应用程序代码,需手动写代码模拟上述hive表中业务数据。
  • SQLContext等对象的生成
    通过修改配置文件相关参数,实现同一个Application在本地测试模式与实际生产环境集群模式的自动切换。

代码

  • SparkUtils.java类
package cool.pengych.sparker.util;import org.apache.spark.SparkConf;import org.apache.spark.SparkContext;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.sql.Row;import org.apache.spark.sql.SQLContext;import org.apache.spark.sql.hive.HiveContext;import com.alibaba.fastjson.JSONObject;import cool.pengych.sparker.conf.ConfigurationManager;import cool.pengych.sparker.constant.Constants;/** * spark应用程序各业务模块公共代码类 * @author pengyucheng */public class SparkUtils{    private static final boolean IS_LOCAL = ConfigurationManager.getBoolean(Constants.LOCAL_DEPLOY);    /**     * 集群模式:本地 or 集群     */    public static void setMaster(SparkConf conf)    {        if(IS_LOCAL)        {            conf.setMaster("local");        }    }    /**     * 生产环境就用HiveContext     * @param sc     * @return     */    public static SQLContext getSQLContext(SparkContext sc)    {        if(IS_LOCAL)        {            return new SQLContext(sc);        }        else        {            return new HiveContext(sc);        }    }    /**     * 模拟hive仓库中数据,本地开发环境测试用     * @param sc     * @param sqlContext     */    public static void mockData(JavaSparkContext sc,SQLContext sqlContext)    {        if(IS_LOCAL)        {            LocalDataGenerator.mockData(sc, sqlContext);        }    }    /**     * 获取指定日期范围内的用户行为数据     * @param sqlContext     * @param taskParam     * @return  JavaRDD<Row>      */    public static JavaRDD<Row> getActionRDDByRange(SQLContext sqlContext,JSONObject taskParam)    {        String startDate = taskParam.getString(Constants.PARAM_START_DATE);        String endDate = taskParam.getString(Constants.PARAM_END_DATE);        String sql = "select * from user_visit_action "                + "where date >= "+startDate +" and date <= "+ endDate;        return sqlContext.sql(sql).toJavaRDD();    }}
  • LocalDataGenerator.java类
package cool.pengych.sparker.util;import java.util.ArrayList;import java.util.Arrays;import java.util.Date;import java.util.List;import java.util.UUID;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.sql.DataFrame;import org.apache.spark.sql.Row;import org.apache.spark.sql.RowFactory;import org.apache.spark.sql.SQLContext;import org.apache.spark.sql.types.DataTypes;import org.apache.spark.sql.types.StructType;import scala.util.Random;/** * 用户行为类型 * 搜索、点击、下单及支付 * @author pengyucheng */ enum ActionType   {      SEARCH,CLICK,ORDER,PAY;}   /**  * spark应用程序  *  本地测试数据用数据生成器   * @author pengyucheng  */public class LocalDataGenerator{    /**     * 模拟生产环境hive表数据     * 本地测试用     * @param jsc     * @param sqlContext     */    public static void mockData(JavaSparkContext jsc,SQLContext sqlContext)    {        mockUserInfo(jsc,sqlContext);        mockUseSessionInfo(jsc,sqlContext);    }    /**     * 模拟产生hive中 user_info表的数据     * @param sc     * @param sqlContext     */    public static void mockUserInfo(JavaSparkContext jsc ,SQLContext sqlContext)    {                /*                 * 1、生成 List<Row> 与 定义RDD中row的每列数据类型 :完成非结构化数据到结构化数据的转换                 */                List<Row> rows = new ArrayList<Row>();                Row row = null;                long userId = 0;                String username = null;                String name = null;                int age = 0;                String professional = null;                String city = null;                String[] sexStrs = new String[]{"man","woman"};                String sex = null;                Random random =  new Random(666);                for (int i = 0; i < 100; i++)                {                    userId = random.nextInt(100);                    username = "username"+userId;                    name = "name"+userId;                    age = random.nextInt(100);                    professional = "professional"+userId;                    city = "city"+userId;                    sex = sexStrs[random.nextInt(2)];                    row = RowFactory.create(userId,username,name,age,professional,city,sex);                    rows.add(row);                }                JavaRDD<Row> rowRDD = jsc.parallelize(rows);                StructType st = DataTypes.createStructType(Arrays.asList(                        DataTypes.createStructField("user_id", DataTypes.LongType, false),                        DataTypes.createStructField("username", DataTypes.StringType, true),                        DataTypes.createStructField("name", DataTypes.StringType, true),                        DataTypes.createStructField("age", DataTypes.IntegerType, true),                        DataTypes.createStructField("professional", DataTypes.StringType, true),                        DataTypes.createStructField("city", DataTypes.StringType, true)                        ));                /*                 * 2、rowRDD 转化成 DataFrame                 */                DataFrame df = sqlContext.createDataFrame(rowRDD, st);                df.printSchema();                /*                 * 3、将内存中的数据,注册临时表                 */                df.registerTempTable("user_info");    }    /**     * 模拟hive中 user_session_info表数据     * @param sc     * @param sqlContext     */    public static void mockUseSessionInfo(JavaSparkContext jsc ,SQLContext sqlContext)    {        List<Row> rows = new ArrayList<Row>();        Random random = new Random();        String date = DateUtils.formatDate(new Date());        String[] searchKeyWords = new String[]{"钢琴","吉他","hadoop","spark"};        for (int i = 0; i < 100; i++)        {            long userId = random.nextInt(100);            for (int j = 0; j < 10; j++)             {                String sessionId = UUID.randomUUID().toString();                for (int k = 0; k < 10; k++)                 {                    String searchKeyWord = null;                    Long clickCategoryId = null;                    Long clickProductId = null;                    String orderCategoryIds = null;                     String orderProductIds = null;                    String payCategoryIds = null;                    String payProductIds = null;                    long pageid = random.nextInt(10);                      String actionTime = date + " " + random.nextInt(24)+":"+random.nextInt(60)+":"+random.nextInt(60);                    ActionType actionType = ActionType.values()[random.nextInt(ActionType.values().length)];                    switch (actionType)                    {                    case SEARCH:                       searchKeyWord = searchKeyWords[random.nextInt(4)];                        break;                    case CLICK:                        clickCategoryId = Long.valueOf(String.valueOf(random.nextInt(100)));                           clickProductId = Long.valueOf(String.valueOf(random.nextInt(100)));                          break;                    case ORDER:                        orderCategoryIds = getRandomStringArrs();                          orderProductIds = getRandomStringArrs();                        break;                    case PAY:                        payCategoryIds = getRandomStringArrs();                          payProductIds = getRandomStringArrs();                        break;                    default:                        break;                    }                    Row row = RowFactory.create(date, userId, sessionId,                             pageid, actionTime, searchKeyWord,                            clickCategoryId, clickProductId,                            orderCategoryIds, orderProductIds,                            payCategoryIds, payProductIds);                    rows.add(row);                }            }        }        StructType type = DataTypes.createStructType(Arrays.asList(                DataTypes.createStructField("date", DataTypes.StringType, false),                DataTypes.createStructField("user_id", DataTypes.LongType, true),                DataTypes.createStructField("session_id", DataTypes.StringType, true),                DataTypes.createStructField("page_id", DataTypes.LongType, true),                DataTypes.createStructField("action_time", DataTypes.StringType, true),                DataTypes.createStructField("search_keyword", DataTypes.StringType, true),                DataTypes.createStructField("click_category_id", DataTypes.LongType, true),                DataTypes.createStructField("click_product_id", DataTypes.LongType, true),                DataTypes.createStructField("order_category_ids", DataTypes.StringType, true),                DataTypes.createStructField("order_product_ids", DataTypes.StringType, true),                DataTypes.createStructField("pay_category_ids", DataTypes.StringType, true),                DataTypes.createStructField("pay_product_ids", DataTypes.StringType, true)));        DataFrame df = sqlContext.createDataFrame(jsc.parallelize(rows), type);        df.registerTempTable("user_visit_action");        /**************测试用**************/        List<Row> rows2 = df.toJavaRDD().take(1);        for (Row row : rows2)         {            System.out.println(row);        }    }    /**     * 获取随机个数的id     * 比如,获取 ids 个 click_category_id     * @return 字符串ids     */    private static String getRandomStringArrs()    {        Random random = new Random();        StringBuffer sb = new StringBuffer();        int ids = random.nextInt(7);        for (int i = 0; i < ids ; i++)        {            sb.append( String.valueOf(random.nextInt(100)));            if(i<ids-1)            {                sb.append(",");            }        }        return sb.toString();    }}

LocalDataGenerator类执行结果

16/06/28 00:05:04 INFO BlockManagerMaster: Registered BlockManagerroot |-- user_id: long (nullable = false) |-- username: string (nullable = true) |-- name: string (nullable = true) |-- age: integer (nullable = true) |-- professional: string (nullable = true) |-- city: string (nullable = true)16/06/28 00:05:08 INFO SparkContext: Starting job: take at LocalDataGenerator.java:193
16/06/28 00:05:09 INFO DAGScheduler: Job 0 finished: take at LocalDataGenerator.java:193, took 1.128053 s[2016-06-28,60,e689c129-2bac-4fc0-82f4-2c17bee21066,0,2016-06-28 1:44:29,null,null,null,,65,66,40,12,43,74,null,null]16/06/28 00:05:09 INFO SparkContext: Invoking stop() from shutdown hook

总结

00:07 Good , night Spark

0 0
原创粉丝点击