将java RDD结果写入Hive表中

来源：互联网发布：代码审查工具 php 编辑：程序博客网时间：2024/06/07 06:15

情况一：只需插入一列

JavaRDD<String> titleParticiple = ....;

/**

* 将分词结果保存到Hive表，供数据探查使用
* */
HiveContext hiveCtx = new HiveContext(jsc);
SQLContext sqlCtx = new SQLContext(jsc);
/**
* 在RDD的基础上创建类型为Row的RDD，
*/
JavaRDD<Row> brandRDD = titleParticiple.map(new Function<String, Row>() {

private static final long serialVersionUID = 1L;

public Row call( String line )
throws Exception {
return RowFactory.create(line);
}
});
/**
*1、动态的构建DataFrame的元数据，一般而言，有多少列以及酶类的具体类型可能来源于JSON文件或者数据库
*/
List<StructField> structFields = new ArrayList<StructField>();
//structFields.add(DataTypes.createStructField( "id", DataTypes.IntegerType, true ));
structFields.add(DataTypes.createStructField( "brand", DataTypes.StringType, true ));
/**
* 2、构建StructType用于DataFrame 元数据的描述
*
*/
StructType structType = DataTypes.createStructType( structFields );
/**
* 3、基于MeataData以及RDD<Row>来构造DataFrame
*/
Dataset<Row> personsDF = sqlCtx.createDataFrame(brandRDD,structType);
/**
* 4、注册成为临时表以供后续的SQL查询操作
*/
personsDF.registerTempTable("brands");
hiveCtx.sql("use sousuo"); //使用sousuo数据库
hiveCtx.sql("drop table if exists sousuo.temp_yeqingyun_20170913");//删除原来的表
hiveCtx.sql("CREATE TABLE IF NOT EXISTS sousuo.temp_yeqingyun_20170913 (brand STRING)");//创建表

hiveCtx.sql("insert into sousuo.temp_yeqingyun_20170913 select brand from brands");//将brands表中的内容全部拷贝到temp_yeqingyun_20170913表中

情况二：需要插入多列，且插入的类型有int和String:

JavaPairRDD<String, String> brandTypeGoodsPair = “...”;

HiveContext hiveCtx = new HiveContext(jsc);

SQLContext sqlCtx = new SQLContext(jsc);

JavaRDD<Row> brandRDD = brandTypeGoodsPair.map(new Function<Tuple2<String, String>, Row>() {
private static final long serialVersionUID = 1L;
int i=0;
public Row call( Tuple2<String, String> pair) throws Exception {
    i++;
    String[] valueArray = pair._2().split(":");
    String value0 = valueArray[0];
    int value1 = Integer.parseInt(valueArray[1]);
return RowFactory.create(i, pair._1, value0, value1);
}
});

List<StructField> structFields = new ArrayList<StructField>();
structFields.add(DataTypes.createStructField( "id", DataTypes.IntegerType, true ));
structFields.add(DataTypes.createStructField( "directory3", DataTypes.StringType, true ));
structFields.add(DataTypes.createStructField( "brandItemModel", DataTypes.StringType, true ));
structFields.add(DataTypes.createStructField( "num", DataTypes.IntegerType, true ));
StructType structType = DataTypes.createStructType( structFields );
Dataset<Row> brandDF = sqlCtx.createDataFrame(brandRDD,structType);
brandDF.registerTempTable("brands_test2");
hiveCtx.sql("use sousuo");
hiveCtx.sql("drop table if exists sousuo.temp_yeqingyun_test2_20170913");
hiveCtx.sql("CREATE TABLE IF NOT EXISTS sousuo.temp_yeqingyun_test2_20170913 (id INT, directory3 STRING, brandItemModel STRING, num INT)");
hiveCtx.sql("insert into sousuo.temp_yeqingyun_test2_20170913 select id,directory3,brandItemModel,num from brands_test2");

阅读全文

0 0