Spark---Datasource(JSON)---java
来源:互联网 发布:密码破解软件 编辑:程序博客网 时间:2024/06/06 05:47
package com.spark.sparksql.datasource.java;import java.util.ArrayList;import java.util.List;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.Function;import org.apache.spark.api.java.function.PairFunction;import org.apache.spark.sql.DataFrame;import org.apache.spark.sql.Row;import org.apache.spark.sql.RowFactory;import org.apache.spark.sql.SQLContext;import org.apache.spark.sql.SaveMode;import org.apache.spark.sql.types.DataTypes;import org.apache.spark.sql.types.StructField;import org.apache.spark.sql.types.StructType;import scala.Tuple2;public class JSONDataSource { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JSONDataSource").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(sc); DataFrame studentScoresDF = sqlContext.read().json("students.json"); //针对学生成绩信息的DataFrame,注册临时表,查询分数大于80分学生的姓名 studentScoresDF.registerTempTable("student_scores"); DataFrame goodStudentsNamesDF = sqlContext.sql("select name, score from student_scores where score >= 80"); //我们接下来把它给转换一下,因为这个时候DataFrame里面的元素还是Row !! 我们把它转换成String List<String> goodStudentNames = goodStudentsNamesDF.toJavaRDD().map(new Function<Row, String>() { private static final long serialVersionUID = 1L; @Override public String call(Row row) throws Exception { return row.getString(0); } }).collect(); List<String> studentInfoJSONs = new ArrayList<String>(); studentInfoJSONs.add("{\"name\":\"Yasaka\",\"age\":18}"); studentInfoJSONs.add("{\"name\":\"Xuruyun\",\"age\":17}"); studentInfoJSONs.add("{\"name\":\"Liangyongqi\",\"age\":19}"); JavaRDD<String> studentInfosRDD = sc.parallelize(studentInfoJSONs) ; DataFrame studentInfosDF = sqlContext.read().json(studentInfosRDD); studentInfosDF.registerTempTable("student_infos"); String sql = "select name, age from student_infos where name in ("; for(int i=0; i<goodStudentNames.size(); i++){ sql += "'" + goodStudentNames.get(i) + "'"; if(i < goodStudentNames.size() - 1){ sql += ","; } } sql += ")"; System.out.println(sql); DataFrame goodStudentInfosDF = sqlContext.sql(sql); // 然后将两份数据的DataFrame执行JOIN转化算子操作!!! JavaPairRDD<String, Tuple2<Integer, Integer>> goodStudentsRDD = goodStudentInfosDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() { private static final long serialVersionUID = 1L; @Override public Tuple2<String, Integer> call(Row row) throws Exception { return new Tuple2<String, Integer>(String.valueOf(row.get(0)), Integer.valueOf(String.valueOf(row.get(1)))); } }).join(studentScoresDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() { private static final long serialVersionUID = 1L; @Override public Tuple2<String, Integer> call(Row row) throws Exception { return new Tuple2<String, Integer>(String.valueOf(row.get(0)), Integer.valueOf(String.valueOf(row.get(1)))); } })); JavaRDD<Row> goodStudentsdRowRDD = goodStudentsRDD.map(new Function<Tuple2<String,Tuple2<Integer,Integer>>, Row>() { private static final long serialVersionUID = 1L; @Override public Row call(Tuple2<String, Tuple2<Integer, Integer>> tuple) throws Exception { return RowFactory.create(tuple._1,tuple._2._1,tuple._2._2); } }); List<StructField> fields = new ArrayList<StructField>(); fields.add(DataTypes.createStructField("name", DataTypes.StringType, true)); fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true)); fields.add(DataTypes.createStructField("score", DataTypes.IntegerType, true)); StructType structType = DataTypes.createStructType(fields); DataFrame goodStudentDF = sqlContext.createDataFrame(goodStudentsdRowRDD, structType); goodStudentDF.write().format("json").mode(SaveMode.Overwrite).save("goodStudentJson"); }}
阅读全文
0 0
- Spark---Datasource(JSON)---java
- Spark---Datasource(JSON)---Scala
- Spark---Datasource(JDBC)---java
- Spark--Datasource(Hive)
- Spark---Datasource(JDBC)---Scala
- java datasource
- JAVA:如何创建 DataSource
- DataSource Java 官方 DBCP
- Java DataSource 访问数据库
- Spark SQL之External DataSource外部数据源
- Spring中使用Spark连接的DataSource
- Spark中External Datasource实现数据迁移
- java建立临时数据源DataSource
- DataSource
- DataSource
- datasource
- dataSource
- dataSource
- Android开发--全局类的实现(用于保存使用的全局变量)
- jQuery选择器
- COCOS抽奖转盘实现
- 微信jssdk使用
- 随机森林算法(RandomForest,RF)
- Spark---Datasource(JSON)---java
- 5. AsyncTask
- GBDT的小结(来自论文greedy function approximation: a gradient boosting machine)
- 功夫小子实践开发-Menu家族学习及开始菜单场景的实现
- 小白学tkinter(entry的验证函数)
- 数据结构之链表
- Fragment里ScrollView嵌套RecyclerView显示不全的问题
- 基于H5+ HBuilder开发app android离线打包步骤 记录比较关键点
- [离散数学]偏序与全序的区别、解释