Two ways to load mysql tables into hdfs via spark
来源:互联网 发布:数据库通配符 编辑:程序博客网 时间:2024/04/30 14:03
There are two ways to load mysql tables into hdfs via spark, then process these datas.
- Load mysql tables: use JDBCRDD directely
package org.apache.spark.examples.sql
import org.apache.spark.sql.SQLContext
import java.sql.{ Connection, DriverManager, ResultSet }
import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.{ SparkConf, SparkContext }
import java.util.HashMap
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.hive.HiveContext
object LoadFromMysql {
def escape(ori: String) = {
if(ori!=null){
ori.replace("&", "&").replace("\t", "	").replace("\n", " ")
}else{
ori
}
}
def main(args: Array[String]) {
if (args.length != 6) {
System.err.println("Usage LoadFromMysql <url> <username> <password> <table> <id> <output>")
System.exit(1)
}
val Array(url, username, password, table, id, output) = args
val sparkConf = new SparkConf().setAppName("LoadFromMysql")
val sc = new SparkContext(sparkConf)
val lines_each_part = 2000000 //row lines each part file include
Class.forName("com.mysql.jdbc.Driver").newInstance
val connection = DriverManager.getConnection(url, username, password)
// for partitions, get lower_bound and upper_bound
......
val myRDD = new JdbcRDD(sc, () => DriverManager.getConnection(url,username, password),
"select * from " + table + " where "+id+" >= ? and "+id+" <= ?", lower_bound, upper_bound, partitions, r => {
var result = escape(r.getString(1))
var i = 2
while (i <= r.getMetaData.getColumnCount) {
result = result + "\t" + escape(r.getString(i))
i += 1
}
result
})
myRDD.saveAsTextFile(output)
}
}
Process data: create hive external table and process with hive(hive-shell or spark-sql) command. - Load mysql tables: SQLContext.load and save table with parquet format
SQLContext way is also based on JDBCRDD, just spark provide more parquet support in SqlContext.
package org.apache.spark.examples.sql
import org.apache.spark.sql.SQLContext
import java.sql.{ Connection, DriverManager, ResultSet }
import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.{ SparkConf, SparkContext }
import java.util.HashMap
import org.apache.spark.api.java.JavaSparkContext
/**
* @author ChenFangFang
*/
object LoadFromMysql_SqlContext {
def main(args: Array[String]) {
if (args.length != 6) {
System.err.println("Usage LoadFromMysql_SqlContext <url> <username> <password> <table> <id> <output>")
System.exit(1)
}
val Array(url, username, password, table, id, output) = args
val sparkConf = new SparkConf().setAppName("SqlKeywordCount")
val lines_each_part = 2000000 //row lines each part file include
Class.forName("com.mysql.jdbc.Driver").newInstance
val connection = DriverManager.getConnection(url, username, password)
// for partitions, get lower_bound and upper_bound
......
val sc = new JavaSparkContext(new SparkConf().setAppName("LoadFromMysql"));
val sqlContext = new SQLContext(sc)
val url_total = url + "?user=" + username + "&password=" + password;
var options: HashMap[String, String] = new HashMap
options.put("driver", "com.mysql.jdbc.Driver")
options.put("url", url_total)
options.put("dbtable", table)
options.put("lowerBound", lower_bound.toString())
options.put("upperBound", upper_bound.toString())
options.put("numPartitions", partitions.toString());
options.put("partitionColumn", id);
val jdbcDF = sqlContext.load("jdbc", options)
jdbcDF.save(output)
}
}
Process data: use spark-shell directely in parquet wayval sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
val df = sqlContext.parquetFile(...
).toDF()df
.registerTempTable("parquetTable")sqlContext.sql("SELECT * FROM parquetTable where id=1").collect().foreach(println)
0 0
- Two ways to load mysql tables into hdfs via spark
- How-to: effective store kafka data into hdfs via spark streaming
- MySQL:Compare two tables
- Qt two ways write Debug() into file
- How to use Scala on Spark to load data into Hbase/MapRDB -- normal load or bulk load.
- How-to: use spark to suport query across mysql tables and hbase tables
- PySpark to load HDFS
- Two ways to unlock iphone4
- Load a mysql table to HDFS using Sqoop
- How-to: make spark streaming collect data from Kafka topics and store data into hdfs
- Ways to write & read HDFS files
- [Python]Two ways to use Threading
- Two ways to force an IGMP join
- Python Mysql Connect By Two ways
- 题目1:MySQL----------Combine Two Tables
- 【MySQL】【leetcode】 Combine Two Tables解题报告
- LeetCode 175 -Combine Two Tables ( MYSQL )
- (转)A SQL query walks into a bar and sees two tables. He walks up to them and says 'Can I join you?'
- HTTP请求报文和HTTP响应报文
- 纹理特征分析的灰度共生矩阵(GLCM)
- C语言基础知识之(六):数组-----数组类型、定义数组、读取数组、排序数组、字符串数组函数
- C/C++ 算法 中缀转前缀表达式实现1---实现浮点数的加减乘除四则算数(包括浮点数运算,包括括号)
- hdu4219-Buildings(2012chengdu online I题)(贪心)
- Two ways to load mysql tables into hdfs via spark
- "abc" full arrange - II
- 国际减灾战略委员会
- 类之间的关系
- Python 动态生成变量名
- python celery(任务调度器)
- Android 百度地图开发 应用到自己的项目中
- OutMan——Objective-C中的ARC介绍和block的使用
- Rational software architect9 生成序列图 软件死机