Two ways to load mysql tables into hdfs via spark

来源:互联网 发布:数据库通配符 编辑:程序博客网 时间:2024/04/30 14:03
There are two ways to load mysql tables into hdfs via spark, then process these datas.
  1. Load mysql tables: use JDBCRDD directely
    package org.apache.spark.examples.sql

    import org.apache.spark.sql.SQLContext
    import java.sql.{ Connection, DriverManager, ResultSet }
    import org.apache.spark.rdd.JdbcRDD
    import org.apache.spark.{ SparkConf, SparkContext }
    import java.util.HashMap
    import org.apache.spark.api.java.JavaSparkContext
    import org.apache.spark.sql.hive.HiveContext

    object LoadFromMysql {

      def escape(ori: String) = {
        if(ori!=null){
         ori.replace("&", "&").replace("\t", "	").replace("\n", "
")
        }else{
         ori
        }  
      }
     
      def main(args: Array[String]) {
        if (args.length != 6) {
          System.err.println("Usage LoadFromMysql <url> <username> <password> <table> <id> <output>")
          System.exit(1)
        }

        val Array(url, username, password, table, id, output) = args
        val sparkConf = new SparkConf().setAppName("LoadFromMysql")
        val sc = new SparkContext(sparkConf)
        val lines_each_part = 2000000 //row lines each part file include

        Class.forName("com.mysql.jdbc.Driver").newInstance

        val connection = DriverManager.getConnection(url, username, password)

        // for partitions, get lower_bound and upper_bound
    ......
        val myRDD = new JdbcRDD(sc, () => DriverManager.getConnection(url,username, password),
          "select * from " + table + " where "+id+" >= ? and "+id+" <= ?", lower_bound, upper_bound, partitions, r => {
            var result = escape(r.getString(1))
            var i = 2
            while (i <= r.getMetaData.getColumnCount) {
              result = result + "\t" + escape(r.getString(i))
              i += 1
            }
            result
          })

        myRDD.saveAsTextFile(output)
      }

    }
    Process data:  create hive external table and process with hive(hive-shell or spark-sql) command. 
  2. Load mysql tables: SQLContext.load and save table with parquet format
    SQLContext way is also based on JDBCRDD, just spark provide more parquet support in SqlContext. 
    package org.apache.spark.examples.sql

    import org.apache.spark.sql.SQLContext
    import java.sql.{ Connection, DriverManager, ResultSet }
    import org.apache.spark.rdd.JdbcRDD
    import org.apache.spark.{ SparkConf, SparkContext }
    import java.util.HashMap
    import org.apache.spark.api.java.JavaSparkContext

    /**
    * @author ChenFangFang
    */
    object LoadFromMysql_SqlContext {

      def main(args: Array[String]) {
        if (args.length != 6) {
          System.err.println("Usage LoadFromMysql_SqlContext <url> <username> <password> <table> <id> <output>")
          System.exit(1)
        }

        val Array(url, username, password, table, id, output) = args
        val sparkConf = new SparkConf().setAppName("SqlKeywordCount")
        val lines_each_part = 2000000 //row lines each part file include

        Class.forName("com.mysql.jdbc.Driver").newInstance

        val connection = DriverManager.getConnection(url, username, password)

        // for partitions, get lower_bound and upper_bound
    ...... 

        val sc = new JavaSparkContext(new SparkConf().setAppName("LoadFromMysql"));
        val sqlContext = new SQLContext(sc)
        val url_total = url + "?user=" + username + "&password=" + password;

        var options: HashMap[String, String] = new HashMap
        options.put("driver", "com.mysql.jdbc.Driver")
        options.put("url", url_total)
        options.put("dbtable", table) 
        options.put("lowerBound", lower_bound.toString())
        options.put("upperBound", upper_bound.toString())
        options.put("numPartitions", partitions.toString());
        options.put("partitionColumn", id);

        val jdbcDF = sqlContext.load("jdbc", options)
        jdbcDF.save(output)
      }
    }
    Process data:  use spark-shell directely in parquet way
    val sqlContext = new org.apache.spark.sql.SQLContext(sc)
    import sqlContext.implicits._
    val df = sqlContext.parquetFile(...).toDF()
    df.registerTempTable("parquetTable")
    sqlContext.sql("SELECT * FROM parquetTable where id=1").collect().foreach(println)


0 0
原创粉丝点击