spark与elasticsearch整合

来源:互联网 发布:c语言常考题 编辑:程序博客网 时间:2024/06/01 10:20
    <dependencies>        <dependency>            <groupId>org.scala-lang</groupId>            <artifactId>scala-library</artifactId>            <version>${scala.version}</version>            <scope>compile</scope>        </dependency>        <dependency>            <groupId>org.scala-lang</groupId>            <artifactId>scala-compiler</artifactId>            <version>${scala.version}</version>            <scope>compile</scope>        </dependency>        <!--<dependency>-->            <!--<groupId>org.specs2</groupId>-->            <!--<artifactId>specs2_${scala.binary.version}</artifactId>-->            <!--<version>3.3.1</version>-->            <!--<scope>test</scope>-->        <!--</dependency>-->        <dependency>            <groupId>junit</groupId>            <artifactId>junit</artifactId>            <version>4.12</version>            <scope>test</scope>        </dependency>        <dependency>            <groupId>org.apache.spark</groupId>            <artifactId>spark-core_${scala.binary.version}</artifactId>            <version>${spark.version}</version>            <scope>provided</scope>        </dependency>        <dependency>            <groupId>org.apache.spark</groupId>            <artifactId>spark-streaming_${scala.binary.version}</artifactId>            <version>${spark.version}</version>            <scope>provided</scope>        </dependency>        <dependency>            <groupId>org.apache.spark</groupId>            <artifactId>spark-streaming-kafka_${scala.binary.version}</artifactId>            <version>${spark.version}</version>        </dependency>        <dependency>            <groupId>org.apache.spark</groupId>            <artifactId>spark-sql_${scala.binary.version}</artifactId>            <version>${spark.version}</version>        </dependency>        <dependency>            <groupId>org.elasticsearch</groupId>            <artifactId>elasticsearch-hadoop</artifactId>            <version>${elasticsearch.version}</version>            <exclusions>                <exclusion>                    <groupId>org.apache.spark</groupId>                    <artifactId>spark-core_2.10</artifactId>                </exclusion>                <exclusion>                    <groupId>org.apache.spark</groupId>                    <artifactId>spark-sql_2.10</artifactId>                </exclusion>                <exclusion>                    <groupId>org.apache.storm</groupId>                    <artifactId>storm-core</artifactId>                </exclusion>                <exclusion>                    <groupId>cascading</groupId>                    <artifactId>cascading-hadoop</artifactId>                </exclusion>            </exclusions>        </dependency>        <dependency>            <groupId>redis.clients</groupId>            <artifactId>jedis</artifactId>            <version>2.8.1</version>        </dependency>        <dependency>            <groupId>org.apache.commons</groupId>            <artifactId>commons-pool2</artifactId>            <version>2.2</version>        </dependency>        <dependency>            <groupId>org.apache.kafka</groupId>            <artifactId>kafka-clients</artifactId>            <version>0.9.0.1</version>        </dependency>        <dependency>            <groupId>org.codehaus.jettison</groupId>            <artifactId>jettison</artifactId>            <version>1.3.7</version>        </dependency>    </dependencies>


demo1:

package demo.spark.elasticsearchimport org.apache.spark.SparkConfimport org.apache.spark.SparkContextimport org.apache.spark.SparkContext._import org.elasticsearch.spark._/**  * Created by cao on 16-3-25.  */object Demo1 {  def main(args: Array[String]) {    val conf = new SparkConf().setAppName("ESDemo1")    conf.set("es.index.auto.create", "true")    val sc = new SparkContext(conf)    val numbers = Map("one" -> 1, "two" -> 2, "three" -> 3)    val airports = Map("arrival" -> "Otopeni", "SFO" -> "San Fran")    sc.makeRDD(Seq(numbers,airports)).saveToEs("spark/docs")  }}
{"took":2,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":12,"max_score":1.0,"hits":[{"_index":"spark","_type":"docs","_id":"AVOukOOI0OVBGh8ft4am","_score":1.0,"_source":{"one":1,"two":2,"three":3}},{"_index":"spark","_type":"docs","_id":"AVOu-vRa0OVBGh8ft4a9","_score":1.0,"_source":{"one":1,"two":2,"three":3}},{"_index":"spark","_type":"docs","_id":"AVOu_kMq0OVBGh8ft4a_","_score":1.0,"_source":{"departure":"MUC","arrival":"OTP"}},{"_index":"spark","_type":"docs","_id":"AVOvAVuS0OVBGh8ft4bE","_score":1.0,"_source":{"one":1,"two":2,"three":3}},{"_index":"spark","_type":"docs","_id":"AVOujInV0OVBGh8ft4aj","_score":1.0,"_source":{"arrival":"Otopeni","SFO":"San Fran"}},{"_index":"spark","_type":"docs","_id":"AVOujInn0OVBGh8ft4ak","_score":1.0,"_source":{"one":1,"two":2,"three":3}},{"_index":"spark","_type":"docs","_id":"AVOumniH0OVBGh8ft4as","_score":1.0,"_source":{"departure":"MUC","arrival":"OTP"}},{"_index":"spark","_type":"docs","_id":"AVOumniH0OVBGh8ft4at","_score":1.0,"_source":{"departure":"OTP","arrival":"SFO"}},{"_index":"spark","_type":"docs","_id":"AVOu_kMq0OVBGh8ft4a-","_score":1.0,"_source":{"departure":"OTP","arrival":"SFO"}},{"_index":"spark","_type":"docs","_id":"AVOvAVuJ0OVBGh8ft4bD","_score":1.0,"_source":{"arrival":"Otopeni","SFO":"San Fran"}}]}}


Dmo2
package demo.spark.elasticsearch/**  * Created by cao on 16-3-26.  */import org.apache.spark.{SparkConf, SparkContext}import org.apache.spark.sql.SQLContextimport org.apache.spark.sql.SQLContext._import org.elasticsearch.spark.rdd.EsSparkimport org.elasticsearch.spark.sql._import org.apache.spark.rdd.RDD._import org.elasticsearch.spark._object Demo2 {  def main(args: Array[String]) {    val sc = new SparkContext(new SparkConf().setAppName("Demo2"))    case class Trip(departure: String, arrival: String)    val upcomingTrip = Trip("OTP", "SFO")    val lastWeekTrip = Trip("MUC", "OTP")    val rdd = sc.makeRDD(Seq(upcomingTrip, lastWeekTrip))    EsSpark.saveToEs(rdd, "spark/docs")  }}


Demo3

package demo.spark.elasticsearch/**  * Created by cao on 16-3-26.  */import org.apache.spark.{SparkConf, SparkContext}import org.apache.spark.sql.SQLContextimport org.elasticsearch.spark.sql._import org.apache.spark.rdd.RDD._import org.elasticsearch.spark._//定义Person case classcase class Person(name: String, surname: String, age: Int)object Demo3 {  def main(args: Array[String]) {    val sc = new SparkContext(new SparkConf().setAppName("Demo4"))    //创建sqlContext    val sqlContext = new SQLContext(sc)    import sqlContext.implicits._    //创建DataFrame    val people = sc.textFile("file:///home/cao/Desktop/poeple.txt").map(_.split(",")).map(p => Person(p(0), p(1), p(2).trim.toInt)).toDF()    people.saveToEs("spark/people")  }}

Demo4

package demo.spark.elasticsearch/**  * Created by cao on 16-3-26.  */import org.apache.spark.{SparkConf, SparkContext}import org.apache.spark.sql.SQLContextimport org.elasticsearch.spark.sql._import org.apache.spark.rdd.RDD._import org.elasticsearch.spark._object Demo4 {  def main(args: Array[String]) {    val sc = new SparkContext(new SparkConf().setAppName("Demo4"))    //创建sqlContext    val sqlContext = new SQLContext(sc)    import sqlContext.implicits._    val options = Map("pushdown" -> "true", "es.nodes" -> "localhost", "es.port" -> "9200")    val spark14DF = sqlContext.read.format("org.elasticsearch.spark.sql").options(options).load("spark/people")    spark14DF.select("name","age").collect().foreach(println(_))    spark14DF.registerTempTable("people")    val results = sqlContext.sql("SELECT name FROM people")    results.map(t => "Name:"+t(0)).collect().foreach(println)  }}

Demo5
package demo.spark.elasticsearch/**  * Created by cao on 16-3-26.  */import org.apache.spark.sql.SQLContextimport org.apache.spark.{SparkConf, SparkContext}object Demo5 {  def main(args: Array[String]) {    val sc = new SparkContext(new SparkConf().setAppName("Demo4"))    //创建sqlContext    val sqlContext = new SQLContext(sc)    sqlContext.sql(      "CREATE TEMPORARY TABLE myPeople    " +        "USING org.elasticsearch.spark.sql " +        "OPTIONS ( resource 'spark/people', nodes 'localhost:9200')" )    sqlContext.sql("select * from myPeople").collect.foreach(println)  }}

Demo6

package demo.spark.elasticsearch/**  * Created by cao on 16-3-26.  */import org.apache.spark.sql.SQLContextimport org.apache.spark.{SparkConf, SparkContext}import org.elasticsearch.spark.sql._object Demo6 {  def main(args: Array[String]) {    val sc = new SparkContext(new SparkConf().setAppName("Demo4"))    //创建sqlContext    val sqlContext = new SQLContext(sc)    val people = sqlContext.esDF("spark/people")    println(people.schema.treeString)    val wangs = sqlContext.esDF("spark/people","?q=wang")    wangs.show()  }}


0 0