Spark Scala 范例

来源:互联网 发布:全国城市mysql数据库 编辑:程序博客网 时间:2024/05/17 06:33

1.处理HDFS日志文件中错误日志

val lines = sc.textFile("hdfs://...")  //lines is a org.apache.spark.rdd.MappedRDDval errors = lines.filter(_.startsWith("ERROR")) //errors is a org.apache.spark.rdd.FilterRDDerrors.cache() //persist到内存中errors.count()  //触发action,计算errors有多少个,即ERROR的多少行errors.filter(_.contains("MySQL")).count()errors.filter(_.contains("HDFS")).map(_.split('\t')(3)).collect()

2. SQL RDDRelation

package org.apache.spark.examples.sqlimport org.apache.spark.{SparkConf,SparkContext}import org.apache.spark.sql.SQLContextimport org.apache.spark.sql.functions._case class Record(key:Int, value:String)object RDDRelation{    def main(args: Array[String]){        val sparkConf = new SparkConf().setAppName("RDDRelation")        val sc = new SparkContext(sparkConf)        val sqlContext = new SQLContext(sc)import sqlContext.implicits._val df = sc.parallelize(1 to 100).map(i => Record(i, s"val_$i"))).toDF()df.register    }}
0 0