Spark 编程基础

来源:互联网 发布:linux arm环境 编辑:程序博客网 时间:2024/06/07 00:02

基本框架

package week2import org.apache.spark.{SparkContext, SparkConf}import org.apache.spark.SparkContext._object WordCount1 {  def main(args: Array[String]) {    if (args.length == 0) {      System.err.println("Usage: WordCount1 <file1>")      System.exit(1)    }    val conf = new SparkConf().setAppName("WordCount1")    val sc = new SparkContext(conf)    sc.stop()  }}

RDD 方法

//parallelize演示val num=sc.parallelize(1 to 10)val doublenum = num.map(_*2)val threenum = doublenum.filter(_ % 3 == 0)threenum.collectthreenum.toDebugStringval num1=sc.parallelize(1 to 10,6)val doublenum1 = num1.map(_*2)val threenum1 = doublenum1.filter(_ % 3 == 0)threenum1.collectthreenum1.toDebugStringthreenum.cache()val fournum = threenum.map(x=>x*x)fournum.collectfournum.toDebugStringthreenum.unpersist()num.reduce (_ + _)num.take(5)num.firstnum.countnum.take(5).foreach(println)//K-V演示val kv1=sc.parallelize(List(("A",1),("B",2),("C",3),("A",4),("B",5)))kv1.sortByKey().collect //注意sortByKey的小括号不能省kv1.groupByKey().collectkv1.reduceByKey(_+_).collectval kv2=sc.parallelize(List(("A",4),("A",4),("C",3),("A",4),("B",5)))kv2.distinct.collectkv1.union(kv2).collectval kv3=sc.parallelize(List(("A",10),("B",20),("D",30)))kv1.join(kv3).collectkv1.cogroup(kv3).collectval kv4=sc.parallelize(List(List(1,2),List(3,4)))kv4.flatMap(x=>x.map(_+1)).collect//文件读取演示val rdd1 = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/directory/")rdd1.toDebugStringval words=rdd1.flatMap(_.split(" "))val wordscount=words.map(x=>(x,1)).reduceByKey(_+_)wordscount.collectwordscount.toDebugStringval rdd2 = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/directory/*.txt")rdd2.flatMap(_.split(" ")).map(x=>(x,1)).reduceByKey(_+_).collect//gzip压缩的文件val rdd3 = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/test.txt.gz")   //MappedRDD[String]//flatMap 简单说就是弄成一行rdd3.flatMap(_.split(" ")).map(x=>(x,1)).reduceByKey(_+_).collect//日志处理演示//http://download.labs.sogou.com/dl/q.html 完整版(2GB):gz格式//访问时间\t用户ID\t[查询词]\t该URL在返回结果中的排名\t用户点击的顺序号\t用户点击的URL//SogouQ1.txt、SogouQ2.txt、SogouQ3.txt分别是用head -n 或者tail -n 从SogouQ数据日志文件中截取//结果对于各种partion  -> bin/hdfs dfs -getmerge <partion file path> <outfile>//搜索结果排名第1,但是点击次序排在第2的数据有多少?val rdd1 = sc.textFile("hdfs://hadoop1:8000/dataguru/data/SogouQ1.txt")val rdd2=rdd1.map(_.split("\t")).filter(_.length==6)rdd2.count()val rdd3=rdd2.filter(_(3).toInt==1).filter(_(4).toInt==2)rdd3.count()rdd3.toDebugString//session查询次数排行榜val rdd4=rdd2.map(x=>(x(1),1)).reduceByKey(_+_).map(x=>(x._2,x._1)).sortByKey(false).map(x=>(x._2,x._1))rdd4.toDebugStringrdd4.saveAsTextFile("hdfs://hadoop1:8000/dataguru/week2/output1")//cache()演示//放入内存//检查block命令:bin/hdfs fsck /dataguru/data/SogouQ3.txt -files -blocks -locationsval rdd5 = sc.textFile("hdfs://hadoop1:8000/dataguru/data/SogouQ3.txt")rdd5.cache()rdd5.count()    //会使其不会100%cache进来rdd5.count()  //比较时间  因为数据没有全部cache 进内存//join演示val format = new java.text.SimpleDateFormat("yyyy-MM-dd")case class Register (d: java.util.Date, uuid: String, cust_id: String, lat: Float,lng: Float)case class Click (d: java.util.Date, uuid: String, landing_page: Int)//映射成(k,v) map 此处 v 为一个类型 case classval reg = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/join/reg.tsv").map(_.split("\t")).map(r => (r(1), Register(format.parse(r(0)), r(1), r(2), r(3).toFloat, r(4).toFloat)))     val clk = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/join/clk.tsv").map(_.split("\t")).map(c => (c(1), Click(format.parse(c(0)), c(1), c(2).trim.toInt)))reg.join(clk).take(2)//Array[(String, (Register,Click))]
0 0