Spark 编程基础
来源:互联网 发布:linux arm环境 编辑:程序博客网 时间:2024/06/07 00:02
基本框架
package week2import org.apache.spark.{SparkContext, SparkConf}import org.apache.spark.SparkContext._object WordCount1 { def main(args: Array[String]) { if (args.length == 0) { System.err.println("Usage: WordCount1 <file1>") System.exit(1) } val conf = new SparkConf().setAppName("WordCount1") val sc = new SparkContext(conf) sc.stop() }}
RDD 方法
//parallelize演示val num=sc.parallelize(1 to 10)val doublenum = num.map(_*2)val threenum = doublenum.filter(_ % 3 == 0)threenum.collectthreenum.toDebugStringval num1=sc.parallelize(1 to 10,6)val doublenum1 = num1.map(_*2)val threenum1 = doublenum1.filter(_ % 3 == 0)threenum1.collectthreenum1.toDebugStringthreenum.cache()val fournum = threenum.map(x=>x*x)fournum.collectfournum.toDebugStringthreenum.unpersist()num.reduce (_ + _)num.take(5)num.firstnum.countnum.take(5).foreach(println)//K-V演示val kv1=sc.parallelize(List(("A",1),("B",2),("C",3),("A",4),("B",5)))kv1.sortByKey().collect //注意sortByKey的小括号不能省kv1.groupByKey().collectkv1.reduceByKey(_+_).collectval kv2=sc.parallelize(List(("A",4),("A",4),("C",3),("A",4),("B",5)))kv2.distinct.collectkv1.union(kv2).collectval kv3=sc.parallelize(List(("A",10),("B",20),("D",30)))kv1.join(kv3).collectkv1.cogroup(kv3).collectval kv4=sc.parallelize(List(List(1,2),List(3,4)))kv4.flatMap(x=>x.map(_+1)).collect//文件读取演示val rdd1 = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/directory/")rdd1.toDebugStringval words=rdd1.flatMap(_.split(" "))val wordscount=words.map(x=>(x,1)).reduceByKey(_+_)wordscount.collectwordscount.toDebugStringval rdd2 = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/directory/*.txt")rdd2.flatMap(_.split(" ")).map(x=>(x,1)).reduceByKey(_+_).collect//gzip压缩的文件val rdd3 = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/test.txt.gz") //MappedRDD[String]//flatMap 简单说就是弄成一行rdd3.flatMap(_.split(" ")).map(x=>(x,1)).reduceByKey(_+_).collect//日志处理演示//http://download.labs.sogou.com/dl/q.html 完整版(2GB):gz格式//访问时间\t用户ID\t[查询词]\t该URL在返回结果中的排名\t用户点击的顺序号\t用户点击的URL//SogouQ1.txt、SogouQ2.txt、SogouQ3.txt分别是用head -n 或者tail -n 从SogouQ数据日志文件中截取//结果对于各种partion -> bin/hdfs dfs -getmerge <partion file path> <outfile>//搜索结果排名第1,但是点击次序排在第2的数据有多少?val rdd1 = sc.textFile("hdfs://hadoop1:8000/dataguru/data/SogouQ1.txt")val rdd2=rdd1.map(_.split("\t")).filter(_.length==6)rdd2.count()val rdd3=rdd2.filter(_(3).toInt==1).filter(_(4).toInt==2)rdd3.count()rdd3.toDebugString//session查询次数排行榜val rdd4=rdd2.map(x=>(x(1),1)).reduceByKey(_+_).map(x=>(x._2,x._1)).sortByKey(false).map(x=>(x._2,x._1))rdd4.toDebugStringrdd4.saveAsTextFile("hdfs://hadoop1:8000/dataguru/week2/output1")//cache()演示//放入内存//检查block命令:bin/hdfs fsck /dataguru/data/SogouQ3.txt -files -blocks -locationsval rdd5 = sc.textFile("hdfs://hadoop1:8000/dataguru/data/SogouQ3.txt")rdd5.cache()rdd5.count() //会使其不会100%cache进来rdd5.count() //比较时间 因为数据没有全部cache 进内存//join演示val format = new java.text.SimpleDateFormat("yyyy-MM-dd")case class Register (d: java.util.Date, uuid: String, cust_id: String, lat: Float,lng: Float)case class Click (d: java.util.Date, uuid: String, landing_page: Int)//映射成(k,v) map 此处 v 为一个类型 case classval reg = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/join/reg.tsv").map(_.split("\t")).map(r => (r(1), Register(format.parse(r(0)), r(1), r(2), r(3).toFloat, r(4).toFloat))) val clk = sc.textFile("hdfs://hadoop1:8000/dataguru/week2/join/clk.tsv").map(_.split("\t")).map(c => (c(1), Click(format.parse(c(0)), c(1), c(2).trim.toInt)))reg.join(clk).take(2)//Array[(String, (Register,Click))]
0 0
- Spark 编程基础
- Spark-RDD编程基础
- Spark学习-RDD编程基础
- 学习spark系列---scala 编程基础
- Spark基础-Scala函数式编程
- Spark基础-Scala集合函数式编程
- Spark基础-Scala类型参数编程
- Spark基础
- spark基础
- spark基础
- spark 基础
- Spark基础
- spark基础
- Spark基础
- Spark基础
- Spark基础
- Spark编程
- Spark编程
- 利用 padding+background & border 为图片设置双边框
- CString字符串分割
- Semaphore and Mutex usages and differences
- ORA-00257: archiver error. Connect internal only, until freed
- 测试参考资料
- Spark 编程基础
- MFC的tab控件用法
- 面试题之螺旋矩阵
- CentOS7 Docker安装操作以及基本命令
- sharepoint 页面登陆缓慢处理的一个方法!
- 【ios开发学习】常见问题积累
- eclipse中删除代码注释
- iOS app 适配问题总结
- 跨站脚本攻击(XSS)——常见网站攻击手段原理与防御