Spark json TempTags Sample
来源:互联网 发布:华泰交易软件下载 编辑:程序博客网 时间:2024/06/05 07:43
Data Frame
———–>取同一编号即同一家的前三的最多评论
77287793{ "reviewPics": null, "extInfoList": [ { "title": "contentTags", "values": [ "高大上", "环境优雅", "价格实惠", "交通便利" ], "desc": "", "defineType": 0 }, { "title": "tagIds", "values": [ "616", "24", "373", "278" ], "desc": "", "defineType": 0 } ], "expenseList": null, "reviewIndexes": [ 2 ], "scoreList": null}
Scala
package com.spark.jsonimport com.alibaba.fastjson.JSONimport org.apache.spark.{SparkConf, SparkContext}/** * Created by wqh on 2017/9/12. */object Test extends App { val start = System.currentTimeMillis val conf = new SparkConf() conf.setAppName("TestsortBykey").setMaster("local[4]") val sc = new SparkContext(conf) val rdd1 = sc.textFile("/Users/wqh/Desktop/data/temptags.txt", 2) def parseJon(line: String): (String, String) = { val arr = line.split("\t") if (arr != null && arr.length > 1) { val jsonObj = JSON.parseObject(arr(1)) val j2 = jsonObj.getJSONArray("extInfoList") if (j2 != null && j2.size() > 0) { val j3 = j2.getJSONObject(0) val j4 = j3.getJSONArray("values") if (j4 != null && j4.size() > 0) { val j5 = j4.toArray().mkString(",") (arr(0), j5) } else (arr(0), "") } else (arr(0), "") } else (arr(0), "") } val rdd2 = rdd1.map(parseJon(_)) val rdd3 = rdd2.filter(t => t._2 != null && !t._2.equals("")) val rdd4 = rdd3.flatMapValues(_.split(",")) val rdd5 = rdd4.map(t => (t, 1)) val rdd6 = rdd5.reduceByKey(_ + _) val rdd7 = rdd6.map(t => (t._1._1, (t._1._2, t._2) :: Nil)) val rdd8 = rdd7.reduceByKey(_ ++ _) val rdd9 = rdd8.map(t => { (t._1, t._2.sortBy(-_._2).take(3)) }) rdd9.collect().foreach(println) println(System.currentTimeMillis - start)}
Java
package com.it18zhang.spark;import com.alibaba.fastjson.JSON;import com.alibaba.fastjson.JSONArray;import com.alibaba.fastjson.JSONObject;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.Function;import org.apache.spark.api.java.function.Function2;import org.apache.spark.api.java.function.PairFlatMapFunction;import org.apache.spark.api.java.function.PairFunction;import scala.Tuple2;import java.util.ArrayList;import java.util.Comparator;import java.util.Iterator;import java.util.List;public class TempTagGenJava { public static void main(String[] args) { long start = System.currentTimeMillis(); SparkConf conf = new SparkConf(); conf.setAppName("tagGen"); conf.setMaster("local[4]"); JavaSparkContext sc = new JavaSparkContext(conf); //加载文档 JavaRDD<String> rdd1 = sc.textFile("/Users/wqh/Desktop/data/temptags.txt"); //变换 JavaRDD<Tuple2<String, String>> rdd2 = rdd1.map(new Function<String, Tuple2<String, String>>() { public Tuple2<String, String> call(String line) throws Exception { return parseJson(line); } }); //过滤空串 JavaRDD<Tuple2<String, String>> rdd3 = rdd2.filter(new Function<Tuple2<String, String>, Boolean>() { public Boolean call(Tuple2<String, String> t) throws Exception { return t._2() != null && !t._2.equals(""); } }); //压扁 JavaPairRDD<String, String> rdd4 = rdd3.flatMapToPair( new PairFlatMapFunction<Tuple2<String, String>, String, String>() { public Iterator<Tuple2<String, String>> call(Tuple2<String, String> t) throws Exception { List<Tuple2<String, String>> list = new ArrayList<Tuple2<String, String>>(); String[] arr = t._2.split(","); for (String comm : arr) { list.add(new Tuple2<String, String>(t._1, comm)); } return list.iterator(); } }); // JavaPairRDD<Tuple2<String, String>, Integer> rdd5 = rdd4.mapToPair( new PairFunction<Tuple2<String, String>, Tuple2<String, String>, Integer>() { public Tuple2<Tuple2<String, String>, Integer> call(Tuple2<String, String> t) throws Exception { return new Tuple2<Tuple2<String, String>, Integer>(t, 1); } }); //聚合 JavaPairRDD<Tuple2<String, String>, Integer> rdd6 = rdd5.reduceByKey(new Function2<Integer, Integer, Integer>() { public Integer call(Integer v1, Integer v2) throws Exception { return v1 + v2; } }); //变换成新对 JavaPairRDD<String, List<Tuple2<String, Integer>>> rdd7 = rdd6.mapToPair(new PairFunction<Tuple2<Tuple2<String, String>, Integer>, String, List<Tuple2<String, Integer>>>() { public Tuple2<String, List<Tuple2<String, Integer>>> call(Tuple2<Tuple2<String, String>, Integer> t) throws Exception { List<Tuple2<String, Integer>> list = new ArrayList<Tuple2<String, Integer>>(); list.add(new Tuple2<String, Integer>(t._1._2, t._2)); return new Tuple2<String, List<Tuple2<String, Integer>>>(t._1._1, list); } }); JavaPairRDD<String, List<Tuple2<String, Integer>>> rdd8 = rdd7.reduceByKey( new Function2<List<Tuple2<String, Integer>>, List<Tuple2<String, Integer>>, List<Tuple2<String, Integer>>>() { public List<Tuple2<String, Integer>> call(List<Tuple2<String, Integer>> v1, List<Tuple2<String, Integer>> v2) throws Exception { v1.addAll(v2); return v1; } }); //排序 JavaPairRDD<String, List<Tuple2<String, Integer>>> rdd9 = rdd8.mapToPair(new PairFunction<Tuple2<String, List<Tuple2<String, Integer>>>, String, List<Tuple2<String, Integer>>>() { public Tuple2<String, List<Tuple2<String, Integer>>> call(Tuple2<String, List<Tuple2<String, Integer>>> t) throws Exception { String busiNum = t._1; List<Tuple2<String, Integer>> list = t._2; list.sort(new Comparator<Tuple2<String, Integer>>() { public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) { return o2._2 - o1._2; } }); if (list != null && list.size() >= 3) { list = list.subList(0, 3); } return new Tuple2<String, List<Tuple2<String, Integer>>>(busiNum, new ArrayList(list)); } }); List list = rdd9.collect(); for (Object o : list) { System.out.println(o); } System.out.println(System.currentTimeMillis() - start); } /** * 解析一行json串,返回元组KV * * @param line * @return */ public static Tuple2<String, String> parseJson(String line) { String[] arr = line.split("\t"); String cont = ""; String busiNum = arr[0]; if (arr != null && arr.length > 1) { String json = arr[1]; JSONObject jo = JSON.parseObject(json); JSONArray jarr = jo.getJSONArray("extInfoList"); if (jarr != null && jarr.size() > 0) { JSONObject jo2 = jarr.getJSONObject(0); if (jo2 != null) { JSONArray commArr = jo2.getJSONArray("values"); if (commArr != null && commArr.size() > 0) { for (int i = 0; i < commArr.size(); i++) { cont = cont + commArr.get(i) + ","; } } } } } if (cont != null && !cont.equals("")) { return new Tuple2<String, String>(busiNum, cont.substring(0, cont.length() - 1)); } return new Tuple2<String, String>(busiNum, ""); }}
Result
(83644298,List((体验好,1), (性价比高,1), (服务热情,1)))(82317795,List((味道差,1)))(77705462,List((服务热情,3), (羊肉,2), (价格实惠,2)))(85766086,List((价格实惠,2), (服务热情,2), (味道赞,2)))(74145782,List((服务热情,18), (味道赞,14), (上菜快,13)))(71039150,List((团建,1), (价格实惠,1), (朋友聚会,1)))(70611801,List((干净卫生,4), (回头客,3), (味道赞,2)))(88902676,List((2,2)))(73963176,List((味道赞,15), (价格实惠,12), (分量足,11)))(84270191,List((价格实惠,2), (服务热情,2), (性价比高,2)))(89223651,List((环境优雅,8), (服务热情,8), (技师专业,7)))(82016443,List((分量足,3), (味道赞,2), (主食赞,2)))(77287793,List((干净卫生,29), (环境优雅,26), (音响效果好,26)))(79197522,List((服务热情,2), (价格实惠,1), (放松舒服,1)))(83084036,List((干净卫生,1), (价格实惠,1)))(73879078,List((饮品赞,3), (回头客,2), (味道赞,2)))(88284865,List((价格实惠,1), (价格高,1), (性价比低,1)))(83073343,List((干净卫生,17), (味道赞,16), (环境优雅,15)))(76114040,List((2,3), (5,1), (性价比高,1)))(86913510,List((午餐,1), (分量适中,1)))(88496862,List((回头客,5), (味道赞,4), (服务热情,4)))(78477325,List((味道赞,8), (回头客,7), (干净卫生,5)))(83981222,List((性价比高,4), (干净卫生,3), (价格实惠,3)))(82705919,List((回头客,3), (干净卫生,3), (1,2)))(87994574,List((无推销,12), (价格实惠,8), (服务热情,7)))(77373671,List((菜品差,1), (服务热情,1), (干净卫生,1)))(75144086,List((8239,60), (服务热情,38), (8241,31)))(85648235,List((味道赞,17), (服务热情,15), (干净卫生,13)))(73607905,List((菜品不错,16), (回头客,15), (干净卫生,15)))(76893145,List((服务热情,10), (环境优雅,7), (高大上,5)))(78824187,List((价格实惠,13), (回头客,11), (分量足,10)))865817/09/12 16:10:31 INFO SparkContext: Invoking stop() from shutdown hook
阅读全文
0 0
- Spark json TempTags Sample
- JSON Sample
- Spark First Sample Demo
- spark--transform算子--sample
- 提交spark sample作业失败
- spark学习-20-Spark的sample理解
- Spring 3 MVC JSON Sample
- spark sql读取json
- spark生成json文件
- Spark---Datasource(JSON)---java
- Spark---Datasource(JSON)---Scala
- sample
- !!!sample
- sample
- Flex 4 beta应用实例:Spark Intranet Sample App
- Spark RDD中Transformation的mapValues、subtract、sample、takeSample详解
- 【Spark Java API】Transformation(2)—sample、randomSplit
- Spark算子[15]:sample、takeSample 源码实例详解
- ibatis使用注意
- iOS-音视频采集 by AVFoundation
- break ,continue 的使用:java如何跳出外层循环?Java如何跳出外层循环执行下一次循环?
- js实现图片在线预览
- 堆排序
- Spark json TempTags Sample
- spring下,druid,c3p0,proxool,dbcp四个数据连接池的使用和配置
- git操作(二)分支切换与合并
- linux微妙和秒定时器
- 安装TortoiseSVN报could not write value to key的错误解决方案
- 广播接收器BroadcastReceiver,无序、有序、本地几种广播
- iOS自定义转场动画
- python之禅
- 移动web应用调试汇总