Spark json TempTags Sample

来源:互联网 发布:华泰交易软件下载 编辑:程序博客网 时间:2024/06/05 07:43

Data Frame

———–>取同一编号即同一家的前三的最多评论

77287793{    "reviewPics": null,     "extInfoList": [        {            "title": "contentTags",             "values": [                "高大上",                 "环境优雅",                 "价格实惠",                 "交通便利"            ],             "desc": "",             "defineType": 0        },         {            "title": "tagIds",             "values": [                "616",                 "24",                 "373",                 "278"            ],             "desc": "",             "defineType": 0        }    ],     "expenseList": null,     "reviewIndexes": [        2    ],     "scoreList": null}

json data

Scala

package com.spark.jsonimport com.alibaba.fastjson.JSONimport org.apache.spark.{SparkConf, SparkContext}/**  * Created by wqh on 2017/9/12.  */object Test extends App {    val start = System.currentTimeMillis    val conf = new SparkConf()    conf.setAppName("TestsortBykey").setMaster("local[4]")    val sc = new SparkContext(conf)    val rdd1 = sc.textFile("/Users/wqh/Desktop/data/temptags.txt", 2)    def parseJon(line: String): (String, String) = {        val arr = line.split("\t")        if (arr != null && arr.length > 1) {            val jsonObj = JSON.parseObject(arr(1))            val j2 = jsonObj.getJSONArray("extInfoList")            if (j2 != null && j2.size() > 0) {                val j3 = j2.getJSONObject(0)                val j4 = j3.getJSONArray("values")                if (j4 != null && j4.size() > 0) {                    val j5 = j4.toArray().mkString(",")                    (arr(0), j5)                } else (arr(0), "")            } else (arr(0), "")        } else (arr(0), "")    }    val rdd2 = rdd1.map(parseJon(_))    val rdd3 = rdd2.filter(t => t._2 != null && !t._2.equals(""))    val rdd4 = rdd3.flatMapValues(_.split(","))    val rdd5 = rdd4.map(t => (t, 1))    val rdd6 = rdd5.reduceByKey(_ + _)    val rdd7 = rdd6.map(t => (t._1._1, (t._1._2, t._2) :: Nil))    val rdd8 = rdd7.reduceByKey(_ ++ _)    val rdd9 = rdd8.map(t => {        (t._1, t._2.sortBy(-_._2).take(3))    })    rdd9.collect().foreach(println)    println(System.currentTimeMillis - start)}

Java

package com.it18zhang.spark;import com.alibaba.fastjson.JSON;import com.alibaba.fastjson.JSONArray;import com.alibaba.fastjson.JSONObject;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.Function;import org.apache.spark.api.java.function.Function2;import org.apache.spark.api.java.function.PairFlatMapFunction;import org.apache.spark.api.java.function.PairFunction;import scala.Tuple2;import java.util.ArrayList;import java.util.Comparator;import java.util.Iterator;import java.util.List;public class TempTagGenJava {    public static void main(String[] args) {        long start = System.currentTimeMillis();        SparkConf conf = new SparkConf();        conf.setAppName("tagGen");        conf.setMaster("local[4]");        JavaSparkContext sc = new JavaSparkContext(conf);        //加载文档        JavaRDD<String> rdd1 = sc.textFile("/Users/wqh/Desktop/data/temptags.txt");        //变换        JavaRDD<Tuple2<String, String>> rdd2 = rdd1.map(new Function<String, Tuple2<String, String>>() {            public Tuple2<String, String> call(String line) throws Exception {                return parseJson(line);            }        });        //过滤空串        JavaRDD<Tuple2<String, String>> rdd3 = rdd2.filter(new Function<Tuple2<String, String>, Boolean>() {            public Boolean call(Tuple2<String, String> t) throws Exception {                return t._2() != null && !t._2.equals("");            }        });        //压扁        JavaPairRDD<String, String> rdd4 = rdd3.flatMapToPair(                new PairFlatMapFunction<Tuple2<String, String>, String, String>() {                    public Iterator<Tuple2<String, String>> call(Tuple2<String, String> t) throws Exception {                        List<Tuple2<String, String>> list = new ArrayList<Tuple2<String, String>>();                        String[] arr = t._2.split(",");                        for (String comm : arr) {                            list.add(new Tuple2<String, String>(t._1, comm));                        }                        return list.iterator();                    }                });        //        JavaPairRDD<Tuple2<String, String>, Integer> rdd5 = rdd4.mapToPair(                new PairFunction<Tuple2<String, String>, Tuple2<String, String>, Integer>() {                    public Tuple2<Tuple2<String, String>, Integer> call(Tuple2<String, String> t) throws Exception {                        return new Tuple2<Tuple2<String, String>, Integer>(t, 1);                    }                });        //聚合        JavaPairRDD<Tuple2<String, String>, Integer> rdd6 = rdd5.reduceByKey(new Function2<Integer, Integer, Integer>() {            public Integer call(Integer v1, Integer v2) throws Exception {                return v1 + v2;            }        });        //变换成新对        JavaPairRDD<String, List<Tuple2<String, Integer>>> rdd7 = rdd6.mapToPair(new PairFunction<Tuple2<Tuple2<String, String>, Integer>, String, List<Tuple2<String, Integer>>>() {            public Tuple2<String, List<Tuple2<String, Integer>>> call(Tuple2<Tuple2<String, String>, Integer> t) throws Exception {                List<Tuple2<String, Integer>> list = new ArrayList<Tuple2<String, Integer>>();                list.add(new Tuple2<String, Integer>(t._1._2, t._2));                return new Tuple2<String, List<Tuple2<String, Integer>>>(t._1._1, list);            }        });        JavaPairRDD<String, List<Tuple2<String, Integer>>> rdd8 = rdd7.reduceByKey(                new Function2<List<Tuple2<String, Integer>>, List<Tuple2<String, Integer>>, List<Tuple2<String, Integer>>>() {                    public List<Tuple2<String, Integer>> call(List<Tuple2<String, Integer>> v1, List<Tuple2<String, Integer>> v2) throws Exception {                        v1.addAll(v2);                        return v1;                    }                });        //排序        JavaPairRDD<String, List<Tuple2<String, Integer>>> rdd9 = rdd8.mapToPair(new PairFunction<Tuple2<String, List<Tuple2<String, Integer>>>, String, List<Tuple2<String, Integer>>>() {            public Tuple2<String, List<Tuple2<String, Integer>>> call(Tuple2<String, List<Tuple2<String, Integer>>> t) throws Exception {                String busiNum = t._1;                List<Tuple2<String, Integer>> list = t._2;                list.sort(new Comparator<Tuple2<String, Integer>>() {                    public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {                        return o2._2 - o1._2;                    }                });                if (list != null && list.size() >= 3) {                    list = list.subList(0, 3);                }                return new Tuple2<String, List<Tuple2<String, Integer>>>(busiNum, new ArrayList(list));            }        });        List list = rdd9.collect();        for (Object o : list) {            System.out.println(o);        }        System.out.println(System.currentTimeMillis() - start);    }    /**     * 解析一行json串,返回元组KV     *     * @param line     * @return     */    public static Tuple2<String, String> parseJson(String line) {        String[] arr = line.split("\t");        String cont = "";        String busiNum = arr[0];        if (arr != null && arr.length > 1) {            String json = arr[1];            JSONObject jo = JSON.parseObject(json);            JSONArray jarr = jo.getJSONArray("extInfoList");            if (jarr != null && jarr.size() > 0) {                JSONObject jo2 = jarr.getJSONObject(0);                if (jo2 != null) {                    JSONArray commArr = jo2.getJSONArray("values");                    if (commArr != null && commArr.size() > 0) {                        for (int i = 0; i < commArr.size(); i++) {                            cont = cont + commArr.get(i) + ",";                        }                    }                }            }        }        if (cont != null && !cont.equals("")) {            return new Tuple2<String, String>(busiNum, cont.substring(0, cont.length() - 1));        }        return new Tuple2<String, String>(busiNum, "");    }}

Result

(83644298,List((体验好,1), (性价比高,1), (服务热情,1)))(82317795,List((味道差,1)))(77705462,List((服务热情,3), (羊肉,2), (价格实惠,2)))(85766086,List((价格实惠,2), (服务热情,2), (味道赞,2)))(74145782,List((服务热情,18), (味道赞,14), (上菜快,13)))(71039150,List((团建,1), (价格实惠,1), (朋友聚会,1)))(70611801,List((干净卫生,4), (回头客,3), (味道赞,2)))(88902676,List((2,2)))(73963176,List((味道赞,15), (价格实惠,12), (分量足,11)))(84270191,List((价格实惠,2), (服务热情,2), (性价比高,2)))(89223651,List((环境优雅,8), (服务热情,8), (技师专业,7)))(82016443,List((分量足,3), (味道赞,2), (主食赞,2)))(77287793,List((干净卫生,29), (环境优雅,26), (音响效果好,26)))(79197522,List((服务热情,2), (价格实惠,1), (放松舒服,1)))(83084036,List((干净卫生,1), (价格实惠,1)))(73879078,List((饮品赞,3), (回头客,2), (味道赞,2)))(88284865,List((价格实惠,1), (价格高,1), (性价比低,1)))(83073343,List((干净卫生,17), (味道赞,16), (环境优雅,15)))(76114040,List((2,3), (5,1), (性价比高,1)))(86913510,List((午餐,1), (分量适中,1)))(88496862,List((回头客,5), (味道赞,4), (服务热情,4)))(78477325,List((味道赞,8), (回头客,7), (干净卫生,5)))(83981222,List((性价比高,4), (干净卫生,3), (价格实惠,3)))(82705919,List((回头客,3), (干净卫生,3), (1,2)))(87994574,List((无推销,12), (价格实惠,8), (服务热情,7)))(77373671,List((菜品差,1), (服务热情,1), (干净卫生,1)))(75144086,List((8239,60), (服务热情,38), (8241,31)))(85648235,List((味道赞,17), (服务热情,15), (干净卫生,13)))(73607905,List((菜品不错,16), (回头客,15), (干净卫生,15)))(76893145,List((服务热情,10), (环境优雅,7), (高大上,5)))(78824187,List((价格实惠,13), (回头客,11), (分量足,10)))865817/09/12 16:10:31 INFO SparkContext: Invoking stop() from shutdown hook