RDD常用操作

来源:互联网 发布:mac编辑图片怎么保存 编辑:程序博客网 时间:2024/06/01 18:27
RDD常用操作:
1、关键词分组之后展示(将<String, Iterable<String>>转化为<String, String>)
JavaPairRDD<String, Iterable<String>> productMap = ones.groupByKey();
List<Tuple2<String, String>> reslist = productMap.map(new Function<Tuple2<String, Iterable<String>>, Tuple2<String, String>>() {  
 
           public Tuple2<String, String> call(Tuple2<String, Iterable<String>> integerIterableTuple2) throws Exception {  
               String key = integerIterableTuple2._1();  
               StringBuffer sb = new StringBuffer();  
               Iterable<String> iter = integerIterableTuple2._2();  
               for (String integer : iter) {  
                       sb.append(integer).append(" ");  
               }  
               return new Tuple2(key, sb.toString().trim().split(" ")[0]);  
           }  
       }).collect();
for(Tuple2<String, String> str : reslist) {  
           System.out.println(str._1() + "\t" + str._2() );  

       } 


2、过滤器,筛选出关键词为“魅言魅语(MeiYanMeiYu)”的键对值


JavaPairRDD<String, String> filterResult=product.filter(
       new Function<Tuple2<String,String>, Boolean>() {
           public Boolean call(Tuple2<String,String> v1)throws Exception {
               return v1._1().contains("魅言魅语(MeiYanMeiYu)");
           }
       }
);
Map<String, String> onesMap = filterResult.collectAsMap();

System.out.println(onesMap);


3、遍历循环


  JavaRDD<String> brands = jsc.textFile(productFilePath);


   brands.foreach(new VoidFunction<String>(){

           public void call(String brandString) throws Exception{
           final String brandOne = brandString.split("\t")[1];
          System.out.println(brandOne);
          }

   });


4、拼接键对值(将<String> 转化为<String, String>)


   JavaRDD<String> products = jsc.textFile(productFilePath);


   JavaPairRDD<String, String> productPair = products.mapToPair(new PairFunction<String, String, String>() {
           public Tuple2<String, String> call(String s) {
   String[] sArray = s.split("\t");
   String primaryDirectory = sArray[6].split(":")[0].split("@A@")[1];
   String brand = sArray[4];
                 return new Tuple2<String, String>(primaryDirectory, brand);
           }
    }); 
   JavaRDD<String> brands = jsc.textFile(brandFilePath);
   JavaPairRDD<String, String> brandsPair = brands.mapToPair(new PairFunction<String, String, String>() {
         public Tuple2<String, String> call(String s) {
   String[] sArray = s.split("\t");
        String key = sArray[1];
         return new Tuple2<String, String>(key, s);
       }
    }); 


   JavaPairRDD<String, String> brandsUnion = brandsPair.union(productPair);


5、两种放在本机上看结果的程序:
<1>//在集群上进行比对操作,下面的collect操作是将数据又放在了本机了。
JavaPairRDD<String, String> filterResult = ....;
Map<String, String> onesMap = filterResult.collectAsMap();
System.out.println(onesMap);
<2>
JavaRDD<String> products = jsc.textFile(productFilePath);
//这句话的意思是将List中的数据改为Map,其中key为相应的数字,value为数字的次数
JavaPairRDD<String, String> productPair = products.mapToPair(new PairFunction<String, String, String>() {
    public Tuple2<String, String> call(String s) {
   String[] sArray = s.split("\t");
   String primaryDirectory = sArray[6].split(":")[0].split("@A@")[1];
   String brand = sArray[4];
       return new Tuple2<String, String>(primaryDirectory, brand);
    }
}); 
List<Tuple2<String, String>> output = productPair.collect();
for (Tuple2<String, String> tuple : output) {
   System.out.println(tuple);
}




6、生成一个只包含不同元素的新RDD,即去除重复的数据
productPair = productPair.distinct();


7、mapPartitionsToPair操作(将<String> 转化为<String, String>)
/* 这里有两种方案提取原始图书非图书文件,一种mapToPair(),一种mapPartitionsToPair
* 由于mapToPair()每次调用一个String都需要调一下回调函数call,效率较低,目前先注销
* 后期优化用mapPartitionsToPair(f),这样只需调用一遍call回调函数效率更高,但是mapPartitionsToPair要求集群内存要足够,
* 否则容易发生OOM异常。
*/
JavaPairRDD<String, String> productPair = products.mapPartitionsToPair(
new PairFlatMapFunction<Iterator<String>, String, String>(){
public Iterator<Tuple2<String, String>> call(Iterator<String> rows) throws Exception{
   Set<String> localBrandDict = b_BrandDict.getValue();//在集群中调用广播
   Set<String> localGoodDict = b_GoodDict.getValue();//在集群中调用广播
   List<Tuple2<String, String>> resultList=new ArrayList<Tuple2<String, String>>();
   while (rows.hasNext()) {
   try{
   String row=rows.next();
                   String[] sArray = row.split("\t");
      String key = "";
      String title = sArray[8];
      String segResult = "";
      int len = sArray.length;
      String keyString = sArray[len-3];
      //获取key与分词处理结果
      List<String> keyAndValueList = getKeyAndValueOfProduct(keyString, title, localBrandDict, localGoodDict);
      key = keyAndValueList.get(0);
      segResult = keyAndValueList.get(1);
      resultList.add(new Tuple2<String, String>(key, segResult));
   }catch(Exception ex){
   resultList.add(new Tuple2<String,String>("",""));
   }
               }
   Iterator<Tuple2<String, String>> result = resultList.iterator();
   return result;
}
}
).filter(new Function<Tuple2<String,String>, Boolean>() {
public Boolean call(Tuple2<String, String> tuple) throws Exception {
   return tuple._1.isEmpty() ? false : true;
   }
   });


8、flatMap操作
/***接下来去重,将图书与非图书的中心词去掉重复的,计算出不重复的有多少个词**/
JavaRDD<String> wordsFromResult = unionResult.values().flatMap(
new FlatMapFunction<String, String>(){
public Iterator<String> call(String s) throws Exception{
Iterable<String> aa = Arrays.asList(s.split(","));
Iterator<String> ii = aa.iterator();
return ii;
}
}
);
Long notRepeatSize = wordsFromResult.distinct().count();


9.两个javaPairRdd的mapPartitionsToPair操作

JavaPairRDD<String, String> shuffleResult = unionResult.mapPartitionsToPair(
new PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, String>(){
public Iterator<Tuple2<String, String>> call(Iterator<Tuple2<String, String>> rows) throws Exception{
List<Tuple2<String, String>> resultList=new ArrayList<Tuple2<String, String>>();
while (rows.hasNext()) {
Tuple2<String, String> row= rows.next();
   resultList.add(row);
               }
Collections.shuffle(resultList);
Iterator<Tuple2<String, String>> result = resultList.iterator();
   return result;
}
}).repartition(1).cache();


10、使用顶级具名类(带自定义参数的)







11、cogroup对多个RDD进行分组:


12、将JavaRDD<String>转化成自己想要的格式的JavaRDD<String>,然后发布到广播,作为集群中的字典使用(利用mapPartitions)



SparkConf conf = new SparkConf()
        .setMaster("local[*]")//本地运行需要加这句话,若放在集群上注释掉即可
        .setAppName("Child And Parent Brand Analysis");//这个名字可以随意命名
JavaSparkContext jsc = new JavaSparkContext(conf);
//如果hdfs上存在这个文件就删除
HdfsPathOpt.deleteHdfsPath(jsc.toSparkContext(jsc),writeFilePath);
JavaRDD<String> brands = jsc.textFile(readFilePath1);
JavaRDD<String> brandString = brands.mapPartitions(new FlatMapFunction<Iterator<String>,String>(){
public Iterator<String> call(Iterator<String> rows) throws Exception {
List<String> resultList=new ArrayList<String>();
   while (rows.hasNext()) {
   try{
   String row=rows.next();
                   String[] brandArray = row.split("\t");
      String brand1 = brandArray[0];
      String brand2 = brandArray[1];
      resultList.add(brand1);
      resultList.add(brand1);
   }catch(Exception ex){
   resultList.add("");
   }
               }
   Iterator<String> result = resultList.iterator();
   return result;    
        }
}).filter(new Function<String, Boolean>() {
public Boolean call(String tuple) throws Exception {
boolean res = true;
if(tuple.equals(""))
{
res = false;
}
   return res;
   }
   });
//brandString.repartition(1).saveAsTextFile(writeFilePath);
List<String> brandList = brandString.collect();
Set<String> brandDict = new HashSet<>(brandList);//去掉重复的品牌词,作为品牌字典
final Broadcast<Set<String>> b_BrandDict = jsc.broadcast(brandDict);//将品牌词发布到广播供集群使用
jsc.stop();
jsc.close();


13、将JavaPairRDD<String, String>用HashMap发布到广播供集群使用

JavaPairRDD<String, String> directoryAndBrandsPair = ...;

Map<String, String> directoryAndBrandMap= directoryAndBrandsPair.collectAsMap();
Map<String, String> directoryAndBrandMap1 = new HashMap<String, String>(directoryAndBrandMap);//一定要有这步否则运行SPARK程序时会报错
final Broadcast<Map<String, String>> directoryAndBrandMapDict = jsc.broadcast(directoryAndBrandMap1);


14、将<String, Iterable<String>> 转化为<String>

JavaPairRDD<String, Iterable<String>> childAndParentBrand = ...;

JavaRDD<String> result = childAndParentBrand.map(new Function<Tuple2<String, Iterable<String>>,String>(){
   public String call(Tuple2<String, Iterable<String>> pair) throws Exception{
   String brand = pair._1();
   Iterator<String> rows = pair._2().iterator();
   StringBuffer result = new StringBuffer();
   result.append(brand+"\t");
   while (rows.hasNext())
   {
   String row=rows.next();
   result.append(row+",");
   }
   String resultStirng = result.toString().trim();
   resultStirng = resultStirng.substring(0,resultStirng.length() - 1);
   return resultStirng;
   }

   }).filter(new Function<String, Boolean>() {
public Boolean call(String tuple) throws Exception {
   return (tuple.equals("")||tuple.isEmpty()) ? false : true;
   }
   });


15、如果List<String> resultList = result.collect();之后要对result进行一些JAVA操作那么一定要

                List<String> resultList = result.collect();
List<String> resultList1 = new ArrayList<String>(resultList);//这步至关重要,否则无法直接对resultList进行removeDuplicate()操作
List<String> resultListByFilter = removeDuplicate(resultList1);


16、key为母品牌,value为很多用逗号隔开的子品牌,将子品牌中包含其他行的母品牌的字段去掉

例如:华为 荣耀

乐际  格力,华为

用下面的程序可以将第二行中的华为和格力都去掉

/**
* 将value中包含母品牌的字段去掉

List<String> parentBrands = childAndParentBrand.keys().collect();
Set<String> parentBrandsDict = new HashSet<>(parentBrands);
final Broadcast<Set<String>> parentBrandsBroad = jsc.broadcast(parentBrandsDict);//将set<String>发布到广播

JavaRDD<String> result = childAndParentBrand.map(new Function<Tuple2<String, Iterable<String>>,String>(){
   public String call(Tuple2<String, Iterable<String>> pair) throws Exception{
   Set<String> localParentDict = parentBrandsBroad.getValue();//在集群中调用广播
   String brand = pair._1();
   Iterator<String> rows = pair._2().iterator();
   StringBuffer result = new StringBuffer();
   result.append(brand+"\t");
   int sizePre = result.toString().trim().length();
   while (rows.hasNext())
   {
   String row=rows.next();
   if(localParentDict.contains(row))
   {
   continue;
   }
   result.append(row+",");
   }
   String resultStirng = result.toString().trim();
   int sizeAfter = resultStirng.length();
   if(sizePre == sizeAfter)//表示只有键没有value
   {
   return "";
   }
   resultStirng = resultStirng.substring(0,resultStirng.length() - 1);
   return resultStirng;
   }

   }).filter(new Function<String, Boolean>() {
public Boolean call(String tuple) throws Exception {
   return (tuple.equals("")||tuple.isEmpty()) ? false : true;
   }

   });*/



17、将JavaPairRDD<String, Iterable<String>> 转为JavaPairRDD<String, String>,且将一行中的数据分解为多行,且统计每行中相同词出现的次数,且去掉单个字母和单个数字


JavaPairRDD<String, Iterable<String>> brandItemModelPair ="...";


JavaPairRDD<String, String> brandTypeGoodsPair = brandItemModelPair.mapPartitionsToPair(new PairFlatMapFunction<Iterator<Tuple2<String, Iterable<String>>>, String, String>(){
        public Iterator<Tuple2<String, String>> call(Iterator<Tuple2<String, Iterable<String>>> rows) throws Exception{
        List<Tuple2<String, String>> resultList=new ArrayList<Tuple2<String, String>>();
        //定义一个map来存放每个元素出现的次数
                Map<String,Integer> elementsCount=new HashMap<String,Integer>();
                String directory3 = "";
        while (rows.hasNext()) {
        Tuple2<String, Iterable<String>> pair = rows.next();
        directory3 = pair._1();
        Iterator<String> rowsOfPair = pair._2().iterator();
                    while (rowsOfPair.hasNext())
                    {
                        String rowOfPair= rowsOfPair.next();
                        String[] rowArray = rowOfPair.split(",");
                        for(String s:rowArray){
                        if(s.matches("[0-9]{1,}") || s.matches("[a-zA-Z]")){
                        continue;//在结果中去掉单个数字或字母
                        }
                            Integer i=elementsCount.get(s);
                            if(i==null){
                                elementsCount.put(s, 1);
                            }else{
                                elementsCount.put(s, i+1);
                            }
                        }
                    }
                    
        }
        int mapSize = elementsCount.size();
                for (String mapKey : elementsCount.keySet()) {
                 String mapResult = mapKey+":"+elementsCount.get(mapKey);
                 resultList.add(new Tuple2<String,String>(directory3,mapResult));
                }
        Iterator<Tuple2<String, String>> result = resultList.iterator();
                return result;
        }
        });