RDD常用操作
来源:互联网 发布:mac编辑图片怎么保存 编辑:程序博客网 时间:2024/06/01 18:27
1、关键词分组之后展示(将<String, Iterable<String>>转化为<String, String>)
JavaPairRDD<String, Iterable<String>> productMap = ones.groupByKey();
List<Tuple2<String, String>> reslist = productMap.map(new Function<Tuple2<String, Iterable<String>>, Tuple2<String, String>>() {
public Tuple2<String, String> call(Tuple2<String, Iterable<String>> integerIterableTuple2) throws Exception {
String key = integerIterableTuple2._1();
StringBuffer sb = new StringBuffer();
Iterable<String> iter = integerIterableTuple2._2();
for (String integer : iter) {
sb.append(integer).append(" ");
}
return new Tuple2(key, sb.toString().trim().split(" ")[0]);
}
}).collect();
for(Tuple2<String, String> str : reslist) {
System.out.println(str._1() + "\t" + str._2() );
}
2、过滤器,筛选出关键词为“魅言魅语(MeiYanMeiYu)”的键对值
new Function<Tuple2<String,String>, Boolean>() {
public Boolean call(Tuple2<String,String> v1)throws Exception {
return v1._1().contains("魅言魅语(MeiYanMeiYu)");
}
}
);
Map<String, String> onesMap = filterResult.collectAsMap();
System.out.println(onesMap);
3、遍历循环
brands.foreach(new VoidFunction<String>(){
public void call(String brandString) throws Exception{final String brandOne = brandString.split("\t")[1];
System.out.println(brandOne);
}
});
4、拼接键对值(将<String> 转化为<String, String>)
JavaRDD<String> products = jsc.textFile(productFilePath);
public Tuple2<String, String> call(String s) {
String[] sArray = s.split("\t");
String primaryDirectory = sArray[6].split(":")[0].split("@A@")[1];
String brand = sArray[4];
return new Tuple2<String, String>(primaryDirectory, brand);
}
});
JavaRDD<String> brands = jsc.textFile(brandFilePath);
JavaPairRDD<String, String> brandsPair = brands.mapToPair(new PairFunction<String, String, String>() {
public Tuple2<String, String> call(String s) {
String[] sArray = s.split("\t");
String key = sArray[1];
return new Tuple2<String, String>(key, s);
}
});
JavaPairRDD<String, String> brandsUnion = brandsPair.union(productPair);
5、两种放在本机上看结果的程序:
<1>//在集群上进行比对操作,下面的collect操作是将数据又放在了本机了。
JavaPairRDD<String, String> filterResult = ....;
Map<String, String> onesMap = filterResult.collectAsMap();
System.out.println(onesMap);
<2>
JavaRDD<String> products = jsc.textFile(productFilePath);
//这句话的意思是将List中的数据改为Map,其中key为相应的数字,value为数字的次数
JavaPairRDD<String, String> productPair = products.mapToPair(new PairFunction<String, String, String>() {
public Tuple2<String, String> call(String s) {
String[] sArray = s.split("\t");
String primaryDirectory = sArray[6].split(":")[0].split("@A@")[1];
String brand = sArray[4];
return new Tuple2<String, String>(primaryDirectory, brand);
}
});
List<Tuple2<String, String>> output = productPair.collect();
for (Tuple2<String, String> tuple : output) {
System.out.println(tuple);
}
6、生成一个只包含不同元素的新RDD,即去除重复的数据
productPair = productPair.distinct();
7、mapPartitionsToPair操作(将<String> 转化为<String, String>)
/* 这里有两种方案提取原始图书非图书文件,一种mapToPair(),一种mapPartitionsToPair
* 由于mapToPair()每次调用一个String都需要调一下回调函数call,效率较低,目前先注销
* 后期优化用mapPartitionsToPair(f),这样只需调用一遍call回调函数效率更高,但是mapPartitionsToPair要求集群内存要足够,
* 否则容易发生OOM异常。
*/
JavaPairRDD<String, String> productPair = products.mapPartitionsToPair(
new PairFlatMapFunction<Iterator<String>, String, String>(){
public Iterator<Tuple2<String, String>> call(Iterator<String> rows) throws Exception{
Set<String> localBrandDict = b_BrandDict.getValue();//在集群中调用广播
Set<String> localGoodDict = b_GoodDict.getValue();//在集群中调用广播
List<Tuple2<String, String>> resultList=new ArrayList<Tuple2<String, String>>();
while (rows.hasNext()) {
try{
String row=rows.next();
String[] sArray = row.split("\t");
String key = "";
String title = sArray[8];
String segResult = "";
int len = sArray.length;
String keyString = sArray[len-3];
//获取key与分词处理结果
List<String> keyAndValueList = getKeyAndValueOfProduct(keyString, title, localBrandDict, localGoodDict);
key = keyAndValueList.get(0);
segResult = keyAndValueList.get(1);
resultList.add(new Tuple2<String, String>(key, segResult));
}catch(Exception ex){
resultList.add(new Tuple2<String,String>("",""));
}
}
Iterator<Tuple2<String, String>> result = resultList.iterator();
return result;
}
}
).filter(new Function<Tuple2<String,String>, Boolean>() {
public Boolean call(Tuple2<String, String> tuple) throws Exception {
return tuple._1.isEmpty() ? false : true;
}
});
8、flatMap操作
/***接下来去重,将图书与非图书的中心词去掉重复的,计算出不重复的有多少个词**/
JavaRDD<String> wordsFromResult = unionResult.values().flatMap(
new FlatMapFunction<String, String>(){
public Iterator<String> call(String s) throws Exception{
Iterable<String> aa = Arrays.asList(s.split(","));
Iterator<String> ii = aa.iterator();
return ii;
}
}
);
Long notRepeatSize = wordsFromResult.distinct().count();
9.两个javaPairRdd的mapPartitionsToPair操作
JavaPairRDD<String, String> shuffleResult = unionResult.mapPartitionsToPair(
new PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, String>(){
public Iterator<Tuple2<String, String>> call(Iterator<Tuple2<String, String>> rows) throws Exception{
List<Tuple2<String, String>> resultList=new ArrayList<Tuple2<String, String>>();
while (rows.hasNext()) {
Tuple2<String, String> row= rows.next();
resultList.add(row);
}
Collections.shuffle(resultList);
Iterator<Tuple2<String, String>> result = resultList.iterator();
return result;
}
}).repartition(1).cache();
10、使用顶级具名类(带自定义参数的)
11、cogroup对多个RDD进行分组:
12、将JavaRDD<String>转化成自己想要的格式的JavaRDD<String>,然后发布到广播,作为集群中的字典使用(利用mapPartitions)
SparkConf conf = new SparkConf()
.setMaster("local[*]")//本地运行需要加这句话,若放在集群上注释掉即可
.setAppName("Child And Parent Brand Analysis");//这个名字可以随意命名
JavaSparkContext jsc = new JavaSparkContext(conf);
//如果hdfs上存在这个文件就删除
HdfsPathOpt.deleteHdfsPath(jsc.toSparkContext(jsc),writeFilePath);
JavaRDD<String> brands = jsc.textFile(readFilePath1);
JavaRDD<String> brandString = brands.mapPartitions(new FlatMapFunction<Iterator<String>,String>(){
public Iterator<String> call(Iterator<String> rows) throws Exception {
List<String> resultList=new ArrayList<String>();
while (rows.hasNext()) {
try{
String row=rows.next();
String[] brandArray = row.split("\t");
String brand1 = brandArray[0];
String brand2 = brandArray[1];
resultList.add(brand1);
resultList.add(brand1);
}catch(Exception ex){
resultList.add("");
}
}
Iterator<String> result = resultList.iterator();
return result;
}
}).filter(new Function<String, Boolean>() {
public Boolean call(String tuple) throws Exception {
boolean res = true;
if(tuple.equals(""))
{
res = false;
}
return res;
}
});
//brandString.repartition(1).saveAsTextFile(writeFilePath);
List<String> brandList = brandString.collect();
Set<String> brandDict = new HashSet<>(brandList);//去掉重复的品牌词,作为品牌字典
final Broadcast<Set<String>> b_BrandDict = jsc.broadcast(brandDict);//将品牌词发布到广播供集群使用
jsc.stop();
jsc.close();
13、将JavaPairRDD<String, String>用HashMap发布到广播供集群使用
JavaPairRDD<String, String> directoryAndBrandsPair = ...;
Map<String, String> directoryAndBrandMap= directoryAndBrandsPair.collectAsMap();
Map<String, String> directoryAndBrandMap1 = new HashMap<String, String>(directoryAndBrandMap);//一定要有这步否则运行SPARK程序时会报错
final Broadcast<Map<String, String>> directoryAndBrandMapDict = jsc.broadcast(directoryAndBrandMap1);
14、将<String, Iterable<String>> 转化为<String>
JavaPairRDD<String, Iterable<String>> childAndParentBrand = ...;
JavaRDD<String> result = childAndParentBrand.map(new Function<Tuple2<String, Iterable<String>>,String>(){
public String call(Tuple2<String, Iterable<String>> pair) throws Exception{
String brand = pair._1();
Iterator<String> rows = pair._2().iterator();
StringBuffer result = new StringBuffer();
result.append(brand+"\t");
while (rows.hasNext())
{
String row=rows.next();
result.append(row+",");
}
String resultStirng = result.toString().trim();
resultStirng = resultStirng.substring(0,resultStirng.length() - 1);
return resultStirng;
}
}).filter(new Function<String, Boolean>() {
public Boolean call(String tuple) throws Exception {
return (tuple.equals("")||tuple.isEmpty()) ? false : true;
}
});
15、如果List<String> resultList = result.collect();之后要对result进行一些JAVA操作那么一定要
List<String> resultList = result.collect();
List<String> resultList1 = new ArrayList<String>(resultList);//这步至关重要,否则无法直接对resultList进行removeDuplicate()操作
List<String> resultListByFilter = removeDuplicate(resultList1);
16、key为母品牌,value为很多用逗号隔开的子品牌,将子品牌中包含其他行的母品牌的字段去掉
例如:华为 荣耀
乐际 格力,华为
用下面的程序可以将第二行中的华为和格力都去掉
/*** 将value中包含母品牌的字段去掉
*
List<String> parentBrands = childAndParentBrand.keys().collect();
Set<String> parentBrandsDict = new HashSet<>(parentBrands);
final Broadcast<Set<String>> parentBrandsBroad = jsc.broadcast(parentBrandsDict);//将set<String>发布到广播
JavaRDD<String> result = childAndParentBrand.map(new Function<Tuple2<String, Iterable<String>>,String>(){
public String call(Tuple2<String, Iterable<String>> pair) throws Exception{
Set<String> localParentDict = parentBrandsBroad.getValue();//在集群中调用广播
String brand = pair._1();
Iterator<String> rows = pair._2().iterator();
StringBuffer result = new StringBuffer();
result.append(brand+"\t");
int sizePre = result.toString().trim().length();
while (rows.hasNext())
{
String row=rows.next();
if(localParentDict.contains(row))
{
continue;
}
result.append(row+",");
}
String resultStirng = result.toString().trim();
int sizeAfter = resultStirng.length();
if(sizePre == sizeAfter)//表示只有键没有value
{
return "";
}
resultStirng = resultStirng.substring(0,resultStirng.length() - 1);
return resultStirng;
}
}).filter(new Function<String, Boolean>() {
public Boolean call(String tuple) throws Exception {
return (tuple.equals("")||tuple.isEmpty()) ? false : true;
}
});*/
17、将JavaPairRDD<String, Iterable<String>> 转为JavaPairRDD<String, String>,且将一行中的数据分解为多行,且统计每行中相同词出现的次数,且去掉单个字母和单个数字
JavaPairRDD<String, Iterable<String>> brandItemModelPair ="...";
JavaPairRDD<String, String> brandTypeGoodsPair = brandItemModelPair.mapPartitionsToPair(new PairFlatMapFunction<Iterator<Tuple2<String, Iterable<String>>>, String, String>(){
public Iterator<Tuple2<String, String>> call(Iterator<Tuple2<String, Iterable<String>>> rows) throws Exception{
List<Tuple2<String, String>> resultList=new ArrayList<Tuple2<String, String>>();
//定义一个map来存放每个元素出现的次数
Map<String,Integer> elementsCount=new HashMap<String,Integer>();
String directory3 = "";
while (rows.hasNext()) {
Tuple2<String, Iterable<String>> pair = rows.next();
directory3 = pair._1();
Iterator<String> rowsOfPair = pair._2().iterator();
while (rowsOfPair.hasNext())
{
String rowOfPair= rowsOfPair.next();
String[] rowArray = rowOfPair.split(",");
for(String s:rowArray){
if(s.matches("[0-9]{1,}") || s.matches("[a-zA-Z]")){
continue;//在结果中去掉单个数字或字母
}
Integer i=elementsCount.get(s);
if(i==null){
elementsCount.put(s, 1);
}else{
elementsCount.put(s, i+1);
}
}
}
}
int mapSize = elementsCount.size();
for (String mapKey : elementsCount.keySet()) {
String mapResult = mapKey+":"+elementsCount.get(mapKey);
resultList.add(new Tuple2<String,String>(directory3,mapResult));
}
Iterator<Tuple2<String, String>> result = resultList.iterator();
return result;
}
});
- RDD常用操作
- RDD常用操作
- Spark常用RDD操作汇总
- spark RDD常用函数/操作
- Spark学习笔记三(RDD常用操作)
- Spark学习之RDD常用操作
- Learning Spark——RDD常用操作
- RDD操作
- 操作RDD
- Spark中RDD的常用操作(Python)
- Spark RDD 常用算子
- Spark RDD操作
- RDD特性与操作
- spark RDD keyvalue操作
- Spark RDD transformation操作
- spark RDD transformation操作
- RDD类操作说明
- spark RDD 基本操作
- 培训第十天 部分关键字及内部类
- java List实体排序
- C语言跳转语句
- 获取本地歌曲信息的方法
- leetcode 538. Convert BST to Greater Tree 后序遍历的一个应用
- RDD常用操作
- Base64是什么
- pthon6个必要库
- 蓝桥杯 基础练习 字母图形
- 好书推荐《蚀心者》(剧透高能预警)
- Android 使用 FFmpeg (二)——视屏流播放简单实现
- C语言递归嵌套
- spring注解应用场景
- WPF课堂例子