Spark算子(四)
来源:互联网 发布:重装系统网络连不上 编辑:程序博客网 时间:2024/05/17 00:58
Point 1:public class MapPartitonsWithIndexOperator {
package com.spark.operator;import java.util.ArrayList;import java.util.Arrays;import java.util.Iterator;import java.util.List;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.Function2;import org.apache.spark.api.java.function.VoidFunction;public class MapPartitonsWithIndexOperator { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("MapPartitonsWithIndexOperator").setMaster( "local[2]"); JavaSparkContext sc = new JavaSparkContext(conf); // 准备一下数据 List<String> names = Arrays .asList("xurunyun", "liangyongqi", "wangfei"); JavaRDD<String> nameRDD = sc.parallelize(names,2); // 其实老师这个地方不写并行度2,默认其实它也是2 // parallelize并行集合的时候,指定了并行度为2,说白了就是numPartitions是2 // 也就是说我们上面的三大女神会被分到不同的两个分区里面去! // 但是怎么分,我不知道,spark决定!! // 如果我想知道谁和谁分到了一组里面去? // MapPartitonsWithIndex这个算子可以拿到每个partition的index JavaRDD<String> nameWithPartitonIndex = nameRDD.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() { private static final long serialVersionUID = 1L; @Override public Iterator<String> call(Integer index, Iterator<String> iterator) throws Exception { List<String> list = new ArrayList<String>(); while(iterator.hasNext()){ String name = iterator.next(); String result = index + " : " + name; list.add(result); } return list.iterator(); } }, true); nameWithPartitonIndex.foreach(new VoidFunction<String>() { private static final long serialVersionUID = 1L; @Override public void call(String result) throws Exception { System.out.println(result); } }); sc.close(); }}
Point 2:MapPartitionsOperator
package com.spark.operator;import java.util.ArrayList;import java.util.Arrays;import java.util.HashMap;import java.util.Iterator;import java.util.List;import java.util.Map;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.FlatMapFunction;import org.apache.spark.api.java.function.VoidFunction;// 理解里面final使用的原因!public class MapPartitionsOperator { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JoinOperator") .setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); // 准备一下数据 List<String> names = Arrays.asList("xurunyun","liangyongqi","wangfei"); JavaRDD<String> nameRDD = sc.parallelize(names); final Map<String, Integer> scoreMap = new HashMap<String, Integer>(); scoreMap.put("xurunyun", 150); scoreMap.put("liangyongqi", 100); scoreMap.put("wangfei", 90); // mapPartitions // map算子,一次就处理一个partition的一条数据!!! // mapPartitions算子,一次处理一个partition中所有的数据!!! // 推荐的使用场景!!! // 如果你的RDD的数据不是特别多,那么采用MapPartitions算子代替map算子,可以加快处理速度 // 比如说100亿条数据,你一个partition里面就有10亿条数据,不建议使用mapPartitions, // 内存溢出 JavaRDD<Integer> scoreRDD = nameRDD.mapPartitions(new FlatMapFunction<Iterator<String>, Integer>() { private static final long serialVersionUID = 1L; @Override public Iterable<Integer> call(Iterator<String> iterator) throws Exception { List<Integer> list = new ArrayList<Integer>(); while(iterator.hasNext()){ String name = iterator.next(); Integer score = scoreMap.get(name); list.add(score); } return list; } }); scoreRDD.foreach(new VoidFunction<Integer>() { private static final long serialVersionUID = 1L; @Override public void call(Integer score) throws Exception { System.out.println(score); } }); sc.close(); }}
Point 3:MapOperator
package com.spark.operator;import java.util.Arrays;import java.util.List;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.Function;import org.apache.spark.api.java.function.VoidFunction;public class MapOperator { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("LineCount") .setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); List<Integer> numbers = Arrays.asList(1,2,3,4,5); JavaRDD<Integer> numberRDD = sc.parallelize(numbers); // map对每个元素进行操作 JavaRDD<Integer> results = numberRDD.map(new Function<Integer, Integer>() { private static final long serialVersionUID = 1L; @Override public Integer call(Integer number) throws Exception { return number * 10; } }); results.foreach(new VoidFunction<Integer>() { private static final long serialVersionUID = 1L; @Override public void call(Integer result) throws Exception { System.out.println(result); } }); sc.close(); }}
阅读全文
0 0
- Spark算子(四)
- Spark RDD算子【四】
- Spark算子(一)
- Spark算子(二)
- Spark算子(三)
- Spark算子(五)
- Spark算子(六)
- Spark算子(七)
- Spark算子(八)
- Spark算子(九)
- Spark算子执行流程详解之四
- 大数据算子(spark)
- spark算子实战(二)
- spark RDD算子(四)之创建键值对RDD mapToPair flatMapToPair
- Spark 算子
- spark算子
- spark 算子
- Spark算子
- 字串和
- Iaas Paas Saas的概念区别
- svn提交项目 Error: is scheduled for addition, but is missing
- 乘法口诀表的灵活打印
- 所谓的JavaScript
- Spark算子(四)
- 引入.so文件
- (74)TreeMap练习:求字符串中每个字符出现的次数(键值对:字符---次数)
- window.location.hash知识汇总
- POJ 3693 Maximum repetition substring(重复次数最多的连续子串 字典序最小)
- TF,数据转换和softmax()
- 微软转向云端,可能意味着又一轮大裁员
- 图片加载失败裂开jquery和js的前端处理方法
- Solr查询参数