Spark之的Transformation
来源:互联网 发布:淘宝计入最低价 编辑:程序博客网 时间:2024/04/28 14:11
package com.java;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.*;
/**
* java版本的关于Spark中transformation算子的操作
*/
public class JavaSparkTransformation {
/**
* flatMap
*
* @param ctx
*/
public static void demoFlatMap2(JavaSparkContext ctx) {
JavaRDD<String> listRDD = ctx.textFile("hdfs://master:9000/spark/input/hello.txt");
JavaRDD<String> flatRDD = listRDD.flatMap(line -> {
return Arrays.asList(line.split(" "));
});
JavaPairRDD<String, Integer> pairRDD = flatRDD.mapToPair(str -> {
return new Tuple2<String, Integer>(str, 1);
});
JavaPairRDD<String, Integer> reduceRDD = pairRDD.reduceByKey((v1, v2) -> {
return v1 + v2;
});
//
// reduceRDD.collect().forEach(v->System.out.println(v));
reduceRDD.saveAsTextFile("hdfs://master:9000/spark/output/core1/");
}
//
// 1、map:将集合中每个元素乘以7
public static void demoMap(JavaSparkContext ctx) {
List<Integer> list = Arrays.asList(1, 2, 3, 4, 5);
JavaRDD<Integer> listRDD = ctx.parallelize(list);
JavaRDD<Integer> mapRDD = listRDD.map(num -> num * 7);
mapRDD.collect().forEach(v -> System.out.println(v));
}
// 2、filter:过滤出集合中的奇数
public static void demoFilter(JavaSparkContext ctx) {
List<Integer> list = Arrays.asList(1, 2, 3, 4, 5);
JavaRDD<Integer> listRDD = ctx.parallelize(list);
JavaRDD<Integer> mapRDD = listRDD.filter(num -> num % 2 == 0);
mapRDD.collect().forEach(v -> System.out.println(v));
}
// 3、flatMap:将行拆分为单词
public static void demoFlatMap(JavaSparkContext ctx) {
JavaRDD<String> listRDD = ctx.textFile("hdfs://master:9000/spark/input/hello.txt");
//方法1:
JavaRDD<String> flatRDD = listRDD.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterable<String> call(String line) throws Exception {
return Arrays.asList(line.split(" "));
}
});
//方法2:
JavaRDD<String> flatRDD2 = listRDD.flatMap(line -> {
return Arrays.asList(line.split(" "));
});
flatRDD.collect().forEach(v -> System.out.println(v));
}
// 4、sample:根据给定的随机种子seed,随机抽样出数量为frac的数据
public static void demoSample(JavaSparkContext ctx) {
List<Integer> list = new ArrayList<>();
for (int i = 0; i < 10000; i++) {
list.add(i);
}
JavaRDD<Integer> listRDD = ctx.parallelize(list);
JavaRDD<Integer> sampleRDD = listRDD.sample(true, 0.01);
System.out.println("SampleRDD=>Count:" + sampleRDD.count());
sampleRDD.collect().forEach(s -> System.out.println(s));
}
// 5、union:返回一个新的数据集,由原数据集和参数联合而成
public static void demoUnion(JavaSparkContext ctx) {
JavaRDD<Integer> oddRDD = ctx.parallelize(Arrays.asList(1, 3, 5, 7, 9));
JavaRDD<Integer> evenRDD = ctx.parallelize(Arrays.asList(2, 4, 6, 8, 10));
JavaRDD<Integer> unionRDD = oddRDD.union(evenRDD);
unionRDD.collect().forEach(s -> System.out.println(s));
}
// 6、groupByKey:对数组进行 group by key操作
public static void demoGroupByKey(JavaSparkContext ctx) {
JavaRDD<String> listRDD = ctx.textFile("hdfs://master:9000/spark/input/hello.txt");
JavaRDD<String> lineRDD = listRDD.flatMap(line -> {
return Arrays.asList(line.split(" "));
});
JavaPairRDD<Object, Object> pairRDD = lineRDD.mapToPair(word -> {
return new Tuple2<Object, Object>(word, 1);
});
pairRDD.collect().forEach(v -> {
System.out.println(v._1() + " :" + v._2());
});
JavaPairRDD<Object, Iterable<Object>> groupRDD = pairRDD.groupByKey();
groupRDD.foreach(v -> System.out.println(v._1() + ": " + v._2()));
}
// 7、reduceByKey:统计每个班级的人数
public static void demoReduceByKey(JavaSparkContext ctx) {
JavaRDD<String> lineRDD = ctx.textFile("hdfs://master:9000/spark/input/class.txt");
JavaPairRDD<String, Integer> pairRDD = lineRDD.mapToPair(line -> {
String[] splits = line.split(" ");
String clazz = splits[0];
int sum = Integer.valueOf(splits[2]);
return new Tuple2<String, Integer>(clazz, sum);
});
JavaPairRDD<String, Integer> reduceRDD = pairRDD.reduceByKey((v1, v2) -> {
return v1 + v2;
});
reduceRDD.foreach(t -> System.out.println(t._1() + ": " + t._2()));
}
// 8、join:打印关联的组合信息
public static void demoJoin(JavaSparkContext ctx) {
List<String> maleList = Arrays.asList(
"bd_1 male 20",
"bd_2 male 25",
"bd_3 male 15");
List<String> femaleList = Arrays.asList(
"bd_1 female 2",
"bd_2 female 10",
"bd_3 female 5"
);
JavaRDD<String> maleRDD = ctx.parallelize(maleList);
JavaRDD<String> femaleRDD = ctx.parallelize(femaleList);
JavaPairRDD<String, Integer> malePairRDD = maleRDD.mapToPair(line -> {
String[] splits = line.split(" ");
String clazz = splits[0];
int sum = Integer.valueOf(splits[2]);
return new Tuple2<String, Integer>(clazz, sum);
});
System.out.println("=============malePairRDD===============");
malePairRDD.foreach(t -> System.out.println(t));
JavaPairRDD<String, Integer> femalePairRDD = femaleRDD.mapToPair(line -> {
String[] splits = line.split(" ");
String clazz = splits[0];
int sum = Integer.valueOf(splits[2]);
return new Tuple2<String, Integer>(clazz, sum);
});
System.out.println("=============femalePairRDD===============");
femalePairRDD.foreach(t -> System.out.println(t));
JavaPairRDD<String, Tuple2<Integer, Integer>> joinRDD = malePairRDD.join(femalePairRDD);
System.out.println("=============joinPairRDD===============");
joinRDD.foreach(t -> System.out.println(t._1() + " :" + t._2()));
}
// 9、sortByKey:将学生身高进行排序
public static void demoSortByKey(JavaSparkContext ctx) {
List<String> list = Arrays.asList(
"zhangsan 176",
"xiaodingding 175",
"xiaobao 173",
"heyajie 174.5",
"liujun 173",
"wangxiaoxiong 150"
);
JavaRDD<String> lineRDD = ctx.parallelize(list);
// JavaPairRDD<String, Double> wordRDD = lineRDD.mapToPair(line -> {
// String[] data = line.split(" ");
// return new Tuple2<String, Double>(data[0], Double.valueOf(data[1]));
// });
//只需要调换两个参数的weizhi
JavaPairRDD<Double, String> wordRDD = lineRDD.mapToPair(line -> {
String[] data = line.split(" ");
return new Tuple2<Double, String>(Double.valueOf(data[1]), data[0]);
});
wordRDD.sortByKey().foreach(t -> System.out.println(t._2() + " :" + t._1()));
}
/**
* TopN
* 取前三名
* @param ctx
*/
public static void demoTopN(JavaSparkContext ctx) {
List<String> list = Arrays.asList(
"class1 90",
"class2 88",
"class2 80",
"class1 79",
"class2 60",
"class1 66",
"class2 86",
"class1 78",
"class1 82",
"class2 87"
);
JavaRDD<String> listRDD = ctx.parallelize(list);
JavaPairRDD<String, Double> pairRDD = listRDD.mapToPair(line -> {
String[] splits = line.split(" ");
return new Tuple2<String, Double>(splits[0], Double.valueOf((splits[1])));
});
JavaPairRDD<String, Iterable<Double>> groupRDD = pairRDD.groupByKey();
JavaRDD<Tuple2<String, Iterable<Double>>> mapRDD = groupRDD.map(m -> {
Iterable<Double> doubles = m._2();
TreeSet<Double> set = new TreeSet<Double>(new Comparator<Double>() {
@Override
public int compare(Double a, Double b) {
int ret = (int) (b - a);
return ret;
}
});
for (Double d : doubles) {
set.add(d);
if (set.size() > 3) {
set.pollLast();
}
}
return new Tuple2<String, Iterable<Double>>(m._1(), set);
});
groupRDD.foreach(t -> {
System.out.println(t._1 + ":" + t._2);
});
mapRDD.foreach(t -> {
System.out.println(t._1 + ":" + t._2);
});
}
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setAppName(JavaSparkTransformation.class.getSimpleName());
conf.setMaster("local");
JavaSparkContext ctx = new JavaSparkContext(conf);
//
demoTopN(ctx);
//
ctx.close();
}
}
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.*;
/**
* java版本的关于Spark中transformation算子的操作
*/
public class JavaSparkTransformation {
/**
* flatMap
*
* @param ctx
*/
public static void demoFlatMap2(JavaSparkContext ctx) {
JavaRDD<String> listRDD = ctx.textFile("hdfs://master:9000/spark/input/hello.txt");
JavaRDD<String> flatRDD = listRDD.flatMap(line -> {
return Arrays.asList(line.split(" "));
});
JavaPairRDD<String, Integer> pairRDD = flatRDD.mapToPair(str -> {
return new Tuple2<String, Integer>(str, 1);
});
JavaPairRDD<String, Integer> reduceRDD = pairRDD.reduceByKey((v1, v2) -> {
return v1 + v2;
});
//
// reduceRDD.collect().forEach(v->System.out.println(v));
reduceRDD.saveAsTextFile("hdfs://master:9000/spark/output/core1/");
}
//
// 1、map:将集合中每个元素乘以7
public static void demoMap(JavaSparkContext ctx) {
List<Integer> list = Arrays.asList(1, 2, 3, 4, 5);
JavaRDD<Integer> listRDD = ctx.parallelize(list);
JavaRDD<Integer> mapRDD = listRDD.map(num -> num * 7);
mapRDD.collect().forEach(v -> System.out.println(v));
}
// 2、filter:过滤出集合中的奇数
public static void demoFilter(JavaSparkContext ctx) {
List<Integer> list = Arrays.asList(1, 2, 3, 4, 5);
JavaRDD<Integer> listRDD = ctx.parallelize(list);
JavaRDD<Integer> mapRDD = listRDD.filter(num -> num % 2 == 0);
mapRDD.collect().forEach(v -> System.out.println(v));
}
// 3、flatMap:将行拆分为单词
public static void demoFlatMap(JavaSparkContext ctx) {
JavaRDD<String> listRDD = ctx.textFile("hdfs://master:9000/spark/input/hello.txt");
//方法1:
JavaRDD<String> flatRDD = listRDD.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterable<String> call(String line) throws Exception {
return Arrays.asList(line.split(" "));
}
});
//方法2:
JavaRDD<String> flatRDD2 = listRDD.flatMap(line -> {
return Arrays.asList(line.split(" "));
});
flatRDD.collect().forEach(v -> System.out.println(v));
}
// 4、sample:根据给定的随机种子seed,随机抽样出数量为frac的数据
public static void demoSample(JavaSparkContext ctx) {
List<Integer> list = new ArrayList<>();
for (int i = 0; i < 10000; i++) {
list.add(i);
}
JavaRDD<Integer> listRDD = ctx.parallelize(list);
JavaRDD<Integer> sampleRDD = listRDD.sample(true, 0.01);
System.out.println("SampleRDD=>Count:" + sampleRDD.count());
sampleRDD.collect().forEach(s -> System.out.println(s));
}
// 5、union:返回一个新的数据集,由原数据集和参数联合而成
public static void demoUnion(JavaSparkContext ctx) {
JavaRDD<Integer> oddRDD = ctx.parallelize(Arrays.asList(1, 3, 5, 7, 9));
JavaRDD<Integer> evenRDD = ctx.parallelize(Arrays.asList(2, 4, 6, 8, 10));
JavaRDD<Integer> unionRDD = oddRDD.union(evenRDD);
unionRDD.collect().forEach(s -> System.out.println(s));
}
// 6、groupByKey:对数组进行 group by key操作
public static void demoGroupByKey(JavaSparkContext ctx) {
JavaRDD<String> listRDD = ctx.textFile("hdfs://master:9000/spark/input/hello.txt");
JavaRDD<String> lineRDD = listRDD.flatMap(line -> {
return Arrays.asList(line.split(" "));
});
JavaPairRDD<Object, Object> pairRDD = lineRDD.mapToPair(word -> {
return new Tuple2<Object, Object>(word, 1);
});
pairRDD.collect().forEach(v -> {
System.out.println(v._1() + " :" + v._2());
});
JavaPairRDD<Object, Iterable<Object>> groupRDD = pairRDD.groupByKey();
groupRDD.foreach(v -> System.out.println(v._1() + ": " + v._2()));
}
// 7、reduceByKey:统计每个班级的人数
public static void demoReduceByKey(JavaSparkContext ctx) {
JavaRDD<String> lineRDD = ctx.textFile("hdfs://master:9000/spark/input/class.txt");
JavaPairRDD<String, Integer> pairRDD = lineRDD.mapToPair(line -> {
String[] splits = line.split(" ");
String clazz = splits[0];
int sum = Integer.valueOf(splits[2]);
return new Tuple2<String, Integer>(clazz, sum);
});
JavaPairRDD<String, Integer> reduceRDD = pairRDD.reduceByKey((v1, v2) -> {
return v1 + v2;
});
reduceRDD.foreach(t -> System.out.println(t._1() + ": " + t._2()));
}
// 8、join:打印关联的组合信息
public static void demoJoin(JavaSparkContext ctx) {
List<String> maleList = Arrays.asList(
"bd_1 male 20",
"bd_2 male 25",
"bd_3 male 15");
List<String> femaleList = Arrays.asList(
"bd_1 female 2",
"bd_2 female 10",
"bd_3 female 5"
);
JavaRDD<String> maleRDD = ctx.parallelize(maleList);
JavaRDD<String> femaleRDD = ctx.parallelize(femaleList);
JavaPairRDD<String, Integer> malePairRDD = maleRDD.mapToPair(line -> {
String[] splits = line.split(" ");
String clazz = splits[0];
int sum = Integer.valueOf(splits[2]);
return new Tuple2<String, Integer>(clazz, sum);
});
System.out.println("=============malePairRDD===============");
malePairRDD.foreach(t -> System.out.println(t));
JavaPairRDD<String, Integer> femalePairRDD = femaleRDD.mapToPair(line -> {
String[] splits = line.split(" ");
String clazz = splits[0];
int sum = Integer.valueOf(splits[2]);
return new Tuple2<String, Integer>(clazz, sum);
});
System.out.println("=============femalePairRDD===============");
femalePairRDD.foreach(t -> System.out.println(t));
JavaPairRDD<String, Tuple2<Integer, Integer>> joinRDD = malePairRDD.join(femalePairRDD);
System.out.println("=============joinPairRDD===============");
joinRDD.foreach(t -> System.out.println(t._1() + " :" + t._2()));
}
// 9、sortByKey:将学生身高进行排序
public static void demoSortByKey(JavaSparkContext ctx) {
List<String> list = Arrays.asList(
"zhangsan 176",
"xiaodingding 175",
"xiaobao 173",
"heyajie 174.5",
"liujun 173",
"wangxiaoxiong 150"
);
JavaRDD<String> lineRDD = ctx.parallelize(list);
// JavaPairRDD<String, Double> wordRDD = lineRDD.mapToPair(line -> {
// String[] data = line.split(" ");
// return new Tuple2<String, Double>(data[0], Double.valueOf(data[1]));
// });
//只需要调换两个参数的weizhi
JavaPairRDD<Double, String> wordRDD = lineRDD.mapToPair(line -> {
String[] data = line.split(" ");
return new Tuple2<Double, String>(Double.valueOf(data[1]), data[0]);
});
wordRDD.sortByKey().foreach(t -> System.out.println(t._2() + " :" + t._1()));
}
/**
* TopN
* 取前三名
* @param ctx
*/
public static void demoTopN(JavaSparkContext ctx) {
List<String> list = Arrays.asList(
"class1 90",
"class2 88",
"class2 80",
"class1 79",
"class2 60",
"class1 66",
"class2 86",
"class1 78",
"class1 82",
"class2 87"
);
JavaRDD<String> listRDD = ctx.parallelize(list);
JavaPairRDD<String, Double> pairRDD = listRDD.mapToPair(line -> {
String[] splits = line.split(" ");
return new Tuple2<String, Double>(splits[0], Double.valueOf((splits[1])));
});
JavaPairRDD<String, Iterable<Double>> groupRDD = pairRDD.groupByKey();
JavaRDD<Tuple2<String, Iterable<Double>>> mapRDD = groupRDD.map(m -> {
Iterable<Double> doubles = m._2();
TreeSet<Double> set = new TreeSet<Double>(new Comparator<Double>() {
@Override
public int compare(Double a, Double b) {
int ret = (int) (b - a);
return ret;
}
});
for (Double d : doubles) {
set.add(d);
if (set.size() > 3) {
set.pollLast();
}
}
return new Tuple2<String, Iterable<Double>>(m._1(), set);
});
groupRDD.foreach(t -> {
System.out.println(t._1 + ":" + t._2);
});
mapRDD.foreach(t -> {
System.out.println(t._1 + ":" + t._2);
});
}
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setAppName(JavaSparkTransformation.class.getSimpleName());
conf.setMaster("local");
JavaSparkContext ctx = new JavaSparkContext(conf);
//
demoTopN(ctx);
//
ctx.close();
}
}
0 0
- Spark之的Transformation
- Spark之RDD的Transformation操作
- spark常用的transformation
- Spark的action和transformation
- spark的transformation与action
- 【spark】Spark transformation和action的算子
- Spark transformation
- spark中RDD的transformation&action
- spark中RDD的transformation&action
- Spark中transformation算子的操作
- Spark:Scala实现常用的Transformation操作
- Spark代码2之Transformation:union,distinct,join
- 用Java理解Spark算子之Transformation算子
- spark源码之RDD(2)transformation和action
- Spark:Transformation和Action
- Spark RDD transformation操作
- spark RDD transformation操作
- Spark: Transformation和Action
- C++中的常量的基础知识
- 【jzoj3748】【CF446D】【DZY Loves Games】【矩阵乘法】
- Spring Security学习(一) Getting Started
- [BZOJ]1070 修车
- gensim word2vec
- Spark之的Transformation
- 视频中IBP帧的介绍和判定方法
- Unsupported major.minor version 52.0(unable to load class com.cl.business.yx.dao.AnnouncementDAO)
- Office2016永久激活
- java多线程:3、线程通讯
- OpenThreads库的使用-Condition
- 强大的vim配置文件,让编程更随意
- dp专题 第三题 采药
- HDU 1213How Many Tables(并查集)