Spark 的JAVA版 wordCount
来源:互联网 发布:白左圣母毁灭欧洲 知乎 编辑:程序博客网 时间:2024/05/20 03:48
package os.unix;import org.apache.spark.SparkConf;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.FlatMapFunction;import org.apache.spark.api.java.function.Function2;import org.apache.spark.api.java.function.PairFunction;import org.apache.spark.api.java.function.VoidFunction;import scala.Tuple2;import java.util.Arrays;import java.util.Iterator;import java.util.List;public class WordCount { public static void main(String[] args) { SparkConf conf = new SparkConf().setMaster("local").setAppName("WordCount"); JavaSparkContext jsc = new JavaSparkContext(conf); /** * 将文件中的内容加载到linesRdd中 */ JavaRDD<String> linesRDD = jsc.textFile("C:\\Users\\os\\Desktop\\test\\word.txt", 3); /** * 每一行数据根据空格分割 */ JavaRDD<String> wordsRDD = linesRDD.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String s) throws Exception { /** * 参数s代表linesRdd中的每一条数据 */ String[] split = s.split(" "); List<String> asList = Arrays.asList(split); return asList.iterator(); } }); /** * wordsRDD是一个非kv格式的RDD变成kv格式RDD * 在java api中必须使用mapToPARI */ JavaPairRDD<String, Integer> pariRDD = wordsRDD.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) throws Exception { return new Tuple2<String, Integer>(s, 1); } }); /** * 使用reduceByKey进行聚合 * groupByKey将相同的key分到相同组里面 */ JavaPairRDD<String, Integer> resultRDD = pariRDD.reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer integer, Integer integer2) throws Exception { return integer + integer2; } }); /** * 按照单词出现的次数进行排序 */ JavaPairRDD<String, Integer> result = resultRDD.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() { @Override public Tuple2<Integer, String> call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception { return new Tuple2<>(stringIntegerTuple2._2, stringIntegerTuple2._1); } }).sortByKey().mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() { @Override public Tuple2<String, Integer> call(Tuple2<Integer, String> integerStringTuple2) throws Exception { return new Tuple2<>(integerStringTuple2._2, integerStringTuple2._1); } }); result.foreach(new VoidFunction<Tuple2<String, Integer>>() { @Override public void call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception { System.out.println(stringIntegerTuple2); } }); }}
阅读全文
0 0
- Spark 的JAVA版 wordCount
- Spark基于排序机制的wordcount程序(Java版)
- java调用spark的借口运行WordCount
- 基于Java的Spark WordCount编程实现
- java spark WordCount
- 007-spark的wordCount
- Spark入门的WordCount
- Spark的WordCount详解
- Spark之java操作WordCount
- Spark wordcount - Python, Scala, Java
- Spark Java 单词计数(WordCount)
- Spark之java操作WordCount
- Spark下如何运行Java版本的WordCount
- spark 本地调试运行WordCount(java版local模式)
- spark-wordcount
- Spark-wordcount
- wordcount spark...
- wordCount spark
- 第一篇博客
- SystemUI RingtonePlayer
- ajax跨域(代理)上传文件
- python2.7使用TimeDelta中total_seconds()方法的问题
- Charles 最新版(Charles 4.1.4 ) 破解注册
- Spark 的JAVA版 wordCount
- 我的python学习笔记、写入文件
- SpringBoot定时任务说明
- Mybatis返回值问题
- CIE 颜色空间
- 三招教你div垂直居中
- Linux 下十大命令行下载工具
- 聊聊我最近的面试感受
- SystemUI PoweUI笔记