Spark java API 单词计数 ----直接在本地IDEA中运行

来源:互联网 发布:网络科技时代 编辑:程序博客网 时间:2024/05/21 14:57

Spark运行在IDEA的本地,Java API --单词记数



  1. package lesson03;
  2. import org.apache.spark.SparkConf;
  3. import org.apache.spark.api.java.JavaPairRDD;
  4. import org.apache.spark.api.java.JavaRDD;
  5. import org.apache.spark.api.java.JavaSparkContext;
  6. import org.apache.spark.api.java.function.*;
  7. import org.apache.spark.sql.catalyst.expressions.In;
  8. import scala.Tuple2;
  9. import java.util.Arrays;
  10. import java.util.Iterator;
  11. /**
  12. * Created by Administrator on 2017/7/31.
  13. */
  14. public class JavaWordCount7 {
  15. public static void main(String[] args) {
  16. SparkConf conf = new SparkConf().setAppName("wordcount").setMaster("local");
  17. // spark程序入口是JavaSparkContext
  18. JavaSparkContext sc = new JavaSparkContext(conf);
  19. //读取文件内容形成一个rdd
  20. JavaRDD<String> fileRDD = sc.textFile("D:\\文档\\hello.txt");
  21. //接下来我们要执行flatmap的操作 -- 分割 --压平 -- 并返回一个Iterator --最终得到一个RDD
  22. //第一个string代表的是输入的数据类型
  23. //第二个String代表的是输出的数据类型
  24. final JavaRDD<String> wordRDD = fileRDD.flatMap(new FlatMapFunction<String, String>() {
  25. //Iterator<String> 输出的数据类型
  26. //String s 输入的数据类型
  27. @Override
  28. public Iterator<String> call(String s) throws Exception {
  29. return Arrays.asList(s.split("\t")).iterator();
  30. }
  31. });
  32. //貌似用map((word,1))也是可以的
  33. final JavaPairRDD<String, Integer> wordOneRDD = wordRDD.mapToPair(new PairFunction<String, String, Integer>() {
  34. @Override
  35. public Tuple2<String, Integer> call(String s) throws Exception {
  36. return new Tuple2<String, Integer>(s, 1);
  37. }
  38. });
  39. final JavaPairRDD<String, Integer> WordCountRDD = wordOneRDD.reduceByKey(new Function2<Integer, Integer, Integer>() {
  40. @Override
  41. public Integer call(Integer v1, Integer v2) throws Exception {
  42. return v1 + v2;
  43. }
  44. });
  45. final JavaPairRDD<Integer, String> resultRDD = WordCountRDD.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
  46. @Override
  47. public Tuple2<Integer, String> call(Tuple2<String, Integer> t) throws Exception {
  48. return new Tuple2<Integer, String>(t._2, t._1);
  49. }
  50. });
  51. resultRDD.sortByKey().foreach(new VoidFunction<Tuple2<Integer, String>>() {
  52. @Override
  53. public void call(Tuple2<Integer, String> t) throws Exception {
  54. System.out.println("单词: "+t._2 + "次数: "+t._1);
  55. }
  56. });
  57. }
  58. }


  1. package lesson03;
  2. import org.apache.spark.SparkConf;
  3. import org.apache.spark.api.java.JavaPairRDD;
  4. import org.apache.spark.api.java.JavaRDD;
  5. import org.apache.spark.api.java.JavaSparkContext;
  6. import org.apache.spark.api.java.function.*;
  7. import org.apache.spark.sql.catalyst.expressions.In;
  8. import scala.Tuple2;
  9. import java.util.Arrays;
  10. import java.util.Iterator;
  11. /**
  12. * Created by Administrator on 2017/7/31.
  13. */
  14. public class JavaWordCount7 {
  15. public static void main(String[] args) {
  16. SparkConf conf = new SparkConf().setAppName("javawordcount").setMaster("local");
  17. //java 程序入口是JavaSparkContext
  18. JavaSparkContext sc = new JavaSparkContext(conf);
  19. //读取文件内容形成一个rdd
  20. JavaRDD<String> fileRDD = sc.textFile("D:\\文档\\hello.txt");
  21. //接下来我们要执行flatmap的操作
  22. //第一个string代表的是输入的数据类型
  23. //第二个String代表的是输出的数据类型
  24. final JavaRDD<String> wordRDD = fileRDD.flatMap(new FlatMapFunction<String, String>() {
  25. //Iterator<String> 输出的数据类型
  26. //String s 输入的数据类型
  27. @Override
  28. public Iterator<String> call(String s) throws Exception {
  29. return Arrays.asList(s.split("\t")).iterator();
  30. }
  31. });
  32. final JavaPairRDD<String, Integer> wordOneRDD = wordRDD.mapToPair(new PairFunction<String, String, Integer>() {
  33. @Override
  34. public Tuple2<String, Integer> call(String s) throws Exception {
  35. return new Tuple2<String, Integer>(s, 1);
  36. }
  37. });
  38. final JavaPairRDD<String, Integer> WordCountRDD = wordOneRDD.reduceByKey(new Function2<Integer, Integer, Integer>() {
  39. @Override
  40. public Integer call(Integer v1, Integer v2) throws Exception {
  41. return v1 + v2;
  42. }
  43. });
  44. final JavaPairRDD<Integer, String> resultRDD = WordCountRDD.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
  45. @Override
  46. public Tuple2<Integer, String> call(Tuple2<String, Integer> t) throws Exception {
  47. return new Tuple2<Integer, String>(t._2, t._1);
  48. }
  49. });
  50. resultRDD.sortByKey().foreach(new VoidFunction<Tuple2<Integer, String>>() {
  51. @Override
  52. public void call(Tuple2<Integer, String> t) throws Exception {
  53. System.out.println("单词:"+t._2 + "次数:"+t._1);
  54. }
  55. });
  56. }
  57. }

阅读全文
0 0