Spark下的TopK实现(Java)
来源:互联网 发布:汕头淘宝村地址 编辑:程序博客网 时间:2024/05/22 14:36
import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.FileReader;import java.io.FileWriter;import java.io.IOException;import java.io.Serializable;import java.util.Arrays;import java.util.Comparator;import java.util.HashSet;import java.util.List;import java.util.Set;import java.util.regex.Pattern;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.function.FlatMapFunction;import org.apache.spark.api.java.function.Function;import org.apache.spark.api.java.function.Function2;import org.apache.spark.api.java.function.PairFunction;import scala.Tuple2;/** * * @author Liyoung * */public class Topk {private static final Pattern SPACE = Pattern.compile("\\s+");public static void main(String[] args) throws Exception {if (args.length != 3) {System.err.println("Usage: TopKCountWord <infile> <stopword> <outfile>");System.exit(1);}String inFile = args[0]; // Path of DataSet.String stopWordpath = args[1]; // Path of stopWords.String outFile = args[2]; // Path of the file of the final result.final Set<String> stopWord = new HashSet<>(); // Store all stopWord in a// HashSet.try (FileReader fr = new FileReader(stopWordpath);BufferedReader reader = new BufferedReader(fr);) {String line = null;// Each line of the file stopWords.txt contains a blank space,so we// use a loop to split this blank space;while ((line = reader.readLine()) != null) {line = line.trim();stopWord.add(line);}} catch (IOException e) {}JavaSparkContext context = new JavaSparkContext("local", "Topk");// Read the DataSet from the HDFS into a JavaRDD;JavaRDD<String> lines = context.textFile(inFile);/* * Filter all punctuation. */JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {private static final long serialVersionUID = 1L;@Overridepublic Iterable<String> call(String s) {s = s.replaceAll("\\pP", " ");return Arrays.asList(SPACE.split(s));}});/* * Filter the words contained in stopWords. */JavaRDD<String> afterFilter = words.filter((new Function<String, Boolean>() {private static final long serialVersionUID = 1L;public Boolean call(String w) {return !stopWord.contains(w);}}));/* * The process of map, transform the RDD "afterFilter" into a PairRDD, * which consist of <key,value> sets. */JavaPairRDD<String, Integer> ones = afterFilter.mapToPair(new PairFunction<String, String, Integer>() {private static final long serialVersionUID = 1L;@Overridepublic Tuple2<String, Integer> call(String s) {return new Tuple2<String, Integer>(s, 1);}});/* * The process of reduce, reduce the value in the key-value pair * produced by the previous step. */JavaPairRDD<String, Integer> counts = ones.reduceByKey(new Function2<Integer, Integer, Integer>() {private static final long serialVersionUID = 1L;@Overridepublic Integer call(Integer i1, Integer i2) {return i1 + i2;}});// Take order and sorts the top 100.List<Tuple2<String, Integer>> output = (List<Tuple2<String, Integer>>) counts.takeOrdered(101, new TupleComparator());/* * Save the result to local file system. */try {FileWriter fw = new FileWriter(outFile);@SuppressWarnings("resource")BufferedWriter bw = new BufferedWriter(fw);for (int i = 1; i < output.size(); i++) {Tuple2<?, ?> tuple = output.get(i);bw.write(tuple._1() + "\n");}bw.flush();} catch (IOException e) {}context.stop();}/** * * @author Liyoung * The function of this class is to sort a set of tuple. */public static class TupleComparator implementsComparator<Tuple2<String, Integer>>, Serializable {private static final long serialVersionUID = 1L;@Overridepublic int compare(Tuple2<String, Integer> o1,Tuple2<String, Integer> o2) {return Integer.compare(o2._2(), o1._2());}}}
1 0
- Spark下的TopK实现(Java)
- topk问题java实现
- 关于海量数据TopK问题的一个具体实现(java)
- 多属性TopK——NRA算法(JAVA)实现
- Java解决TopK问题(使用集合和直接实现)
- MapReduce实现TopK的示例
- Scala堆的方式进行Spark topK词频查询(根据value进行TreeMap排序)
- TopK问题探索-最小堆JAVA实现
- java和scala分别实现TopK
- java 使用二叉堆实现 TopK 算法
- spark-TopK算法
- Spark TopK问题解法
- SPARK TOPK程序
- 大数据下的TopK算法
- 大数据下的多维TopK算法
- # 海量数据下的TopK问题
- 最小堆的实现--topk算法
- Spark下的PageRank实现
- C++ Primer Plus第六版编程练习12.6解答
- 非最大抑制(NMS)
- Netty服务器部署
- poj 2001 Shortest Prefixes
- Linux 技巧: Bash 测试和比较函数
- Spark下的TopK实现(Java)
- Windows 驱动开发 - 基础1
- PHP中的魔术方法详解
- Ubuntu14.10中安装eclipse后打开无菜单栏解决办法
- java 重入锁死
- 黑马程序员------------JAVA基础----单例设计模式
- C# 基础进阶 HTML语言一 标签、表格
- C Primer Plus 第十二章程序清单……2015.5.10
- Spring mvc 中处理模型数据 ModelAndView