spark实现PageRank

来源:互联网 发布:阴阳师神龙强化数据 编辑:程序博客网 时间:2024/05/17 01:11
import java.util.ArrayList;import java.util.List;import java.util.Iterator;import java.util.regex.Pattern;import scala.Tuple2;import com.google.common.collect.Iterables;import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.api.java.function.Function;import org.apache.spark.api.java.function.Function2;import org.apache.spark.api.java.function.PairFlatMapFunction;import org.apache.spark.api.java.function.PairFunction;import org.apache.spark.sql.SparkSession;/** * 数据格式如下: * URL         neighbor URL * URL         neighbor URL * URL         neighbor URL */public final class JavaPageRank {  private static final Pattern SPACES = Pattern.compile("\\s+");/** *  实现元素的相加,reduceByKey使用 根据key求和 */  private static class Sum implements Function2<Double, Double, Double> {    @Override    public Double call(Double a, Double b) {      return a + b;    }  }  public static void main(String[] args) throws Exception {    if (args.length < 2) {      System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");      System.exit(1);    }    SparkSession spark = SparkSession      .builder()      .appName("JavaPageRank")      .getOrCreate();    JavaRDD<String> lines = spark.read().textFile(args[0]).javaRDD();    /** links 是(String ,Iterable<String>)     *  先将数据转为key value形式的RDD,然后根据key进行分组  key是某一个url  value是这个key的的所有邻居url组成的集合      */    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(      new PairFunction<String, String, String>() {        @Override        public Tuple2<String, String> call(String s) {          String[] parts = SPACES.split(s);          return new Tuple2<>(parts[0], parts[1]);        }      }).distinct().groupByKey().cache();      // ranks的初始形式是(String ,Double)其中double一列初始都是1    JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {      @Override      public Double call(Iterable<String> rs) {        return 1.0;      }    });    for (int current = 0; current < Integer.parseInt(args[1]); current++) {      // Calculates URL contributions to the rank of other URLs.        // links 是(String ,Iterable<String>) ranks是(String ,Double)  join之后是( String ,(Iterable<String>,Double))      JavaPairRDD<String, Double> contribs = links.join(ranks).values()        .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {          @Override          //对values进行flatMapToPair操作  ,value的格式Tuple2<Iterable<String>, Double>          //迭代元组中第一个元素 然后权重分别设置为第二个元素除以第一个元素的里面url的个数          //组成二元组Tuple2<String, Double>  ,因为第一个元素里面有多个url会迭代          // 所以call方法返回多个Tuple2<String, Double> 组成的一个Iterator,          // 最后会进行扁平化,所以flatMapToPair返回了JavaPairRDD<String, Double>          public Iterator<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {            int urlCount = Iterables.size(s._1);            List<Tuple2<String, Double>> results = new ArrayList<>();            for (String n : s._1) {              results.add(new Tuple2<>(n, s._2() / urlCount));            }            return results.iterator();          }      });        //contribs是JavaPairRDD<String, Double>  然后我们进行根据key操作对value求和        //那么得到了key及其自己对应的权重值,然后得到平滑后的每个url对应的权重得到新的ranks        //继续迭代,继续把links跟ranks进行join然后更新权重      ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {        @Override        public Double call(Double sum) {          return 0.15 + sum * 0.85;        }      });    }    //进行输出ranks存放了url及其对应的权重    List<Tuple2<String, Double>> output = ranks.collect();    for (Tuple2<?,?> tuple : output) {        System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");    }    spark.stop();  }}