spark实现简单的pagerank
来源:互联网 发布:博信软件 编辑:程序博客网 时间:2024/05/02 23:59
/** * Created by Administrator on 2016/8/9 . */import org.apache.spark.api.java.JavaPairRDD;import org.apache.spark.api.java.JavaSparkContext;import org.apache.spark.api.java.JavaRDD;import org.apache.spark.SparkConf;import org.apache.spark.api.java.function.Function;import org.apache.spark.api.java.function.Function2;import org.apache.spark.api.java.function.PairFunction;import org.apache.spark.api.java.function.PairFlatMapFunction;import scala.Tuple2;import java.util.*;public class PageRank{ public static void main(String[] args){ SparkConf conf=new SparkConf(); conf.setAppName("pagerank"); conf.setMaster("local"); conf.set("spark.testing.memory", "500000000");//设置运行内存大小 JavaSparkContext sc=new JavaSparkContext(conf); //partitionBy()只对kv RDD起作用, 进行该操作后,将相同key值的数据放到同一机器上,并进行持久化操作,对后续循环中的join操作进行优化,使得省去join操作 shuffle的开销 ArrayList<String> list=new ArrayList<String>(4); list.add("A,D");//网页之间的连接关系,A页面链接到网页D list.add("B,A"); list.add("C,A,B"); list.add("D,A,C"); JavaRDD<String> links=sc.parallelize(list); JavaPairRDD<Character,char[]> pairs =links.mapToPair(new PairFunction<String, Character, char[]>() { public Tuple2<Character,char[]> call(String s) { String[] str=s.split(","); char[] ch=new char[str.length]; for (int i=0;i<str.length;i++){ ch[i]=str[i].charAt(0); } return new Tuple2<Character,char[]>(s.charAt(0),ch ); }//将字符串中保存的页面的链接关系map转换成key-values形式,key当前页面,指向的页面集合用数组表示 }).cache();//持久化 JavaPairRDD<Character,Float> ranks=sc.parallelize(Arrays.asList('A','B','C','D')).mapToPair(new PairFunction<Character, Character, Float>() { public Tuple2<Character,Float> call(Character character) throws Exception { return new Tuple2<Character,Float>(character,new Float(1.0)); }//初始化页面权值是1.0 }); for(int i=0;i<10;i++){ JavaPairRDD<Character,Tuple2<char[],Float>> contribs=pairs.join(ranks); JavaPairRDD<Character,Float> con=contribs.flatMapToPair(new PairFlatMapFunction<Tuple2<Character,Tuple2<char[],Float>>,Character,Float>(){ public Iterator call(Tuple2<Character,Tuple2<char[],Float>> val) throws Exception{ List<Tuple2<Character,Float>> list=new ArrayList<Tuple2<Character, Float>>(); Float f=val._2._2; char[] ch=val._2._1(); int len=ch.length; for(int i=0;i<len;i++) { Tuple2<Character, Float> map = new Tuple2<Character, Float>(new Character(ch[i]), new Float(f / len)); list.add(map); } return list.iterator(); } });//将每个页面获得其他页面的pagerank值形成键值对的形式 ranks=con.reduceByKey(new Function2<Float, Float, Float>() { public Float call(Float a, Float b) { return a + b; } }).mapValues(new Function<Float, Float>() { public Float call(Float a) throws Exception { return new Float(0.15+0.85*a); } });//当前迭代的pagerank计算 } Map map=ranks.collectAsMap();//访问所有页面的pagerank值 Set set=map.keySet(); Iterator it=set.iterator(); while(it.hasNext()){ System.out.println(map.get(it.next()) ); } //rank.saveAsTextFile("hdfs://172.20.35.85:9000/output/pagerank/scala"); }}
python版pagerank:#coding:utf-8import osimport sys# Set the path for spark installation# this is the path where you have built spark using sbt/sbt assemblyos.environ['SPARK_HOME'] = "/usr/local/spark-1.6.2-bin-hadoop2.6"# os.environ['SPARK_HOME'] = "/home/jie/d2/spark-0.9.1"# Append to PYTHONPATH so that pyspark could be foundsys.path.append("/usr/local/spark-1.6.2-bin-hadoop2.6/python")# sys.path.append("/home/jie/d2/spark-0.9.1/python")from pyspark import SparkConf,SparkContextdef f(x): print x list1=[] s=len(x[1][0]) for y in x[1][0]: list1.append(tuple((y,x[1][1]/s))) return list1def p(x): print xif __name__=="__main__": list=[('A',('D',)),('B',('A',)),('C',('A','B')),('D',('A','C'))] conf=SparkConf().setMaster("local").setAppName("pagerank") sc=SparkContext(conf=conf) pages=sc.parallelize(list).map(lambda x:(x[0],tuple(x[1]))).partitionBy(2).cache()#必须转换成key-values,持久化操作提高效率,partitionBy将相同key的元素哈希到相同的机器上,省去了后续join操作shuffle开销 links=sc.parallelize(['A','B','C','D']).map(lambda x:(x,1.0)) for i in range(1,10): rank=pages.join(links).flatMap(f) links=rank.reduceByKey(lambda x, y:x+y) links=links.mapValues(lambda x:0.15+0.85*x) links.foreach(p) links.saveAsSequenceFile("/pagerank")
0 0
- spark实现简单的pagerank
- PageRank算法在spark上的简单实现
- Spark下的PageRank实现
- Spark上PageRank的简单应用
- PageRank的一个简单实现
- PageRank算法的简单实现.
- PageRank的php简单实现
- Spark GraphX实现PageRank
- spark实现PageRank
- PageRank的简单实现(scala版)
- spark-rdd 实现简易pagerank
- PageRank简单实现
- spark-scala版的PageRank
- 简单PageRank的理解
- 谷歌pageRank算法简单实现
- hadoop实现的一个简单的Pagerank例子
- PageRank算法原理剖析及Spark实现
- Spark PageRank
- 文件模式与文件操作
- shell脚本特殊用法
- 【NIO学习序列】
- smarty的block function(块函数)
- Java中文乱码解决之道:认识字符集
- spark实现简单的pagerank
- MFC OCX控件实现安全初始化和脚本安全的方法http://blog.csdn.net/xiliang_pan/article/details/8264685
- 包含中划线的英文正则
- 判断是ios还是安卓访问
- js控制checkbox全选/取消全选
- Ubuntu下创建eclipse的桌面快捷方式
- android 6.0 权限问题的解决
- Qt关于设置子窗口样式表不生效问题
- AsyncTask粗糙讲解