PageRank值的和为1

来源:互联网 发布:rewear it aac 编辑:程序博客网 时间:2024/05/17 22:31

计算图的pageRank值的和为1的前提条件是

1:每个节点分配的初始值的和为1.

2:计算公式的阿尔法的值需要除以节点的格式。

2:每个节点的out degree 不是是0;


第一个和第二个条件和我们编程时可以满足,第三个不是我们能决定的。如果不满足第三条件,那么就会有黑洞,往该节点上分配的值,就损失了。我们把out degree为0的节点称为黑洞节点。当然有些特殊的情况,和会大于一,比如所有节点都指向一个节点,没有其他的边。因为每个节点的值是固定的,每次把固定的值增加到指向的节点,那么和就会大于1.当然,这是比较特殊的情况。

其实和是否为1不重要,Graph中初始值为阿尔法值,而且计算PageRank时阿尔法也没有除以节点数。佩奇的论文中是除以节点数的。

如果我们必须要使PageRank的和为1,我们可以这样做,人为的为每个黑洞节点加上和其他节点的边。这样,就使所有的值参与运算,PageRank的和也就为1了。

如果我们再程序中直接加边,那么会使边额外的庞大,我们可以直接在计算时加入额外的值就可以了。每个黑洞节点贡献的值是节点值除以节点数-1(黑洞节点对其它所有节点)

那么每次的调整值为所有黑洞节点贡献值之和,代码为final double delta = outCount == count?0.0:rankGraph.vertices( ).toJavaRDD( ).filter( s-> !s._2( )._2( ))
.map( s-> s._2._1/(count-1)).reduce( (s1, s2)->s1 + s2 ); 做outCount == count判断是因为java的空集合不能与graph做join,scala就没有这个问题。

做最后的计算时,我们把调整加上去,如果是黑洞节点本身,那么需要减去节点本身的贡献值(节点值除以节点数-1),因为是其他黑洞节点的贡献,代码为

if (t2.isDefined( ))
{
double countValue = !t1._2?t2.get( ) + delta - t1._1( )/(count - 1):t2.get( ) + delta;
double value = resetProb/count + (1.0 - resetProb)*countValue;
return new Tuple2<Double, Boolean>(value, t1._2( ));
}
else
{
double value = resetProb/count + (1.0 - resetProb)*delta;
return new Tuple2<Double, Boolean>(value, t1._2( ));
}

如果是in degree的值为0的,那么计算值就是调整值。还有一个就是 in degree 和out degree都是0,那么这个节点应该在图中先处理掉是无用的,所以不考虑in degree 和out degree都为0的情况。

另外 做初始化时,取值黑洞节点应该小,程序取值为 阿尔法值除以节点数,可以加快收敛。

代码为

final double v1 = 1.0/outCount - resetProb*(count - outCount)/count/outCount; 
final double v2 = resetProb/count;


整个程序代码如下,


import java.io.Serializable;
import java.util.Arrays;


import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.graphx.Edge;
import org.apache.spark.graphx.EdgeContext;
import org.apache.spark.graphx.EdgeTriplet;
import org.apache.spark.graphx.Graph;
import org.apache.spark.graphx.TripletFields;
import org.apache.spark.graphx.VertexRDD;
import org.apache.spark.storage.StorageLevel;


import scala.Option;
import scala.Tuple2;
import scala.reflect.ClassManifestFactory;
import scala.reflect.ClassTag;
import scala.runtime.AbstractFunction0;
import scala.runtime.AbstractFunction1;
import scala.runtime.AbstractFunction2;
import scala.runtime.AbstractFunction3;
import scala.runtime.BoxedUnit;


/**
 * 
 */


public class FinalRageFunction
{
private static final ClassTag<String> tagString = ClassManifestFactory.classType( String.class );
private static final ClassTag<Object> tagObject = ClassManifestFactory.classType( Object.class );
private static final ClassTag<Double> tagDouble = ClassManifestFactory.classType( Double.class );

public static void main( String[] args )
{
SparkConf conf = new SparkConf().setAppName( "page rank" ).setMaster( "local" );
JavaSparkContext ctx = new JavaSparkContext( conf );

JavaRDD<Tuple2<Object, String>> vertices = ctx.parallelize( Arrays.asList( 
new Tuple2<Object, String>(1L, "a"),
new Tuple2<Object, String>(2L, "b"),
new Tuple2<Object, String>(3L, "c"),
new Tuple2<Object, String>(4L, "d")
) );

JavaRDD<Edge<Double>> edges = ctx.parallelize( Arrays.asList(
new Edge<Double>(1L, 2L, 10.0),
new Edge<Double>(2L, 3L, 10.0),
new Edge<Double>(2L, 4L, 10.0),
new Edge<Double>(1L, 5L, 10.0),
new Edge<Double>(1L, 4L, 10.0)
) );

Graph<String,Double> g = Graph.apply( vertices.rdd( ), edges.rdd( ), "", StorageLevel.MEMORY_ONLY( ), StorageLevel.MEMORY_ONLY( ), tagString, tagDouble );
int numIter = 10;
double resetProb = 0.15;
Graph<Double, Double> g2 = calePageRank1( g, numIter, resetProb );

g2.vertices( ).toJavaRDD( ).sortBy( new Function<Tuple2<Object,Double>, Object>( )
{
@Override
public Object call( Tuple2<Object, Double> v1 ) throws Exception
{
return v1._1( );
}
}, true, 1 ).foreach( s->System.out.println( s ) );

double sum  = g2.vertices( ).toJavaRDD( ).map( s->s._2 ).reduce( (s1, s2)-> s1 + s2 );
System.out.println( sum );
}


private static  Graph<Double, Double> calePageRank1(Graph<String, Double> g, int  numIter, double resetProb)
{
final long count = g.vertices( ).count( );
VertexRDD<Object> outDegrees = g.ops( ).outDegrees( );
final long outCount = outDegrees.count( );

final double v1 = 1.0/outCount - resetProb*(count - outCount)/count/outCount; 
final double v2 = resetProb/count;
Graph<Tuple2<Double, Boolean>, Double> rankGraph = g.outerJoinVertices( outDegrees, new MyFunction3<Object, String, Option<Object>, Tuple2<Double,Boolean>>( )
{


@Override
public Tuple2<Double, Boolean> apply( Object arg0, String arg1,
Option<Object> t )
{
int retValue = t.getOrElse( new MyFunction0<Object, Integer>( -1 ) );

return new Tuple2<Double, Boolean>( Double.valueOf( retValue ), retValue != -1 );
}
}, tagObject, ClassManifestFactory.classType( Tuple2.class ), null );

rankGraph = rankGraph.mapTriplets( new MyFunction1<EdgeTriplet<Tuple2<Double,Boolean>,Double>, Double>( )
{


@Override
public Double
apply( EdgeTriplet<Tuple2<Double, Boolean>, Double> t )
{
return 1.0/t.srcAttr( )._1( );
}

}, tagDouble ).mapVertices( new MyFunction2<Object, Tuple2<Double,Boolean>, Tuple2<Double,Boolean>>( )
{


@Override
public Tuple2<Double, Boolean> apply( Object t0,
Tuple2<Double, Boolean> t1 )
{
if (!t1._2)
{
return new Tuple2<Double, Boolean>(v2, t1._2( ));
}
else
{
return new Tuple2<Double, Boolean>(v1, t1._2( ));
}
}
}, ClassManifestFactory.classType( Tuple2.class ), null );

int iteration = 0;

Graph<Tuple2<Double, Boolean>, Double> prevRankGraph = null;
while(iteration < numIter)
{
rankGraph.cache( );

final double delta = outCount == count?0.0:rankGraph.vertices( ).toJavaRDD( ).filter( s-> !s._2( )._2( ))
.map( s-> s._2._1/(count-1)).reduce( (s1, s2)->s1 + s2 );

rankGraph = rankGraph.mapVertices( new MyFunction2<Object, Tuple2<Double,Boolean>, Tuple2<Double,Boolean>>( )
{
@Override
public Tuple2<Double, Boolean> apply( Object t0,
Tuple2<Double, Boolean> t1 )
{
return new Tuple2<Double, Boolean>( t1._1, t1._2 );
}
}, ClassManifestFactory.classType( Tuple2.class ), null );

VertexRDD<Double> rankUpdates = rankGraph.aggregateMessages( new MyFunction1<EdgeContext<Tuple2<Double,Boolean>,Double,Double>, BoxedUnit>( )
{


@Override
public BoxedUnit apply(
EdgeContext<Tuple2<Double, Boolean>, Double, Double> t )
{
t.sendToDst( t.srcAttr( )._1( ) * t.attr( ));
return BoxedUnit.UNIT;
}
}, new MyFunction2<Double, Double, Double>( )
{
@Override
public Double apply( Double t1, Double t2 )
{
return t1 + t2;
}
}, TripletFields.Src, tagDouble );
prevRankGraph = rankGraph;

rankGraph = rankGraph.outerJoinVertices( rankUpdates, new MyFunction3<Object, Tuple2<Double, Boolean>, Option<Double>, Tuple2<Double, Boolean>>( )
{


@Override
public Tuple2<Double, Boolean> apply( Object t0,
Tuple2<Double, Boolean> t1, Option<Double> t2 )
{
if (t2.isDefined( ))
{
double countValue = !t1._2?t2.get( ) + delta - t1._1( )/(count - 1):t2.get( ) + delta;
double value = resetProb/count + (1.0 - resetProb)*countValue;
return new Tuple2<Double, Boolean>(value, t1._2( ));
}
else
{
double value = resetProb/count + (1.0 - resetProb)*delta;
return new Tuple2<Double, Boolean>(value, t1._2( ));
}
}
}, tagDouble, ClassManifestFactory.classType( Tuple2.class ), null );
rankGraph.edges( ).toJavaRDD( ).foreachPartition(x -> {});// also materializes rankGraph.vertices
    prevRankGraph.vertices( ).unpersist(false);
    prevRankGraph.edges( ).unpersist(false);

iteration ++;
}

return rankGraph.mapVertices( new MyFunction2<Object, Tuple2<Double,Boolean>, Double>( )
{


@Override
public Double apply( Object t0, Tuple2<Double, Boolean> t1 )
{
return t1._1( );
}
}, tagDouble, null );
}


public static abstract class MyFunction2<T1, T2, R> extends AbstractFunction2<T1, T2, R> implements Serializable
{


}


public static abstract class MyFunction3<T1, T2, T3, R> extends AbstractFunction3<T1, T2, T3, R> implements Serializable
{


}


public static  class MyFunction0<T1, R> extends AbstractFunction0<R> implements Serializable
{
private R r;
public MyFunction0(R r)
{
this.r = r;
}
@Override
public R apply( )
{
return r;
}


}


public static abstract class MyFunction1<T1, R> extends AbstractFunction1<T1, R> implements Serializable
{


}
}