PageRank介绍

来源:互联网 发布:win7注册表修改mac地址 编辑:程序博客网 时间:2024/06/06 21:45

PageRank介绍

我的环境是MyEclipse7.0,jdk为1.7

 

PageRank算法包括两个类HtmlEntity和HtmlPageRank,其中HtmlPageRank需要用到htmlParser.jar和htmllexer.jar这两个包。本文是根据http://duyunfei.iteye.com/blog/1532798的说明调试的.

 

首先,我们准备了7个测试网页,这几个网页的链接情况如下:  

i\j

test1

test2

test3

test4

test5

test6

test7

test1

0

1

1

0

0

0

0

test2

1

0

0

1

0

0

0

test3

0

0

0

1

1

1

0

test4

0

1

0

0

1

0

1

test5

0

0

1

1

0

0

0

test6

1

0

0

0

1

0

0

test7

0

1

0

1

0

0

1

表格的意思是 test1链接到test2,test3 ....依次类推,我们大致的根据上面两个原则可以猜一下,哪个将会是排名第一的网页?哪个最不重要?

貌似是test4和test6?

 

Html代码我都是放在E:/MyEclipse/workspace/PageRank/WebRoot/htmlDoc

Test1.html链接代码<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test2.html">test2</a><<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test3.html">test3</a> Test2.html<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test1.html">test1</a><a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html">test4</a> Test3.html<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html">test4</a><ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test5.html">test5</a><ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test6.html">test6</a> Test4.html<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test2.html">test2</a><a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test5.html">test5</a><a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test7.html">test7</a>Test5.html<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test3.html">test3</a><ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html">test4</a>Test6.html<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test1.html">test1</a><ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test5.html">test5</a>Test7.html<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test2.html">test2</a><ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html">test4</a><ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test7.html">test7</a>

运行结果:

HtmlPageRank类import java.io.*;import java.util.*;import org.htmlparser.*;import org.htmlparser.filters.TagNameFilter;import org.htmlparser.tags.LinkTag;import org.htmlparser.util.NodeList;import org.htmlparser.visitors.HtmlPage;/** *pagerank算法实现 * *@authorafei * */publicclassHtmlPageRank{    /*阀值 */    publicstaticdoubleMAX=0.00000000001;    /*阻尼系数 */    publicstaticdoublealpha=0.85;    publicstaticStringhtmldoc="E:/MyEclipse/workspace/PageRank/WebRoot/htmlDoc";    publicstaticMap<String,HtmlEntity>map=newHashMap<String,HtmlEntity>();    publicstaticList<HtmlEntity>list=newArrayList<HtmlEntity>();    publicstaticdouble[]init;    publicstaticdouble[]pr;    publicstaticvoidmain(String[]args)throws Exception {       loadHtml();       pr=doPageRank();        while(!(checkMax())){           System.arraycopy(pr,0,init,0,init.length);           pr=doPageRank();       }       for(inti=0;i<pr.length;i++){           HtmlEntityhe=list.get(i);           he.setPr(pr[i]);       }       List<HtmlEntity>finalList=newArrayList<HtmlEntity>();       Collections.sort(list,newComparator(){           publicintcompare(Objecto1,Objecto2){              HtmlEntityh1=(HtmlEntity)o1;              HtmlEntityh2=(HtmlEntity)o2;              intem=0;              if(h1.getPr()>h2.getPr()){                  em=-1;              }else{                  em=1;              }              returnem;           }       });       for(HtmlEntityhe:list){           System.out.println(he.getPath()+" : "+he.getPr());       }    }     /**     *加载文件夹下的网页文件,并且初始化pr值(即init数组),计算每个网页的外链和内链     */    publicstaticvoidloadHtml()throws Exception {       Filefile=newFile(htmldoc);       File[]htmlfiles=file.listFiles(newFileFilter(){           publicbooleanaccept(Filepathname){              if(pathname.getPath().endsWith(".html")){                  return true;              }              return false;           }       });       init=newdouble[htmlfiles.length];       for(inti=0;i<htmlfiles.length;i++){           Filef=htmlfiles[i];           BufferedReaderbr=newBufferedReader(newInputStreamReader(                  newFileInputStream(f)));           Stringline=br.readLine();           StringBufferhtml=newStringBuffer();           while(line!=null){              line=br.readLine();              html.append(line);           }           HtmlEntityhe=newHtmlEntity();           he.setPath(f.getAbsolutePath());           he.setContent(html.toString());           Parserparser=Parser.createParser(html.toString(),"gb2312");           HtmlPagepage=newHtmlPage(parser);           parser.visitAllNodesWith(page);           NodeListnodelist=page.getBody();           nodelist=nodelist.extractAllNodesThatMatch(                  newTagNameFilter("A"),true);           for(intj=0;j<nodelist.size();j++){              LinkTagoutlink=(LinkTag)nodelist.elementAt(j);              he.getOutLinks().add(outlink.getAttribute("href"));           }            map.put(he.getPath(),he);           list.add(he);           init[i]=0.0;       }       for(inti=0;i<list.size();i++){           HtmlEntityhe=list.get(i);           List<String>outlink=he.getOutLinks();            for(Stringol:outlink){              HtmlEntityhe0=map.get(ol);              try{                  he0.getInLinks().add(he.getPath());              }catch(NullPointerExceptione){//如果网页的链接路径不正确,则报NullPointerException错误,并且你会发现heo=null,也就是说map.get(ol)取到的值为null,但是事实上map不为null,ol的值在map中不存在导致的,这是由于html中路径设置不正确                  e.printStackTrace();              }          }       }    }     /**     *计算pagerank     *     *@paraminit     *@paramalpho     *@return     */    privatestaticdouble[]doPageRank(){       double[]pr=newdouble[init.length];        for(inti=0;i<init.length;i++){           doubletemp=0;           HtmlEntityhe0=list.get(i);           for(intj=0;j<init.length;j++){              HtmlEntityhe=list.get(j);              //计算对本页面链接相关总值              if(i!=j&&he.getOutLinks().size()!=0&&he.getOutLinks().contains(he0.getPath())){                  temp=temp+init[j]/he.getOutLinks().size();              }           }           //经典的pr公式           pr[i]=alpha+(1-alpha)*temp;       }       returnpr;    }     /**     *判断前后两次的pr数组之间的差别是否大于我们定义的阀值假如大于,那么返回false,继续迭代计算pr     *     *@parampr     *@paraminit     *@parammax     *@return     */    privatestaticbooleancheckMax(){       booleanflag=true;        for(inti=0;i<pr.length;i++){           if(Math.abs(pr[i]-init[i])>MAX){              flag=false;              break;           }       }       return flag;    }}HtmlEntity类import java.util.*;/** *网页entity * *@authorafei * */classHtmlEntity{    privateStringpath;    privateStringcontent;    /*外链(本页面链接的其他页面) */    privateList<String>outLinks=newArrayList<String>();    /*内链(另外页面链接本页面) */    privateList<String>inLinks=newArrayList<String>();    privatedoublepr;    publicStringgetPath(){       returnpath;    }    publicvoidsetPath(Stringpath){       this.path=path;    }    publicStringgetContent(){       returncontent;    }    publicvoidsetContent(Stringcontent){       this.content=content;    }    publicdoublegetPr(){       returnpr;    }    publicvoidsetPr(doublepr){       this.pr=pr;    }    publicList<String>getOutLinks(){       returnoutLinks;    }    publicvoidsetOutLinks(List<String>outLinks){       this.outLinks=outLinks;    }    publicList<String>getInLinks(){       returninLinks;    }    publicvoidsetInLinks(List<String>inLinks){       this.inLinks=inLinks;    }}
运行结果

E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html: 1.0988562616424633

E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test2.html: 1.024767124729736

E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test5.html: 1.0225108328175456

E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test3.html: 1.0012654834548864

E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test1.html: 0.994362279917484

E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test7.html: 0.9049428130819769

E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test6.html: 0.9000632741726616

 


原创粉丝点击