PageRank介绍
来源:互联网 发布:win7注册表修改mac地址 编辑:程序博客网 时间:2024/06/06 21:45
PageRank介绍
我的环境是MyEclipse7.0,jdk为1.7
PageRank算法包括两个类HtmlEntity和HtmlPageRank,其中HtmlPageRank需要用到htmlParser.jar和htmllexer.jar这两个包。本文是根据http://duyunfei.iteye.com/blog/1532798的说明调试的.
首先,我们准备了7个测试网页,这几个网页的链接情况如下:
i\j
test1
test2
test3
test4
test5
test6
test7
test1
0
1
1
0
0
0
0
test2
1
0
0
1
0
0
0
test3
0
0
0
1
1
1
0
test4
0
1
0
0
1
0
1
test5
0
0
1
1
0
0
0
test6
1
0
0
0
1
0
0
test7
0
1
0
1
0
0
1
表格的意思是 test1链接到test2,test3 ....依次类推,我们大致的根据上面两个原则可以猜一下,哪个将会是排名第一的网页?哪个最不重要?
貌似是test4和test6?
Html代码我都是放在E:/MyEclipse/workspace/PageRank/WebRoot/htmlDoc
Test1.html链接代码<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test2.html">test2</a><<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test3.html">test3</a> Test2.html<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test1.html">test1</a><a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html">test4</a> Test3.html<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html">test4</a><ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test5.html">test5</a><ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test6.html">test6</a> Test4.html<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test2.html">test2</a><a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test5.html">test5</a><a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test7.html">test7</a>Test5.html<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test3.html">test3</a><ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html">test4</a>Test6.html<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test1.html">test1</a><ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test5.html">test5</a>Test7.html<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test2.html">test2</a><ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html">test4</a><ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test7.html">test7</a>
运行结果:
HtmlPageRank类import java.io.*;import java.util.*;import org.htmlparser.*;import org.htmlparser.filters.TagNameFilter;import org.htmlparser.tags.LinkTag;import org.htmlparser.util.NodeList;import org.htmlparser.visitors.HtmlPage;/** *pagerank算法实现 * *@authorafei * */publicclassHtmlPageRank{ /*阀值 */ publicstaticdoubleMAX=0.00000000001; /*阻尼系数 */ publicstaticdoublealpha=0.85; publicstaticStringhtmldoc="E:/MyEclipse/workspace/PageRank/WebRoot/htmlDoc"; publicstaticMap<String,HtmlEntity>map=newHashMap<String,HtmlEntity>(); publicstaticList<HtmlEntity>list=newArrayList<HtmlEntity>(); publicstaticdouble[]init; publicstaticdouble[]pr; publicstaticvoidmain(String[]args)throws Exception { loadHtml(); pr=doPageRank(); while(!(checkMax())){ System.arraycopy(pr,0,init,0,init.length); pr=doPageRank(); } for(inti=0;i<pr.length;i++){ HtmlEntityhe=list.get(i); he.setPr(pr[i]); } List<HtmlEntity>finalList=newArrayList<HtmlEntity>(); Collections.sort(list,newComparator(){ publicintcompare(Objecto1,Objecto2){ HtmlEntityh1=(HtmlEntity)o1; HtmlEntityh2=(HtmlEntity)o2; intem=0; if(h1.getPr()>h2.getPr()){ em=-1; }else{ em=1; } returnem; } }); for(HtmlEntityhe:list){ System.out.println(he.getPath()+" : "+he.getPr()); } } /** *加载文件夹下的网页文件,并且初始化pr值(即init数组),计算每个网页的外链和内链 */ publicstaticvoidloadHtml()throws Exception { Filefile=newFile(htmldoc); File[]htmlfiles=file.listFiles(newFileFilter(){ publicbooleanaccept(Filepathname){ if(pathname.getPath().endsWith(".html")){ return true; } return false; } }); init=newdouble[htmlfiles.length]; for(inti=0;i<htmlfiles.length;i++){ Filef=htmlfiles[i]; BufferedReaderbr=newBufferedReader(newInputStreamReader( newFileInputStream(f))); Stringline=br.readLine(); StringBufferhtml=newStringBuffer(); while(line!=null){ line=br.readLine(); html.append(line); } HtmlEntityhe=newHtmlEntity(); he.setPath(f.getAbsolutePath()); he.setContent(html.toString()); Parserparser=Parser.createParser(html.toString(),"gb2312"); HtmlPagepage=newHtmlPage(parser); parser.visitAllNodesWith(page); NodeListnodelist=page.getBody(); nodelist=nodelist.extractAllNodesThatMatch( newTagNameFilter("A"),true); for(intj=0;j<nodelist.size();j++){ LinkTagoutlink=(LinkTag)nodelist.elementAt(j); he.getOutLinks().add(outlink.getAttribute("href")); } map.put(he.getPath(),he); list.add(he); init[i]=0.0; } for(inti=0;i<list.size();i++){ HtmlEntityhe=list.get(i); List<String>outlink=he.getOutLinks(); for(Stringol:outlink){ HtmlEntityhe0=map.get(ol); try{ he0.getInLinks().add(he.getPath()); }catch(NullPointerExceptione){//如果网页的链接路径不正确,则报NullPointerException错误,并且你会发现heo=null,也就是说map.get(ol)取到的值为null,但是事实上map不为null,ol的值在map中不存在导致的,这是由于html中路径设置不正确 e.printStackTrace(); } } } } /** *计算pagerank * *@paraminit *@paramalpho *@return */ privatestaticdouble[]doPageRank(){ double[]pr=newdouble[init.length]; for(inti=0;i<init.length;i++){ doubletemp=0; HtmlEntityhe0=list.get(i); for(intj=0;j<init.length;j++){ HtmlEntityhe=list.get(j); //计算对本页面链接相关总值 if(i!=j&&he.getOutLinks().size()!=0&&he.getOutLinks().contains(he0.getPath())){ temp=temp+init[j]/he.getOutLinks().size(); } } //经典的pr公式 pr[i]=alpha+(1-alpha)*temp; } returnpr; } /** *判断前后两次的pr数组之间的差别是否大于我们定义的阀值假如大于,那么返回false,继续迭代计算pr * *@parampr *@paraminit *@parammax *@return */ privatestaticbooleancheckMax(){ booleanflag=true; for(inti=0;i<pr.length;i++){ if(Math.abs(pr[i]-init[i])>MAX){ flag=false; break; } } return flag; }}HtmlEntity类import java.util.*;/** *网页entity * *@authorafei * */classHtmlEntity{ privateStringpath; privateStringcontent; /*外链(本页面链接的其他页面) */ privateList<String>outLinks=newArrayList<String>(); /*内链(另外页面链接本页面) */ privateList<String>inLinks=newArrayList<String>(); privatedoublepr; publicStringgetPath(){ returnpath; } publicvoidsetPath(Stringpath){ this.path=path; } publicStringgetContent(){ returncontent; } publicvoidsetContent(Stringcontent){ this.content=content; } publicdoublegetPr(){ returnpr; } publicvoidsetPr(doublepr){ this.pr=pr; } publicList<String>getOutLinks(){ returnoutLinks; } publicvoidsetOutLinks(List<String>outLinks){ this.outLinks=outLinks; } publicList<String>getInLinks(){ returninLinks; } publicvoidsetInLinks(List<String>inLinks){ this.inLinks=inLinks; }}运行结果
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html: 1.0988562616424633
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test2.html: 1.024767124729736
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test5.html: 1.0225108328175456
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test3.html: 1.0012654834548864
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test1.html: 0.994362279917484
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test7.html: 0.9049428130819769
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test6.html: 0.9000632741726616
- PageRank介绍
- PageRank介绍
- Pagerank 算法介绍
- Google PageRank(网页级别)介绍
- PageRank排序算法详细介绍
- pagerank
- pagerank
- PageRank
- PageRank
- PageRank
- PageRank
- PageRank
- pagerank
- PageRank
- PageRank
- pageRank
- PageRank
- pagerank
- 错误
- Jboss6 配置数据源
- .def
- Oracle COALESCE函数
- Python实现“已知三角形两个直角边,求斜边”
- PageRank介绍
- mysql中创建数据表
- IntentService和AsyncTask
- 输出俄文字母表
- ListView滑动时 item出现黑色背景的问题的解决方法
- 占位
- linux下完全删除Oracle
- [设计模式]外观模式(Facade)
- 20130307