用Java MySQL PHP轻松构建跨平台的搜索引
来源:互联网 发布:下载加速器软件 编辑:程序博客网 时间:2024/06/05 11:36
<script type="text/javascript">google_ad_client = "pub-8800625213955058";/* 336x280, 创建于 07-11-21 */google_ad_slot = "0989131976";google_ad_width = 336;google_ad_height = 280;//</script><script type="text/javascript"src="http://pagead2.googlesyndication.com/pagead/show_ads.js"></script>此搜索引擎适于在一个中等规模的局域网中使用,由于找到的网页存在数据库中,不仅可以索静态的HTML页面,可以搜索php、asp等动态页面。对于一个拥有5万个网页的系统(使用PII-400作为服务器),搜索响应时间在2-10秒左右,完全可以满足要求,由于Java、MySQL、PHP都是跨平台的软件,所以此搜索引擎不仅可以工作在Windows服务器上,而且也可以工作在Linux等其他系统中。 一、建立搜索引擎需要的数据库和数据表。 首先建立数据库: c:/mysql/bin/> mysqladmin -uroot -pmypasswd create Spider 然后建立数据库中的表结构 c:/mysql/bin/> mysql -uroot -pmypasswd Spider < Spider.mysql 其中Spider.mysql为一个文本文件,其内容如下:
CREATE TABLE link (Id int(10) unsigned NOT NULL auto_increment,Url varchar(120) NOT NULL,Class tinyint(3) unsigned NOT NULL default 0 ,IsSearchLink tinyint(3) unsigned default 0,PRIMARY KEY (Url),UNIQUE Id (Id),KEY Url (Url),KEY Class (Class));
# 本局域网的初始主页地址,搜索蜘蛛从此网址开始搜索所有其他网页INSERT INTO link VALUES( '1', 'HTTP://102.211.69.1/', '0', '0'); # 数据表 webpagelocal 用来存放下载的所有的网页CREATE TABLE webpagelocal (Id int(10) unsigned NOT NULL auto_increment,Url varchar(120) NOT NULL,Content text NOT NULL,PRIMARY KEY (Url),UNIQUE Id (Id),KEY Url (Url)); # 数据表 webpagefindfast # 用MakeFast.php从表webpagelocal中提取512字节的检索信息存放其中CREATE TABLE webpagefindfast (Id int(10) unsigned NOT NULL,Url varchar(120) NOT NULL,Title varchar(64),Content blob,PRIMARY KEY (Url),KEY Url (Url),KEY Title (Title));
二、以下为搜索网页和下载网页至本地数据库的Java程序LinkToDB.java,它也是此搜索引擎的核心和基础/***************************** LinkToDB.java ************************************* 对URL中的http链接进行分析,将相对路径转换为绝对路径,排序方式输出结果到数据库 ** 如果分析得到的URL是Link表中唯一的,就将其内容下载到表 WebPageLocal 中。 *********************************************************************************/import java.io.*;import java.util.*;import java.net.*;import java.lang.String;import java.sql.*;import java.text.*;class Counter {private int i = 1;int read() { return i; }void increment() { i ; }}public class LinkToDB {String UrlHost = "";String UrlFile = "";String UrlPath = "";static String StartWith = null;boolean outsideTag = true; //判断是否在标记之中static char[] buffer = new char[4096]; // 缓冲区:用于保存从 URL 读的数据InputStreamReader read = null;BufferedReader reader = null;URLConnection uc = null;private URL url = null;private StreamTokenizer st;private TreeMap counts = new TreeMap();//以排序方式保存找到的链接LinkToDB(String myurl,String StartOnly){try {StartWith = StartOnly;if(StartOnly!=null) { if(!myurl.startsWith(StartOnly)) return; }//只搜索此网站url = new URL(myurl);UrlHost = url.getHost();UrlHost = UrlHost.toUpperCase();UrlFile = url.getFile();int v=UrlFile.lastIndexOf("/");if(v!=-1) UrlPath = UrlFile.substring(0,v);System.out.println("分析文件:" myurl);int uclength=200000;int ucError=0;try{ uc = url.openConnection();uc.setUseCaches(false);uc.connect();}catch(IOException io) { ucError=1; System.out.println("打不开待分析网页:" myurl); }if(ucError!=1){uclength = uc.getContentLength();if (uclength<200000) {try{ read = new InputStreamReader(url.openStream()); }catch(IOException io) {System.out.println("流打开错误:" myurl);}}else System.out.println("文件太大,不分析");}if(read!=null){reader=new BufferedReader(read); if(reader!=null){st = new StreamTokenizer(reader);st.resetSyntax(); // 重置语法表st.wordChars(0,255); // 令牌范围为全部字符st.ordinaryChar('<'); // HTML标记两边的分割符st.ordinaryChar('>');}}}catch(MalformedURLException e){ System.out.println("Malformed URL String!");} }void cleanup() {try { read.close(); }catch(IOException e) { System.out.println("流关闭错误"); }}void countWords() {try {while(st.nextToken()!=StreamTokenizer.TT_EOF) {String s0="";String s_NoCase="";switch(st.ttype) {case '<': //入标记字段outsideTag=false;continue; //countWords();case '>': //出标记字段outsideTag=true;continue; //countWords();case StreamTokenizer.TT_EOL: s0 = new String("EOL"); break;case StreamTokenizer.TT_WORD: if(!outsideTag) s0 = st.sval; /*已经是字符串*/ break;default: s0 = "";// s0 = String.valueOf((char)st.ttype);/*单一字符*/}if(outsideTag) continue;//出了标记区域(<a >)String s = "";s_NoCase = s0.trim();s0=s_NoCase.toUpperCase();if(s0.startsWith("A ")||s0.startsWith("AREA ")||s0.startsWith("FRAME ")||s0.startsWith("IFRAME ")){ //以这些开始的都是超级链接int HREF_POS = -1;if(s0.startsWith("FRAME ")||s0.startsWith("IFRAME ")) {HREF_POS = s0.indexOf("SRC=");s0 = s0.substring(HREF_POS 4).trim();s_NoCase=s_NoCase.substring(HREF_POS 4).trim();}else {HREF_POS=s0.indexOf("HREF=");s0=s0.substring(HREF_POS 5).trim();s_NoCase=s_NoCase.substring(HREF_POS 5).trim();}if(HREF_POS!=-1) {if(s0.startsWith("/"")) {s0=s0.substring(1);s_NoCase=s_NoCase.substring(1);}int QUOTE=s0.indexOf("/"");if(QUOTE!=-1) {s0=s0.substring(0,QUOTE).trim();s_NoCase=s_NoCase.substring(0,QUOTE).trim();}int SPACE=s0.indexOf(" ");if(SPACE!=-1) {s0=s0.substring(0,SPACE).trim();s_NoCase=s_NoCase.substring(0,SPACE).trim();}if(s0.endsWith("/"")) {s0=s0.substring(0,s0.length()-1);s_NoCase=s_NoCase.substring(0,s_NoCase.length()-1);}if(s0.indexOf("'")!=-1||s0.indexOf("JAVASCRIPT:")!=-1||s0.indexOf("..")!=-1){s0="";s_NoCase="";} //有这些符号,认为非合法链接;两点表示上一目录,而我只想向下级查找if ( !s0.startsWith("FTP://") &&//以下后缀或前缀通常非网页格式!s0.startsWith("FTP://") &&!s0.startsWith("MAILTO:") &&!s0.endsWith(".SWF") &&!s0.startsWith("../")) //因../表示上一目录,通常只需考虑本级和下N级目录s=s0;if (!s.startsWith("HTTP://")&&!s.equals("")) {s=UrlHost UrlPath "/" s;s_NoCase=UrlHost UrlPath "/" s_NoCase;}else if(s.startsWith("/")) {s=UrlHost s;s_NoCase=UrlHost s_NoCase;}if(s.startsWith("HTTP://")) {s=s.substring(7);s_NoCase=s_NoCase.substring(7);}int JinHao=s.indexOf("#"); //如果含有"#"号,表示有效的链接是此前的部分if(JinHao!=-1) {s=s.substring(0,JinHao).trim();s_NoCase=s_NoCase.substring(0,JinHao).trim();}int H=-1; //以下将/./转换为/for(int m=0;m<4;m ){H=s.indexOf("/./");if(H!=-1) {s=s.substring(0,H) s.substring(H 2);s_NoCase=s_NoCase.substring(0,H) s_NoCase.substring(H 2);}}int TwoXG=-1; //以下将//转换为/for(int m=0;m<5;m ){TwoXG=s.indexOf("//");if(TwoXG!=-1) {s=s.substring(0,TwoXG) s.substring(TwoXG 1);s_NoCase=s_NoCase.substring(0,TwoXG) s_NoCase.substring(TwoXG 1);}}int OneXG=s.indexOf("/");if(OneXG==-1) {s=s "/";s_NoCase ="/";} //将xx.xx.xx.xxx转换为xx.xx.xx.xxx/的标准形式if (!s.startsWith("HTTP://")) {s="HTTP://" s;s_NoCase="HTTP://" s_NoCase;}}}if(counts.containsKey(s_NoCase)) ((Counter)counts.get(s_NoCase)).increment();else counts.put(s_NoCase,new Counter());}} catch(IOException e) {System.out.println("st.nextToken() unsuccessful");}}Collection values() { return counts.values(); }Set keySet() { return counts.keySet(); }Counter getCounter(String s) { return (Counter)counts.get(s); }public static void main(String[] argv) throws FileNotFoundException {try{Class.forName("org.gjt.mm.mysql.Driver").newInstance(); }catch (Exception E) {System.out.println("加载Jdbc驱动程序失败");E.printStackTrace();}try{Connection conn = DriverManager.getConnection("jdbc:mysql://localhost/Spider?user=root&password=mypassword");Statement stmt = conn.createStatement();String myurl;for(int i=1;i<=6;i ){String query = "SELECT Url FROM link WHERE IsSearchLink=0 and Class=" (i-1) " ORDER BY Url";ResultSet rs = stmt.executeQuery(query);while (rs.next()) {myurl = rs.getString("Url");String StartOnly = null;if(argv.length>0) StartOnly=argv[0];LinkToDB wc = new LinkToDB(myurl,StartOnly);if(wc.reader!=null){stmt.executeUpdate("UPDATE Link SET IsSearchLink=1 WHERE Class=" (i-1) " and Url='" myurl "'");wc.countWords();Iterator keys = wc.keySet().iterator();while(keys.hasNext()) {String key = (String)keys.next();System.out.println("分析找到链接:" key ": " wc.getCounter(key).read());int ErrorDB=0;if(StartWith==null||(StartWith!=null&&key.startsWith(StartWith))){try{//-------------------------- 找到的链接插入数据库link -------------------stmt.executeUpdate("INSERT INTO Link(Id,Url,Class) VALUES(0,'" key "'," i ")");}catch(SQLException ex){ErrorDB=1;System.out.println("插入数据错 SQLException: " ex.getMessage());}if(ErrorDB!=1){ //链接不重复就下载网页到WebPageLocalSystem.out.println("下载网页:" key);int length; // 读的字符数int filelength=200000;InputStreamReader read=null;URL rurl=null;URLConnection urlc=null;String Content="";try{ rurl = new URL(key); }catch(MalformedURLException mu) {System.out.println("打开下载网页出错:" mu.getMessage());}if(rurl!=null){int ucError=0;try{ urlc = rurl.openConnection(); urlc.connect();}catch(IOException io) { ucError=1; System.out.println("下载网页打不开:" key); }if(ucError==0){try{ filelength=urlc.getContentLength();if (filelength>=200000) System.out.println("网页太大,我不下载了。" key);else read = new InputStreamReader(rurl.openStream());}catch(IOException io) {System.out.println("下载网页打不开:" key);}}// 读入 URL 并写入数据库if(read!=null&&filelength<200000){try{while((length = read.read(buffer)) != -1) {String s = new String(buffer, 0, length);Content=Content s;}}catch(IOException io) {Content="";System.out.println("不能读入URL文件");}try{Statement stmt2 = conn.createStatement(ResultSet.TYPE_SCROLL_SENSITIVE, ResultSet.CONCUR_UPDATABLE);ResultSet uprs = stmt2.executeQuery("SELECT Id,Url,Content FROM WebPageLocal WHERE 0");//WHERE 0很重要,否则会耗尽内存uprs.moveToInsertRow();uprs.updateInt("Id",0);uprs.updateString("Url",key);uprs.updateString("Content",Content);uprs.insertRow();uprs.beforeFirst();uprs.close();stmt2.close();}catch(SQLException ex){System.out.println("插入数据错:" ex.getMessage());}}//if(read!=null&&filelength<200000)}//}}//------------------------------- 下载网页 ----------------------------------}//while(keys.hasNext())wc.cleanup();}//if(wc.reader!=null) }//while rs.nextrs.close(); //关闭记录结果}//end forstmt.close(); //关闭语句conn.close(); //关闭连接 }//trycatch(SQLException ex){ System.out.println("SQL异常:" ex.getMessage()); }}//main()函数结束}//类 LinkToDB 结束
三、编译和运行此Java程序d:/Spider/> set CLASSPATH=d:/j/mm.mysql.jdbc2;d:/Spider/> d:/j/bin/javac LinkToDB.javad:/Spider/> d:/j/bin/java LinkToDB
其中第一行命令是设置MySQL的JDBC驱动程序路径。四、由于网页中含有大量的无用的格式信息,直接用它来搜索要浪费大量的时间,所以需要掉其中的HTML格式控制信息,并将太长的网页截短,然后将整理后的用于搜索的信息存到另一个数据表中。由于PHP4中有一个很方便的函数strip_tags可以去掉其中的HTML格式标记,所以我们用PHP来整理。 MakeFast.php的内容如下:<?phpmysql_connect("localhost","root","mypassword");$result = mysql_db_query("Spider","select Id,Url,Content from WebPageLocal where Id>$n1 and Id<$n2");while($mt = mysql_fetch_array($result)){$Title = "";$Body = "";$mt2 = strtoupper($mt[2]);$PosTitleL = strpos($mt2,"<TITLE>");$PosTitleR = strpos($mt2,"</TITLE>");$PosBody = strpos($mt2,"<BODY");$PosHeadR = strpos($mt2,"</HEAD>");if($PosTitleL&&$PosTitleR) $Title = substr($mt[2],$PosTitleL 7,$PosTitleR-$PosTitleL-7);$Title = eregi_replace("'","’",$Title);if($PosBody) $Body = substr($mt[2],$PosBody);else if($PosHeadR) $Body = substr($mt[2],$PosHeadR 7);else if($PosTitleR) $Body = substr($mt[2],$PosTitleR 8);else if($PosTitleL) $Body = substr($mt[2],$PosTitleL);else $Body = $mt[2];$BodyText = strip_tags($Body);$BodyNoSpace = eregi_replace(" ","",$BodyText);$BodyNoQuote = eregi_replace("'","",$BodyNoSpace);$Body512 = substr($BodyNoQuote,0,511)." ";$Id = $mt[0];$Url = $mt[1];$sql="Insert Into WebPageFindFast(Id,Url,Title,Content)VALUES($Id,'$Url','$Title','$Body512')";mysql_db_query("Spider",$sql) or die($sql);echo $Id." ";}?>
使用方式: 在浏览器中输入http://mywebsite/Spider/MakeFast.php?n1=1&n2=10000五、以上是建立搜索引擎所用到的数据,下面编制用于用户搜索的网页和PHP脚本文件。首先是用于搜索的表单页面SearchForm.htm,其内容如下。
<html><head><title>红蜘蛛搜索引擎-V0.1</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312"><link rel="stylesheet" href="../All.css" type="text/css"></head><body bgcolor="#eeffee" text="#000000"><table width="600" border="0" cellspacing="2" cellpadding="2" align="center" bgcolor="#99CC00"><tr><td><div align="center"><font color="#FF0000"><b><span class="pt16">红蜘蛛搜索引擎</span> <span class="pt12">V0.1</span></b></font></div></td></tr></table><form name="form1" method="post" action="Search.php"><table width="600" border="1" cellspacing="1" cellpadding="1" align="center" bordercolor="#99CC00"><tr> <td> <div align="left"><span class="pt15"><font color="#FF0000"><b>关 键 字</b>:</font></span> <input type="text" name="KeyWords" size="40" maxlength="40"></div></td><td> <div align="left"><span class="pt15"><font color="#FF0000"><b>查找范围</b>:</font></span> <select name="SearchIn"><option value="Content" selected>网页正文</option><option value="Title">网页标题</option></select></div></td></tr><tr> <td colspan="2"><span class="pt15"><font color="#FF0000"><b>查找方式</b>:</font></span> <select name="Speed"><option value="Fast" selected>快速查找</option><option value="Slow">更深查找</option></select></td></tr><tr> <td colspan="2"> <div align="left"> <input type="submit" name="Submit" value="搜索"></div></td></tr></table></form><table width="600" border="0" cellspacing="2" cellpadding="2" align="center"><tr> <td height="18"> <p class="pt12"><font color="#FF0000"><b>使用方法</b></font>:仅需在关键字一栏输入查询内容并按回车键(Enter)即可。 </p><blockquote> <p align="left" class="pt12">如果希望输入多个条件,只需要用空格分隔即可:</p><p class="pt12">例如:要查询同时包含“西昌”和“卫星”的网页,只需输入[西昌 卫星]。</p><p class="pt12">又如:要查询只包含关键字“西昌”而不包含“卫星”的网页,只需要输入[西昌 -卫星]。注意中间的空格不能少。</p></blockquote><p class="pt12"><font color="#FF0000"><b>查找范围</b></font>:你可以选择从“网页标题”中查找或者从“网页正文”中查找。</p><p class="pt12"><font color="#FF0000"><b>查找方式</b></font>:“快速查找”速度快但找到的网页数可能较少,因为:</p><blockquote> <p class="pt12">“快速查找”只搜索网页正文的前512个字符。</p><p class="pt12">“更深查找”搜索网页正文的前2048个字符。</p><div align="right"><a href="mailto:zdyhlp@263.net"><font color="#FF0000" class="pt13"><b>欢迎提出宝贵意见</b></font></a></div></blockquote></td></tr></table></body></html>
search.php根据用户输入的条件,完成搜索,显示找到的网页的链接地址、标题和提要信息。内容如下:<TITLE>红蜘蛛正在搜索关键词为[<?php echo $KeyWords?>]的网页</TITLE><link rel="stylesheet" href="../All.css" type="text/css"><body bgcolor="#eeffee"><table width="96%" border="0" cellspacing="2" cellpadding="2" align="center" bgcolor="#99CC00"><tr> <td> <div align="center"><font color="#FF0000"><b><span class="pt16">红蜘蛛搜索引擎</span> <span class="pt12">V0.1</span></b></font></div></td></tr></table><form name="form1" method="post" action="Search.php"><table width="96%" border="1" cellspacing="1" cellpadding="1" align="center" bordercolor="#99CC00"><tr> <td valign="top"> <font color="#FF0000"><b><span class="pt13">关键字</span></b><span class="pt13">:</span></font> <input type="text" name="KeyWords" value="<?php echo $KeyWords?>" size="30" maxlength="30"><input type="submit" name="Submit" value="重新搜索"></td><td valign="top"><font color="#FF0000"><b><span class="pt13">查找范围</span></b><span class="pt13">:</span></font> <select name="SearchIn"><option value="Content" <?php if ($SearchIn=="Content") echo "selected";?>>网页正文</option><option value="Title" <?php if ($SearchIn=="Title") echo "selected";?>>网页标题</option></select></td><td valign="top"> <div align="left"><font color="#FF0000"><b><span class="pt13">查找方式</span></b><span class="pt13">:</span></font> <select name="Speed"><option value="Fast" <?php if ($Speed=="Fast") echo "selected";?>>快速查找</option><option value="Slow" <?php if ($Speed=="Slow") echo "selected";?>>更深查找</option></select></div></td></tr></table></form><?phpif($SearchIn=="Title") $SQL="SELECT Id,Url,Title,Content FROM WebPageFindFast WHERE ";else $SQL="SELECT Id,Url,Title,Content FROM WebPageFind$Speed WHERE ";$KeyWords=str_replace(" ", " ", $KeyWords);if($KeyWords=="") {echo "关键字不能为空"; exit();}$tok = strtok($KeyWords," ");$i=0;$j=0;while($tok) {$i ;$tok = strtok(" ");}$key = strtok($KeyWords," ");while($key) {$j ;if(substr($key,0,1)!="-") {$SQL=$SQL.$SearchIn." LIKE '%".$key."%' ";$Words[]=$key;}else {$SQL=$SQL.$SearchIn." NOT LIKE '%".substr($key,1)."%' ";}if($j<$i) $SQL.=" AND ";$key = strtok(" ");}if($CurPos!="") $SQL.=" AND Id>=$CurPos ";$SQL.=" LIMIT 100";//echo "/$SQL=".$SQL."<br>";mysql_connect("localhost","root","mypassword");$result=mysql_db_query("Spider",$SQL);$RowCount=mysql_num_rows($result);$FindCount=0;?><table border=0 align=center width="96%"><tr> <th nowrap width="41%"> <div align="left" class="pt12">共找到关键字为 <font color=red> <?php echo $KeyWords?></font> 的网页共 <font color=red> <?php echo $RowCount;?></font> 个</div></th><td nowrap></td></tr><tr bgcolor="#FF0000"> <th nowrap colspan="2" height="3"></th></tr><?php while($row= mysql_fetch_array($result)){ $Pos=$row[0];$FindCount ;if($FindCount>20) break;?><tr> <td nowrap colspan="2"> <?php echo $FindCount;?><a href="<?php echo $row[1]?>" target=_black> <?php if($row[2]!="") echo $row[2];else echo substr($row[3],0,64);?></a></td></tr><tr> <td colspan="2" ><span class="pt13">摘要:</span> <?php if($SearchIn=="Title") {$ZhaiYao=substr($row[3],0,1024);}else{if($Speed=="Fast") $ZhaiYao=$row[3];else{$RowLen=strlen($row[3]);if ($RowLen<1024) {$ZhaiYao=$row[3];}else { $CutPos=0;$PosWord1=strpos($row[3],$Words[0]);if($PosWord1-512<0) $ZhaiYao=substr($row[3],0,1024);else {for($i=24;$i<500;$i ){ //避免将中文字符从半个字处截断,选择从英文处截断if(ord(substr($row[3],$PosWord1-$i,1))<128) {$CutPos=$i;break;}}$ZhaiYao=substr($row[3],$PosWord1-$CutPos,1024);}} }for($i=0;$i<count($Words);$i ){$ZhaiYao=str_replace($Words[$i],"<font color=red>".$Words[$i]."</font>", $ZhaiYao);}}echo $ZhaiYao;?></td></tr><tr> <td colspan="2" align="right"><a href="One.php?num=<?php echo $row[0]?>" target=_black> <font color="#0033FF" class="pt12">本地镜像</font></a></td></tr><tr bgcolor="#999933"> <td nowrap colspan="2" height="1"></td></tr><?php } ?><?php if($RowCount>20){ ?><tr> <td align="right" colspan="2" height="10"><form name="form2" method="post" action="Search.php"><input type="hidden" name="KeyWords" value="<?php echo $KeyWords;?>"><input type="hidden" name="SearchIn" value="<?php echo $SearchIn;?>"><input type="hidden" name="Speed" value="<?php echo $Speed;?>"><input type="hidden" name="CurPos" value="<?php echo $Pos;?>"><input type="submit" name="Submit" value="下20个网页"></form> </td></tr><?php } ?></table>
One.php用于从本地镜像中显示一个找到的网页。由于网页的原始信息已经在WebPageLocal中存储,所以只需简单的读出,发给用户。<?php mysql_connect("localhost","root","mypassword"); $sql="select Url,Content from WebPageLocal where "; if($num!="") $sql=$sql."Id=$num"; else exit(); $result=mysql_db_query("Spider",$sql); $mt=mysql_fetch_row($result); echo $mt[1];?>