利用内存数据库和布隆过滤器写的网络爬虫

来源：互联网发布：数据产品经理面试题目编辑：程序博客网时间：2024/06/06 08:24

内存数据库用来保存待访问url，布隆过滤器用来记录已访问的url。先前我们待访问url是存放在内存中，已访问的url是利用HashSet实现的。

布隆过滤器

package hashfilter;import java.util.BitSet;import bdb.CrawlUrl;public class SimpleBloomFilter {private static final int DEFAULT_SIZE=2<<24;private static final int seeds[]={7,11,13,31,37,61};private BitSet bits=new BitSet(DEFAULT_SIZE);private SimpleHash func[]=new SimpleHash[seeds.length];public SimpleBloomFilter(){int n=func.length;for(int i=0;i<n;i++){func[i]=new SimpleHash(DEFAULT_SIZE,seeds[i]);}}public void add(CrawlUrl crawlUrl){add(crawlUrl.getOriUrl());}private void add(String value){if(value!=null){for(SimpleHash f:func){bits.set(f.hash(value), true);}}}public boolean contains(CrawlUrl crawlUrl){return contains(crawlUrl.getOriUrl());}private boolean contains(String value){if(value==null)return false;else{boolean ret=true;for(SimpleHash f:func){ret=ret&&bits.get(f.hash(value));}return ret;}}}

package hashfilter;public class SimpleHash {private int cap;private int seed;public SimpleHash(int cap,int seed){this.cap=cap;this.seed=seed;}public int hash(String value){int result=0;int n=value.length();for(int i=0;i<n;i++){result=result*seed+value.charAt(i);}return (cap-1)&result;}}

内存数据库

package bdb;import java.io.Serializable;import java.util.Date;import com.sleepycat.je.utilint.Timestamp;public class CrawlUrl implements Serializable{private static final long serialVersionUID=7931672194843948629L;public CrawlUrl(){}private String oriUrl;// 原始 URL 的值，主机部分是域名private String url;// URL 的值，主机部分是 IP，为了防止重复主机的出现private int urlNo;// URL NUMprivate int statusCode;// 获取 URL 返回的结果码private int hitNum;// 此 URL 被其他文章引用的次数private String charSet;// 此 URL 对应文章的汉字编码private String abstractText;// 文章摘要private String author;// 作者private int weight;// 文章的权重(包含导向词的信息)private String description;// 文章的描述private int fileSize;// 文章大小private Timestamp lastUpdateTime;// 最后修改时间private Date timeToLive;// 过期时间private String title;// 文章名称private String type;// 文章类型private String[] urlRefrences;// 引用的链接private int layer;// 爬取的层次， 从种子开始， 依次为第 0 层， 第 1 层...public int getLayer(){return layer;}public void setLayer(int layer){this.layer=layer;}public String getUrl(){return url;}public void setUrl(String url){this.url=url;}public int getUrlNo(){return urlNo;}public void setUrlNo(int urlNo) {this.urlNo = urlNo;}public int getStatusCode() {return statusCode;}public void setStatusCode(int statusCode) {this.statusCode = statusCode;}public int getHitNum() {return hitNum;}public void setHitNum(int hitNum) {this.hitNum = hitNum;}public String getCharSet() {return charSet;}public void setCharSet(String charSet) {this.charSet = charSet;}public String getAbstractText() {return abstractText;}public void setAbstractText(String abstractText) {this.abstractText = abstractText;}public String getAuthor() {return author;}public void setAuthor(String author) {this.author = author;}public int getWeight() {return weight;}public void setWeight(int weight){this.weight = weight;}public String getDescription() {return description;}public void setDescription(String description) {this.description = description;}public int getFileSize() {return fileSize;}public void setFileSize(int fileSize) {this.fileSize = fileSize;}public Timestamp getLastUpdateTime() {return lastUpdateTime;}public void setLastUpdateTime(Timestamp lastUpdateTime){this.lastUpdateTime = lastUpdateTime;}public Date getTimeToLive() {return timeToLive;}public void setTimeToLive(Date timeToLive) {this.timeToLive = timeToLive;}public String getTitle() {return title;}public void setTitle(String title) {this.title = title;}public String getType() {return type;}public void setType(String type) {this.type = type;}public String[] getUrlRefrences() {return urlRefrences;}public void setUrlRefrences(String[] urlRefrences) {this.urlRefrences = urlRefrences;}public final String getOriUrl() {return oriUrl;}public void setOriUrl(String oriUrl) {this.oriUrl = oriUrl;}}

package bdb;public interface Frontier {public CrawlUrl getNext() throws Exception;public boolean putUrl(CrawlUrl url) throws Exception;}

package bdb;import java.io.File;import com.sleepycat.bind.serial.StoredClassCatalog;import com.sleepycat.je.Database;import com.sleepycat.je.DatabaseConfig;import com.sleepycat.je.Environment;import com.sleepycat.je.EnvironmentConfig;public abstract class AbstractFrontier {private Environment env;private static final String CLASS_CATALOG="java_class_catalog";protected StoredClassCatalog javaCatalog;protected Database catalogdatabase;protected Database database;public AbstractFrontier(String homeDirectory){System.out.println("Opening environment in: "+homeDirectory);EnvironmentConfig envConfig=new EnvironmentConfig();envConfig.setTransactional(true);envConfig.setAllowCreate(true);env=new Environment(new File(homeDirectory),envConfig);DatabaseConfig dbConfig=new DatabaseConfig();dbConfig.setAllowCreate(true);dbConfig.setTransactional(true);catalogdatabase=env.openDatabase(null, CLASS_CATALOG, dbConfig);// A single StoredClassCatalog object is normally used along with a set of databases that stored serialized objects.// 存放需要序列化的对象javaCatalog=new StoredClassCatalog(catalogdatabase);DatabaseConfig dbConfig0=new DatabaseConfig();dbConfig0.setAllowCreate(true);dbConfig0.setTransactional(true);// 存放的是keydatabase=env.openDatabase(null,"URL", dbConfig0);}public void close(){database.close();javaCatalog.close();env.close();}protected abstract void put(Object key,Object value);protected abstract Object get(Object key);protected abstract Object delete(Object key);}

package bdb;import java.util.Map.Entry;import java.util.Set;import com.sleepycat.bind.EntryBinding;import com.sleepycat.bind.serial.SerialBinding;import com.sleepycat.collections.StoredMap;public class BDBFrontier extends AbstractFrontier implements Frontier{private StoredMap pendingUrisDB=null;public BDBFrontier(String homeDirectory) {super(homeDirectory);// TODO Auto-generated constructor stub// 获得DatabaseEntry有两种方式，一是通过其构造函数，参数是对象的字节；// 二是通过EntryBinding.objectToEntry()函数来获得EntryBinding keyBinding=new SerialBinding(javaCatalog, String.class);EntryBinding valueBinding=new SerialBinding(javaCatalog,CrawlUrl.class);// Creates a map entity view of a DatabasependingUrisDB=new StoredMap(database,keyBinding,valueBinding,true);}@Overridepublic CrawlUrl getNext() throws Exception {// TODO Auto-generated method stubCrawlUrl result=null;if(!pendingUrisDB.isEmpty()){//Set entrys=pendingUrisDB.entrySet();//System.out.println(entrys);Entry<String,CrawlUrl> entry=(Entry<String,CrawlUrl>)pendingUrisDB.entrySet().iterator().next();result=entry.getValue();delete(entry.getKey());}return result;}@Overridepublic boolean putUrl(CrawlUrl url) throws Exception {// TODO Auto-generated method stubput(url.getOriUrl(),url);return true;}@Overrideprotected void put(Object key, Object value) {// TODO Auto-generated method stubpendingUrisDB.put(key, value);}@Overrideprotected Object get(Object key) {// TODO Auto-generated method stubreturn pendingUrisDB.get(key);}@Overrideprotected Object delete(Object key) {// TODO Auto-generated method stubreturn pendingUrisDB.remove(key);}// 根据url可计算键值，可使用包括MD5在内的各种压缩算法private String calulateUrl(String url){return url;}public boolean contains(CrawlUrl url){return pendingUrisDB.containsKey(url.getOriUrl());}public boolean isEmpty(){return pendingUrisDB.isEmpty();}// 测试程序//public static void main(String[] args)//{//BDBFrontier bDBFrontier=new BDBFrontier("D:\\bdb");//CrawlUrl url=new CrawlUrl();//url.setOriUrl("http://www.baidu.com");//try {//bDBFrontier.putUrl(url);//System.out.println(bDBFrontier.getNext().getOriUrl());//bDBFrontier.close();//} catch (Exception e) {//// TODO Auto-generated catch block//e.printStackTrace();//}//}}

封装待访问url和已访问url

import bdb.BDBFrontier;import bdb.CrawlUrl;import hashfilter.SimpleBloomFilter;public class NewLinkQueue {private static SimpleBloomFilter visitedUrl=new SimpleBloomFilter();private static BDBFrontier unvistedUrl=new BDBFrontier("D:\\bdb");public static Object unvisitedUrlDeQueue() throws Exception{return unvistedUrl.getNext().getOriUrl();}public static void addUnvisitedUrl(String url){CrawlUrl crawlUrl=new CrawlUrl();crawlUrl.setOriUrl(url);if(url!=null&&!url.trim().equals("")&&!unvistedUrl.contains(crawlUrl)&&!visitedUrl.contains(crawlUrl)){try {unvistedUrl.putUrl(crawlUrl);} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}}}public static boolean unvisitedUrlIsEmpty(){return unvistedUrl.isEmpty();}public static void addVisitedUrl(String url){CrawlUrl crawlUrl=new CrawlUrl();crawlUrl.setOriUrl(url);visitedUrl.add(crawlUrl);}}

//下载网页

import java.io.BufferedReader;import java.io.File;import java.io.FileOutputStream;import java.io.FileWriter;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.util.concurrent.TimeUnit;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.http.Header;import org.apache.http.HttpResponse;import org.apache.http.HttpStatus;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.HttpClient;import org.apache.http.client.config.RequestConfig;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.conn.HttpClientConnectionManager;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.impl.client.HttpClientBuilder;import org.apache.http.impl.client.HttpClients;import org.apache.http.impl.conn.BasicHttpClientConnectionManager;import org.apache.http.params.BasicHttpParams;import org.apache.http.params.HttpConnectionParams;import org.apache.http.params.HttpParams;public class DownLoadFile {private String filePath;private CloseableHttpClient httpclient;DownLoadFile(){filePath=null;//httpclient=HttpClients.createDefault();//HttpParams httpParams=new BasicHttpParams();//HttpConnectionParams.setConnectionTimeout(httpParams, 50000);//HttpConnectionParams.setSoTimeout(httpParams, 5000);}//根据URL和网页类型生成需要保存的网页的文件名，去除URL中的非文件名字符public String getFileNameByUrl(String url,String contentType){url=url.substring(7);//text/html类型if(contentType.indexOf("html")!=-1){url=url.replaceAll("[\\?/:|<>\"]","_")+".html";return url;}else{return url.replaceAll("[\\?/:|<>\"]","_")+"."+contentType.substring(contentType.lastIndexOf("/")+1);}}//保存网页字节数组到本地文件，filePath为要保存的文件的相对路径//下载URL指向的网页public String downloadFile(String url){System.out.println("link:"+url);//HttpClientConnectionManager connManager=new BasicHttpClientConnectionManager();//connManager.closeIdleConnections(5, TimeUnit.SECONDS);//httpclient=HttpClients.createMinimal(connManager);//RequestConfig.Builder requestBuilder=RequestConfig.custom();//requestBuilder = requestBuilder.setConnectionRequestTimeout(5*1000);//requestBuilder = requestBuilder.setConnectTimeout(5*1000);//HttpClientBuilder builder=HttpClientBuilder.create();//builder.setDefaultRequestConfig(requestBuilder.build());//CloseableHttpClient httpclient=builder.build(); HttpParams params = new BasicHttpParams(); HttpConnectionParams.setConnectionTimeout(params, 10000); HttpConnectionParams.setSoTimeout(params, 10000); HttpClient httpClient = new DefaultHttpClient(params);try {HttpGet httpGet=new HttpGet(url);HttpResponse response=httpClient.execute(httpGet);System.out.println("得到http响应");if(response.getStatusLine().getStatusCode()==HttpStatus.SC_OK){/**************************************************************************************///提取网页编码方式/*Header[] headers=response.getAllHeaders();String charset=null;int temp=-1;for(int i=0;i<headers.length;i++){if((temp=headers[i].getValue().indexOf("charset="))!=-1){//int end=headers[i].getValue().indexOf("\"");//if(end==-1)//end=headers[i].getValue().indexOf(">");//charset=headers[i].getValue().substring(temp+8,end-1);charset=headers[i].getValue().substring(temp+8);break;}}*//*InputStream in=response.getEntity().getContent();String charset=null;byte b[]=null;int contentLength=in.available();if(contentLength>1000){contentLength=1000;}b=new byte[1000];in.read(b,0,contentLength);String strTmp=new String(b);Pattern p;Matcher m;String regex="gb2312|GB2312|GBK|gbk|utf-8|UTF-8|utf8|UTF8";p=Pattern.compile(regex);m=p.matcher(strTmp);if(m.find()){charset=m.group();}else{charset="utf-8";}System.out.println("得到网页字符集"+charset);//BufferedReader br=new BufferedReader(new InputStreamReader(in));//if(charset==null)//{//String line="";//StringBuffer buffer=new StringBuffer();//while((line=br.readLine())!=null)//{//buffer.append(line);//}//line=buffer.toString();//int a=line.indexOf("charset=");//String str=line.substring(a);//charset=str.substring(8,str.indexOf("\""));//}//if(charset==null)//{//charset="utf-8";//}*//*************************************************************************************//*//得到网页内容BufferedReader responseBody=new BufferedReader(new InputStreamReader(in,charset));*//*************************************************************************************/String a=response.getFirstHeader("Content-Type").getValue();System.out.println("Content-Type内容: "+a);InputStream responseBody=response.getEntity().getContent();filePath="E:\\temp\\"+getFileNameByUrl(url,response.getFirstHeader("Content-Type").getValue());System.out.println("文件路径： "+filePath);//saveToLocal(responseBody,filePath);FileOutputStream outputStream=new FileOutputStream(new File(filePath));int length=0;byte b[]=new byte[1024];while((length=responseBody.read(b))!=-1){outputStream.write(b,0,length);}responseBody.close();outputStream.close();}else{System.err.print("Method Failed:"+response.getStatusLine().getStatusCode());}} catch (ClientProtocolException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}catch(Exception e){e.printStackTrace();}//try {//httpclient.close();//} catch (IOException e) {//// TODO Auto-generated catch block//e.printStackTrace();//}return filePath;}private void saveToLocal(InputStream responseBody,String filePath) throws IOException{//int ch;//FileWriter fw=new FileWriter(filePath);//////while((ch=responseBody.read())!=-1){//fw.write(ch);//}//responseBody.close();//fw.close();////return ;//String line="";//StringBuffer buffer=new StringBuffer();//int i=0;//while((line=responseBody.readLine())!=null)//{//buffer.append(line);//System.out.println("第"+i+"次循环");//i++;//}//line=buffer.toString();//System.out.println(line);//输出源码/**********************************************************************************************///向文件中写入源码字符串//FileWriter fw1=new FileWriter(filePath);//fw1.write(line);//fw1.close();//System.out.println("保存完成"+filePath);//DataOutputStream out=new DataOutputStream(new FileOutputStream(new File(filePath)));//for(int i=0;i<b.length;i++)//{//out.write(b[i]);//}FileOutputStream outputStream=new FileOutputStream(new File(filePath));byte b[]=new byte[1024];while(responseBody.read(b)!=-1){outputStream.write(b);}responseBody.close();outputStream.close();}/*****************************************************************************************///调试用//public static void main(String[] args)//{//DownLoadFile df=new DownLoadFile();//df.downloadFile("http://www.baidu.com");//}}

提取链接

import java.io.BufferedReader;import java.io.FileNotFoundException;import java.io.FileReader;import java.util.HashSet;import java.util.Set;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.filters.OrFilter;import org.htmlparser.tags.LinkTag;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;public class HtmlParserTool {public static Set<String> extractLinks(String filePath){Set<String> links=new HashSet<String>();NodeList nodeList;String line="";          StringBuffer sb=new StringBuffer();        NodeFilter linkFilter=new NodeClassFilter(LinkTag.class);        OrFilter lastFilter=new OrFilter();        lastFilter.setPredicates(new NodeFilter[]{linkFilter});try {BufferedReader br=new BufferedReader(new FileReader(filePath));while((line=br.readLine())!=null){sb.append(line);}Parser parser=Parser.createParser(sb.toString(), "utf-8");nodeList=parser.parse(lastFilter);Node nodes[]=nodeList.toNodeArray();String link=null;for(int i=0;i<nodes.length;i++){if(nodes[i] instanceof  LinkTag)//<a>  标签{LinkTag linkNode=(LinkTag)(nodes[i]);link=linkNode.getLink();links.add(link);}else//<frame标签>{//提取frame里src属性的链接，如<frame src="test.html"/>String frame=nodes[i].getText();int start=frame.indexOf("src");int end=frame.indexOf(" ");if(end==-1){end=frame.indexOf(">");}String frameUrl=frame.substring(start+5, end-1);links.add(frameUrl);}}} catch (FileNotFoundException e) {// TODO Auto-generated catch blocke.printStackTrace();}catch (Exception e) {e.printStackTrace();}return links;//try {//Parser parser=new Parser(url);//parser.setEncoding("gb2312");//NodeFilter linkFilter=new NodeClassFilter(LinkTag.class);//OrFilter lastFilter=new OrFilter();//lastFilter.setPredicates(new NodeFilter[]{linkFilter});////parser.setEncoding("gb2312");//nodeList=parser.parse(lastFilter);//Node[] nodes=nodeList.toNodeArray();//String link="";//System.out.println("开始提取链接循环");//for(int i=0;i<nodes.length;i++)//{//if(nodes[i] instanceof  LinkTag)//<a>  标签//{//LinkTag linkNode=(LinkTag)(nodes[i]);//link=linkNode.getLink();//links.add(link);//}//else//<frame标签>//{////提取frame里src属性的链接，如<frame src="test.html"/>//String frame=nodes[i].getText();//int start=frame.indexOf("src");//int end=frame.indexOf(" ");//if(end==-1)//{//end=frame.indexOf(">");//}//String frameUrl=frame.substring(start+5, end-1);//links.add(frameUrl);//}//}//} catch (ParserException e) {//// TODO Auto-generated catch block//e.printStackTrace();//}//catch(Exception e){//e.printStackTrace();//}//return links;}}

主程序

import java.util.Set;public class MyClawler {private void initCrawlerWithSeeds(String[] seeds){for(int i=0;i<seeds.length;i++){NewLinkQueue.addUnvisitedUrl(seeds[i]);}}public void crawling(String[] seeds){/******************************************************************************///定义过滤器/*LinkFilter filter=new LinkFilter(){public boolean accept(String url){if(url.startsWith("http://www.baidu.com"))return true;else return false;}};*//******************************************************************************/initCrawlerWithSeeds(seeds);DownLoadFile downLoader=new DownLoadFile();Set<String> links=null;String filePath=null;while(!NewLinkQueue.unvisitedUrlIsEmpty()){String visitUrl;try {visitUrl = (String)NewLinkQueue.unvisitedUrlDeQueue();// 未访问队列队首Url出列System.out.println("提取未访问的Url"+visitUrl);if(visitUrl==null)continue;filePath=downLoader.downloadFile(visitUrl);// 下载网页NewLinkQueue.addVisitedUrl(visitUrl);// 将该Url放入已访问队列links=HtmlParserTool.extractLinks(filePath);// 提取网页中的链接System.out.println("网页中的链接数："+links.size());for(String link:links){NewLinkQueue.addUnvisitedUrl(link);// 将链接放入未访问队列System.out.println(link);}System.out.println("网页中的链接数："+links.size());} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}}}public static void main(String[] args){MyClawler clawler=new MyClawler();clawler.crawling(new String[]{"http://www.baidu.com"});System.out.println("done");}}

参考文献：《自己动手写网络爬虫》、Berkeley DB参考手册等