简易的网络爬虫

来源:互联网 发布:人工智能程序语言 编辑:程序博客网 时间:2024/05/23 00:01

程序用到的类库:

commons-logging-1.1.1.jar

htmlparser.jar

httpclient-4.1.jar

httpcore-4.1.jar



package 简易网络爬虫;import java.io.IOException;import java.util.Set;import org.apache.http.client.ClientProtocolException;import org.htmlparser.util.ParserException;/** * 入口程序 * @author Administrator * */public class Crawer {public static void crawlering(String rootUrl) throws ClientProtocolException, ParserException, IOException{//初始化种子initCrawlerUrl(rootUrl);}public static void initCrawlerUrl(String rootUrl) throws ClientProtocolException, IOException, ParserException{LinkDB.enQueue(rootUrl);LinkFilter linkFilter = new LinkFilter(){public boolean accept(String url){return true;}};while(!LinkDB.isQueueEmpty() && LinkDB.getVisitedUrl()<=50){String url=LinkDB.deQueue();FileDownLoader downLoader=new FileDownLoader();//设置下载器类型downLoader.setDownType("html");downLoader.downloadFile(url);LinkDB.addVisitedUrl(url);Set<String> links=ExtractLink.extractLinks(url, linkFilter);for(String str:links){LinkDB.enQueue(str);}}}public static void main(String [] args) throws ClientProtocolException, ParserException, IOException{Crawer crawer=new Crawer();crawer.crawlering("http://dblp.uni-trier.de/");}}

package 简易网络爬虫;import java.util.HashSet;import java.util.Set;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.filters.OrFilter;import org.htmlparser.tags.LinkTag;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;/** * 抽取网页中的URL * @author Administrator * */public class ExtractLink {public static String getFrameSrc(String text){int start = text.indexOf("src");text=text.substring(start);int end = text.indexOf(" ");if(end == -1)end = text.indexOf(">")-1;text=text.substring(5,end-1);return text;}public static Set<String> extractLinks(String url,LinkFilter linkFilter) throws ParserException{Set<String> links=new HashSet<String>();Parser parser=new Parser(url);NodeFilter frameFilter=new NodeFilter(){public boolean accept(Node node){if(node.getText().contains("frame src="))return true;return false;}};OrFilter orFilter= new OrFilter(new NodeClassFilter(LinkTag.class),frameFilter);NodeList nodeList=parser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class));int size=nodeList.size();String text="";for(int i=0;i<size;i++){Node node=nodeList.elementAt(i);if(node instanceof LinkTag){LinkTag linkTag=(LinkTag) node;text=linkTag.getLink();if(linkFilter.accept(text)){links.add(text);}}else{text=getFrameSrc(node.getText());links.add(text);}}return links;}}

package 简易网络爬虫;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.OutputStreamWriter;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.DefaultHttpClient;/** * 文件下载器 * @author Administrator * */public class FileDownLoader {//文件类型private String downType;public String getDownType() {return downType;}public void setDownType(String downType) {this.downType = downType;}//根据url和网页类型生成需要保存的网页文件名,去除掉url中的非法字符public String getFileNameByUrl(String url,String contentType){url=url.substring(7); //取出http://if(contentType.indexOf("html")!=-1){url=url.replaceAll("[\\?/:*|<>\"]", "_")+".html";return url;}else{return url.replaceAll("[\\?/:*|<>\"]", "_")+"."+contentType.substring(contentType.lastIndexOf("/")+1);}}//将页面内容保存到本地public void saveToLocal(InputStream is , String filePath) throws IOException{BufferedReader br=new BufferedReader(new InputStreamReader(is));BufferedWriter bw=new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(filePath))));String str="";while((str=br.readLine())!=null){bw.write(str+"\n");}bw.close();br.close();is.close();}public void saveToLocal2(InputStream is , String filePath) throws IOException{byte[] buffer=new byte[1024];int offset=0;FileOutputStream fos=new FileOutputStream(new File(filePath));while((offset=is.read(buffer))!=-1){fos.write(buffer, 0, offset);}fos.close();is.close();}//下载url指向的网页public void downloadFile(String url) throws ClientProtocolException, IOException{String filePath=null;//生成httpclient兑现归并设置参数HttpClient httpClient=new DefaultHttpClient();HttpGet getMethod=new HttpGet(url);System.out.println(url);getMethod.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");HttpResponse response=httpClient.execute(getMethod);if(response.getStatusLine().getStatusCode()==200){HttpEntity entity=response.getEntity();InputStream is=entity.getContent();if(downType!=null){filePath="F:\\download\\"+this.getFileNameByUrl(url, entity.getContentType().getValue());if(downType.equals("zip"))this.saveToLocal2(is, filePath);elsethis.saveToLocal(is, filePath);}}}public static void main(String[] args) throws ClientProtocolException, IOException{FileDownLoader fdl=new FileDownLoader();fdl.setDownType("html");String url="http://www.baidu.com";fdl.downloadFile(url);}}

package 简易网络爬虫;/** * URL过滤 * @author Administrator * */public interface LinkFilter {//抽取指定条件的URLpublic boolean accept(String url);}

package 简易网络爬虫;import java.util.HashSet;import java.util.Set;/** * 存放已下载和未下载的链接信息 * @author Administrator * */public class LinkDB {//保存已访问过的url地址private static Set<String> visitedUrl=new HashSet<String>();//保存未访问的url地址private static Queue<String> unvisitedUrl=new Queue<String>();public static void addVisitedUrl(String url){visitedUrl.add(url);}public static void enQueue(String url){unvisitedUrl.enQueue(url);}public static String deQueue(){return unvisitedUrl.deQueue();}public static boolean isQueueEmpty(){return unvisitedUrl.isEmpty();}public static boolean isQueueContains(String url){return unvisitedUrl.contains(url);}public static int getVisitedUrl(){return visitedUrl.size();}}

package 简易网络爬虫;import java.util.LinkedList;/** * 存放未处理的URL * @author Administrator * */public class Queue<T> {private LinkedList<T> queue=new LinkedList<T>();public T deQueue(){return queue.removeFirst();}public void enQueue(T t){queue.addLast(t);}public boolean contains(T t){return queue.contains(t);}public boolean isEmpty(){return queue.isEmpty();}}


0 0