利用htmlparser爬虫获取指定完整的完整区域信息

来源:互联网 发布:java系统架构师面试题 编辑:程序博客网 时间:2024/05/17 08:38
package com.hundsun.pc;import java.io.BufferedReader;import java.io.InputStream;import java.io.InputStreamReader;import java.util.ArrayList;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.util.EntityUtils;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.filters.TagNameFilter;import org.htmlparser.nodes.TagNode;import org.htmlparser.tags.LinkTag;import org.htmlparser.util.NodeList;public class Url {ArrayList<String> links= new ArrayList<String>();  final static String url="http://www.diqudaima.com";public static void main(String[] args) {String vistor = null ;try {LinkQueue queue = new LinkQueue();queue.addUnvisitedUrl("/zhejiang/hangzhoushi/");do{try {HttpClient http = new DefaultHttpClient();vistor = queue.unVisitedUrlDeQueue().toString();System.out.println("开始访问:" + url + vistor);HttpGet hg = new HttpGet(url + vistor);HttpResponse hr;hr = http.execute(hg);HttpEntity he = hr.getEntity();//Url Url = new Url();if (he != null) {String charset = EntityUtils.getContentCharSet(he);InputStream is = he.getContent();BufferedReader br = new BufferedReader(new InputStreamReader(is, "GBK"));String line = null;int i = 0;while ((line = br.readLine()) != null) {Url.geUrl(line, queue);}}queue.addVisitedUrl(vistor);http.getConnectionManager().shutdown();System.out.println("获取区域信息数:"+queue.getMap().size()+"已经访问链接数:"+queue.getVisitedUrl().size()+"异常链接数:"+queue.getErrorUrl().size());} catch (Exception e) {System.out.println("访问异常:"+url+vistor);queue.adderrorUrl(vistor);queue.addUnvisitedUrl(vistor);} }while(!queue.isUnvisitedUrlsEmpty());} catch (Exception e) {e.printStackTrace();}finally{}}public  void geUrl(String html,LinkQueue queue){try{ NodeFilter filter = new TagNameFilter("li"); Parser p=new Parser();       p.setInputHTML(html);       NodeList list = p.extractAllNodesThatMatch(filter);       for (int i = 0; i < list.size(); i++) {       Node textnode = (Node) list.elementAt(i);     NodeList listChildren =textnode.getChildren();     String key="";     String value="";     for (int j = 0; j < listChildren.size(); j++) {       Node textnodeChildren = (Node) listChildren.elementAt(j);     if(textnodeChildren.getClass()==LinkTag.class){     LinkTag nodeChildren = (LinkTag)textnodeChildren;     key=nodeChildren.getLinkText();     queue.addUnvisitedUrl(nodeChildren.getLink());     }else{     value=textnodeChildren.getText();     if(value.split("邮编").length>1)     value= value.substring(value.indexOf("地区编码:")+5, value.indexOf("邮编:"));     if(value.startsWith("[")){     value=value.replace("[", "");     value=value.replace("]", "");     }     if(key.equals("")){     String[] args =  value.split("\\[");     key = args[0];     value =  args[1].replace("]", "");     }     }     }     System.out.println("key="+key+"       value="+value);     if(!key.equals(""))     queue.getMap().put(key, value);            }  }catch (Exception e) {e.printStackTrace();} }

辅助类

package com.hundsun.pc;import java.util.HashMap;import java.util.HashSet;import java.util.Map;import java.util.Set;/** * Created by amosli on 14-7-9. */public class LinkQueue {    //已经访问的队列    private static Set visitedUrl = new HashSet();    //已经访问的队列    private static Set errorUrl = new HashSet();    //未访问的队列    private static Queue unVisitedUrl = new Queue();    private Map map = new HashMap<String,String>();    //获得URL队列    public static Queue getUnVisitedUrl() {        return unVisitedUrl;    }    public static Set getVisitedUrl() {        return visitedUrl;    }    //添加到访问过的URL队列中    public static void addVisitedUrl(String url) {        visitedUrl.add(url);    }    public static void adderrorUrl(String url) {    errorUrl.add(url);    }    //删除已经访问过的URL    public static void removeVisitedUrl(String url){        visitedUrl.remove(url);    }    //未访问的URL出队列    public static Object unVisitedUrlDeQueue(){        return unVisitedUrl.deQueue();    }    //保证每个URL只被访问一次,url不能为空,同时已经访问的URL队列中不能包含该url,而且因为已经出队列了所未访问的URL队列中也不能包含该url    public static void addUnvisitedUrl(String url){        if(url!=null&&!url.trim().equals("")&&!visitedUrl.contains(url)&&!unVisitedUrl.contains(url))        unVisitedUrl.enQueue(url);    }    //获得已经访问过的URL的数量    public static int getVisitedUrlNum(){        return visitedUrl.size();    }    //判断未访问的URL队列中是否为空    public static boolean isUnvisitedUrlsEmpty(){        return unVisitedUrl.empty();    }public Map getMap() {return map;}public void setMap(Map map) {this.map = map;}public static Set getErrorUrl() {return errorUrl;}public static void setErrorUrl(Set errorUrl) {LinkQueue.errorUrl = errorUrl;}        }


运行结果如下图



原创粉丝点击