java网络爬虫

来源：互联网发布：wpsoffice办公软件编辑：程序博客网时间：2024/05/19 13:24

注意这个程序需要加入解析html的一个jar包。htmlParser。jar.其实这个小程序也是我在网上弄的。主要是想利用lucene这个全文检索技术，实现一个类似百度那样的搜索引擎。没别的想法，没有百度那么复杂，那些核心的算法，都没牵涉到，也就是自己写个网络爬虫前端，然后用它抓取网页，用lucene建立索引，建立分词库，然后写个用户界面就行了。最好你的爬虫的功能很强大，你的服务器也够强大才行。这段程序试了下，在Eclipse里运行的，卡死了。有的网站也屏蔽了爬虫。

package cn.bit.Main;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import org.htmlparser.Node;
import org.htmlparser.Tag;
import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.*;
import org.htmlparser.Parser;
import org.htmlparser.filters.StringFilter;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.NodeTreeWalker;
import org.htmlparser.util.ParserException;
import java.util.Queue;
import java.util.LinkedList;

public class Splider implements Runnable {

   boolean search_key_words = false;
   int count = 0;
   int limitsite = 10;
   int countsite = 1;
   String keyword = "中国";//搜索关键字
   Parser parser = new Parser();
   // List linklist = new ArrayList();
   String startsite = "";//搜索的其实站点
   SearchResultBean srb;//保存搜索结果
   List resultlist = new ArrayList();//搜索到关键字链接列表
   List searchedsite = new ArrayList();//已经被搜索站点列表
   Queue linklist = new LinkedList();//需解析的链接列表

  HashMap<String, ArrayList<String>> disallowListCache = new HashMap<String, ArrayList<String>>();

   public Splider(String keyword, String startsite)
   {
     this.keyword = keyword;
     this.startsite = startsite;
     linklist.add(startsite);
     srb = new SearchResultBean();
   }

   public void run() {
    search(linklist);
   }

   public void search(Queue queue) {
    String url = "";
    while(!queue.isEmpty()){
     url = queue.peek().toString();//查找列队
      try {
      if (!isSearched(searchedsite, url)) {
       if (isRobotAllowed(new URL(url)))//检查该链接是否被允许搜索
        processHtml(url);
       else
        System.out.println("this page is disallowed to search");
            }
     } catch(Exception ex) {
      }
    }
       queue.remove();
   }

   /**
   *解析HTML
   * @param url
   * @throws ParserException
   * @throws Exception
   */

   public void processHtml(String url) throws Exception {
    searchedsite.add(url);
    count = 0;
    System.out.println("searching ... :" + url);
    parser.setURL(url);
    parser.setEncoding("GBK");
    URLConnection uc = parser.getConnection();
    uc.connect(); //uc.getLastModified();
    NodeIterator nit = parser.elements();
    while (nit.hasMoreNodes()) {
     Node node = nit.nextNode();
     parserNode(node);
    }
     srb.setKeywords(keyword);
     srb.setUrl(url);
     srb.setCount_key_words(count);
                    resultlist.add(srb);
     System.out.println("count keywords is :" + count);

     Iterator<SearchResultBean> iter= resultlist.iterator();
     while(iter.hasNext()){
      System.out.println(iter.next().getUrl());
     }
     System.out.println("----------------------------------------------");
   }

  /**
   *处理HTML标签
   * @param tag
   * @throws Exception
   */
   public void dealTag(Tag tag) throws Exception {
     NodeList list = tag.getChildren();
     if (list != null) {
      NodeIterator it = list.elements();
      while (it.hasMoreNodes()) {
       Node node = it.nextNode();
       parserNode(node);
      }
     }
   }

   /**
   *处理HTML标签结点
   * @param node
   * @throws Exception
   */
   public void parserNode(Node node) throws Exception{

     if (node instanceof TextNode) {//判断是否是文本结点
        TextNode sNode = (TextNode) node;
        StringFilter sf = new StringFilter(keyword,false); search_key_words = sf.accept(sNode);
        if (search_key_words) {
        count++;
        }
     // System.out.println("text is :"+sNode.getText().trim());
     } else if (node instanceof Tag) {//判断是否是标签库结点

        Tag atag = (Tag) node;
        if (atag instanceof TitleTag) {//判断是否是标TITLE结点
        srb.setTitle(atag.getText());
        }
        if (atag instanceof LinkTag) {//判断是否是标LINK结点
        LinkTag linkatag = (LinkTag) atag;
        checkLink(linkatag.getLink(), linklist); // System.out.println("-----------------this is link --------------");
        }
        dealTag(atag);
    } else if (node instanceof RemarkNode) {//判断是否是注释
       // System.out.println("this is remark");
    }
   }

   /*
   *检查链接是否需要加入列队
   */
   public void checkLink(String link, Queue queue) {
    if (link != null && !link.equals("") && link.indexOf("#") == -1){
     if (!link.startsWith("http://") && !link.startsWith("ftp://") && !link.startsWith("www.")) {
      link = "file:///" + link;
     } else if (link.startsWith("www.")) {
      link = "http://" + link;
     }
     if (queue.isEmpty())
      queue.add(link);
     else {
      String link_end_=link.endsWith("/")?link.substring(0,link.lastIndexOf("/")):(link+"/");
      if (!queue.contains(link)&&!queue .contains(link_end_)) {
       queue.add(link);
      }
     }
    }
   }

   /**
    *检查该链接是否已经被扫描
    * @param list
    * @param url
    * @return
    */
    public boolean isSearched(List list, String url) {
     String url_end_ = "";
     if (url.endsWith("/")) {
      url_end_ = url.substring(0, url.lastIndexOf("/"));
     } else {
      url_end_ = url + "/";
     }
     if (list.size() > 0) {
      if (list.indexOf(url) != -1 || list.indexOf(url_end_) != -1) {
       return true;
      }
     }
     return false;
    }

    /**
    *检查URL是否被允许搜索
    * @param urlToCheck
    * @return
    */
    private boolean isRobotAllowed(URL urlToCheck) {
      String host = urlToCheck.getHost().toLowerCase();// 获取给出RUL的主机// System.out.println("主机="+host);
      System.out.println("^^^^^^"+host+"^^^^^^");
     //获取主机不允许搜索的URL缓存
       ArrayList<String> disallowList = disallowListCache.get(host);
    // 如果还没有缓存,下载并缓存。
    if (disallowList == null) {
       disallowList = new ArrayList<String>();
      try {
       URL robotsFileUrl = new URL("http://" + host + "/robots.txt");
       BufferedReader reader = new BufferedReader(
         new InputStreamReader(robotsFileUrl.openStream()));
       // 读robot文件，创建不允许访问的路径列表。
       String line;
       while ((line = reader.readLine()) != null) {
        if (line.indexOf("Disallow:") == 0) {
         // 是否包含"Disallow:"
         String disallowPath = line.substring("Disallow:" .length());// 获取不允许访问路径
        // 检查是否有注释。
         int commentIndex = disallowPath.indexOf("#");
         if (commentIndex != -1) {
         disallowPath = disallowPath.substring(0,commentIndex);//去掉注释
         }
        disallowPath = disallowPath.trim();
        disallowList.add(disallowPath);
        }
       }
        for (Iterator it = disallowList.iterator();
          it.hasNext();) {
         System.out.println("Disallow is :" + it.next());
        }

        // 缓存此主机不允许访问的路径。
        disallowListCache.put(host, disallowList);
       } catch (Exception e) {
        return true; // web站点根目录下没有robots.txt文件,返回真
       }
        String file = urlToCheck.getFile();
        // System.out.println("文件getFile()="+file);
        for (int i = 0; i < disallowList.size(); i++)
        {
         String disallow = disallowList.get(i);
         if (file.startsWith(disallow)) {
          return false;
         }
        }
    }
    return true;
    }

public static void main(String[] args) {
     Splider ph = new Splider("", "http://www.baidu.com");
     try {
      Thread search = new Thread(ph);
      search.start();//启动线程
     } catch (Exception ex) {
     }
}
}

…………………………………………………………………………………………………………………………………………………………………………………………

下面的这个bean是用来存储搜索结果的。。

package cn.bit.Main;

public class SearchResultBean {
String url = "";
    String title = "";
    String keywords = "";
    int count_key_words = 0;
    public int getCount_key_words() {
    return count_key_words;
    }

    public void setCount_key_words(int count_key_words) {
    this.count_key_words = count_key_words;
    }

    public String getKeywords() {
    return keywords;
    }

    public void setKeywords(String keywords) {
    this.keywords = keywords;
    }

    public String getTitle() {
    return title;
    }

    public void setTitle(String title) {
    this.title = title;
    }

    public String getUrl() {
    return url;
    }

public void setUrl(String url) {

this.url = url;

}
}