Java网络爬虫crawler4j学习笔记<6> WebURL类

来源:互联网 发布:java清除jemetercookie 编辑:程序博客网 时间:2024/05/17 01:45

源代码分析

package edu.uci.ics.crawler4j.url;import java.io.Serializable;import com.sleepycat.persist.model.Entity;import com.sleepycat.persist.model.PrimaryKey;@Entity // Berkley DB Annotationpublic class WebURL implements Serializable {  private static final long serialVersionUID = 1L;  @PrimaryKey  private String url;       // 当前页面的url  private int docid;        // 为当前网页分配的一个docId  private int parentDocid;  // 在网页a的页面上找到指向b的链接,则a是b的parentDocid  private String parentUrl; // 在网页a的页面上找到指向b的链接,则a是b的parentUrl  private short depth;      // 爬取深度, 从0开始计数  private String domain;    // 当前网页的主域名  private String subDomain; // 当前网页的子域名  private String path;      // 当前网页在网站中的资源路径  private String anchor;    // 超链接标签中的文本  private byte priority;    // 爬取的优先级,越低代表优先级越高  private String tag;       // 标签  /**   * @return unique document id assigned to this Url.   */  public int getDocid() {    return docid;  }  public void setDocid(int docid) {    this.docid = docid;  }  /**   * @return Url string   */  public String getURL() {    return url;  }  public void setURL(String url) {    this.url = url;    // 从"http://"开始作为domain的起点    int domainStartIdx = url.indexOf("//") + 2;     // 第一个斜杠作为domain的终点,例如”http://www.baidu.com/“    int domainEndIdx = url.indexOf('/', domainStartIdx);    // 有点没有斜杠,如http://www.baidu.com    domainEndIdx = domainEndIdx > domainStartIdx ? domainEndIdx : url.length();    domain = url.substring(domainStartIdx, domainEndIdx);    subDomain = "";    //根据点进行拆分    String[] parts = domain.split("\\.");       if (parts.length > 2) {      // 默认的domain包含两个字段,如www.baidu.com中的baidu.com      domain = parts[parts.length - 2] + "." + parts[parts.length - 1];      int limit = 2;      // 有的包含3个字段,如www.sina.com.cn中的sina.com.cn      if (TLDList.getInstance().contains(domain)) {        domain = parts[parts.length - 3] + "." + domain;        limit = 3;      }      for (int i = 0; i < parts.length - limit; i++) {        // 加上分隔符        if (subDomain.length() > 0) {          subDomain += ".";        }        subDomain += parts[i];      }    }    path = url.substring(domainEndIdx);    // 如果url中带有参数(即含有?),则?之后的不是path    int pathEndIdx = path.indexOf('?');    if (pathEndIdx >= 0) {      path = path.substring(0, pathEndIdx);    }  }  /**   * @return   *      unique document id of the parent page. The parent page is the   *      page in which the Url of this page is first observed.   */  public int getParentDocid() {    return parentDocid;  }  public void setParentDocid(int parentDocid) {    this.parentDocid = parentDocid;  }  /**   * @return   *      url of the parent page. The parent page is the page in which   *      the Url of this page is first observed.   */  public String getParentUrl() {    return parentUrl;  }  public void setParentUrl(String parentUrl) {    this.parentUrl = parentUrl;  }  /**   * @return   *      crawl depth at which this Url is first observed. Seed Urls   *      are at depth 0. Urls that are extracted from seed Urls are at depth 1, etc.   */  public short getDepth() {    return depth;  }  public void setDepth(short depth) {    this.depth = depth;  }  /**   * @return   *      domain of this Url. For 'http://www.example.com/sample.htm', domain will be 'example.com'   */  public String getDomain() {    return domain;  }  public String getSubDomain() {    return subDomain;  }  /**   * @return   *      path of this Url. For 'http://www.example.com/sample.htm', path will be 'sample.htm'   */  public String getPath() {    return path;  }  public void setPath(String path) {    this.path = path;  }  /**   * @return   *      anchor string. For example, in <a href="example.com">A sample anchor</a>   *      the anchor string is 'A sample anchor'   */  public String getAnchor() {    return anchor;  }  public void setAnchor(String anchor) {    this.anchor = anchor;  }  /**   * @return priority for crawling this URL. A lower number results in higher priority.   */  public byte getPriority() {    return priority;  }  public void setPriority(byte priority) {    this.priority = priority;  }  /**   * @return tag in which this URL is found, like 'a' , 'href' ,····   * */  public String getTag() {    return tag;  }  public void setTag(String tag) {    this.tag = tag;  }  @Override  public int hashCode() {    return url.hashCode();  }  @Override  public boolean equals(Object o) {    if (this == o) {      return true;    }    if (o == null || getClass() != o.getClass()) {      return false;    }    WebURL otherUrl = (WebURL) o;    return url != null && url.equals(otherUrl.getURL());  }  @Override  public String toString() {    return url;  }}

测试

这里写图片描述

0 0
原创粉丝点击