Java网络爬虫crawler4j学习笔记<21> Page 类

来源:互联网 发布:js 质数 编辑:程序博客网 时间:2024/06/05 15:14

简介

Page 类解析httpClient包中的Entity对象,获取当前页面的信息,包括url(转换为WebURl),response的信息(status code, response header等),解析后的内容信息等等。

源代码

package edu.uci.ics.crawler4j.crawler;import java.nio.charset.Charset;import org.apache.http.Header;import org.apache.http.HttpEntity;import org.apache.http.entity.ContentType;import org.apache.http.util.EntityUtils;import edu.uci.ics.crawler4j.parser.ParseData;import edu.uci.ics.crawler4j.url.WebURL;/** * This class contains the data for a fetched and parsed page. * * @author Yasser Ganjisaffar [lastname at gmail dot com] */// 用来描述web页面的类public class Page {  /**   * The URL of this page.   */  // 当前页面的url  protected WebURL url;  /**  * Redirection flag  */  // 当前页面是否重定向  protected boolean redirect;  /**   * The URL to which this page will be redirected to   */  // 重定向的url  protected String redirectedToUrl;  /**  * Status of the page  */  // 当前页面的状态码  protected int statusCode;  /**   * The content of this page in binary format.   */  // 二进制格式的页面内容  protected byte[] contentData;  /**   * The ContentType of this page.   * For example: "text/html; charset=UTF-8"   */  // 当前页面的contentType  protected String contentType;  /**   * The encoding of the content.   * For example: "gzip"   */  // 当前页面的编码方式  protected String contentEncoding;  /**   * The charset of the content.   * For example: "UTF-8"   */  // 页面内容的字符集  protected String contentCharset;  /**  * Language of the Content.  */  // 页面内容的language  private String language;  /**   * Headers which were present in the response of the fetch request   */  // 当前页面response中的header集合  protected Header[] fetchResponseHeaders;  /**   * The parsed data populated by parsers   */  // 使用parser翻译过后的页面  protected ParseData parseData;  public Page(WebURL url) {    this.url = url;  }  /**   * Loads the content of this page from a fetched HttpEntity.   *   * @param entity HttpEntity   * @throws Exception when load fails   */  // 解析通过httpclient包收到的entity  public void load(HttpEntity entity) throws Exception {    contentType = null;    Header type = entity.getContentType();    if (type != null) {      contentType = type.getValue();    }    contentEncoding = null;    Header encoding = entity.getContentEncoding();    if (encoding != null) {      contentEncoding = encoding.getValue();    }    Charset charset = ContentType.getOrDefault(entity).getCharset();    if (charset != null) {      contentCharset = charset.displayName();    }    contentData = EntityUtils.toByteArray(entity);  }  public WebURL getWebURL() {    return url;  }  public void setWebURL(WebURL url) {    this.url = url;  }  public boolean isRedirect() {    return redirect;  }  public void setRedirect(boolean redirect) {    this.redirect = redirect;  }  public String getRedirectedToUrl() {    return redirectedToUrl;  }  public void setRedirectedToUrl(String redirectedToUrl) {    this.redirectedToUrl = redirectedToUrl;  }  public int getStatusCode() {    return statusCode;  }  public void setStatusCode(int statusCode) {    this.statusCode = statusCode;  }  /**   * Returns headers which were present in the response of the fetch request   *   * @return Header Array, the response headers   */  public Header[] getFetchResponseHeaders() {    return fetchResponseHeaders;  }  public void setFetchResponseHeaders(Header[] headers) {    fetchResponseHeaders = headers;  }  /**   * @return parsed data generated for this page by parsers   */  public ParseData getParseData() {    return parseData;  }  public void setParseData(ParseData parseData) {    this.parseData = parseData;  }  /**   * @return content of this page in binary format.   */  public byte[] getContentData() {    return contentData;  }  public void setContentData(byte[] contentData) {    this.contentData = contentData;  }  /**   * @return ContentType of this page.   * For example: "text/html; charset=UTF-8"   */  public String getContentType() {    return contentType;  }  public void setContentType(String contentType) {    this.contentType = contentType;  }  /**   * @return encoding of the content.   * For example: "gzip"   */  public String getContentEncoding() {    return contentEncoding;  }  public void setContentEncoding(String contentEncoding) {    this.contentEncoding = contentEncoding;  }  /**   * @return charset of the content.   * For example: "UTF-8"   */  public String getContentCharset() {    return contentCharset;  }  public void setContentCharset(String contentCharset) {    this.contentCharset = contentCharset;  }  /**   * @return Language   */  public String getLanguage() {    return language;  }  public void setLanguage(String language) {    this.language = language;  }}
0 0