tokenHTML,JDom,httpClient综合应用

来源:互联网 发布:最新锐捷mac客户端 编辑:程序博客网 时间:2024/06/05 06:51

1. 获取html 内容

private String getPageContent(String url){
  String content = "";
  HttpClient httpClient = new HttpClient();
  GetMethod getMethod = new GetMethod( url );
  getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler());
  try {
   int statusCode = httpClient.executeMethod(getMethod);
   if (statusCode != HttpStatus.SC_OK) {
    System.err.println("Method failed: "+ getMethod.getStatusLine());
   }
   byte[] responseBody = getMethod.getResponseBody();
   content =new String(responseBody) ;
  } catch (HttpException e) {
   System.out.println("Please check your provided http address!");
   e.printStackTrace();
  } catch (IOException e) {
   e.printStackTrace();
  } finally {
   getMethod.releaseConnection();
  }
  return content;
 }

所用依赖包

   <dependency>
    <groupId>commons-httpclient</groupId>
    <artifactId>commons-httpclient</artifactId>
    <version>3.1</version>
   </dependency>

 

2 NekoHTML 标签补偿应用

 

 

try {

  String content = "<tr><td>test</td><<td>hello</td></tr>
   InputSource inputSource = new InputSource(  new StringReader( content ));
   parser.parse( inputSource );
   org.w3c.dom.Document  doc = parser.getDocument();
  } catch ( Exception e) {
   e.printStackTrace();
  } 

所用依赖包

   <dependency>
    <groupId>net.sourceforge.nekohtml</groupId>
    <artifactId>nekohtml</artifactId>
    <version>1.9.9</version>
   </dependency>

 

3 org.3c.document 转jdom.Document

   org.w3c.dom.Document  doc = parser.getDocument();
   DOMBuilder builders = new DOMBuilder();
   org.jdom.Document jDoc = builders.build(doc);

所用依赖包

   <dependency>
    <groupId>org.jdom</groupId>
    <artifactId>jdom</artifactId>
    <version>1.1</version>
   </dependency>

 

4  jdom递归访问

 

 process (jDoc.getRootElement());

 

 public  void process(Element element){
   inspect(element);
   List content=element.getContent();//取元素的所有内容
   Iterator iterator=content.iterator();
   while(iterator.hasNext()){
    Object o=iterator.next();
    if(o instanceof Element){//如果是子元素
     Element child=(Element)o;
     process(child);//递归调用
    }else if(o instanceof Comment){//如果是说明
     Comment c=(Comment)o;
     System.out.println(c.getText());
     // System.out.println();
    }
   }
  }
 
  public   void inspect(Element element){ //element 为org.jdom.Element

    //do some thing

    //

}

原创粉丝点击