javax w3c 网页解析(一)

来源:互联网 发布:双网络设置 编辑:程序博客网 时间:2024/06/10 14:09

package test;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.HttpURLConnection;
import java.net.URL;

import javax.swing.text.Document;
import javax.swing.text.EditorKit;
import javax.swing.text.SimpleAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;

 


public class javahtml {
 
   public static void main(String[] args)
    throws Exception
   {
    EditorKit kit = new HTMLEditorKit();
    Document doc = kit.createDefaultDocument();
  
    // The Document class does not yet handle charset's properly.
    doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);
  
    // Create a reader on the HTML content.
  
    Reader rd = getReader("http://hexun.com/kangojian/default.html");
  
    // Parse the HTML.
  
    kit.read(rd, doc, 0);
  
    //  The HTML text is now stored in the document
  
    HTMLDocument.Iterator it = ((HTMLDocument) doc).getIterator(HTML.Tag.A);
    
    
    while(it.isValid())
    {
     SimpleAttributeSet s = (SimpleAttributeSet)it.getAttributes();
     String href = (String)s.getAttribute(HTML.Attribute.HREF);
     System.out.println(href);
     it.next();
    }

   }
  
   // Returns a reader on the HTML data. If 'uri' begins
   // with "http:", it's treated as a URL; otherwise,
   // it's assumed to be a local filename.
  
   static Reader getReader(String uri)
    throws IOException
   {
    // Retrieve from Internet.
    if (uri.startsWith("http:"))
    {
     HttpURLConnection conn = (HttpURLConnection) new URL(uri).openConnection();
     return new InputStreamReader(conn.getInputStream());
    }
    // Retrieve from file.
    else
    {
     return new FileReader(uri);
    }
   }
    }

 

原创粉丝点击